├── tests
├── __init__.py
├── test_entities.py
├── chromeget.py
├── test_torproxy.py
├── test_processors.py
└── test_sqlfeedstorage.py
├── scrapy_plus
├── __init__.py
├── utils
│ ├── __init__.py
│ ├── starturls.py
│ ├── spiders.py
│ └── parser.py
├── extensions
│ ├── __init__.py
│ ├── oss.py
│ └── sql.py
├── pipelines
│ ├── __init__.py
│ ├── mongo.py
│ ├── images.py
│ └── files.py
├── dupefilters
│ ├── __init__.py
│ ├── redis.py
│ └── redisbloom.py
├── spiders
│ ├── __init__.py
│ ├── taobao.py
│ ├── netease.py
│ └── douban.py
├── items
│ ├── __init__.py
│ ├── NewsItem.py
│ ├── ProductItem.py
│ └── BookItem.py
├── middlewares
│ ├── __init__.py
│ ├── proxy.py
│ ├── autologin.py
│ ├── chrome.py
│ ├── tor.py
│ ├── ua.py
│ ├── huaban.py
│ └── splash.py
└── processors.py
├── requirements.txt
├── setup.py
├── .gitignore
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_plus/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_plus/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_plus/extensions/__init__.py:
--------------------------------------------------------------------------------
1 | from .oss import OSSFeedStorage
2 | from .sql import SQLFeedStorage, SQLItemExporter
3 |
--------------------------------------------------------------------------------
/scrapy_plus/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .mongo import MongoDBPipeline
2 | from .images import ImagesPipeline
3 |
4 | __all__ = ["MongoDBPipeline", "ImagesPipeline"]
5 |
--------------------------------------------------------------------------------
/scrapy_plus/dupefilters/__init__.py:
--------------------------------------------------------------------------------
1 | from .redis import RedisDupeFilter
2 | from .redisbloom import RedisBloomDupeFilter
3 |
4 | __all__ = ["RedisBloomDupeFilter", "RedisDupeFilter"]
--------------------------------------------------------------------------------
/scrapy_plus/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | from .douban import BookSpider
2 | from .netease import NeteaseSpider
3 | from .taobao import TaobaoSpider
4 |
5 | __all__ = ["BookSpider", "NeteaseSpider", "TaobaoSpider"]
6 |
--------------------------------------------------------------------------------
/scrapy_plus/items/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import defaultdict
3 | import scrapy
4 | from scrapy import Item, Field
5 | from scrapy.loader.processors import Join, MapCompose, Identity
6 |
7 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from .autologin import LoginMiddleWare
2 | from .chrome import ChromeMiddleware
3 | from .proxy import RandomProxyMiddleware
4 | from .splash import SplashSpiderMiddleware
5 | from .tor import TorProxyMiddleware
6 | from .ua import RandomUserAgentMiddleware
7 |
--------------------------------------------------------------------------------
/tests/test_entities.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from sqlalchemy.ext.declarative import declarative_base
4 | from sqlalchemy import Column, Integer, String, Float, Text, DateTime
5 |
6 | Base = declarative_base()
7 |
8 |
9 | class Book(Base):
10 | """
11 | 测试用的数据实体
12 | """
13 | __tablename__ = "books"
14 | id = Column(Integer, primary_key=True)
15 | name = Column(String)
16 | alias = Column(String)
17 | summary = Column(Text)
18 |
19 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import random
4 |
5 |
6 | class RandomProxyMiddleware(object):
7 | """
8 | 随机代理。在运行时会从settings.py设置的PROXIES中随机抽取一个作为当前代理地址。
9 | """
10 | @classmethod
11 | def from_crawler(cls, crawler):
12 | return cls(proxies=crawler.settings.getlist('HTTP_PROXIES'))
13 |
14 | def __init__(self, proxies=[]):
15 | self.proxies = proxies
16 |
17 | def process_request(self, request, spider):
18 | request.meta['proxy'] = random.choice(self.proxies)
19 |
--------------------------------------------------------------------------------
/tests/chromeget.py:
--------------------------------------------------------------------------------
1 | # coding=utf8
2 | import os
3 | from selenium import webdriver
4 | from selenium.webdriver.common.keys import Keys
5 | from selenium.webdriver.chrome.options import Options
6 |
7 | # 以无头方式使用 Chrome 而无需再采用PhantomJS的方式
8 | chrome_options = Options()
9 | chrome_options.add_argument("--headless") # 指定采用无头方式
10 |
11 | browser = webdriver.Chrome(executable_path="/usr/local/Caskroom/chromedriver/2.46/chromedriver", chrome_options=chrome_options)
12 |
13 | browser.get("http://www.baidu.com")
14 | #browser.get("https://s.taobao.com/search?q=Vue2&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306")
15 | browser.implicitly_wait(1)
16 |
17 | element = browser.find_element_by_id('kw')
18 | #button = browser.find_element_by_css_selector('form button.btn-search')
19 |
20 | print(element.get_attribute('name'))
21 | browser.close()
--------------------------------------------------------------------------------
/scrapy_plus/items/NewsItem.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.item import Item, Field
3 | from scrapy.loader.processors import TakeFirst, MapCompose, Compose, Identity, Join
4 | from w3lib.html import remove_tags
5 |
6 |
7 | class NewsItem(Item):
8 | title = Field(output_processor=TakeFirst())
9 | desc = Field(input_processor=MapCompose(str.strip,
10 | stop_on_none=True),
11 | output_processor=TakeFirst())
12 | link = Field(output_processor=TakeFirst())
13 | pub_date = Field(input_processor=MapCompose(lambda v: v.split()[0],
14 | stop_on_none=True),
15 | output_processor=TakeFirst())
16 | body = Field(input_processor=MapCompose(remove_tags, str.strip,
17 | stop_on_none=True),
18 | output_processor=TakeFirst())
19 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aliyun-python-sdk-core==2.13.30
2 | aliyun-python-sdk-core-v3==2.13.11
3 | aliyun-python-sdk-kms==2.13.0
4 | attrs==20.3.0
5 | Automat==20.2.0
6 | certifi==2020.12.5
7 | cffi==1.14.4
8 | chardet==4.0.0
9 | constantly==15.1.0
10 | crcmod==1.7
11 | cryptography==3.3.2
12 | cssselect==1.1.0
13 | dateparser==1.0.0
14 | hyperlink==20.0.1
15 | idna==2.10
16 | incremental==17.5.0
17 | itemadapter==0.2.0
18 | itemloaders==1.0.4
19 | jmespath==0.10.0
20 | lxml==4.6.5
21 | oss2==2.13.1
22 | parsel==1.6.0
23 | Protego==0.1.16
24 | pyasn1==0.4.8
25 | pyasn1-modules==0.2.8
26 | pycparser==2.20
27 | pycryptodome==3.9.9
28 | PyDispatcher==2.0.5
29 | PyHamcrest==2.0.2
30 | pymongo==3.11.2
31 | pyOpenSSL==20.0.1
32 | python-dateutil==2.8.1
33 | pytz==2020.4
34 | queuelib==1.5.0
35 | redis==3.5.3
36 | regex==2020.11.13
37 | requests==2.25.1
38 | Scrapy==2.5.1
39 | scrapy-splash==0.8.0
40 | selenium==3.141.0
41 | service-identity==18.1.0
42 | six==1.15.0
43 | SQLAlchemy==1.3.20
44 | stem==1.8.0
45 | Twisted==20.3.0
46 | tzlocal==2.1
47 | urllib3==1.26.5
48 | w3lib==1.22.0
49 | zope.interface==5.2.0
50 |
--------------------------------------------------------------------------------
/tests/test_torproxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from scrapy_plus.middlewares.tor import TorProxyMiddleware
5 | from scrapy.http import Request
6 | from urllib3 import ProxyManager
7 |
8 |
9 | class TorProxyMiddlewareTestCase(unittest.TestCase):
10 |
11 | def test_tor_should_change_diff_ips(self):
12 | tor = TorProxyMiddleware(tor_proxy='127.0.0.1:8118',
13 | tor_password='mypassword',
14 | after_times=2)
15 | request = Request(url='http://www.baidu.com')
16 | ip = self.get_ip()
17 | for i in range(1, 10):
18 | tor.process_request(request, None)
19 | if i > 1 and (i % 2) != 0:
20 | new_ip = self.get_ip()
21 | self.assertNotEqual(ip, new_ip)
22 | ip = new_ip
23 |
24 | def get_ip(self):
25 | http = ProxyManager('http://127.0.0.1:8118')
26 | body = http.request('GET', 'http://icanhazip.com')
27 | return str(body.data, 'utf-8').replace('\n', '')
28 |
29 |
30 | if __name__ == '__main__':
31 | unittest.main()
32 |
--------------------------------------------------------------------------------
/scrapy_plus/dupefilters/redis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import logging
3 | from redis import Redis
4 | from scrapy.dupefilters import BaseDupeFilter
5 |
6 |
7 |
8 | class RedisDupeFilter(BaseDupeFilter):
9 | """
10 | Redis 去重过滤器
11 | """
12 | def __init__(self, host='localhost', port=6379, db=0):
13 | self.redis = Redis(host=host, port=port, db=db)
14 | self.logger = logging.getLogger(__name__)
15 |
16 | @classmethod
17 | def from_settings(cls, settings):
18 | host = settings.get('REDIS_HOST', 'localhost')
19 | redis_port = settings.getint('REDIS_PORT')
20 | redis_db = settings.get('REDIS_DUP_DB')
21 | return cls(host, redis_port, redis_db)
22 |
23 | def request_seen(self, request):
24 | fp = request.url
25 | key = 'UrlFingerprints'
26 | if not self.redis.sismember(key, fp):
27 | self.redis.sadd(key, fp)
28 | return False
29 | return True
30 |
31 | def log(self, request, spider):
32 | msg = ("已过滤的重复请求: %(request)s")
33 | self.logger.debug(msg, {'request': request}, extra={'spider': spider})
34 | spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
35 |
36 |
37 |
--------------------------------------------------------------------------------
/scrapy_plus/items/ProductItem.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import Item, Field
4 | from scrapy.loader.processors import TakeFirst
5 | from scrapy_plus.processors import Text, CleanText, Price, Image, Url, Number
6 |
7 |
8 | class ProductItem(Item):
9 | """
10 | 商品实体
11 | """
12 | name = Field(input_processor=CleanText(),
13 | output_processor=TakeFirst()), # 品名
14 | link = Field(input_processor=Url(),
15 | output_processor=TakeFirst()) # 链接地址
16 | image_urls = Field(input_processor=Image(),
17 | output_processor=TakeFirst()) # 产品图片地址
18 | image_files = Field() # 图片下载至本地的位置
19 | price = Field(input_processor=Price(),
20 | output_processor=TakeFirst()) # 价格
21 | deal = Field(input_processor=Number(),
22 | output_processor=TakeFirst()) # 成交人数
23 | free_shipping = Field(input_processor=CleanText(),
24 | output_processor=TakeFirst()) # 是否包邮
25 | shop = Field(input_processor=CleanText(),
26 | output_processor=TakeFirst()) # 淘宝店名
27 | location = Field(input_processor=CleanText(),
28 | output_processor=TakeFirst()) # 地区
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from setuptools import setup, find_packages
4 |
5 | with open('requirements.txt') as reqs_file:
6 | REQS = reqs_file.read()
7 |
8 | with open('README.md', encoding='utf-8') as readme_file:
9 | README = readme_file.read()
10 |
11 | setup(
12 | name='scrapy_plus',
13 | version='1.0.5',
14 | packages=find_packages(exclude=["tests"]),
15 | install_requires=REQS,
16 | url='http://www.github.com/dotnetage/scrapy_plus',
17 | license='BSD',
18 | author='Ray',
19 | author_email='csharp2002@hotmail.com',
20 | description="scrapy 常用爬网必备工具包",
21 | long_description=README,
22 | long_description_content_type='text/markdown',
23 | zip_safe=False,
24 | platforms='any',
25 | keywords=('scrapy', 'crawl', 'redis', 'tor'),
26 | classifiers=['Development Status :: 4 - Beta',
27 | 'Intended Audience :: Developers',
28 | 'License :: OSI Approved :: BSD License',
29 | 'Natural Language :: English',
30 | 'Operating System :: OS Independent',
31 | 'Programming Language :: Python :: 3.6',
32 | 'Topic :: Software Development :: Libraries',
33 | 'Topic :: Utilities'])
34 |
--------------------------------------------------------------------------------
/tests/test_processors.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import unittest
3 | from scrapy_plus.processors import Number, Text, Date, Price
4 |
5 |
6 | class ProcessorTestCase(unittest.TestCase):
7 |
8 | def test_number_processor(self):
9 | tests_text = "共93.2人"
10 | expected = 93.2
11 | processor = Number()
12 |
13 | actual = processor([tests_text])
14 | self.assertEqual(actual[0], expected)
15 |
16 | def test_text_processor(self):
17 | tests_text = "
This is a text with some html tags
"
18 | expected_text = "This is a text with some html tags"
19 | processor = Text()
20 |
21 | actual = processor([tests_text])
22 | self.assertEqual(actual[0], expected_text)
23 |
24 | def test_price_processor(self):
25 | tests_text = "¥24.2 元"
26 | expected = 24.2
27 | processor = Price()
28 |
29 | actual = processor([tests_text])
30 | self.assertEqual(actual[0], expected)
31 |
32 | def test_date_processor(self):
33 | tests_text = "2015年2月3日"
34 | expected_text = '2015-02-03T00:00:00'
35 | processor = Date()
36 |
37 | actual = processor([tests_text])
38 | self.assertEqual(actual[0].strftime('%Y-%m-%dT%H:%M:%S'), expected_text)
39 |
--------------------------------------------------------------------------------
/scrapy_plus/pipelines/mongo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pymongo
4 | import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | class MongoDBPipeline(object):
10 | """
11 | MongoDB数据管道
12 |
13 | 配置方法:
14 | ITEM_PIPELINES = ['scrapyplus.pipelines.MongoDBPipeline', ]
15 |
16 | MONGODB_SERVER = "localhost"
17 | MONGODB_PORT = 27017
18 | MONGODB_DB = "数据库名"
19 | MONGODB_COLLECTION = "表名"
20 | """
21 |
22 | def __init__(self, server=None, port=None, db_name=None, col=None):
23 | connection = pymongo.MongoClient(server, port)
24 | db = connection[db_name]
25 | self.collection = db[col]
26 |
27 | @classmethod
28 | def from_settings(cls, settings):
29 | server = settings['MONGODB_SERVER'],
30 | port = settings['MONGODB_PORT']
31 | db_name = settings['MONGODB_DB']
32 | collection_name = settings['MONGODB_COLLECTION']
33 | return cls(server, port, db_name, collection_name)
34 |
35 | def process_item(self, item, spider):
36 | self.collection.insert(dict(item))
37 | logger.debug("成功将数据插入至MongoDB",extra={'spider':spider})
38 | spider.crawler.stats.inc_value(
39 | 'mongodb/inserted', spider=spider)
40 | return item
41 |
--------------------------------------------------------------------------------
/scrapy_plus/spiders/taobao.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider, Request
3 | import urllib
4 | from ..items import ProductItem
5 | from scrapy.loader import ItemLoader
6 |
7 |
8 | class TaobaoSpider(Spider):
9 | name = 'taobao'
10 | allowed_domains = ['s.taobao.com']
11 | base_url = 'https://s.taobao.com/search?q=%s'
12 |
13 | def start_requests(self):
14 | keywords = self.gen_keywords()
15 |
16 | for kw in keywords:
17 | url = self.base_url % urllib.parse.quote(kw.encode('utf-8'))
18 | yield Request(url, self.parse, meta={'kw': kw})
19 |
20 | def gen_keywords(self):
21 | raise NotImplemented
22 |
23 | def parse(self, response):
24 |
25 | products = response.css('#mainsrp-itemlist .items .item')
26 |
27 | for product in products:
28 | loader = ItemLoader(item=ProductItem(), selector=product)
29 | loader.add_css('price', '.price>strong::text')
30 | loader.add_css('name', 'div.title>a::text')
31 | loader.add_css('shop', '.shopname>span::text')
32 | loader.add_css('image_url', '.pic img::attr(data-src)')
33 | loader.add_css('deal', '.deal-cnt::text')
34 | loader.add_css('location', '.location::text')
35 | loader.add_css('link', 'div.title>a::attr(href)')
36 | loader.add_css('free_shipping', '.icon-service-free')
37 |
38 | yield loader.load_item()
39 |
--------------------------------------------------------------------------------
/scrapy_plus/pipelines/images.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy.pipelines.images import ImagesPipeline as _ImagesPipeline
4 | from scrapy.pipelines.files import FSFilesStore, S3FilesStore, GCSFilesStore
5 | from .files import OSSFilesStore
6 |
7 | class ImagesPipeline(_ImagesPipeline):
8 | STORE_SCHEMES = {
9 | '': FSFilesStore,
10 | 'file': FSFilesStore,
11 | 's3': S3FilesStore,
12 | 'gs': GCSFilesStore,
13 | 'oss':OSSFilesStore
14 | }
15 |
16 | def __init__(self, store_uri, download_func=None, settings=None):
17 | super(ImagesPipeline, self).__init__(store_uri, settings=settings,
18 | download_func=download_func)
19 |
20 | @classmethod
21 | def from_settings(cls, settings):
22 | s3store = cls.STORE_SCHEMES['s3']
23 | s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
24 | s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
25 | s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
26 |
27 | gcs_store = cls.STORE_SCHEMES['gs']
28 | gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
29 | gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
30 |
31 | ossStore = cls.STORE_SCHEMES['oss']
32 | ossStore.OSS_ACCESS_KEY = settings['OSS_ACCESS_KEY']
33 | ossStore.OSS_ACCESS_SECRET = settings['OSS_ACCESS_SECRET']
34 |
35 | store_uri = settings['IMAGES_STORE']
36 | return cls(store_uri, settings=settings)
--------------------------------------------------------------------------------
/scrapy_plus/spiders/netease.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.linkextractors import LinkExtractor
3 | from scrapy.spiders import CrawlSpider, Rule
4 | from ..items import NewsItem
5 | from scrapy.loader import ItemLoader
6 |
7 |
8 | class NeteaseSpider(CrawlSpider):
9 | name = 'netease'
10 | allowed_domains = ['163.com']
11 | urls = 'https://www.163.com/'
12 | start_urls = urls.split(',')
13 |
14 | rules = (
15 | Rule(LinkExtractor(allow=r'(\w+):\/\/([^/:]+)\/(\d{2})+\/(\d{4})+\/(\d{2})+\/([^#]*)'),
16 | callback='parse_item', follow=True),
17 | )
18 |
19 | def parse_item(self, response):
20 | loader = ItemLoader(item=NewsItem(), response=response)
21 | loader.add_css('title', '#epContentLeft>h1::text')
22 | loader.add_css('pub_date', '#epContentLeft .post_time_source::text')
23 | loader.add_css('desc', '#epContentLeft .post_desc::text')
24 |
25 | # 游戏栏目 play.163.com
26 | loader.add_css('title', 'h1.article-h1::text')
27 | loader.add_css('desc', '.artical-summary::text')
28 |
29 | # 人间栏目 renjian.163.com
30 | loader.add_css('title', '.bannertext>.daxie_sub_title::text')
31 | loader.add_css('pub_date', '.sub_title>.pub_time::text')
32 |
33 | # 体育 sports.163.com
34 | loader.add_css('title', '.m-article .article-top>.article-title::text')
35 | loader.add_xpath('body', '//div[@class=".article-details"]')
36 |
37 | loader.add_xpath('body', '//div[@id="endText"]')
38 | loader.add_value('link', response.url)
39 | return loader.load_item()
40 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/autologin.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import signals
4 | from scrapy.exceptions import IgnoreRequest
5 | from scrapy.http import HtmlResponse, FormRequest
6 | from logging import getLogger
7 |
8 |
9 | class LoginMiddleWare():
10 | """
11 | 预登录表单中间件
12 | """
13 | @classmethod
14 | def from_crawler(cls, crawler):
15 | return cls(login_url=crawler.settings.get('LOGIN_URL'),
16 | user_name=crawler.settings.get('LOGIN_USR'),
17 | password=crawler.settings.get('LOGIN_PWD'),
18 | user_ele=crawler.settings.get('LOGIN_USR_FIELD'),
19 | pwd_ele=crawler.settings.get('LOGIN_PWD_FIELD'))
20 |
21 | def __init__(self, login_url, user_name, password, user_ele='username', pwd_ele='password'):
22 | self.logger = getLogger(__name__) # 打开日志
23 | self.login_url = login_url
24 | self.user_name = user_name
25 | self.password = password
26 | self.user_ele = user_ele
27 | self.pwd_ele = pwd_ele
28 |
29 | def process_request(self, request, spider):
30 | cookies = request.headers.getlist('Cookie')
31 | if cookies is None or len(cookies)==0:
32 | return FormRequest(url=self.login_url,
33 | formdata={self.user_ele: self.user_name, self.pwd_ele: self.password})
34 | return request
35 |
36 | def process_response(self, request, response, spider):
37 | if "authentication failed" in response.body:
38 | return IgnoreRequest()
39 |
40 | def process_exception(self, request, exception, spider):
41 | self.logger.error("登录失败")
42 |
--------------------------------------------------------------------------------
/tests/test_sqlfeedstorage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from scrapy_plus.extensions import SQLFeedStorage
5 | from scrapy_plus.extensions.sql import EntityFileFaker
6 |
7 | from sqlalchemy.engine import create_engine
8 | from sqlalchemy.orm import sessionmaker
9 |
10 |
11 | class SQLFeedStorageTestCase(unittest.TestCase):
12 | """
13 | SQLFeedStorage单元测试
14 | """
15 |
16 | def setUp(self):
17 | self.book = {
18 | 'id': 1,
19 | 'name': "Vue2实践揭秘",
20 | 'alias': "Vue2实践揭秘 - 电子工业出版社出版",
21 | 'summary': "这是一本关于Vue2实践的书籍,由浅入深层层揭显Vue2中的隐秘。"
22 | }
23 |
24 | self.connection_str = "sqlite:///test.db"
25 |
26 | def test_entity_faker_should_banch_update(self):
27 | from tests.test_entities import Base, Book
28 |
29 | engine = create_engine(self.connection_str)
30 | Base.metadata.bind = engine
31 | DBSession = sessionmaker(bind=engine)
32 |
33 | Base.metadata.create_all()
34 | faker = EntityFileFaker(DBSession(), Book)
35 |
36 | faker.write(self.book.keys(), self.book)
37 | faker.close()
38 |
39 | session = DBSession()
40 | books = session.query(Book).all()
41 | self.assertEqual(books.__len__(), 1)
42 | faker.close()
43 |
44 | Base.metadata.drop_all()
45 |
46 | def test_sql_feed_storage_should_create_database(self):
47 | storage = SQLFeedStorage('sqlite:///test1.db',
48 | 'tests.test_entities', 'Base', 'Book')
49 | file = storage.open(None)
50 | file.write(self.book.keys(), self.book)
51 | storage.store(file)
52 |
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
56 |
--------------------------------------------------------------------------------
/scrapy_plus/utils/starturls.py:
--------------------------------------------------------------------------------
1 | import re
2 | import six
3 | from datetime import datetime
4 | from itertools import product
5 | from scrapy import Request
6 |
7 |
8 |
9 | class FragmentGenerator(object):
10 | def _process_fixed(self, fragment):
11 | return [fragment]
12 |
13 | def _process_list(self, fragment):
14 | return fragment.split(' ')
15 |
16 | def _process_date(self, fragment):
17 | now = datetime.now()
18 | return [now.strftime(fragment)]
19 |
20 | def _process_range(self, fragment):
21 | a, b = fragment.split('-')
22 |
23 | if a.isalpha() and b.isalpha():
24 | a, b = [ord(w.lower()) for w in [a, b]]
25 | return (chr(w) for w in six.moves.range(a, b + 1))
26 | else:
27 | a, b = int(a), int(b)
28 | return (str(i) for i in six.moves.range(a, b + 1))
29 |
30 | def _process_fragment(self, fragment):
31 | processor = getattr(self, '_process_{}'.format(fragment['type']))
32 | return processor(fragment['value'])
33 |
34 | def process_fragments(self, spec):
35 | return map(self._process_fragment, spec['fragments'])
36 |
37 | def __call__(self, spec):
38 | generated = product(*self.process_fragments(spec))
39 | for fragment_list in generated:
40 | yield ''.join(fragment_list)
41 | _NEWLINE_RE = re.compile('[\r\n]')
42 |
43 |
44 | class FeedGenerator(object):
45 | def __init__(self, callback):
46 | self.callback = callback
47 |
48 | def __call__(self, url):
49 | return Request(url, callback=self.parse_urls)
50 |
51 | def parse_urls(self, response):
52 | newline_urls = _NEWLINE_RE.split(response.text)
53 | urls = [url for url in newline_urls if url]
54 | for url in urls:
55 | yield Request(url, callback=self.callback)
--------------------------------------------------------------------------------
/scrapy_plus/items/BookItem.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from scrapy import Item, Field
4 | from ..processors import Number, Date, Price, Text, CleanText
5 | from scrapy.loader.processors import TakeFirst, Join
6 |
7 |
8 | class BookItem(Item):
9 | # 书名
10 | name = Field(input_processor=CleanText(),
11 | output_processor=TakeFirst())
12 | # 作者
13 | authors = Field(input_processor=CleanText(),
14 | output_processor=TakeFirst())
15 | # 出版社
16 | publishing_house = Field(input_processor=CleanText(),
17 | output_processor=TakeFirst())
18 | # 出品方
19 | publisher = Field(input_processor=CleanText(),
20 | output_processor=TakeFirst())
21 | # 原名
22 | origin_name = Field(input_processor=CleanText(),
23 | output_processor=TakeFirst())
24 | # 译者
25 | translators = Field(input_processor=CleanText(),
26 | output_processor=TakeFirst())
27 | # 出版时间
28 | pub_date = Field(input_processor=Date(),
29 | output_processor=TakeFirst())
30 | # 页数
31 | pages = Field(input_processor=Number(),
32 | output_processor=TakeFirst())
33 | # 定价
34 | price = Field(input_processor=Price(),
35 | output_processor=TakeFirst())
36 | # ISBN
37 | isbn = Field(input_processor=CleanText(),
38 | output_processor=TakeFirst())
39 | # 豆瓣评分
40 | rates = Field(input_processor=Number(),
41 | output_processor=TakeFirst())
42 | # 评价数
43 | rating_count = Field(input_processor=Number(),
44 | output_processor=TakeFirst())
45 | # 简介
46 | summary = Field(input_processor=Text(),
47 | output_processor=Join())
48 | # 作者简介
49 | about_authors = Field(input_processor=CleanText(),
50 | output_processor=TakeFirst())
51 |
--------------------------------------------------------------------------------
/scrapy_plus/extensions/oss.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy.extensions.feedexport import BlockingFeedStorage
4 | import oss2
5 | import os
6 | from urllib.parse import urlparse
7 |
8 |
9 | class OSSFeedStorage(BlockingFeedStorage):
10 | """
11 | 阿里云OSS存储后端
12 | """
13 |
14 | def __init__(self, uri):
15 | # < Schema >: // < Bucket >.< 外网Endpoint > / < Object >
16 | u = urlparse(uri)
17 | self.uri = uri
18 | self.bucket_name = u.hostname.splite('.')[0]
19 | self.endpoint = '.'.join(u.hostname.splite('.')[1:])
20 | self.path = u.path
21 |
22 | def open(self, spider):
23 | access_key = spider.crawler.settings.get('OSS_ACCESS_KEY')
24 | access_secret = spider.crawler.settings.get('OSS_ACCESS_SECRET')
25 |
26 | # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
27 | self.bucket = oss2.Bucket(oss2.Auth(access_key, access_secret),
28 | self.endpoint, self.bucket_name)
29 |
30 | def _store_in_thread(self, file):
31 | # 首先可以用帮助函数设定分片大小,设我们期望的分片大小为128KB
32 | total_size = os.path.getsize(file)
33 | part_size = oss2.determine_part_size(total_size, preferred_size=128 * 1024)
34 |
35 | # 初始化分片上传,得到Upload ID。接下来的接口都要用到这个Upload ID。
36 | key = file.replace('../', '')
37 | upload_id = self.bucket.init_multipart_upload(key).upload_id
38 |
39 | # 逐个上传分片
40 | # 其中oss2.SizedFileAdapter()把fileobj转换为一个新的文件对象,新的文件对象可读的长度等于size_to_upload
41 | with open(file, 'rb') as fileobj:
42 | parts = []
43 | part_number = 1
44 | offset = 0
45 | while offset < total_size:
46 | size_to_upload = min(part_size, total_size - offset)
47 | result = self.bucket.upload_part(key, upload_id, part_number,
48 | oss2.SizedFileAdapter(fileobj, size_to_upload))
49 | parts.append(oss2.models.PartInfo(part_number, result.etag, size=size_to_upload, part_crc=result.crc))
50 |
51 | offset += size_to_upload
52 | part_number += 1
53 |
54 | # 完成分片上传
55 | self.bucket.complete_multipart_upload(key, upload_id, parts)
56 |
57 | # 验证一下
58 | with open(file, 'rb') as fileobj:
59 | assert self.bucket.get_object(key).read() == fileobj.read()
60 |
--------------------------------------------------------------------------------
/scrapy_plus/spiders/douban.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.loader import ItemLoader
5 | from ..items import BookItem
6 |
7 |
8 | class BookSpider(CrawlSpider):
9 | name = "doubanbook"
10 | start_urls = ['https://book.douban.com/tag/']
11 | rules = (Rule(LinkExtractor(allow=('\/tag\/(.*?)')), follow=True),
12 | Rule(LinkExtractor(allow=('\/tag\/(.*?)\?start\='),
13 | tags=('link'), attrs=('href')), follow=True),
14 | Rule(LinkExtractor(allow=('\/subject\/.*'), ), follow=False, callback='parse_item'))
15 |
16 | def parse_item(self, response):
17 | loader = ItemLoader(item=BookItem(), response=response)
18 | loader.add_css('name',"h1 span::text") # 标题
19 | loader.add_css('summary','.related_info #link-report .intro p::text') # 简介
20 | loader.add_xpath('authors', u'//span[.//text()[normalize-space(.)="作者:"]]/following::text()[1]')
21 | loader.add_xpath('authors', u'//span[.//text()[normalize-space(.)="作者:"]]/following::text()[2]')
22 | loader.add_xpath('publishing_house', u'//span[.//text()[normalize-space(.)="出版社:"]]/following::text()[1]')
23 | loader.add_xpath('publisher', u'//span[.//text()[normalize-space(.)="出品方:"]]/following::text()[1]')
24 | loader.add_xpath('publisher', u'//span[.//text()[normalize-space(.)="出品方:"]]/following::text()[2]')
25 | loader.add_xpath('origin_name', u'//span[.//text()[normalize-space(.)="原作名:"]]/following::text()[1]')
26 | loader.add_xpath('translators', u'//span[.//text()[normalize-space(.)="译者:"]]/following::text()[1]')
27 | loader.add_xpath('translators', u'//span[.//text()[normalize-space(.)="译者"]]/following::text()[2]')
28 | loader.add_xpath('pub_date', u'//span[.//text()[normalize-space(.)="出版年:"]]/following::text()[1]')
29 | loader.add_xpath('pages', u'//span[.//text()[normalize-space(.)="页数:"]]/following::text()[1]')
30 | loader.add_xpath('price', u'//span[.//text()[normalize-space(.)="定价:"]]/following::text()[1]')
31 | loader.add_xpath('isbn', u'//span[.//text()[normalize-space(.)="ISBN:"]]/following::text()[1]')
32 | loader.add_css('rates',".rating_num::text") # 得分
33 | loader.add_css('rating_count', ".rating_people>span::text") #投票
34 | return loader.load_item()
35 |
36 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/chrome.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import signals
4 | from scrapy.exceptions import IgnoreRequest
5 | from selenium import webdriver
6 | from selenium.common.exceptions import TimeoutException
7 | from selenium.webdriver.common.by import By
8 | from selenium.webdriver.support.ui import WebDriverWait
9 | from selenium.webdriver.support import expected_conditions as EC
10 | from scrapy.http import HtmlResponse
11 | from logging import getLogger
12 | import random
13 |
14 |
15 | class ChromeMiddleware():
16 | """
17 | Chrome 无头浏览器仿真中间件。
18 | """
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
22 | exec_path=crawler.settings.get('CHROMEDRIVER'))
23 |
24 | def __init__(self, timeout=None, exec_path=''):
25 | self.logger = getLogger(__name__) # 打开日志
26 | self.timeout = timeout
27 | options = webdriver.ChromeOptions()
28 | options.add_argument('headless') # 采用无头浏览器
29 | self.browser = webdriver.Chrome(
30 | executable_path=exec_path, chrome_options=options)
31 |
32 | self.browser.set_window_size(1400, 700) # 设置浏览窗口
33 | self.browser.set_page_load_timeout(self.timeout) # 设置浏览器加载网页的超时时间
34 | self.wait = WebDriverWait(self.browser, self.timeout)
35 |
36 | def __del__(self):
37 | self.browser.close() # 释构时关闭浏览器实例
38 |
39 | def process_request(self, request, spider):
40 | """
41 | 用Chrome抓取页面
42 | :param request: Request对象
43 | :param spider: Spider对象
44 | :return: HtmlResponse
45 | """
46 | self.logger.debug(u'启动Chrome...')
47 | # page = request.meta.get('sn', 1)
48 |
49 | try:
50 | self.browser.get(request.url)
51 |
52 | # 等待页面的宝贝全部加载完成
53 | self.wait.until(EC.presence_of_element_located(
54 | (By.CSS_SELECTOR, '.m-itemlist .items .item')))
55 |
56 | return HtmlResponse(url=request.url,
57 | body=self.browser.page_source,
58 | request=request,
59 | encoding='utf-8',
60 | status=200)
61 |
62 | except TimeoutException:
63 | # 超时抛出异常
64 | return HtmlResponse(url=request.url, status=500, request=request)
65 |
66 |
--------------------------------------------------------------------------------
/scrapy_plus/pipelines/files.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import os
3 | import oss2
4 | from urllib.parse import urlparse
5 |
6 | from twisted.internet import threads
7 |
8 |
9 | class OSSFilesStore(object):
10 | OSS_ACCESS_KEY = ""
11 | OSS_ACCESS_SECRET = ""
12 |
13 | def __init__(self, uri):
14 | # < Schema >: // < Bucket >.< 外网Endpoint > / < Object >
15 | u = urlparse(uri)
16 | self.uri = uri
17 | self.bucket_name = u.hostname.splite('.')[0]
18 | self.endpoint = '.'.join(u.hostname.splite('.')[1:])
19 | self.objectPath = u.path
20 | self.bucket = oss2.Bucket(oss2.Auth(self.OSS_ACCESS_KEY, self.OSS_ACCESS_SECRET),
21 | self.endpoint, self.bucket_name)
22 |
23 | def stat_file(self, path, info):
24 |
25 | def _onsuccess(meta):
26 | checksum = meta.headers['ETag']
27 | last_modified = meta.headers['Last-Modifie']
28 | return {'checksum': checksum, 'last_modified': last_modified}
29 |
30 | return threads.deferToThread(self.bucket.get_object_meta, path).addCallback(_onsuccess)
31 |
32 |
33 |
34 | def persist_file(self, path, buf, info, meta=None, headers=None):
35 | # 首先可以用帮助函数设定分片大小,设我们期望的分片大小为128KB
36 | total_size = len(buf)
37 | part_size = oss2.determine_part_size(total_size, preferred_size=128 * 1024)
38 |
39 | # 初始化分片上传,得到Upload ID。接下来的接口都要用到这个Upload ID。
40 | key = os.path.join(self.objectPath, info)
41 | upload_id = self.bucket.init_multipart_upload(key).upload_id
42 |
43 | # 逐个上传分片
44 | # 其中oss2.SizedFileAdapter()把fileobj转换为一个新的文件对象,新的文件对象可读的长度等于size_to_upload
45 | parts = []
46 | part_number = 1
47 | offset = 0
48 | while offset < total_size:
49 | size_to_upload = min(part_size, total_size - offset)
50 | result = self.bucket.upload_part(key, upload_id, part_number,
51 | oss2.SizedFileAdapter(buf, size_to_upload))
52 | parts.append(oss2.models.PartInfo(part_number,
53 | result.etag,
54 | size=size_to_upload,
55 | part_crc=result.crc))
56 |
57 | offset += size_to_upload
58 | part_number += 1
59 |
60 | # 完成分片上传
61 | self.bucket.complete_multipart_upload(key, upload_id, parts)
62 |
63 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/tor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | 洋葱头代理
4 | """
5 | # from scrapy import signals
6 | from logging import getLogger
7 | from stem.control import Controller
8 | import stem
9 | # import random
10 | import time
11 |
12 | logger = getLogger(__name__)
13 |
14 | class TorProxyMiddleware(object):
15 | """
16 | 洋葱头代理中间件
17 |
18 | ## settings.py 中的配置说明
19 |
20 | - HTTP_PROXY - 本机的代理端口
21 | - TOR_CTRL_PORT - 本机Tor的控制端口
22 | - TOR_PASSWORD - 登入Tor的密码
23 | """
24 |
25 | def __init__(self, tor_proxy='127.0.0.1:8118', tor_control_port=9051, tor_password=None, after_times=50):
26 |
27 | if not tor_proxy:
28 | raise Exception('http proxy setting should not be empty')
29 |
30 | if not tor_control_port:
31 | raise Exception('tor control port setting should not be empty')
32 |
33 | if not tor_password:
34 | raise Exception('tor password setting should not be empty')
35 |
36 | self.http_proxy = tor_proxy
37 | self.tor_control_port = tor_control_port
38 | self.tor_password = tor_password
39 | self.count = 0
40 | self.times = after_times
41 |
42 | @classmethod
43 | def from_crawler(cls, crawler):
44 | tor_proxy = crawler.settings.get('TOR_PROXY')
45 | tor_control_port = crawler.settings.getint('TOR_CTRL_PORT') # 默认为9051
46 | tor_password = crawler.settings.get('TOR_PASSWORD')
47 | after_times = crawler.settings.get('TOR_CHANGE_AFTER_TIMES')
48 | return cls(tor_proxy, tor_control_port, tor_password, after_times)
49 |
50 | def process_request(self, request, spider):
51 | # 当启用Retry中间件,并且曾经出现2次的Retry就应该尝试更换IP
52 | retry_times = request.meta.get('retry_times', 0)
53 |
54 | if (self.count > 0 and self.count % self.times == 0) or retry_times>= 2:
55 | logger.debug("正在更换新的IP地址")
56 | self.ip_renew(spider)
57 |
58 | self.count += 1
59 |
60 | request.meta['proxy'] = self.http_proxy
61 |
62 | def ip_renew(self,spider):
63 | """access tor ControlPort to signal tor get a new IP
64 | """
65 | with Controller.from_port(port=self.tor_control_port) as controller:
66 | controller.authenticate(password=self.tor_password)
67 | controller.signal(stem.Signal.NEWNYM)
68 | time.sleep(controller.get_newnym_wait())
69 | controller.close()
70 | spider.crawler.stats.inc_value('renew_ip/count')
71 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/ua.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import random
4 |
5 | _agents = [
6 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
7 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
8 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
9 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
10 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
11 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
12 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
13 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
14 | 'Mozilla/5.0 (Linux; U; Android 2.2) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
15 | 'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
16 | 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G532G Build/MMB29T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.83 Mobile Safari/537.36',
17 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6',
18 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
19 | 'MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
20 | 'Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
21 | 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
22 | 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
23 | 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
24 | ]
25 |
26 |
27 | class RandomUserAgentMiddleware(object):
28 | """
29 | 随机User Agent 中间件
30 | """
31 |
32 | @classmethod
33 | def from_crawler(cls, crawler):
34 | return cls(user_agents=crawler.settings.getlist('USER_AGENTS', None))
35 |
36 | def __init__(self, user_agents=None):
37 | self.user_agents = user_agents if user_agents is not None else _agents
38 |
39 | def process_request(self, request, spider):
40 | if self.user_agents != None and len(self.user_agents) > 0:
41 | request.headers.setdefault(
42 | b'User-Agent', random.choice(self.user_agents))
43 |
--------------------------------------------------------------------------------
/scrapy_plus/dupefilters/redisbloom.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import logging
3 | from scrapy.utils.request import request_fingerprint
4 | from redis import Redis
5 | from hashlib import md5
6 | from scrapy.dupefilters import BaseDupeFilter
7 |
8 | BLOOMFILTER_HASH_NUMBER = 6
9 | BLOOMFILTER_BIT = 30
10 |
11 |
12 | class SimpleHash(object):
13 | def __init__(self, cap, seed):
14 | self.cap = cap
15 | self.seed = seed
16 |
17 | def hash(self, value):
18 | ret = 0
19 | for i in range(len(value)):
20 | ret += self.seed * ret + ord(value[i])
21 | return (self.cap - 1) & ret
22 |
23 |
24 | class RedisBloomDupeFilter(BaseDupeFilter):
25 |
26 | def __init__(self, host='localhost', port=6379, db=0, blockNum=1, key='bloomfilter'):
27 | self.redis = Redis(host=host, port=port, db=db)
28 |
29 | self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M
30 | self.seeds = [5, 7, 11, 13, 31, 37, 61]
31 | self.key = key
32 | self.blockNum = blockNum
33 | self.hashfunc = []
34 | for seed in self.seeds:
35 | self.hashfunc.append(SimpleHash(self.bit_size, seed))
36 |
37 | self.logger = logging.getLogger(__name__)
38 |
39 | @classmethod
40 | def from_settings(cls, settings):
41 | _port = settings.getint('REDIS_PORT', 6379)
42 | _host = settings.get('REDIS_HOST', '127.0.0.1')
43 | _db = settings.get('REDIS_DUP_DB', 0)
44 | key = settings.get('BLOOMFILTER_REDIS_KEY', 'bloomfilter')
45 | block_number = settings.getint(
46 | 'BLOOMFILTER_BLOCK_NUMBER', 1)
47 |
48 | return cls(_host, _port, _db, blockNum=block_number, key=key)
49 |
50 | def request_seen(self, request):
51 | fp = request_fingerprint(request)
52 | if self.exists(fp):
53 | return True
54 |
55 | self.insert(fp)
56 | return False
57 |
58 | def exists(self, str_input):
59 | if not str_input:
60 | return False
61 | m5 = md5()
62 | m5.update(str(str_input).encode('utf-8'))
63 | _input = m5.hexdigest()
64 | ret = True
65 | name = self.key + str(int(_input[0:2], 16) % self.blockNum)
66 | for f in self.hashfunc:
67 | loc = f.hash(_input)
68 | ret = ret & self.redis.getbit(name, loc)
69 | return ret
70 |
71 | def insert(self, str_input):
72 | m5 = md5()
73 | m5.update(str(str_input).encode('utf-8'))
74 | _input = m5.hexdigest()
75 | name = self.key + str(int(_input[0:2], 16) % self.blockNum)
76 | for f in self.hashfunc:
77 | loc = f.hash(_input)
78 | self.redis.setbit(name, loc, 1)
79 |
80 | def log(self, request, spider):
81 | msg = ("已过滤的重复请求: %(request)s")
82 | self.logger.debug(msg, {'request': request}, extra={'spider': spider})
83 | spider.crawler.stats.inc_value(
84 | 'redisbloomfilter/filtered', spider=spider)
--------------------------------------------------------------------------------
/scrapy_plus/utils/spiders.py:
--------------------------------------------------------------------------------
1 | from scrapy.spiders import CrawlSpider
2 | from scrapy.loader import ItemLoader
3 | from scrapy.utils.response import get_base_url
4 |
5 | from .starturls import FeedGenerator, FragmentGenerator
6 |
7 |
8 | class RequiredFieldMissing(Exception):
9 | def __init__(self, msg):
10 | self.msg = msg
11 |
12 | def __str__(self):
13 | return self.msg
14 |
15 |
16 | class PortiaItemLoader(ItemLoader):
17 | def get_value(self, value, *processors, **kw):
18 | required = kw.pop('required', False)
19 | val = super(PortiaItemLoader, self).get_value(value, *processors, **kw)
20 | if required and not val:
21 | raise RequiredFieldMissing(
22 | 'Missing required field "{value}" for "{item}"'.format(
23 | value=value, item=self.item.__class__.__name__))
24 | return val
25 |
26 |
27 | class BasePortiaSpider(CrawlSpider):
28 | items = []
29 |
30 | def start_requests(self):
31 | for url in self.start_urls:
32 | if isinstance(url, dict):
33 | type_ = url['type']
34 | if type_ == 'generated':
35 | for generated_url in FragmentGenerator()(url):
36 | yield self.make_requests_from_url(generated_url)
37 | elif type_ == 'feed':
38 | yield FeedGenerator(self.parse)(url)
39 | else:
40 | yield self.make_requests_from_url(url)
41 |
42 | def parse_item(self, response):
43 | for sample in self.items:
44 | items = []
45 | try:
46 | for definition in sample:
47 | items.extend(
48 | [i for i in self.load_item(definition, response)]
49 | )
50 | except RequiredFieldMissing as exc:
51 | self.logger.warning(str(exc))
52 | if items:
53 | for item in items:
54 | yield item
55 | break
56 |
57 | def load_item(self, definition, response):
58 | query = response.xpath if definition.type == 'xpath' else response.css
59 | selectors = query(definition.selector)
60 | for selector in selectors:
61 | selector = selector if selector else None
62 | ld = PortiaItemLoader(
63 | item=definition.item(),
64 | selector=selector,
65 | response=response,
66 | baseurl=get_base_url(response)
67 | )
68 | for field in definition.fields:
69 | if hasattr(field, 'fields'):
70 | if field.name is not None:
71 | ld.add_value(field.name,
72 | self.load_item(field, selector))
73 | elif field.type == 'xpath':
74 | ld.add_xpath(field.name, field.selector, *field.processors,
75 | required=field.required)
76 | else:
77 | ld.add_css(field.name, field.selector, *field.processors,
78 | required=field.required)
79 | yield ld.load_item()
80 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 | ### Eclipse template
108 |
109 | .metadata
110 | bin/
111 | tmp/
112 | *.tmp
113 | *.bak
114 | *.swp
115 | *~.nib
116 | local.properties
117 | .settings/
118 | .loadpath
119 | .recommenders
120 |
121 | # External tool builders
122 | .externalToolBuilders/
123 |
124 | # Locally stored "Eclipse launch configurations"
125 | *.launch
126 |
127 | # PyDev specific (Python IDE for Eclipse)
128 | *.pydevproject
129 |
130 | # CDT-specific (C/C++ Development Tooling)
131 | .cproject
132 |
133 | # CDT- autotools
134 | .autotools
135 |
136 | # Java annotation processor (APT)
137 | .factorypath
138 |
139 | # PDT-specific (PHP Development Tools)
140 | .buildpath
141 |
142 | # sbteclipse plugin
143 | .target
144 |
145 | # Tern plugin
146 | .tern-project
147 |
148 | # TeXlipse plugin
149 | .texlipse
150 |
151 | # STS (Spring Tool Suite)
152 | .springBeans
153 |
154 | # Code Recommenders
155 | .recommenders/
156 |
157 | # Annotation Processing
158 | .apt_generated/
159 |
160 | # Scala IDE specific (Scala & Java development for Eclipse)
161 | .cache-main
162 | .scala_dependencies
163 | .worksheet
164 | ### VirtualEnv template
165 | # Virtualenv
166 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
167 | .Python
168 | [Bb]in
169 | [Ii]nclude
170 | [Ll]ib
171 | [Ll]ib64
172 | [Ll]ocal
173 | [Ss]cripts
174 | pyvenv.cfg
175 | .venv
176 | pip-selfcheck.json
177 |
178 | build
179 | .vscode/
180 | .idea/
--------------------------------------------------------------------------------
/scrapy_plus/extensions/sql.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from zope.interface import Interface, implementer
3 | from scrapy.extensions.feedexport import IFeedStorage
4 | from scrapy.exceptions import NotConfigured
5 | from sqlalchemy.engine import create_engine
6 | from sqlalchemy.orm import sessionmaker
7 | from importlib import import_module
8 | from scrapy.exporters import BaseItemExporter
9 | from logging import getLogger
10 |
11 | logger = getLogger(__name__)
12 |
13 |
14 | class EntityFileFaker():
15 |
16 | def __init__(self, session, entity_cls):
17 | self.session = session
18 | if entity_cls is None:
19 | raise NotConfigured
20 |
21 | self.entity_cls = entity_cls
22 |
23 | def write(self, keys, values):
24 | """
25 | 将值写入到
26 | :param key: 实体的成员变量
27 | :param value: 实体字段值
28 | """
29 | entity = self.entity_cls()
30 | for key in keys:
31 | val = values.get(key)
32 | if val is not None:
33 | entity.__setattr__(key, val)
34 | self.session.add(entity)
35 |
36 | def close(self):
37 | self.session.commit()
38 | self.session.close()
39 |
40 |
41 | @implementer(IFeedStorage)
42 | class SQLFeedStorage():
43 | """
44 | SQL的存储后端
45 | @uri - SQL的连接字符串
46 | """
47 |
48 | @classmethod
49 | def from_crawler(cls, crawler, uri):
50 | return cls(uri,
51 | crawler.settings.get('ORM_MODULE'),
52 | crawler.settings.get('ORM_METABASE'),
53 | crawler.settings.get('ORM_ENTITY'))
54 |
55 | def __init__(self, uri, mod_name=None, metabase_name=None, entity_name=None):
56 | """
57 | 初始化SQL的存储后端
58 | FEED_URI 作为连接字符串使用
59 | """
60 | self.connection_str = uri
61 | self.mod_name = mod_name
62 | self.metabase = metabase_name
63 | self.entity_name = entity_name
64 |
65 | def open(self, spider):
66 | """
67 | 通过连接字符串打开SQL数据库并返回生成的数据库上下文
68 | """
69 | engine = create_engine(self.connection_str)
70 |
71 | # 动态载入MetaData
72 | mod = import_module(self.mod_name)
73 | metabase = getattr(mod, self.metabase)
74 | entity_cls = getattr(mod, self.entity_name)
75 | metabase.metadata.bind = engine
76 | metabase.metadata.create_all()
77 |
78 | DBSession = sessionmaker(bind=engine)
79 | return EntityFileFaker(session=DBSession(), entity_cls=entity_cls)
80 |
81 | def store(self, file):
82 | """
83 | 向数据提提交更改并关闭数据库
84 | """
85 | file.close()
86 |
87 |
88 | class SQLItemExporter(BaseItemExporter):
89 | """
90 | 将Item中的数据写入转换成为实体
91 | """
92 |
93 | def __init__(self, file, **kwargs):
94 | self.file = file
95 | self._configure(kwargs, dont_fail=True)
96 |
97 | def export_item(self, item):
98 | """
99 | 将Item插入到数据库
100 | 可以通过FEED_EXPORT_FIELDS设置要从Item中序列化至数据库的字段
101 | """
102 |
103 | self.file.write(self.fields_to_export if self.fields_to_export is not None and self.fields_to_export.__len__() else item.fields.keys(),
104 | item)
105 |
106 | # TODO:要进行数据实体的转换就涉及数据类型转换问题,Item就需要进行序列化控制
107 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/huaban.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from selenium import webdriver
3 | from selenium.common.exceptions import TimeoutException
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.ui import WebDriverWait
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from scrapy.http import HtmlResponse
8 | from logging import getLogger
9 |
10 |
11 | class HuabanMiddleware():
12 |
13 | @classmethod
14 | def from_crawler(cls, crawler):
15 | return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
16 | exec_path=crawler.settings.get('CHROMEDRIVER'),
17 | username=crawler.settings.get('HUABAN_USR'),
18 | password=crawler.settings.get('HUABAN_PWD'))
19 |
20 | def __init__(self, timeout=None, exec_path='', username='', password=''):
21 | self.logger = getLogger(__name__) # 打开日志
22 | self.timeout = timeout
23 | self.usr = username
24 | self.pwd = password
25 | options = webdriver.ChromeOptions()
26 | options.add_argument('headless') # 采用无头浏览器
27 | self.browser = webdriver.Chrome(executable_path=exec_path,
28 | options=options)
29 |
30 | self.browser.set_window_size(1400, 700) # 设置浏览窗口
31 | self.browser.set_page_load_timeout(self.timeout) # 设置浏览器加载网页的超时时间
32 | self.wait = WebDriverWait(self.browser, self.timeout)
33 |
34 | def __del__(self):
35 | self.browser.close() # 释构时关闭浏览器实例
36 |
37 | def login(self):
38 |
39 | login_button = self.browser.find_element_by_css_selector('.login.btn')
40 | login_button.click()
41 | form = self.browser.find_element_by_css_selector('form.mail-login')
42 | email_input = form.find_element_by_name('email')
43 | password_input = form.find_element_by_name('password')
44 | email_input.send_keys(self.usr)
45 | password_input.send_keys(self.pwd)
46 | form.submit()
47 | self._wait()
48 |
49 | def _wait(self):
50 | self.wait.until(EC.presence_of_element_located(
51 | (By.CSS_SELECTOR, '#index_footer')))
52 |
53 | def process_request(self, request, spider):
54 | """
55 | 用Chrome抓取页面
56 | :param request: Request对象
57 | :param spider: Spider对象
58 | :return: HtmlResponse
59 | """
60 | self.logger.debug(u'启动Chrome...')
61 |
62 | try:
63 | self.browser.get(request.url)
64 | # 等待页脚被渲染完成
65 | self.browser.implicitly_wait(3)
66 |
67 | cookies = self.browser.get_cookies()
68 | is_login = False
69 | for cookie in cookies:
70 | if cookie['name'] == 'sid':
71 | is_login = True
72 | break
73 |
74 | if not is_login:
75 | self.login()
76 | self.browser.get(request.url)
77 | self.browser.implicitly_wait(3)
78 |
79 | return HtmlResponse(url=request.url,
80 | body=self.browser.page_source,
81 | request=request,
82 | encoding='utf-8',
83 | status=200)
84 |
85 | except TimeoutException:
86 | # 超时抛出异常
87 | return HtmlResponse(url=request.url, status=500, request=request)
88 |
--------------------------------------------------------------------------------
/scrapy_plus/middlewares/splash.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import signals
3 | from scrapy_splash import SplashRequest
4 |
5 | class SplashSpiderMiddleware():
6 | """
7 | Splash 中间件,可将请求转发至指定的Splash服务,使蜘蛛具有浏览器仿真功能。
8 | """
9 | # 以下的Lua脚本会生一个等待指定元素选择器加载完成的函数
10 | lua_source = """
11 | function wait_for_element(splash, css, maxwait)
12 | -- Wait until a selector matches an element
13 | -- in the page. Return an error if waited more
14 | -- than maxwait seconds.
15 | if maxwait == nil then
16 | maxwait = 10
17 | end
18 | return splash:wait_for_resume(string.format([[
19 | function main(splash) {
20 | var selector = '%s';
21 | var maxwait = %s;
22 | var end = Date.now() + maxwait*1000;
23 |
24 | function check() {
25 | if(document.querySelector(selector)) {
26 | splash.resume('Element found');
27 | } else if(Date.now() >= end) {
28 | var err = 'Timeout waiting for element';
29 | splash.error(err + " " + selector);
30 | } else {
31 | setTimeout(check, 200);
32 | }
33 | }
34 | check();
35 | }
36 | ]], css, maxwait))
37 | end
38 |
39 | function main(splash, args)
40 | splash:go(args.url)
41 | wait_for_element(splash, args.wait_for_element)
42 | return splash:html()
43 | end
44 | """
45 |
46 | @classmethod
47 | def from_crawler(cls, crawler):
48 | # This method is used by Scrapy to create your spiders.
49 | s = cls(wait_for_element=crawler.settings.get('WAIT_FOR_ELEMENT'))
50 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
51 | return s
52 |
53 | def __init__(self, wait_for_element=None):
54 | self.wait_for_element = wait_for_element
55 |
56 | def process_spider_input(self, response, spider):
57 | # Called for each response that goes through the spider
58 | # middleware and into the spider.
59 |
60 | # Should return None or raise an exception.
61 | return None
62 |
63 | def process_spider_output(self, response, result, spider):
64 | # Called with the results returned from the Spider, after
65 | # it has processed the response.
66 |
67 | # Must return an iterable of Request, dict or Item objects.
68 | for i in result:
69 | yield i
70 |
71 | def process_spider_exception(self, response, exception, spider):
72 | # Called when a spider or process_spider_input() method
73 | # (from other spider middleware) raises an exception.
74 |
75 | # Should return either None or an iterable of Response, dict
76 | # or Item objects.
77 | pass
78 |
79 | def process_start_requests(self, start_requests, spider):
80 | # Called with the start requests of the spider, and works
81 | # similarly to the process_spider_output() method, except
82 | # that it doesn’t have a response associated.
83 |
84 | # Must return only requests (not items).
85 | for request in start_requests:
86 | yield SplashRequest(request.url,
87 | request.callback,
88 | endpoint='execute',
89 | meta=dict(request.meta),
90 | args={
91 | 'lua_source': self.lua_source,
92 | 'wait_for_element': self.wait_for_element,
93 | 'wait': 3}
94 | )
95 |
96 | def spider_opened(self, spider):
97 | spider.logger.info('Spider opened: %s' % spider.name)
98 |
--------------------------------------------------------------------------------
/scrapy_plus/utils/parser.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | try:
3 | from HTMLParser import HTMLParser
4 | except ImportError:
5 | from html.parser import HTMLParser
6 |
7 | ALLOWED_TAGS = frozenset({
8 | 'abbr', 'acronym', 'address', 'bdo', 'big', 'blockquote', 'br', 'cite',
9 | 'code', 'dd', 'del', 'dfn', 'dl', 'dt', 'em', 'ins', 'kbd', 'li',
10 | 'listing', 'ol', 'p', 'plaintext', 'pre', 'q', 'samp', 'small', 'strong',
11 | 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'time', 'tr', 'tt', 'ul', 'var'
12 | })
13 | REPLACE_TAGS = {
14 | 'b': 'strong',
15 | 'h1': 'strong',
16 | 'h2': 'strong',
17 | 'h3': 'strong',
18 | 'h4': 'strong',
19 | 'h5': 'strong',
20 | 'h6': 'strong',
21 | 'i': 'em'
22 | }
23 | PURGE_TAGS = ('script', 'img', 'input', 'style')
24 | ALLOWED_ATTRS = frozenset({
25 | 'height', 'width', 'colspan', 'cellspacing', 'callpadding', 'border',
26 | 'bgcolor', 'alt', 'align', 'valign', 'dir', 'headers', 'reversed',
27 | 'rows', 'rowspan', 'scope', 'span', 'start', 'summary', 'title', 'value'
28 | })
29 | class AllowAll(object):
30 | def __contains__(self, value):
31 | return True
32 |
33 |
34 | class SafeHtmlParser(HTMLParser):
35 | """Parser for making raw html safe for displaying.
36 |
37 | HTML is made safe by the removal of some tags and the replacement of
38 | others. The HTML generated should be safe for display and shouldn't cause
39 | formatting problems.
40 |
41 | Behaviour can be customized through the following keyword arguments:
42 | allowed_tags is a set of tags that are allowed
43 | replace_tags is a mapping of tags to alternative tags to substitute.
44 | tags_to_purge are tags that, if encountered, all content between the
45 | opening and closing tag is removed.
46 |
47 | For example:
48 | >>> t = SafeHtmlParser().feed
49 | >>> t(u'test ')
50 | u'test test'
51 |
52 | Some tags, like script, are completely removed
53 | >>> t(u'test')
54 | u'test'
55 |
56 | replace_tags defines tags that are converted. By default all headers, bold
57 | and indenting are converted to strong and em.
58 | >>> t(u'header
test bold indent')
59 | u'header test bold indent'
60 |
61 | tags_to_purge defines the tags that have enclosing content removed:
62 | >>> t(u'test
')
63 | u'test
'
64 |
65 | Comments are stripped, but entities are not converted
66 | >>> t(u' only £42')
67 | u'only £42'
68 |
69 | Paired tags are closed
70 | >>> t(u'test')
71 | u'
test
'
72 |
73 | >>> t(u'test
test
')
74 | u'test
test
'
75 |
76 | """
77 | def __init__(self, allowed_tags=ALLOWED_TAGS, replace_tags=REPLACE_TAGS,
78 | tags_to_purge=PURGE_TAGS, allowed_attrs=ALLOWED_ATTRS):
79 | self.reset()
80 | self._body = []
81 | self.skip = False
82 | self._unclosed = deque()
83 | if allowed_tags is None:
84 | allowed_tags = AllowAll()
85 | if allowed_attrs is None:
86 | allowed_attrs = AllowAll()
87 | self.allowed_tags = allowed_tags
88 | self.replace_tags = replace_tags
89 | self.tags_to_purge = tags_to_purge
90 | self.allowed_attrs = allowed_attrs
91 | super(SafeHtmlParser, self).__init__()
92 |
93 | def feed(self, data):
94 | self._body, self._unclosed, self.skip = [], deque(), False
95 | self.rawdata = self.rawdata + data
96 | self.goahead(0)
97 | self._close_remaining_tags()
98 | return ''.join(self._body).strip()
99 |
100 | def handle_starttag(self, tag, attrs):
101 | self._handle_open(tag, attrs)
102 | self._unclosed.appendleft(tag)
103 |
104 | def handle_startendtag(self, tag, attrs):
105 | self._handle_open(tag, attrs, closed=True)
106 |
107 | def handle_endtag(self, tag):
108 | tag = tag.lower()
109 | try:
110 | last_opened = self._unclosed.popleft()
111 | while last_opened != tag:
112 | self._body.append(self._build_close_tag(last_opened))
113 | last_opened = self._unclosed.popleft()
114 | except IndexError:
115 | return
116 | if self.skip and tag in self.tags_to_purge:
117 | self.skip = False
118 | return
119 | if tag not in self.allowed_tags and tag not in self.replace_tags:
120 | return
121 | self._body.append(self._build_close_tag(tag))
122 |
123 | def handle_data(self, data):
124 | if self.skip:
125 | return
126 | self._body.append(data)
127 |
128 | def handle_entityref(self, name):
129 | self._body.append('&{};'.format(name))
130 |
131 | def _handle_open(self, tag, attrs, closed=False):
132 | tag = tag.lower()
133 | if tag in self.tags_to_purge:
134 | if not closed:
135 | self.skip = True
136 | return
137 | if tag not in self.allowed_tags and tag not in self.replace_tags:
138 | return
139 | self._body.append(self._build_open_tag(tag, attrs))
140 |
141 | def _build_open_tag(self, tag, attrs):
142 | tag = self.replace_tags.get(tag, tag)
143 | attrs = [(k, v) for k, v in attrs if k.lower() in self.allowed_attrs]
144 | return '<{tag}{has_attrs}{attrs}>'.format(
145 | tag=tag,
146 | has_attrs=' ' * bool(attrs),
147 | attrs=(' '.join('{}="{}"#'.format(*a) for a in attrs)
148 | if attrs else '')
149 | )
150 |
151 | def _build_close_tag(self, tag):
152 | tag = self.replace_tags.get(tag, tag)
153 | return '{}>'.format(tag)
154 |
155 | def _close_remaining_tags(self):
156 | for tag in self._unclosed:
157 | self._body.append(self._build_close_tag(tag))
158 |
--------------------------------------------------------------------------------
/scrapy_plus/processors.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import re
3 | import six
4 |
5 | from urllib.parse import urljoin, urlparse, urlunparse
6 |
7 | from copy import deepcopy
8 | from itertools import chain
9 | try:
10 | from itertools import izip_longest
11 | except ImportError:
12 | from itertools import zip_longest as izip_longest
13 |
14 | from dateparser.date import DateDataParser
15 | from scrapy.loader.processors import Identity as _Identity
16 | from scrapy.utils.markup import unquote_markup
17 | from w3lib.html import remove_tags
18 | from .utils.parser import SafeHtmlParser
19 |
20 |
21 | # Regeps from Scrapely_CSS_IMAGERE.pattern
22 | _CSS_IMAGERE = re.compile(r'background(?:-image)?\s*:\s*url\((.*?)\)')
23 | _GENERIC_PATH_RE = re.compile('/?(?:[^/]+/)*(?:.+)')
24 | _IMAGE_PATH_RE = re.compile(r'/?(?:[^/]+/)*(?:.+\.(?:mng|pct|bmp|gif|jpg|jpeg|'
25 | r'png|pst|psp|tif|tiff|ai|drw|dxf|eps|ps|svg))')
26 | _NUMERIC_ENTITIES = re.compile(r'([0-9]+)(?:;|\s)', re.U)
27 | _PRICE_NUMBER_RE = re.compile(r'(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)'
28 | r'(?:$|[^a-zA-Z0-9])')
29 | _NUMBER_RE = re.compile(r'(-?\d+(?:\.\d+)?)')
30 | _DECIMAL_RE = re.compile(r'(\d[\d\,]*(?:(?:\.\d+)|(?:)))', re.U | re.M)
31 | _VALPARTS_RE = re.compile(r'([\.,]?\d+)')
32 | _SENTINEL = object()
33 |
34 |
35 | def _strip_url(text):
36 | if text:
37 | return text.strip("\t\r\n '\"")
38 |
39 |
40 | def extract_image_url(text):
41 | text = _strip_url(text)
42 | imgurl = None
43 | if text:
44 | # check if the text is style content
45 | match = _CSS_IMAGERE.search(text)
46 | text = match.groups()[0] if match else text
47 | parsed = urlparse(text)
48 | path = None
49 | match = _IMAGE_PATH_RE.search(parsed.path)
50 | if match:
51 | path = match.group()
52 | elif parsed.query:
53 | match = _GENERIC_PATH_RE.search(parsed.path)
54 | if match:
55 | path = match.group()
56 | if path is not None:
57 | parsed = list(parsed)
58 | parsed[2] = path
59 | imgurl = urlunparse(parsed)
60 | if not imgurl:
61 | imgurl = text
62 | return imgurl
63 |
64 | class Text():
65 | def __call__(self, values):
66 | return [remove_tags(v).strip()
67 | if v and isinstance(v, six.string_types) else v
68 | for v in values]
69 |
70 |
71 | class Number():
72 | def __call__(self, values):
73 | numbers = []
74 | for value in values:
75 | if isinstance(value, (dict, list)):
76 | numbers.append(value)
77 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])),
78 | value)
79 | numbers.append(_NUMBER_RE.findall(txt))
80 | return list(chain(*numbers))
81 |
82 |
83 | class Price():
84 | def __call__(self, values):
85 | prices = []
86 | for value in values:
87 | if isinstance(value, (dict, list)):
88 | prices.append(value)
89 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])),
90 | value)
91 | m = _DECIMAL_RE.search(txt)
92 | if m:
93 | value = m.group(1)
94 | parts = _VALPARTS_RE.findall(value)
95 | decimalpart = parts.pop(-1)
96 | if decimalpart[0] == "," and len(decimalpart) <= 3:
97 | decimalpart = decimalpart.replace(",", ".")
98 | value = "".join(parts + [decimalpart]).replace(",", "")
99 | prices.append(value)
100 | return prices
101 |
102 |
103 | class Date(Text):
104 | def __init__(self, format='%Y-%m-%dT%H:%M:%S'):
105 | self.format = format
106 |
107 | def __call__(self, values):
108 | values = super(Date, self).__call__(values)
109 | dates = []
110 | for text in values:
111 | if isinstance(text, (dict, list)):
112 | dates.append(text)
113 | try:
114 | date = DateDataParser().get_date_data(text)['date_obj']
115 | dates.append(date.strftime(self.format))
116 | except ValueError:
117 | pass
118 | return dates
119 |
120 |
121 | class Url(Text):
122 | def __call__(self, values, loader_context=None):
123 | values = super(Url, self).__call__(values)
124 | urls = []
125 | for value in values:
126 | if isinstance(value, (dict, list)):
127 | urls.append(value)
128 | value = _strip_url(unquote_markup(value))
129 | base = loader_context.get('baseurl', '')
130 | urls.append(urljoin(base, value))
131 | return urls
132 |
133 | class CleanText():
134 | def __call__(self, values):
135 | return [(lambda v: v.replace('\n', '').replace(' ', '').strip())(v) for v in values]
136 |
137 | class Image(Text):
138 | def __call__(self, values):
139 | return super(Image, self).__call__([
140 | val if isinstance(val, (dict, list)) else extract_image_url(val)
141 | for val in values
142 | ])
143 |
144 |
145 | class SafeHtml(Text):
146 |
147 | def __init__(self, parser=None):
148 | if parser is None:
149 | parser = SafeHtmlParser()
150 | self.parser = parser
151 |
152 | def __call__(self, values):
153 | results = []
154 | for val in values:
155 | if isinstance(val, (dict, list)):
156 | results.append(val)
157 | results.append(self.parser.feed(str(val)))
158 | return results
159 |
160 |
161 | class Regex():
162 | def __init__(self, regexp):
163 | if isinstance(regexp, six.string_types):
164 | regexp = re.compile(regexp)
165 | self.regexp = regexp.pattern
166 | self._regexp = regexp
167 |
168 | def __call__(self, values):
169 | results = []
170 | for value in values:
171 | if isinstance(value, (dict, list)):
172 | results.append(value)
173 | if not value:
174 | continue
175 | match = self._regexp.search(value)
176 | if not match:
177 | continue
178 | results.append(
179 | u"".join([g for g in match.groups() or match.group() if g])
180 | )
181 | return results
182 |
183 | def __deepcopy__(self, memo):
184 | """Overwrite deepcopy so that the regexp is recalculated."""
185 | return type(self)(deepcopy(self.regexp, memo))
186 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy+
2 |
3 | scrapy-plus是一个scrapy的辅助扩展工具包,提供了一系列在开发爬虫过程中需要的一些扩展插件,便于大家在以后的爬虫项目中快速开发。
4 |
5 | > 更多详细的使用方法请移玉步到专栏内参考 [《从0学爬虫专栏》](https://www.imooc.com/read/34)
6 |
7 | ```
8 | $ pip install scrapy_plus
9 | ```
10 |
11 | Scrapy+提供以下的内容
12 |
13 | - 过滤器
14 | - Redis 去重过滤器
15 | - Redis 布隆去重过滤器
16 | - 中间件
17 | - 自登录中间件
18 | - 花瓣网专用中间件
19 | - Chrome通用中间件
20 | - Splash渲染中间件
21 | - Tor中间件
22 | - 随机UA中间件
23 | - 随机代理中间件
24 | - 管道
25 | - MongoDB数据存储管道
26 | - 可支持阿里云的OSS图片管道
27 | - SQL存储端
28 | - 输入/输出处理器
29 | - 蜘蛛
30 | - `BookSpider`
31 | - `NeteaseSpider`
32 | - `TaobaoSpider`
33 |
34 |
35 | ## 过滤器
36 |
37 | 根据本专栏第4章网易爬虫的优化——大规模数据处理技术中所介绍的Redis去重过滤器与高效的布隆过滤器的内容进行抽象与优化。
38 |
39 | 所有的过滤器都放置于`scrapy_plus.dupefilters`下。
40 |
41 | ### Redis 去重过滤器
42 |
43 | `RedisDupeFilter`的模块位置:
44 |
45 | ```python
46 | scrapy_plus.dupefilters.RedisDupeFilter
47 | ```
48 |
49 | 基于Redis使用`Set`存储曾访问过的URL。
50 |
51 | **使用方法**
52 |
53 | 首先要安装Redis或者起动一个Redis的容器,具体做法请参考本专栏的第4章第2节去重处理——高性能爬虫调优技术中的Redis安装介绍。
54 |
55 | `RedisDupeFilter`的使用极其简单,只需要在配置文件中做以相应的修改即可。具体做法是在`settings`文件内引入以下的内容:
56 |
57 | ```py
58 | # 覆盖原有的去重过滤器
59 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisDupeFilter'
60 | REDIS_PORT = 6379 # REDIS服务器端口
61 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址
62 | REDIS_DB = 0 # 数据库名
63 | ```
64 |
65 | **默认配置**
66 |
67 | ```py
68 | REDIS_PORT = 6379 # REDIS服务器端口
69 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址
70 | REDIS_DB = 0 # 数据库名
71 | ```
72 |
73 | 如果你不修改Redis的安装配置可以只在`settings.py`文件中加入以下这行即可:
74 |
75 | ```
76 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisDupeFilter'
77 | ```
78 |
79 |
80 |
81 | ### Redis 布隆去重过滤器
82 |
83 | 这是基于`RedisDupeFilter`并加入布隆算法后的最高效的去重过滤器。
84 |
85 | `RedisBloomDupeFilter`的模块位置:
86 |
87 | ```
88 | scrapy_plus.dupefilters.RedisBloomDupeFilter
89 | ```
90 |
91 | **使用方法**
92 |
93 | 使用方法与`RedisDupeFilter`相同在`settings`文件内引入以下的内容:
94 |
95 | ```py
96 | # 覆盖原有的去重过滤器
97 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisBloomDupeFilter'
98 | REDIS_PORT = 6379 # REDIS服务器端口
99 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址
100 | REDIS_DB = 0 # 数据库名
101 | ```
102 |
103 | **默认配置**
104 |
105 | ```
106 | REDIS_PORT = 6379 # REDIS服务器端口
107 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址
108 | REDIS_DB = 0 # 数据库名
109 | BLOOMFILTER_REDIS_KEY = 'bloomfilter' # 去重键名
110 | BLOOMFILTER_BLOCK_NUMBER = 1 # 块大小
111 | ```
112 |
113 | 与`RedisDupeFilter`不同的是`RedisBloomDupeFilter`增加了两个配置项:
114 |
115 | - `BLOOMFILTER_REDIS_KEY` - 设置Redis中去重键的名称。
116 | - `BLOOMFILTER_BLOCK_NUMBER` - 设置布隆算法块的大小。
117 |
118 | 这两个选项推荐使用默认值,也可以根据你项目的实际情况进行调节。
119 |
120 |
121 |
122 | ## 中间件
123 |
124 | Scrapy+的中间件放置于`scrapy_plus.middlewares`包内。
125 |
126 | ### 自登录中间件
127 |
128 | 这是一个通用的中间件,可以应用于所有能提供登陆URL的网站,`LoginMiddleWare`会判断是否已登录,如果已登录则不会进行重复登录。
129 |
130 | 使用这个中间件时需要在`settings.py`配置文件中的`COOKIES_ENABLED`打开
131 |
132 | 例如在网页中找到一个`
139 | ```
140 | 那就完全可以应用`LoginMiddleware`完成自动登录。
141 |
142 | **模块位置**
143 |
144 | ```
145 | scrapy_plus.middlewares.LoginMiddleWare
146 | ```
147 |
148 | 以下是自动登录中间件的配置:
149 |
150 | ```python
151 | COOKIES_ENABLED=True
152 | LOGIN_URL = '网站登录地址'
153 | LOGIN_USR = '用户名'
154 | LOGIN_PWD = '密码'
155 | LOGIN_USR_FIELD = '用户名input元素名称(name)'
156 | LOGIN_PWD_FIELD = '密码input元素名称(name)'
157 | DOWNLOADER_MIDDLEWARES = {
158 | 'scrapyplus.middlewares.LoginMiddleWare': 330
159 | }
160 | ```
161 |
162 |
163 |
164 | ### 花瓣网专用中间件
165 |
166 | 这是一个基于Chrome无头浏览器,可以自动登录花瓣网并能正确渲染花瓣网javascript页面的中间件。
167 |
168 |
169 |
170 | **模块位置**
171 |
172 | ```python
173 | scrapy_plus.middlewares.HuabanMiddleware
174 | ```
175 |
176 |
177 |
178 | **使用方法**
179 |
180 | 首先,你需要安装chromedriver,具体做法请参考本专栏第6章第4节用Chrome无头浏览器处理js网页关于安装chromedriver的相关内容。
181 |
182 | 其次,你需要拥有一个花瓣网的注册账号。
183 |
184 | 最后,在`settings.py`配置文件内加入以下的配置项:
185 |
186 | ```python
187 | SELENIUM_TIMEOUT = 30 # 设置页面打开的超时秒数
188 | CHROMEDRIVER = "/path/to/chrome" # Chrome浏览器驱动地址
189 | # 以下的macOS上示例:
190 | # CHROMEDRIVER = "/usr/local/Caskroom/chromedriver/75.0.3770.90/chromedriver"
191 | DOWNLOADER_MIDDLEWARES = {
192 | 'scrapyplus.middlewares.HuabanMiddleware': 100
193 | }
194 | HUABAN_USR="您在花瓣网上的用户名"
195 | HUABAN_PWD="你在花瓣网上注册的用户密码"
196 | ```
197 |
198 | 有了这个中间件你就可以像写普通蜘蛛一样来编写花瓣蜘蛛的逻辑。
199 |
200 |
201 |
202 | ### Chrome通用中间件
203 |
204 | Chrome 无头浏览器仿真中间件。让爬虫用Chrome来访问目标URL,完美解决富JS页面的问题。
205 |
206 | > 但仅可以对不需要进行登录的网站应用此中间件。
207 |
208 | ```python
209 | SELENIUM_TIMEOUT = 30 # 设置页面打开的超时秒数
210 | CHROMEDRIVER = "/path/to/chrome" # Chrome浏览器驱动地址
211 | DOWNLOADER_MIDDLEWARES = {
212 | 'scrapyplus.middlewares.ChromeMiddleware': 800
213 | }
214 | ```
215 |
216 |
217 |
218 | ### Splash渲染中间件
219 |
220 | 基于scrappy_splash进行扩展,简化splash的用法,使蜘蛛可以不发出`SplashRequest`就能使用splash进行javascript页面的渲染。
221 |
222 | **模块位置**
223 |
224 | ```python
225 | scrapy_plus.middlewares.SplashSpiderMiddleware
226 | ```
227 |
228 |
229 | Splash 中间件,可将请求转发至指定的Splash服务,使蜘蛛具有浏览器仿真功能。
230 |
231 | ```python
232 | WAIT_FOR_ELEMENT = "选择器" # 等待该元素被加载成功才认为页面加载完成
233 | DOWNLOADER_MIDDLEWARES = {
234 | }
235 |
236 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
237 |
238 | DOWNLOADER_MIDDLEWARES = {
239 | 'scrapy_splash.SplashCookiesMiddleware': 723,
240 | 'scrapy_splash.SplashMiddleware': 725,
241 | 'scrapy_plus.middlewares.SplashSpiderMiddleware': 800,
242 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810
243 | }
244 |
245 | SPIDER_MIDDLEWARES = {
246 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
247 | }
248 | ```
249 |
250 |
251 |
252 | ### 随机UA中间件
253 |
254 | 为爬取请求随机分配UA。具体原理请参考本专栏第5章第5节反爬初步之客户端仿真。
255 |
256 | > 使用前需要将`scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`禁用。
257 |
258 | **模块位置**
259 |
260 |
261 | ```python
262 | scrapyplus.middlewares.RandomUserAgentMiddleware
263 | ```
264 |
265 | **使用方法**
266 |
267 | 在`settings.py`添加以下的配置:
268 |
269 | ```python
270 | DOWNLOADER_MIDDLEWARES = {
271 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
272 | 'scrapyplus.middlewares.RandomUserAgentMiddleware': 500
273 | }
274 | ```
275 |
276 | 在默认情况下RandomUserAgentMiddleware会提供一系列常用的UA,还可以在settings.py文件内配置`USER_AGENTS`添加自定义的UA。
277 | 如下所示:
278 |
279 | ```python
280 | ## 可随机增加更多的UA,中间件会进行自动随机选择
281 | USER_AGENTS = [
282 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
283 | 'Mozilla/5.0 (Linux; U; Android 2.2) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
284 | 'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
285 | 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G532G Build/MMB29T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.83 Mobile Safari/537.36',
286 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6'
287 | ]
288 | ```
289 |
290 | ### 随机代理中间件
291 |
292 | 当爬虫向目标网站发出请求时在配置文件的`HTTP_PROXIES`列表中随机选择一个地址。
293 |
294 | **模块位置**
295 |
296 | ```python
297 | scrapyplus.middlewares.RandomProxyMiddleware
298 | ```
299 |
300 |
301 |
302 | 在`settings.py`文件内添加以下的配置:
303 |
304 | ```python
305 | DOWNLOADER_MIDDLEWARES = {
306 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
307 | 'scrapyplus.middlewares.RandomProxyMiddleware': 750
308 | }
309 | # 以下为代理列表
310 | HTTP_PROXIES=[
311 | '203.11.43.22:8080'
312 | ]
313 | ```
314 |
315 |
316 |
317 | ### Tor 中间件
318 |
319 | **模块位置**
320 |
321 | ```python
322 | scrapyplus.middlewares.TorProxyMiddleware
323 | ```
324 |
325 | 洋葱头代理中间件,让你的蜘蛛不停地更换IP地址,化身万千。需要先安装 tor 与 privoxy 具体配置方法请参考《虫术——python绝技》
326 |
327 | ```py
328 | # Tor代理
329 | TOR_PROXY = 'http://127.0.0.1:8118' # 8118是Privoxy的默认代理端口
330 | TOR_CTRL_PORT = 9051
331 | TOR_PASSWORD = 'mypassword'
332 | TOR_CHANGE_AFTER_TIMES = 50 # 在发出多少次请求之后更换IP地址。
333 | ```
334 |
335 |
336 |
337 | ## 管道
338 |
339 | Scrapy+的中间件放置于`scrapy_plus.pipelines`包内。
340 |
341 | ### MongoDB数据存储管道
342 |
343 | 将`Item`数据项目直接写入到MongoDB。使用此管道前需要先安装MongoDB或起动MongoDB的Docker实例。
344 |
345 | **模块位置**
346 |
347 | ```python
348 | scrapy_plus.piplines.MongoDBPipeline
349 | ```
350 |
351 | 可以将Item直接写入MongoDB数据库中。
352 |
353 | **使用方法**
354 |
355 | 在`settings.py`文件内加入以下配置项:
356 |
357 | ```py
358 | ITEM_PIPELINES = {'scrapy_plus.pipelines.MongoDBPipeline':2}
359 |
360 | MONGODB_SERVER = "localhost" # mongodb服务器地址
361 | MONGODB_PORT = 27017 # mongodb服务端口
362 | MONGODB_DB = "数据库名" # 数据库名
363 | MONGODB_COLLECTION = "表名" # 表名
364 | ```
365 |
366 | ### 可支持阿里云的OSS图片管道
367 |
368 | scrapy 搭载的图片管道`ImagesPipeline`只能将图片保存到本地或者S3、Google这些中国不能用的云储存,在第6章第5节将花瓣爬虫采访的图片存储于阿里云一节中我就介绍过这个管道的实现过程与原理。
369 |
370 | **模块位置**
371 |
372 | ```
373 | scrapy_plus.pipelines.ImagesPipeline
374 | ```
375 |
376 | **使用方法**
377 |
378 | 需要在配置文件中`settings.py`加入以下的配置项
379 |
380 | ```
381 | IMAGE_STORE='oss://.<外网EndPoint>.aliyuncs.com/<子目录>'
382 | OSS_ACCESS_KEY = 'OSS上访问公钥'
383 | OSS_ACCESS_SECRET = 'OSS的访问私钥'
384 | # 在Item中存储目标图片下载地址的字段名
385 | IMAGES_URLS_FIELD = 'img_urls'
386 | # 当下载完后将下载结果对象写入到Item对象的字段名
387 | IMAGES_RESULT_FIELD = 'img_files'
388 | # 加载图片存储管道
389 | ITEM_PIPELINES = {
390 | 'huaban.pipelines.images.ImagesPipeline': 2
391 | }
392 |
393 | ```
394 |
395 |
396 |
397 |
398 |
399 | ## 存储端
400 |
401 | ### SQL存储端
402 |
403 | 将数据一次性地写入SQL数据库存储端。可以支持sqlite, postgresql和mysql多种SQL类型的数据库。
404 |
405 | **模块位置**
406 |
407 | ```
408 | scrapy_plus.extensions.SQLFeedStorage
409 | ```
410 |
411 | **使用方法**
412 |
413 | `SQLFeedStorage`的使用方法有点复杂,在使用前需要基于SQLAlchemy进行数据建模,具体方法可以参考本专栏第5章第4节基于SQL的数据导出机制。
414 |
415 | 在`settings.py`需要加入以下的配置项
416 |
417 | ```py
418 | # 数据存储
419 | ORM_MODULE = 'movies.entities'
420 | ORM_METABASE = 'Base'
421 | ORM_ENTITY = 'Movie'
422 |
423 | FEED_FORMAT = 'entity'
424 | FEED_EXPORTERS = {
425 | 'entity': 'scrapyplus.extensions.SQLItemExporter'
426 | }
427 |
428 | FEED_URI = 'dialect+driver://username:password@host:port/database' # 默认后端存储文件的名称
429 | FEED_STORAGES = {
430 | 'sqlite': 'scrapyplus.extensions.SQLFeedStorage',
431 | 'postgresql': 'scrapyplus.extensions.SQLFeedStorage',
432 | 'mysql': 'scrapyplus.extensions.SQLFeedStorage'
433 | }
434 | ```
435 |
436 |
437 |
438 | ## 输入/输出处理器
439 |
440 | Scrapy+的中间件放置于`scrapy_plus.processors`包内。我在本专栏第五章第一、第二节都有介绍过输入输出处理器的用法以及如何开发自定义的输入/输出处理器。在Scrapy+中则更进一步提供了8个最常用的输入/输出处理器。他们分别是:
441 |
442 | - `Text` - 提取并输出文字字符串
443 | - `Number` - 提取并输出数字格式字符串
444 | - `Price` - 提取出价格格式的字符串
445 | - `Date` - 提取日期格式的字符串
446 | - `Url` - 提取并输出URL格式字符串
447 | - `Image` - 提取并输出图片地址格式字符串
448 | - `SafeHtml` - 提取并输出移除所有可被执行的HTML代码后的HTML格式化字符串
449 | - `Regex` - 提取并输出符合正规表达式的字符串
450 |
451 | **模块位置**
452 |
453 | ```python
454 | scrapy_plus.processors
455 | ```
456 |
457 |
458 |
459 | ## 蜘蛛
460 |
461 | Scrapy+将本专栏中两个蜘蛛放到了`scrapy_plus.spiders`包内,方便读者可以在不编写蜘蛛的情况下只配`settings.py`就可以直接使用。
462 |
463 | Scrapy+总共提供了三个蜘蛛类,分别是:
464 |
465 | 类名 | 蜘蛛名 | 说明
466 | --|--|--
467 | `BookSpider` | `'doubanbook'` |用于拉取豆瓣图书的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.BookItem`的数据项对象。
468 | `NeteaseSpider` | `'netease'`| 用于拉取网易新闻的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.NewsItem`数据项对象
469 | `TaobaoSpider`| `'taobao'`| 用于拉取淘宝网搜索页面的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.ProductItem`数据项对象
470 |
471 | 它们的使方法如下,在`settings.py`内修改
472 |
473 | ```
474 | BOT_NAME = '蜘蛛名'
475 | SPIDER_MODULES = ['scrapy_plus.spiders'] # 指定爬虫类所在的包
476 | NEWSPIDER_MODULE = 'scrapy_plus.spiders' # 指定爬虫模块
477 | ```
478 |
479 | 淘宝蜘蛛是在《Python绝技——虫术》一书中的一个经典例子,要使用这个蜘蛛需要使用慢速爬虫与自动登录的Chrome中间件,同时这个蜘蛛需要被继承并重新实现`gen_keywords`方法后才能使用:
480 |
481 | ```python
482 | from scrapy_plus.spider import TaobaoSpider
483 |
484 | class MyTaobaoSpider(TaobaoSpider):
485 | def gen_keywords(self):
486 | return ["小米","红酒"]
487 | ```
488 |
489 | 这个蜘蛛是利用淘宝搜索页进行爬取的,所以你需要返回需要搜索的关键字,该蜘蛛就会对这些关键字下搜索出来的产品进行爬取。
--------------------------------------------------------------------------------