├── .gitigore
├── README.md
├── main.py
├── scrapy.cfg
└── zi5book
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── exception.py
        └── zi5book_spider.py


/.gitigore:
--------------------------------------------------------------------------------
1 | .idea/
2 | zi5book/__pycache__/
3 | zi5book/spiders/__pycache__/
4 | *.pyc
5 | .DS_Store
6 | 
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # zi5book
  2 | 要不要先star一波，book.zi5.me全站kindle电子书籍爬取，按照作者书籍名分类，每本书有mobi和equb两种格式，采用分布式进行全站爬取
  3 | # tips
  4 | 需要安装pillow，pillow有相关依赖库，需要翻***墙访问，20190130可以正常使用
  5 | > sudo apt-get install libjpeg-dev
  6 | 
  7 | > pip3 install pillow
  8 | 
  9 | 
 10 | 
 11 | 
 12 | # 最新安装操作
 13 | 
 14 | # 没有python3环境
 15 | 
 16 | 下载anaconda3 https://www.anaconda.com/download/#linux
 17 | 
 18 | https://repo.anaconda.com/archive/
 19 | 
 20 | 
 21 | 
 22 | wget https://repo.anaconda.com/archive/Anaconda3-5.0.1-Linux-x86_64.sh
 23 | 
 24 | chmod +x Anaconda3-5.0.1-Linux-x86_64.sh
 25 | 
 26 | ./Anaconda3-5.0.1-Linux-x86_64.sh
 27 | 
 28 | 
 29 | 
 30 | 一路yes即可，除了最后的安装vscode
 31 | 
 32 | 
 33 | 
 34 | # 安装依赖包
 35 | 
 36 | conda install scrapy(也可以pip install scrapy，有时候容易安装错误) 
 37 | 
 38 | pip install scrapy_redis
 39 | 
 40 | pip install pymongo
 41 | 
 42 | 
 43 | 
 44 | # 安装redis和mongodb
 45 | 
 46 | sudo apt-get install redis-server
 47 | 
 48 | sudo apt-get install mongodb
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | # 运行
 55 | 
 56 | git clone https://github.com/guapier/zi5book.git
 57 | 
 58 | cd zi5book
 59 | 
 60 | python3 main.py即可
 61 | 
 62 | 
 63 | 
 64 | # 可能出现的错误的解决方案
 65 | 
 66 | ```ba's
 67 | UnicodeEncodeError: 'ascii' codec can't encode characters in position 25-31: ordinal not in range(128)
 68 | 
 69 | sudo apt-get install language-pack-zh-hans
 70 | 
 71 |     
 72 |     
 73 | 首先要从Ubuntu语言设置那里，把中文语言包安装上
 74 | 
 75 | 打开/etc/environment
 76 | 在下面添加如下两行
 77 | LANG=zh_CN.UTF-8
 78 | LANGUAGE=zh_CN:zh:en_US:en
 79 | 
 80 | 打开 /var/lib/locales/supported.d/local
 81 | 添加zh_CN.GB2312字符集，如下：
 82 | en_US.UTF-8 UTF-8
 83 | zh_CN.UTF-8 UTF-8
 84 | zh_CN.GBK GBK
 85 | zh_CN GB2312
 86 | 保存后，执行命令：
 87 | sudo locale-gen
 88 | 
 89 | 打开/etc/default/locale
 90 | 修改为：
 91 | LANG=”zh_CN.UTF-8″
 92 | LANGUAGE=”zh_CN:zh:en_US:en”
 93 | ```
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | import sys
4 | import os
5 | 
6 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7 | execute(["scrapy", "crawl", "zi5book_spider"])


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zi5book.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zi5book
12 | 


--------------------------------------------------------------------------------
/zi5book/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guapier/zi5book/5492989f7141e0d4dc21334612e8748aff20d3a4/zi5book/__init__.py


--------------------------------------------------------------------------------
/zi5book/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Zi5BookItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     url=scrapy.Field()
15 |     name=scrapy.Field()
16 |     time=scrapy.Field()
17 |     author=scrapy.Field()
18 |     publisher=scrapy.Field()
19 |     comment=scrapy.Field()
20 |     view=scrapy.Field()
21 |     ISBN=scrapy.Field()
22 |     rates=scrapy.Field()
23 |     updated=scrapy.Field()
24 |     desc=scrapy.Field()
25 |     tags=scrapy.Field()
26 |     up=scrapy.Field()
27 |     down=scrapy.Field()
28 |     image_urls=scrapy.Field()
29 |     file_urls=scrapy.Field()
30 |     images=scrapy.Field()
31 |     files=scrapy.Field()
32 | 
33 | 


--------------------------------------------------------------------------------
/zi5book/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from random import choice
 10 | from scrapy.exceptions import NotConfigured
 11 | 
 12 | 
 13 | class Zi5BookSpiderMiddleware(object):
 14 |     # Not all methods need to be defined. If a method is not defined,
 15 |     # scrapy acts as if the spider middleware does not modify the
 16 |     # passed objects.
 17 | 
 18 |     @classmethod
 19 |     def from_crawler(cls, crawler):
 20 |         # This method is used by Scrapy to create your spiders.
 21 |         s = cls()
 22 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 23 |         return s
 24 | 
 25 |     def process_spider_input(self, response, spider):
 26 |         # Called for each response that goes through the spider
 27 |         # middleware and into the spider.
 28 | 
 29 |         # Should return None or raise an exception.
 30 |         return None
 31 | 
 32 |     def process_spider_output(self, response, result, spider):
 33 |         # Called with the results returned from the Spider, after
 34 |         # it has processed the response.
 35 | 
 36 |         # Must return an iterable of Request, dict or Item objects.
 37 |         for i in result:
 38 |             yield i
 39 | 
 40 |     def process_spider_exception(self, response, exception, spider):
 41 |         # Called when a spider or process_spider_input() method
 42 |         # (from other spider middleware) raises an exception.
 43 | 
 44 |         # Should return either None or an iterable of Response, dict
 45 |         # or Item objects.
 46 |         pass
 47 | 
 48 |     def process_start_requests(self, start_requests, spider):
 49 |         # Called with the start requests of the spider, and works
 50 |         # similarly to the process_spider_output() method, except
 51 |         # that it doesn’t have a response associated.
 52 | 
 53 |         # Must return only requests (not items).
 54 |         for r in start_requests:
 55 |             yield r
 56 | 
 57 |     def spider_opened(self, spider):
 58 |         spider.logger.info('Spider opened: %s' % spider.name)
 59 | 
 60 | 
 61 | 
 62 | class RotateUserAgentMiddleware(object):
 63 |     """Rotate user-age for each request
 64 |     """
 65 | 
 66 |     def __init__(self, user_agents):
 67 |         self.enabled = False
 68 |         self.user_agents = user_agents
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         # This method is used by Scrapy to create your spiders.
 73 |         # s = cls()
 74 |         # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 75 |         # return s
 76 |         user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
 77 | 
 78 |         if not user_agents:
 79 |             raise NotConfigured("USER_AGENT_CHOICES not set or empty")
 80 | 
 81 |         o = cls(user_agents)
 82 |         crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
 83 |         return o
 84 | 
 85 |     def process_spider_input(self, response, spider):
 86 |         # Called for each response that goes through the spider
 87 |         # middleware and into the spider.
 88 | 
 89 |         # Should return None or raise an exception.
 90 |         return None
 91 | 
 92 |     def process_spider_output(self, response, result, spider):
 93 |         # Called with the results returned from the Spider, after
 94 |         # it has processed the response.
 95 | 
 96 |         # Must return an iterable of Request, dict or Item objects.
 97 |         for i in result:
 98 |             yield i
 99 | 
100 |     def process_spider_exception(self, response, exception, spider):
101 |         # Called when a spider or process_spider_input() method
102 |         # (from other spider middleware) raises an exception.
103 | 
104 |         # Should return either None or an iterable of Response, dict
105 |         # or Item objects.
106 |         pass
107 | 
108 |     def process_start_requests(self, start_requests, spider):
109 |         # Called with the start requests of the spider, and works
110 |         # similarly to the process_spider_output() method, except
111 |         # that it doesn’t have a response associated.
112 | 
113 |         # Must return only requests (not items).
114 |         for r in start_requests:
115 |             yield r
116 | 
117 |     def spider_opened(self, spider):
118 |         spider.logger.info('Spider opened: %s' % spider.name)
119 |         self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)
120 | 
121 |     def process_request(self, request, spider):
122 |         if not self.enabled or not self.user_agents:
123 |             return
124 |         request.headers['user-agent'] = choice(self.user_agents)
125 | 


--------------------------------------------------------------------------------
/zi5book/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | import logging
 5 | 
 6 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 7 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 8 | import pymongo
 9 | # Define your item pipelines here
10 | #
11 | from scrapy.conf import settings
12 | from scrapy.exceptions import DropItem
13 | # from scrapy.contrib.pipeline.images import ImagesPipeline
14 | # from scrapy.contrib.pipeline.files import FilesPipeline
15 | from scrapy.pipelines.files import FilesPipeline
16 | from scrapy.pipelines.images import ImagesPipeline
17 | from scrapy.exceptions import DropItem
18 | import scrapy
19 | 
20 | 
21 | class MongoDBPipeline(object):
22 |     def __init__(self):
23 |         connection = pymongo.MongoClient(
24 |             settings['MONGODB_SERVER'],
25 |             settings['MONGODB_PORT']
26 |         )
27 |         self.db = connection[settings['MONGODB_DB']]
28 |         self.collection = self.db[settings['MONGODB_COLLECTION']]
29 | 
30 |     def process_item(self, item, spider):
31 |         valid = True
32 |         for data in item:
33 |             if not data:
34 |                 valid = False
35 |                 raise DropItem("Missing {0}!".format(data))
36 |         if valid:
37 |             try:
38 |                 self.collection.insert(dict(item))
39 |                 logging.debug("add {}".format(item['item_name']))
40 |             except (pymongo.errors.WriteError, KeyError) as err:
41 |                 raise DropItem("Duplicated Item: {}".format(item['name']))
42 |         return item
43 | 
44 | 
45 | class MyImagePipelines(ImagesPipeline):
46 |     def get_media_requests(self, item, info):
47 |         for image_url in item['image_urls']:
48 |             # 这里我把item传过去,因为后面需要用item里面的书名和章节作为文件名
49 |             yield scrapy.Request(image_url, meta={'item': item})
50 | 
51 |     def item_completed(self, results, item, info):
52 |         image_paths = [x['path'] for ok, x in results if ok]
53 |         if not image_paths:
54 |             raise DropItem("Item contains no images")
55 |         return item
56 | 
57 |     def file_path(self, request, response=None, info=None):
58 |         item = request.meta['item']
59 |         # 从URL提取图片的文件名
60 |         image_guid = request.url.split('/')[-1].split('.')[1]
61 |         # 拼接最终的文件名,格式:full/{书名}/{章节}/图片文件名.jpg
62 |         filename = u'full/{0[author]}/{0[name]}/{0[name]}.{1}'.format(item, image_guid)
63 |         return filename
64 | 
65 | 
66 | class MyFilePipelines(FilesPipeline):
67 |     def get_media_requests(self, item, info):
68 |         for image_url in item['file_urls']:
69 |             # 这里我把item传过去,因为后面需要用item里面的书名和章节作为文件名
70 |             yield scrapy.Request(image_url, meta={'item': item})
71 | 
72 |     def item_completed(self, results, item, info):
73 |         image_paths = [x['path'] for ok, x in results if ok]
74 |         if not image_paths:
75 |             raise DropItem("Item contains no images")
76 |         return item
77 | 
78 |     def file_path(self, request, response=None, info=None):
79 |         item = request.meta['item']
80 |         # 从URL提取图片的文件名
81 |         image_guid = request.url.split('/')[-1].split('.')[1]
82 |         # 拼接最终的文件名,格式:full/{书名}/{章节}/图片文件名.jpg
83 |         filename = u'full/{0[author]}/{0[name]}/{0[name]}.{1}'.format(item, image_guid)
84 |         return filename
85 | 


--------------------------------------------------------------------------------
/zi5book/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for zi5book project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'zi5book'
 13 | 
 14 | SPIDER_MODULES = ['zi5book.spiders']
 15 | NEWSPIDER_MODULE = 'zi5book.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'zi5book (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | DOWNLOAD_DELAY = 0.25
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | COOKIES_ENABLED = True
 36 | 
 37 | USER_AGENT_CHOICES = [
 38 |     'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
 39 |     'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)',
 40 |     'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
 41 |     'DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)',
 42 |     'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
 43 |     'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
 44 |     'ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)',
 45 | ]
 46 | # Disable Telnet Console (enabled by default)
 47 | # TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | # DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | # }
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | # SPIDER_MIDDLEWARES = {
 58 | #    'zi5book.middlewares.RotateUserAgentMiddleware': 543,
 59 | # }
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | DOWNLOADER_MIDDLEWARES = {
 64 |     'zi5book.middlewares.RotateUserAgentMiddleware': 543,
 65 | }
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | # EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | # }
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |     'scrapy_redis.pipelines.RedisPipeline': 300,
 77 |     'zi5book.pipelines.MongoDBPipeline': 300,
 78 |     # 'scrapy.pipelines.images.ImagesPipeline': 1,
 79 |     # 'scrapy.pipelines.files.FilesPipeline': 1
 80 |     'zi5book.pipelines.MyImagePipelines': 1,
 81 |     'zi5book.pipelines.MyFilePipelines': 2
 82 | }
 83 | 
 84 | # Enable and configure the AutoThrottle extension (disabled by default)
 85 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 86 | # AUTOTHROTTLE_ENABLED = True
 87 | # The initial download delay
 88 | # AUTOTHROTTLE_START_DELAY = 5
 89 | # The maximum download delay to be set in case of high latencies
 90 | # AUTOTHROTTLE_MAX_DELAY = 60
 91 | # The average number of requests Scrapy should be sending in parallel to
 92 | # each remote server
 93 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 94 | # Enable showing throttling stats for every response received:
 95 | # AUTOTHROTTLE_DEBUG = False
 96 | 
 97 | # Enable and configure HTTP caching (disabled by default)
 98 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 99 | # HTTPCACHE_ENABLED = True
100 | # HTTPCACHE_EXPIRATION_SECS = 0
101 | # HTTPCACHE_DIR = 'httpcache'
102 | # HTTPCACHE_IGNORE_HTTP_CODES = []
103 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
104 | 
105 | FILES_STORE = './files'
106 | IMAGES_STORE = './images'
107 | 
108 | # 90 days of delay for files expiration
109 | FILES_EXPIRES = 90
110 | 
111 | # 30 days of delay for images expiration
112 | IMAGES_EXPIRES = 30
113 | 
114 | IMAGES_THUMBS = {
115 |     'small': (50, 50),
116 |     'big': (270, 270),
117 | }
118 | 
119 | REDIS_HOST = 'localhost'
120 | REDIS_PORT = 6379
121 | 
122 | # Enables scheduling storing requests queue in redis.
123 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
124 | 
125 | # Ensure all spiders share same duplicates filter through redis.
126 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
127 | 
128 | # ITEM_PIPELINES = {
129 | #     'zi5book.pipelines.MongoDBPipeline': 300,
130 | #     'scrapy_redis.pipelines.RedisPipeline': 300
131 | # }
132 | MONGODB_SERVER = "localhost"
133 | MONGODB_PORT = 27017
134 | MONGODB_DB = "zi5book"
135 | MONGODB_COLLECTION = 'book'
136 | 


--------------------------------------------------------------------------------
/zi5book/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zi5book/spiders/exception.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # # -*- coding: utf-8 -*-
 3 | # author:Samray <samrayleung@gmail.com>
 4 | 
 5 | 
 6 | class ParseNotSupportedError(Exception):
 7 |     def __init__(self, url):
 8 |         self.url = url
 9 | 
10 |     def __str__(self):
11 |         return 'url {} is could not be parsed '.format(self.url)
12 | 


--------------------------------------------------------------------------------
/zi5book/spiders/zi5book_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | from scrapy import Request
 5 | from zi5book.items import Zi5BookItem
 6 | 
 7 | 
 8 | class Zi5bookSpiderSpider(scrapy.Spider):
 9 |     name = 'zi5book_spider'
10 |     start_urls = []
11 |     headers = {
12 |         'pragma': "no-cache",
13 |         'cookie': "pgv_pvi=2762272768; PHPSESSID=a987cbecdca352e260c085da785d8aa7; pgv_si=s1139271680",
14 |         'dnt': "1",
15 |         'accept-encoding': "gzip, deflate",
16 |         'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
17 |         'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
18 |                       "Chrome/72.0.3626.96 Safari/537.36",
19 |         'accept': "text/html, */*; q=0.01",
20 |         'cache-control': "no-cache",
21 |         'x-requested-with': "XMLHttpRequest",
22 |         'proxy-connection': "keep-alive",
23 |         'referer': "http://book.zi5.me/",
24 |     }
25 | 
26 |     def start_requests(self):
27 |         page_url = 'http://book.zi5.me/page/{0}'
28 |         for i in range(1, 51):
29 |             yield Request(page_url.format(str(i)), headers=self.headers)
30 | 
31 |     def parse(self, response):
32 |         thumbs = response.css('div.thumb-holder')
33 |         for thumb in thumbs:
34 |             detail_url = thumb.css('a.colorbox::attr(href)').extract_first()
35 |             yield Request(detail_url, callback=self.parse_detail, headers=self.headers)
36 | 
37 |     def parse_detail(self, response):
38 |         item = Zi5BookItem()
39 |         item['name'] = response.css('.h1-wrapper > h1:nth-child(1)::text').extract_first()
40 |         item['author'] = response.css('.post-meta-top > div:nth-child(2) > a:nth-child(1)::text').extract_first()
41 |         item['time'] = response.css('.post-meta-top > div:nth-child(2)::text').extract_first().replace('\xa0|\xa0 ', '')
42 |         item['publisher'] = response.css('.post-meta-top > div:nth-child(2) > a:nth-child(2)::text').extract_first()
43 |         item['comment'] = response.css('.post-meta-top > div:nth-child(1) > a:nth-child(1)::text').extract_first()
44 |         item['view'] = response.css('.post-meta-top > div:nth-child(1)::text').extract_first().replace('|','').replace('views','').strip()
45 |         # item['ISBN'] = response.css(
46 |         #     '#post-936 > div:nth-child(3) > div:nth-child(1) > a:nth-child(1)::text').extract_first()
47 |         ISBN = re.findall("title='跳转至豆瓣'>(.*?)</a>", response.text)
48 |         if ISBN:
49 |             item['ISBN'] = ISBN[0]
50 |         else:
51 |             item['ISBN'] = ''
52 | 
53 |         item['rates'] = response.css('.rateNum::text').extract_first()
54 |         # item['updated'] = response.css('#post-936 > div:nth-child(3)::text').extract_first()
55 |         updated = re.findall('更新时间：(.*?)</div>', response.text)
56 |         if updated:
57 |             item['updated'] = updated[0]
58 |         else:
59 |             item['updated'] = ''
60 | 
61 |         item['desc'] = ''.join(response.xpath('//p[@class="description"]/text()').extract())
62 |         item['image_urls'] = [response.urljoin(image_url) for image_url in
63 |                               response.css('.post-content > img:nth-child(1)::attr(src)').extract()]
64 |         item['up'] = response.css('.thumbs-rating-up::text').extract_first()
65 |         item['down'] = response.css('.thumbs-rating-down::text').extract_first()
66 |         item['tags'] = ''.join(response.css('.post-meta-category-tag a::text').extract())
67 |         item['file_urls'] = response.css('a.download-link::attr(href)').extract()
68 | 
69 |         yield item
70 | 


--------------------------------------------------------------------------------