├── .gitigore ├── README.md ├── main.py ├── scrapy.cfg └── zi5book ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── exception.py └── zi5book_spider.py /.gitigore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | zi5book/__pycache__/ 3 | zi5book/spiders/__pycache__/ 4 | *.pyc 5 | .DS_Store 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zi5book 2 | 要不要先star一波,book.zi5.me全站kindle电子书籍爬取,按照作者书籍名分类,每本书有mobi和equb两种格式,采用分布式进行全站爬取 3 | # tips 4 | 需要安装pillow,pillow有相关依赖库,需要翻***墙访问,20190130可以正常使用 5 | > sudo apt-get install libjpeg-dev 6 | 7 | > pip3 install pillow 8 | 9 | 10 | 11 | 12 | # 最新安装操作 13 | 14 | # 没有python3环境 15 | 16 | 下载anaconda3 https://www.anaconda.com/download/#linux 17 | 18 | https://repo.anaconda.com/archive/ 19 | 20 | 21 | 22 | wget https://repo.anaconda.com/archive/Anaconda3-5.0.1-Linux-x86_64.sh 23 | 24 | chmod +x Anaconda3-5.0.1-Linux-x86_64.sh 25 | 26 | ./Anaconda3-5.0.1-Linux-x86_64.sh 27 | 28 | 29 | 30 | 一路yes即可,除了最后的安装vscode 31 | 32 | 33 | 34 | # 安装依赖包 35 | 36 | conda install scrapy(也可以pip install scrapy,有时候容易安装错误) 37 | 38 | pip install scrapy_redis 39 | 40 | pip install pymongo 41 | 42 | 43 | 44 | # 安装redis和mongodb 45 | 46 | sudo apt-get install redis-server 47 | 48 | sudo apt-get install mongodb 49 | 50 | 51 | 52 | 53 | 54 | # 运行 55 | 56 | git clone https://github.com/guapier/zi5book.git 57 | 58 | cd zi5book 59 | 60 | python3 main.py即可 61 | 62 | 63 | 64 | # 可能出现的错误的解决方案 65 | 66 | ```ba's 67 | UnicodeEncodeError: 'ascii' codec can't encode characters in position 25-31: ordinal not in range(128) 68 | 69 | sudo apt-get install language-pack-zh-hans 70 | 71 | 72 | 73 | 首先要从Ubuntu语言设置那里,把中文语言包安装上 74 | 75 | 打开/etc/environment 76 | 在下面添加如下两行 77 | LANG=zh_CN.UTF-8 78 | LANGUAGE=zh_CN:zh:en_US:en 79 | 80 | 打开 /var/lib/locales/supported.d/local 81 | 添加zh_CN.GB2312字符集,如下: 82 | en_US.UTF-8 UTF-8 83 | zh_CN.UTF-8 UTF-8 84 | zh_CN.GBK GBK 85 | zh_CN GB2312 86 | 保存后,执行命令: 87 | sudo locale-gen 88 | 89 | 打开/etc/default/locale 90 | 修改为: 91 | LANG=”zh_CN.UTF-8″ 92 | LANGUAGE=”zh_CN:zh:en_US:en” 93 | ``` 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | import sys 4 | import os 5 | 6 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 | execute(["scrapy", "crawl", "zi5book_spider"]) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zi5book.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zi5book 12 | -------------------------------------------------------------------------------- /zi5book/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guapier/zi5book/5492989f7141e0d4dc21334612e8748aff20d3a4/zi5book/__init__.py -------------------------------------------------------------------------------- /zi5book/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Zi5BookItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | url=scrapy.Field() 15 | name=scrapy.Field() 16 | time=scrapy.Field() 17 | author=scrapy.Field() 18 | publisher=scrapy.Field() 19 | comment=scrapy.Field() 20 | view=scrapy.Field() 21 | ISBN=scrapy.Field() 22 | rates=scrapy.Field() 23 | updated=scrapy.Field() 24 | desc=scrapy.Field() 25 | tags=scrapy.Field() 26 | up=scrapy.Field() 27 | down=scrapy.Field() 28 | image_urls=scrapy.Field() 29 | file_urls=scrapy.Field() 30 | images=scrapy.Field() 31 | files=scrapy.Field() 32 | 33 | -------------------------------------------------------------------------------- /zi5book/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from random import choice 10 | from scrapy.exceptions import NotConfigured 11 | 12 | 13 | class Zi5BookSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | 60 | 61 | 62 | class RotateUserAgentMiddleware(object): 63 | """Rotate user-age for each request 64 | """ 65 | 66 | def __init__(self, user_agents): 67 | self.enabled = False 68 | self.user_agents = user_agents 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | # This method is used by Scrapy to create your spiders. 73 | # s = cls() 74 | # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 75 | # return s 76 | user_agents = crawler.settings.get('USER_AGENT_CHOICES', []) 77 | 78 | if not user_agents: 79 | raise NotConfigured("USER_AGENT_CHOICES not set or empty") 80 | 81 | o = cls(user_agents) 82 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 83 | return o 84 | 85 | def process_spider_input(self, response, spider): 86 | # Called for each response that goes through the spider 87 | # middleware and into the spider. 88 | 89 | # Should return None or raise an exception. 90 | return None 91 | 92 | def process_spider_output(self, response, result, spider): 93 | # Called with the results returned from the Spider, after 94 | # it has processed the response. 95 | 96 | # Must return an iterable of Request, dict or Item objects. 97 | for i in result: 98 | yield i 99 | 100 | def process_spider_exception(self, response, exception, spider): 101 | # Called when a spider or process_spider_input() method 102 | # (from other spider middleware) raises an exception. 103 | 104 | # Should return either None or an iterable of Response, dict 105 | # or Item objects. 106 | pass 107 | 108 | def process_start_requests(self, start_requests, spider): 109 | # Called with the start requests of the spider, and works 110 | # similarly to the process_spider_output() method, except 111 | # that it doesn’t have a response associated. 112 | 113 | # Must return only requests (not items). 114 | for r in start_requests: 115 | yield r 116 | 117 | def spider_opened(self, spider): 118 | spider.logger.info('Spider opened: %s' % spider.name) 119 | self.enabled = getattr(spider, 'rotate_user_agent', self.enabled) 120 | 121 | def process_request(self, request, spider): 122 | if not self.enabled or not self.user_agents: 123 | return 124 | request.headers['user-agent'] = choice(self.user_agents) 125 | -------------------------------------------------------------------------------- /zi5book/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import logging 5 | 6 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 7 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 8 | import pymongo 9 | # Define your item pipelines here 10 | # 11 | from scrapy.conf import settings 12 | from scrapy.exceptions import DropItem 13 | # from scrapy.contrib.pipeline.images import ImagesPipeline 14 | # from scrapy.contrib.pipeline.files import FilesPipeline 15 | from scrapy.pipelines.files import FilesPipeline 16 | from scrapy.pipelines.images import ImagesPipeline 17 | from scrapy.exceptions import DropItem 18 | import scrapy 19 | 20 | 21 | class MongoDBPipeline(object): 22 | def __init__(self): 23 | connection = pymongo.MongoClient( 24 | settings['MONGODB_SERVER'], 25 | settings['MONGODB_PORT'] 26 | ) 27 | self.db = connection[settings['MONGODB_DB']] 28 | self.collection = self.db[settings['MONGODB_COLLECTION']] 29 | 30 | def process_item(self, item, spider): 31 | valid = True 32 | for data in item: 33 | if not data: 34 | valid = False 35 | raise DropItem("Missing {0}!".format(data)) 36 | if valid: 37 | try: 38 | self.collection.insert(dict(item)) 39 | logging.debug("add {}".format(item['item_name'])) 40 | except (pymongo.errors.WriteError, KeyError) as err: 41 | raise DropItem("Duplicated Item: {}".format(item['name'])) 42 | return item 43 | 44 | 45 | class MyImagePipelines(ImagesPipeline): 46 | def get_media_requests(self, item, info): 47 | for image_url in item['image_urls']: 48 | # 这里我把item传过去,因为后面需要用item里面的书名和章节作为文件名 49 | yield scrapy.Request(image_url, meta={'item': item}) 50 | 51 | def item_completed(self, results, item, info): 52 | image_paths = [x['path'] for ok, x in results if ok] 53 | if not image_paths: 54 | raise DropItem("Item contains no images") 55 | return item 56 | 57 | def file_path(self, request, response=None, info=None): 58 | item = request.meta['item'] 59 | # 从URL提取图片的文件名 60 | image_guid = request.url.split('/')[-1].split('.')[1] 61 | # 拼接最终的文件名,格式:full/{书名}/{章节}/图片文件名.jpg 62 | filename = u'full/{0[author]}/{0[name]}/{0[name]}.{1}'.format(item, image_guid) 63 | return filename 64 | 65 | 66 | class MyFilePipelines(FilesPipeline): 67 | def get_media_requests(self, item, info): 68 | for image_url in item['file_urls']: 69 | # 这里我把item传过去,因为后面需要用item里面的书名和章节作为文件名 70 | yield scrapy.Request(image_url, meta={'item': item}) 71 | 72 | def item_completed(self, results, item, info): 73 | image_paths = [x['path'] for ok, x in results if ok] 74 | if not image_paths: 75 | raise DropItem("Item contains no images") 76 | return item 77 | 78 | def file_path(self, request, response=None, info=None): 79 | item = request.meta['item'] 80 | # 从URL提取图片的文件名 81 | image_guid = request.url.split('/')[-1].split('.')[1] 82 | # 拼接最终的文件名,格式:full/{书名}/{章节}/图片文件名.jpg 83 | filename = u'full/{0[author]}/{0[name]}/{0[name]}.{1}'.format(item, image_guid) 84 | return filename 85 | -------------------------------------------------------------------------------- /zi5book/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zi5book project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zi5book' 13 | 14 | SPIDER_MODULES = ['zi5book.spiders'] 15 | NEWSPIDER_MODULE = 'zi5book.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'zi5book (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 0.25 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | COOKIES_ENABLED = True 36 | 37 | USER_AGENT_CHOICES = [ 38 | 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 39 | 'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)', 40 | 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)', 41 | 'DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)', 42 | 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', 43 | 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)', 44 | 'ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)', 45 | ] 46 | # Disable Telnet Console (enabled by default) 47 | # TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | # DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | # } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'zi5book.middlewares.RotateUserAgentMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | DOWNLOADER_MIDDLEWARES = { 64 | 'zi5book.middlewares.RotateUserAgentMiddleware': 543, 65 | } 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | # EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | # } 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | 'scrapy_redis.pipelines.RedisPipeline': 300, 77 | 'zi5book.pipelines.MongoDBPipeline': 300, 78 | # 'scrapy.pipelines.images.ImagesPipeline': 1, 79 | # 'scrapy.pipelines.files.FilesPipeline': 1 80 | 'zi5book.pipelines.MyImagePipelines': 1, 81 | 'zi5book.pipelines.MyFilePipelines': 2 82 | } 83 | 84 | # Enable and configure the AutoThrottle extension (disabled by default) 85 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 86 | # AUTOTHROTTLE_ENABLED = True 87 | # The initial download delay 88 | # AUTOTHROTTLE_START_DELAY = 5 89 | # The maximum download delay to be set in case of high latencies 90 | # AUTOTHROTTLE_MAX_DELAY = 60 91 | # The average number of requests Scrapy should be sending in parallel to 92 | # each remote server 93 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 94 | # Enable showing throttling stats for every response received: 95 | # AUTOTHROTTLE_DEBUG = False 96 | 97 | # Enable and configure HTTP caching (disabled by default) 98 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 99 | # HTTPCACHE_ENABLED = True 100 | # HTTPCACHE_EXPIRATION_SECS = 0 101 | # HTTPCACHE_DIR = 'httpcache' 102 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 103 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 104 | 105 | FILES_STORE = './files' 106 | IMAGES_STORE = './images' 107 | 108 | # 90 days of delay for files expiration 109 | FILES_EXPIRES = 90 110 | 111 | # 30 days of delay for images expiration 112 | IMAGES_EXPIRES = 30 113 | 114 | IMAGES_THUMBS = { 115 | 'small': (50, 50), 116 | 'big': (270, 270), 117 | } 118 | 119 | REDIS_HOST = 'localhost' 120 | REDIS_PORT = 6379 121 | 122 | # Enables scheduling storing requests queue in redis. 123 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 124 | 125 | # Ensure all spiders share same duplicates filter through redis. 126 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 127 | 128 | # ITEM_PIPELINES = { 129 | # 'zi5book.pipelines.MongoDBPipeline': 300, 130 | # 'scrapy_redis.pipelines.RedisPipeline': 300 131 | # } 132 | MONGODB_SERVER = "localhost" 133 | MONGODB_PORT = 27017 134 | MONGODB_DB = "zi5book" 135 | MONGODB_COLLECTION = 'book' 136 | -------------------------------------------------------------------------------- /zi5book/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zi5book/spiders/exception.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # # -*- coding: utf-8 -*- 3 | # author:Samray 4 | 5 | 6 | class ParseNotSupportedError(Exception): 7 | def __init__(self, url): 8 | self.url = url 9 | 10 | def __str__(self): 11 | return 'url {} is could not be parsed '.format(self.url) 12 | -------------------------------------------------------------------------------- /zi5book/spiders/zi5book_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | from scrapy import Request 5 | from zi5book.items import Zi5BookItem 6 | 7 | 8 | class Zi5bookSpiderSpider(scrapy.Spider): 9 | name = 'zi5book_spider' 10 | start_urls = [] 11 | headers = { 12 | 'pragma': "no-cache", 13 | 'cookie': "pgv_pvi=2762272768; PHPSESSID=a987cbecdca352e260c085da785d8aa7; pgv_si=s1139271680", 14 | 'dnt': "1", 15 | 'accept-encoding': "gzip, deflate", 16 | 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", 17 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) " 18 | "Chrome/72.0.3626.96 Safari/537.36", 19 | 'accept': "text/html, */*; q=0.01", 20 | 'cache-control': "no-cache", 21 | 'x-requested-with': "XMLHttpRequest", 22 | 'proxy-connection': "keep-alive", 23 | 'referer': "http://book.zi5.me/", 24 | } 25 | 26 | def start_requests(self): 27 | page_url = 'http://book.zi5.me/page/{0}' 28 | for i in range(1, 51): 29 | yield Request(page_url.format(str(i)), headers=self.headers) 30 | 31 | def parse(self, response): 32 | thumbs = response.css('div.thumb-holder') 33 | for thumb in thumbs: 34 | detail_url = thumb.css('a.colorbox::attr(href)').extract_first() 35 | yield Request(detail_url, callback=self.parse_detail, headers=self.headers) 36 | 37 | def parse_detail(self, response): 38 | item = Zi5BookItem() 39 | item['name'] = response.css('.h1-wrapper > h1:nth-child(1)::text').extract_first() 40 | item['author'] = response.css('.post-meta-top > div:nth-child(2) > a:nth-child(1)::text').extract_first() 41 | item['time'] = response.css('.post-meta-top > div:nth-child(2)::text').extract_first().replace('\xa0|\xa0 ', '') 42 | item['publisher'] = response.css('.post-meta-top > div:nth-child(2) > a:nth-child(2)::text').extract_first() 43 | item['comment'] = response.css('.post-meta-top > div:nth-child(1) > a:nth-child(1)::text').extract_first() 44 | item['view'] = response.css('.post-meta-top > div:nth-child(1)::text').extract_first().replace('|','').replace('views','').strip() 45 | # item['ISBN'] = response.css( 46 | # '#post-936 > div:nth-child(3) > div:nth-child(1) > a:nth-child(1)::text').extract_first() 47 | ISBN = re.findall("title='跳转至豆瓣'>(.*?)", response.text) 48 | if ISBN: 49 | item['ISBN'] = ISBN[0] 50 | else: 51 | item['ISBN'] = '' 52 | 53 | item['rates'] = response.css('.rateNum::text').extract_first() 54 | # item['updated'] = response.css('#post-936 > div:nth-child(3)::text').extract_first() 55 | updated = re.findall('更新时间:(.*?)', response.text) 56 | if updated: 57 | item['updated'] = updated[0] 58 | else: 59 | item['updated'] = '' 60 | 61 | item['desc'] = ''.join(response.xpath('//p[@class="description"]/text()').extract()) 62 | item['image_urls'] = [response.urljoin(image_url) for image_url in 63 | response.css('.post-content > img:nth-child(1)::attr(src)').extract()] 64 | item['up'] = response.css('.thumbs-rating-up::text').extract_first() 65 | item['down'] = response.css('.thumbs-rating-down::text').extract_first() 66 | item['tags'] = ''.join(response.css('.post-meta-category-tag a::text').extract()) 67 | item['file_urls'] = response.css('a.download-link::attr(href)').extract() 68 | 69 | yield item 70 | --------------------------------------------------------------------------------