├── .gitattributes
├── README.md
├── ScrapyTest01
├── .idea
│ ├── ScrapyTest01.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── ScrapyTest01
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── qidian.cpython-36.pyc
│ │ └── qidian.py
├── book.json
├── scrapy.cfg
└── startScrapy.py
├── ScrapyTest02
├── .idea
│ ├── ScrapyTest02.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest02
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── middlewares.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── qidian_font.cpython-36.pyc
│ │ └── qidian_font.py
├── book_info.json
├── scrapy.cfg
└── start.py
├── ScrapyTest03
├── .idea
│ ├── ScrapyTest03.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest03
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── middlewares.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── ajax_zhihu.cpython-36.pyc
│ │ └── ajax_zhihu.py
├── scrapy.cfg
└── start.py
├── ScrapyTest04
├── .idea
│ ├── ScrapyTest04.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest04
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── doubanLogin.cpython-36.pyc
│ │ └── doubanLogin.py
├── people.html
├── scrapy.cfg
└── start.py
├── ScrapyTest05
├── .idea
│ ├── ScrapyTest05.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest05
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── zhihuLogin.cpython-36.pyc
│ │ └── zhihuLogin.py
├── scrapy.cfg
├── start.py
└── zhihu.html
├── ScrapyTest06
├── .idea
│ ├── ScrapyTest06.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest06
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── zhihuSelenium.cpython-36.pyc
│ │ └── zhihuSelenium.py
├── scrapy.cfg
├── start.py
└── zhihu.html
├── ScrapyTest07
├── .idea
│ ├── ScrapyTest07.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── ScrapyTest07
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── qzone.cpython-36.pyc
│ │ └── qzone.py
├── scrapy.cfg
├── start.py
└── statistical.py
└── ScrapyTest08
├── .idea
├── ScrapyTest08.iml
├── vcs.xml
└── workspace.xml
├── DaiLi
├── .idea
│ ├── DaiLi.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── DaiLi
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── xici.cpython-36.pyc
│ │ ├── proxy.txt
│ │ └── xici.py
├── proxy.txt
├── scrapy.cfg
└── start.py
├── dytt_redis_master
├── .idea
│ ├── dytt_redis_master.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── dytt_redis_master
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── dytt_master.cpython-36.pyc
│ │ └── dytt_master.py
├── scrapy.cfg
└── start.py
├── dytt_redis_slaver
├── .idea
│ ├── dytt_redis_slaver.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── dytt_redis_slaver
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── middlewares.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── dytt_slaver.cpython-36.pyc
│ │ └── dytt_slaver.py
├── movie.json
├── scrapy.cfg
└── start.py
└── redis-mysql.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy_notes
2 | 一个向往成为程序员的码奴的Scrapy学习笔记
3 |
4 |
5 | 知乎专栏地址:https://zhuanlan.zhihu.com/zhiqi-scrapy
6 |
--------------------------------------------------------------------------------
/ScrapyTest01/.idea/ScrapyTest01.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest01/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest01/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest01Item(scrapy.Item):
12 | # 书名
13 | title = scrapy.Field()
14 | # 作者
15 | author = scrapy.Field()
16 | # 简介
17 | abstract = scrapy.Field()
18 |
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest01SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest01DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import json
9 |
10 | class Scrapytest01Pipeline(object):
11 |
12 | def __init__(self):
13 | self.file = open('book.json', 'w')
14 |
15 | def process_item(self, item, spider):
16 | content = json.dumps(dict(item), ensure_ascii=False) + "\n"
17 | self.file.write(content)
18 | return item
19 |
20 | def close_spider(self, spider):
21 | self.file.close()
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest01 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest01'
13 |
14 | SPIDER_MODULES = ['ScrapyTest01.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest01.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest01.middlewares.Scrapytest01SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest01.middlewares.Scrapytest01DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'ScrapyTest01.pipelines.Scrapytest01Pipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__pycache__/qidian.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/spiders/__pycache__/qidian.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/qidian.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from ScrapyTest01.items import Scrapytest01Item
4 |
5 |
6 | class QidianSpider(scrapy.Spider):
7 | name = 'qidian'
8 | allowed_domains = ['qidian.com/']
9 | start_urls = ['https://www.qidian.com/rank/hotsales']
10 |
11 | def parse(self, response):
12 | # with open('book.html', 'w') as f:
13 | # f.write(response.body.decode('utf-8'))
14 |
15 |
16 | # 存放书籍的集合
17 | book_items = []
18 |
19 | # item = Scrapytest01Item()
20 | # title = response.xpath("//div[@class='book-mid-info']/h4/a/text()").extract()
21 | # author = response.xpath("//div[@class='book-mid-info']/p/a[@class='name']/text()").extract()
22 | # abstract = response.xpath("//div[@class='book-mid-info']/p[@class='intro']/text()").extract()
23 | #
24 | # item['title'] = title
25 | # item['author'] = author
26 | # item['abstract'] = abstract
27 | #
28 | # book_items.append(item)
29 |
30 | for each in response.xpath("//div[@class='book-mid-info']"):
31 | item = Scrapytest01Item()
32 | title = each.xpath("h4/a/text()").extract()[0]
33 | author = each.xpath("p/a[@class='name']/text()").extract()[0]
34 | abstract = each.xpath("p[@class='intro']/text()").extract()[0].strip()
35 | item['title'] = title
36 | item['author'] = author
37 | item['abstract'] = abstract
38 |
39 | book_items.append(item)
40 |
41 | return book_items
--------------------------------------------------------------------------------
/ScrapyTest01/book.json:
--------------------------------------------------------------------------------
1 | {"title": "大王饶命", "author": "会说话的肘子", "abstract": "高中生吕树在一场车祸中改变人生,当灵气复苏时代来袭,他要做这时代的领跑者。物竞天择,胜者为王。……全订验证群号:696087569"}
2 | {"title": "太初", "author": "高楼大厦", "abstract": "一树生的万朵花,天下道门是一家。法术千般变化,人心却亘古不变"}
3 | {"title": "牧神记", "author": "宅猪", "abstract": "大墟的祖训说,天黑,别出门。大墟残老村的老弱病残们从江边捡到了一个婴儿,取名秦牧,含辛茹苦将他养大。这一天夜幕降临,黑暗笼罩大墟,秦牧走出了家门……做个春风中荡漾的反派吧!瞎子对他"}
4 | {"title": "修真聊天群", "author": "圣骑士的传说", "abstract": "某天,宋书航意外加入了一个仙侠中二病资深患者的交流群,里面的群友们都以‘道友’相称,群名片都是各种府主、洞主、真人、天师。连群主走失的宠物犬都称为大妖犬离家出走。整天聊的是炼丹、闯"}
5 | {"title": "圣墟", "author": "辰东", "abstract": "在破败中崛起,在寂灭中复苏。沧海成尘,雷电枯竭,那一缕幽雾又一次临近大地,世间的枷锁被打开了,一个全新的世界就此揭开神秘的一角……"}
6 | {"title": "汉乡", "author": "孑与2", "abstract": "我们接受了祖先的遗产,这让中华辉煌了数千年,我们是如此的心安理得,从未想过要回归那个在刀耕火种中苦苦寻找出路的时代。反哺我们苦难的祖先,并从中找到故乡的真正意义,将是本书要讲的故事"}
7 | {"title": "诡秘之主", "author": "爱潜水的乌贼", "abstract": "蒸汽与机械的浪潮中,谁能触及非凡?历史和黑暗的迷雾里,又是谁在耳语?我从诡秘中醒来,睁眼看见这个世界:枪械,大炮,巨舰,飞空艇,差分机;魔药,占卜,诅咒,倒吊人,封印物……光明依旧"}
8 | {"title": "重生之最强人生", "author": "俊秀才", "abstract": "一次跑步锻炼,殷俊从迷雾中穿出来时,已经回到了78年的香江。这一年,还没有霸占荧幕的综艺节目,没有熟悉的特效大片,也没有耳熟能详的歌曲。这一年,香江电视剧还没有好戏频出,香江电影也"}
9 | {"title": "明朝败家子", "author": "上山打老虎额", "abstract": "弘治十一年。这是一个美好的清晨。此时朱厚照初成年。此时王守仁和唐伯虎磨刀霍霍,预备科举。此时小冰河期已经来临,绵长的严寒肆虐着大地。此时在南和伯府里,地主家的傻儿子,南和伯的嫡传继"}
10 | {"title": "带着仓库到大明", "author": "迪巴拉爵士", "abstract": "方醒穿了,带着两个仓库穿了!别人穿越是带着王霸之气,方醒却是只想种田!“我只想在这个时代悠闲的活着!”坐拥大别墅,顺便教几个弟子,努力让他们往上爬,好给自己当靠山!可谁想弟子有些不"}
11 | {"title": "恶魔就在身边", "author": "汉宝", "abstract": "陈曌能召唤恶魔,能够看到死亡。“别西卜,用你暴食者的能力,为这位客户治疗一下厌食症。”“雷蒙,这位老年人想重新获得男性的能力,你懂的。”“老黑,你和我说实话,这人什么时候死,怎么死"}
12 | {"title": "凡人修仙之仙界篇", "author": "忘语", "abstract": "凡人修仙,风云再起时空穿梭,轮回逆转金仙太乙,大罗道祖三千大道,法则至尊《凡人修仙传》仙界篇,一个韩立叱咤仙界的故事,一个凡人小子修仙的不灭传说。特说明下,没有看过前传的书友,并不"}
13 | {"title": "深夜书屋", "author": "纯洁滴小龙", "abstract": "一家只在深夜开门营业的书屋,欢迎您的光临。————————《舵主群》:587980337(进群粉丝值验证)《读书群》:523978007(无需验证)《战斗群》:457654443("}
14 | {"title": "超神机械师", "author": "齐佩甲", "abstract": "韩萧,《星海》骨灰级代练,被来自东(zuo)方(zhe)的神秘力量扔进穿越大军,携带玩家面板变成NPC,回到《星海》公测之前,毅然选择难度最高的机械系。战舰列队纵横星海,星辰机甲夭"}
15 | {"title": "帝霸", "author": "厌笔萧生", "abstract": "千万年前,李七夜栽下一株翠竹。八百万年前,李七夜养了一条鲤鱼。五百万年前,李七夜收养一个小女孩。今天,李七夜一觉醒来,翠竹修练成神灵,鲤鱼化作金龙,小女孩成为九界女帝。这是一个养成"}
16 | {"title": "诸界末日在线", "author": "烟火成城", "abstract": "当游戏中的第一件法宝出现在现实世界,所有人类都为之陷入疯狂。这是最好的时代,人们进入游戏,将装备、神通、修为统统带回现实世界;这是最坏的时代,游戏中的妖魔侵入现实,实力恐怖到让人绝"}
17 | {"title": "大医凌然", "author": "志鸟村", "abstract": "医学院学生凌然有一个小目标,要成为世界上最伟大的医生,结果不小心实现了。"}
18 | {"title": "神话版三国", "author": "坟土荒草", "abstract": "陈曦看着将一块数百斤巨石撇出去的士卒,无语望苍天,这真的是东汉末年?吕布单枪匹马凿穿万人部队,这怎么看都不科学。赵子龙真心龙魂附体了,一剑断山,这真的是人?典韦单人护着曹操杀出敌营"}
19 | {"title": "斗罗大陆III龙王传说", "author": "唐家三少", "abstract": "伴随着魂导科技的进步,斗罗大陆上的人类征服了海洋,又发现了两片大陆。魂兽也随着人类魂师的猎杀无度走向灭亡,沉睡无数年的魂兽之王在星斗大森林最后的净土苏醒,它要带领仅存的族人,向人类"}
20 | {"title": "道君", "author": "跃千愁", "abstract": "一个地球神级盗墓宗师,闯入修真界的故事……桃花源里,有歌声。山外青山,白骨山。五花马,千金裘,倚天剑。应我多情,啾啾鬼鸣,美人薄嗔。天地无垠,谁家旗鼓,碧落黄泉,万古高楼。为义气争"}
21 |
--------------------------------------------------------------------------------
/ScrapyTest01/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest01.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest01
12 |
--------------------------------------------------------------------------------
/ScrapyTest01/startScrapy.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | # cmdline.execute("scrapy crawl qidian -o books.json".split())
4 |
5 | cmdline.execute("scrapy crawl qidian".split())
--------------------------------------------------------------------------------
/ScrapyTest02/.idea/ScrapyTest02.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest02/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest02/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest02/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest02Item(scrapy.Item):
12 | # 书名
13 | title = scrapy.Field()
14 | # 作者
15 | author = scrapy.Field()
16 | # 作品信息
17 | information = scrapy.Field()
18 | # 作品简介
19 | Introduction = scrapy.Field()
20 | # 字数
21 | word_num = scrapy.Field()
22 | # 点击量
23 | clicks_num = scrapy.Field()
24 | # 推荐数
25 | recommended_num = scrapy.Field()
26 |
27 |
28 |
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | import random
9 | import base64
10 |
11 | from ScrapyTest02.settings import USER_AGENTS
12 | from ScrapyTest02.settings import PROXIES
13 |
14 | # 随机的User-Agent
15 | class RandomUserAgent(object):
16 | def process_request(self, request, spider):
17 | useragent = random.choice(USER_AGENTS)
18 | #print useragent
19 | request.headers.setdefault("User-Agent", useragent)
20 |
21 | # class RandomProxy(object):
22 | # def process_request(self, request, spider):
23 | # proxy = random.choice(PROXIES)
24 | #
25 | # if proxy['user_passwd'] is None:
26 | # # 没有代理账户验证的代理使用方式
27 | # request.meta['proxy'] = "http://" + proxy['ip_port']
28 | #
29 | # else:
30 | # # 对账户密码进行base64编码转换
31 | # base64_userpasswd = base64.b64encode(proxy['user_passwd'])
32 | # # 对应到代理服务器的信令格式里
33 | # request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
34 | #
35 | # request.meta['proxy'] = "http://" + proxy['ip_port']
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | import json
10 | import pymysql
11 |
12 | class JSONPipeline(object):
13 |
14 | def __init__(self):
15 | self.file = open('book_info.json', 'w')
16 |
17 | def process_item(self, item, spider):
18 | content = json.dumps(dict(item), ensure_ascii=False) + "\n"
19 | self.file.write(content)
20 | return item
21 |
22 | def close_spider(self, spider):
23 | self.file.close()
24 |
25 | class DBPipeline(object):
26 |
27 | def __init__(self):
28 | self.connect = pymysql.connect(
29 | host='localhost',
30 | db='Scrapy',
31 | user='root',
32 | passwd='zhiqi'
33 | )
34 | # 数据库游标,用于操作数据库
35 | self.cursor = self.connect.cursor()
36 |
37 | def process_item(self, item, spider):
38 | try:
39 | # 将信息写入数据库
40 | self.cursor.execute("INSERT INTO qidian(title,author,information,introduction,word_num,clicks_num,recommended_num) VALUES (%s,%s,%s,%s,%s,%s,%s)",(item['title'],item['author'],item['information'],item['Introduction'],item['word_num'],item['clicks_num'],item['recommended_num']))
41 | # 提交信息
42 | self.connect.commit()
43 | except Exception as e:
44 | # 输出错误信息
45 | print(e)
46 |
47 | return item
48 |
49 | def close_spider(self, spider):
50 | # 关闭游标
51 | self.cursor.close()
52 | # 关闭连接
53 | self.connect.close()
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest02 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest02'
13 |
14 | SPIDER_MODULES = ['ScrapyTest02.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest02.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
20 |
21 | USER_AGENTS = [
22 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
23 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
24 | 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
25 | 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
26 | 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
27 | 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
28 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
29 | ]
30 |
31 | PROXIES = [
32 | {"ip_port" :"ip:端口", "user_passwd": ""},
33 | {"ip_port" :"ip:端口", "user_passwd": ""},
34 | {"ip_port" :"ip:端口", "user_passwd": ""},
35 | {"ip_port" :"ip:端口", "user_passwd": ""},
36 | {"ip_port" :"ip:端口", "user_passwd": ""},
37 | ]
38 |
39 | MYSQL_HOST = 'localhost'
40 | MYSQL_DBNAME = 'Scrapy'
41 | MYSQL_USER = 'root'
42 | MYSQL_PASSWD = 'zhiqi'
43 |
44 |
45 |
46 | # Obey robots.txt rules
47 | # ROBOTSTXT_OBEY = True
48 |
49 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
50 | #CONCURRENT_REQUESTS = 32
51 |
52 | # Configure a delay for requests for the same website (default: 0)
53 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
54 | # See also autothrottle settings and docs
55 | DOWNLOAD_DELAY = 2
56 | # The download delay setting will honor only one of:
57 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
58 | # CONCURRENT_REQUESTS_PER_IP = 16
59 |
60 | # Disable cookies (enabled by default)
61 | #COOKIES_ENABLED = False
62 |
63 | # Disable Telnet Console (enabled by default)
64 | #TELNETCONSOLE_ENABLED = False
65 |
66 | # Override the default request headers:
67 | #DEFAULT_REQUEST_HEADERS = {
68 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 | # 'Accept-Language': 'en',
70 | #}
71 |
72 | # Enable or disable spider middlewares
73 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
74 | # SPIDER_MIDDLEWARES = {
75 | # 'ScrapyTest02.middlewares.Scrapytest02SpiderMiddleware': 543,
76 | #
77 | # }
78 |
79 | # Enable or disable downloader middlewares
80 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
81 | DOWNLOADER_MIDDLEWARES = {
82 | 'ScrapyTest02.middlewares.RandomUserAgent': 543,
83 | # 'ScrapyTest02.middlewares.RandomProxy': 533,
84 | }
85 |
86 | # Enable or disable extensions
87 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
88 | #EXTENSIONS = {
89 | # 'scrapy.extensions.telnet.TelnetConsole': None,
90 | #}
91 |
92 | # Configure item pipelines
93 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
94 | ITEM_PIPELINES = {
95 | 'ScrapyTest02.pipelines.JSONPipeline': 300,
96 | 'ScrapyTest02.pipelines.DBPipeline': 280,
97 | }
98 |
99 | # Enable and configure the AutoThrottle extension (disabled by default)
100 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
101 | #AUTOTHROTTLE_ENABLED = True
102 | # The initial download delay
103 | #AUTOTHROTTLE_START_DELAY = 5
104 | # The maximum download delay to be set in case of high latencies
105 | #AUTOTHROTTLE_MAX_DELAY = 60
106 | # The average number of requests Scrapy should be sending in parallel to
107 | # each remote server
108 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
109 | # Enable showing throttling stats for every response received:
110 | #AUTOTHROTTLE_DEBUG = False
111 |
112 | # Enable and configure HTTP caching (disabled by default)
113 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
114 | #HTTPCACHE_ENABLED = True
115 | #HTTPCACHE_EXPIRATION_SECS = 0
116 | #HTTPCACHE_DIR = 'httpcache'
117 | #HTTPCACHE_IGNORE_HTTP_CODES = []
118 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
119 |
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__pycache__/qidian_font.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/spiders/__pycache__/qidian_font.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/qidian_font.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from ScrapyTest02.items import Scrapytest02Item
4 | from lxml import etree
5 | import re
6 | from fontTools.ttLib import TTFont
7 | from io import BytesIO
8 |
9 | from scrapy.linkextractors import LinkExtractor
10 | from scrapy.spiders import CrawlSpider, Rule
11 |
12 | # class QidianFontSpider(scrapy.Spider):
13 | class QidianFontSpider(CrawlSpider):
14 | name = 'qidian_font'
15 | allowed_domains = ['qidian.com', 'qidian.gtimg.com']
16 | # start_urls = ['https://book.qidian.com/info/1010191960']
17 |
18 | start_urls = []
19 | for i in range(1, 26):
20 | start_urls.append('https://www.qidian.com/rank/hotsales?page=' + str(i))
21 |
22 | rules = (
23 | Rule(LinkExtractor(allow=r'info/\d+'), callback='parse_item'),
24 | )
25 |
26 | def __init__(self,*args, **kwargs):
27 | self.WORD_TO_NUM = {"zero": "0","one": "1","two": "2","three":"3","four":"4","five":"5","six":"6","seven":"7","eight":"8","nine":"9","period":"."}
28 |
29 | super(QidianFontSpider, self).__init__(*args, **kwargs) # crawlSpider一定要加上这句
30 | self.font_dic = {}
31 |
32 | def parse_item(self, response):
33 |
34 | #item = Scrapytest02Item()
35 | title = response.xpath('//div[@class="book-info "]/h1/em/text()').extract()[0]
36 | author = response.xpath('//div[@class="book-info "]//a[@class="writer"]/text()').extract()[0]
37 | information = response.xpath('//div[@class="book-info "]/p[@class="intro"]/text()').extract()[0]
38 | Introduction = response.xpath('//div[@class="book-intro"]//p/text()').extract()[0].strip()
39 | # word_num = response.xpath('//div[@class="book-info "]/p[3]/em[1]/span/text()').extract()[0]
40 | # clicks_num = response.xpath('//div[@class="book-info "]/p[3]/em[2]/span/text()').extract()[0]
41 | # recommended_num = response.xpath('//div[@class="book-info "]/p[3]/em[3]/span/text()').extract()[0]
42 | #
43 | # item['title'] = title
44 | # item['author'] = author
45 | # item['information'] = information
46 | # item['Introduction'] = Introduction
47 | # item['word_num'] = word_num
48 | # item['clicks_num'] = clicks_num
49 | # item['recommended_num'] = recommended_num
50 | #
51 | # yield item
52 |
53 | # 得到字体的名字
54 | font_style = response.xpath('//div[@class="book-info "]//style/text()').extract()[0]
55 | font_name = font_style.split(';')[0].split(':')[1].strip()
56 |
57 | html = etree.HTML(response.text)
58 | # 获取文章字数字体的编码
59 | word_num_coding = self.get_coding(html, 'p[3]/em[1]/span')
60 | # 获取文章点击量字体的编码
61 | clicks_num_coding = self.get_coding(html, 'p[3]/em[2]/span')
62 | # 获取文章总推荐字体的编码
63 | recommended_num_coding = self.get_coding(html, 'p[3]/em[3]/span')
64 |
65 | # 临时的字典,用于回调传参
66 | temp = {}
67 | temp['word_num_coding'] = word_num_coding
68 | temp['clicks_num_coding'] = clicks_num_coding
69 | temp['recommended_num_coding'] = recommended_num_coding
70 | temp['title'] = title
71 | temp['author'] = author
72 | temp['information'] = information
73 | temp['Introduction'] = Introduction
74 |
75 | font_link = 'https://qidian.gtimg.com/qd_anti_spider/' + font_name + '.woff'
76 | if font_link not in self.font_dic.keys():
77 | yield scrapy.Request(font_link, callback=self.parse_detial, meta=temp, dont_filter=True)
78 | else:
79 | yield self.processing_data(self.font_dic.get(font_link), temp)
80 |
81 | def get_coding(self,html, word_num_title_xpath):
82 | # 文章信息的根节点
83 | root_node = html.xpath('//div[@class="book-info "]')
84 | # 字数标签
85 | num_title = root_node[0].find(word_num_title_xpath)
86 | # 字数标签解码
87 | num_text = etree.tostring(num_title).decode()
88 | # 正则匹配
89 | groups = re.search(r'>(.*?);<', num_text)
90 | # 取出字数的字码
91 | num_coding = groups[1]
92 | # 返回字数的字码
93 | return num_coding
94 |
95 | def parse_detial(self, response):
96 | font = TTFont(BytesIO(response.body))
97 | cmap = font.getBestCmap()
98 | font.close()
99 | self.font_dic[response.url] = cmap
100 |
101 | return self.processing_data(cmap, response.meta)
102 |
103 | def processing_data(self, cmap, meta):
104 | word_num = self.decode_num(cmap, meta, 'word_num_coding')
105 | clicks_num = self.decode_num(cmap, meta, 'clicks_num_coding')
106 | recommended_num = self.decode_num(cmap, meta, 'recommended_num_coding')
107 | item = Scrapytest02Item()
108 | item['title'] = meta.get('title')
109 | item['author'] = meta.get('author')
110 | item['information'] = meta.get('information')
111 | item['Introduction'] = meta.get('Introduction')
112 | item['word_num'] = word_num + '万字'
113 | item['clicks_num'] = clicks_num + '万总会员点击'
114 | item['recommended_num'] = recommended_num + '万总推荐'
115 | return item
116 |
117 | def decode_num(self, cmap, meta,code_name):
118 | word_num = ''
119 | num_coding_list = meta.get(code_name).replace('', '').split(';')
120 | for num in num_coding_list:
121 | ch = cmap.get(int(num))
122 | word_num += self.WORD_TO_NUM[ch]
123 | return word_num
--------------------------------------------------------------------------------
/ScrapyTest02/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest02.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest02
12 |
--------------------------------------------------------------------------------
/ScrapyTest02/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl qidian_font".split())
--------------------------------------------------------------------------------
/ScrapyTest03/.idea/ScrapyTest03.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest03/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest03/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest03/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest03Item(scrapy.Item):
12 | # 文章标题
13 | title = scrapy.Field()
14 | # 作者
15 | name = scrapy.Field()
16 | # 作者简介
17 | headline = scrapy.Field()
18 | # 文章链接
19 | url = scrapy.Field()
20 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest03SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest03DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymysql
8 |
9 | class DBPipeline(object):
10 |
11 | def __init__(self):
12 | self.connect = pymysql.connect(
13 | host='localhost',
14 | db='Scrapy',
15 | user='root',
16 | passwd='zhiqi'
17 | )
18 | # 数据库游标,用于操作数据库
19 | self.cursor = self.connect.cursor()
20 |
21 | def process_item(self, item, spider):
22 | try:
23 | # 将信息写入数据库
24 | self.cursor.execute("INSERT INTO ZhPyZnCom(title,author,headline,url) VALUES (%s,%s,%s,%s)",(item['title'],item['name'],item['headline'],item['url']))
25 | # 提交信息
26 | self.connect.commit()
27 | except Exception as e:
28 | # 输出错误信息
29 | print(e)
30 |
31 | return item
32 |
33 | def close_spider(self, spider):
34 | # 关闭游标
35 | self.cursor.close()
36 | # 关闭连接
37 | self.connect.close()
38 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest03 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest03'
13 |
14 | SPIDER_MODULES = ['ScrapyTest03.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest03.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | #ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 2
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest03.middlewares.Scrapytest03SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest03.middlewares.Scrapytest03DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'ScrapyTest03.pipelines.DBPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__pycache__/ajax_zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/spiders/__pycache__/ajax_zhihu.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/ajax_zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import json
4 | from ScrapyTest03.items import Scrapytest03Item
5 |
6 | class AjaxZhihuSpider(scrapy.Spider):
7 | name = 'ajax_zhihu'
8 | allowed_domains = ['zhihu.com']
9 | start_urls = ['https://www.zhihu.com/api/v4/columns/zimei/articles?limit=20&offset=0']
10 |
11 | def parse(self, response):
12 | # print(response.body)
13 | jsonBody = json.loads(response.body.decode('gbk').encode('utf-8'))
14 | articles = jsonBody['data']
15 | for art in articles:
16 | item = Scrapytest03Item()
17 | item['title'] = art['title']
18 | item['name'] = art['author']['name']
19 | item['headline'] = art['author']['headline']
20 | item['url'] = art['url']
21 | yield item
22 |
23 | if articles:
24 | yield scrapy.Request(jsonBody['paging']['next'], callback=self.parse)
25 | else:
26 | print("获取完毕!")
27 |
28 |
29 |
--------------------------------------------------------------------------------
/ScrapyTest03/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest03.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest03
12 |
--------------------------------------------------------------------------------
/ScrapyTest03/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl ajax_zhihu".split())
--------------------------------------------------------------------------------
/ScrapyTest04/.idea/ScrapyTest04.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest04/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest04/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest04/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest04Item(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest04SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest04DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class Scrapytest04Pipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest04 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest04'
13 |
14 | SPIDER_MODULES = ['ScrapyTest04.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest04.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest04.middlewares.Scrapytest04SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest04.middlewares.Scrapytest04DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'ScrapyTest04.pipelines.Scrapytest04Pipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__pycache__/doubanLogin.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/spiders/__pycache__/doubanLogin.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/doubanLogin.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import re
4 | from io import BytesIO
5 | from PIL import Image
6 |
7 | class DoubanloginSpider(scrapy.Spider):
8 | name = 'doubanLogin'
9 | allowed_domains = ['douban.com']
10 | start_urls = ['https://accounts.douban.com/login']
11 |
12 | def parse(self, response):
13 | temp = {}
14 | captcha = response.xpath('//img[@id = "captcha_image"]/@src').extract()
15 | if captcha:
16 | # 有验证码
17 | temp['captcha'] = captcha[0]
18 | temp['response'] = response
19 | print(captcha)
20 | yield scrapy.Request(captcha, callback=self.get_captcha, meta=temp)
21 | else:
22 | # 没有验证码
23 | yield scrapy.FormRequest.from_response(
24 | response,
25 | formdata={
26 | "source": "index_nav",
27 | # "redir": "https://www.douban.com/people/182875833/",
28 | "form_email": "1XXXXXXXXXXX",
29 | "form_password": "zXXXXXXXXXX!",
30 | "user_login": "登录"
31 | },
32 | callback=self.parse_page
33 | )
34 |
35 |
36 | def get_captcha(self, response):
37 | captcha_img = Image.open(BytesIO(response.body))
38 | captcha_img.show()
39 |
40 | captcha_id = re.search(r'id=\w+:en', response.meta['captcha']).group(0).split('=')[1]
41 | captcha_solution = input("请输入验证码:")
42 | # 发送请求参数,并调用指定回调函数处理
43 | yield scrapy.FormRequest.from_response(
44 | response.meta['response'],
45 | formdata={
46 | "source": "index_nav",
47 | # "redir": "https://www.douban.com/people/182875833/",
48 | "form_email": "1XXXXXXXXXX",
49 | "form_password": "zxXXXXXXXX!",
50 | "captcha-solution": captcha_solution,
51 | "captcha-id": captcha_id,
52 | "user_login": "登录"
53 | },
54 | callback=self.parse_page
55 | )
56 |
57 | def parse_page(self, response):
58 | url = "https://www.douban.com/people/182875833/"
59 | yield scrapy.Request(url, callback=self.parse_newpage)
60 |
61 | def parse_newpage(self, response):
62 | with open("people.html", 'wb') as f:
63 | f.write(response.body)
64 |
--------------------------------------------------------------------------------
/ScrapyTest04/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest04.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest04
12 |
--------------------------------------------------------------------------------
/ScrapyTest04/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl doubanLogin".split())
--------------------------------------------------------------------------------
/ScrapyTest05/.idea/ScrapyTest05.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest05/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest05/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest05/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest05/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
116 |
117 |
118 |
119 |
120 | true
121 | DEFINITION_ORDER
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 | 1534213258864
214 |
215 |
216 | 1534213258864
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest05Item(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest05SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest05DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class Scrapytest05Pipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest05 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest05'
13 |
14 | SPIDER_MODULES = ['ScrapyTest05.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest05.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest05.middlewares.Scrapytest05SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest05.middlewares.Scrapytest05DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'ScrapyTest05.pipelines.Scrapytest05Pipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__pycache__/zhihuLogin.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/spiders/__pycache__/zhihuLogin.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/zhihuLogin.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | class ZhihuloginSpider(scrapy.Spider):
6 | name = 'zhihuLogin'
7 | allowed_domains = ['zhihu.com']
8 | start_urls = ['https://www.zhihu.com/inbox']
9 |
10 | cookies = {
11 | 'UM_distinctid': '162xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
12 | '__DAYU_PP': 'yxxxxxxxxxxxxxxxxxxxxxd',
13 | '_zap': 'f50cxxxxxxxxxxxxxxxxxxxxxx048',
14 | 'z_c0': '"2|xxxxxxxxxxxxxxxxxxxxx|92:MixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxX01ORzZnWVFTMGxOdF9B|5f5bxxxxxxxxxxxxxxxxxxxxxxxxx417"',
15 | 'CNZZDATA1256793290': '59643xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx524016409',
16 | 'd_c0': '"AOxxxxxxxxxxxxxxxxxxxxxxxxxxxxx728"',
17 | 'Hm_lvt_0bd5xxxxxxxxxxxxxxxxxxxxxxxxxxxxx14379',
18 | 'q_c1': '00cxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx9756000',
19 | '_xsrf': 'd4xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxbjP',
20 | '__utma': '518xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx89.2',
21 | '__utmz': '51854390.1533xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx|utmcct=/',
22 | '__utmv': '5185xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx160510=1',
23 | 'tgw_l7_route': '156dxxxxxxxxxxxxxxxxxxxxxxxxf36',
24 | }
25 |
26 | # 重写Spider类的start_requests方法,附带Cookie值,发送POST请求
27 | def start_requests(self):
28 | for url in self.start_urls:
29 | yield scrapy.FormRequest(url, cookies=self.cookies, callback=self.parse_page)
30 |
31 | # 处理响应内容
32 | def parse_page(self, response):
33 | with open("zhihu.html", "wb") as filename:
34 | filename.write(response.body)
35 |
--------------------------------------------------------------------------------
/ScrapyTest05/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest05.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest05
12 |
--------------------------------------------------------------------------------
/ScrapyTest05/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl zhihuLogin".split())
--------------------------------------------------------------------------------
/ScrapyTest06/.idea/ScrapyTest06.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest06/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest06/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest06/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest06Item(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest06SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest06DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class Scrapytest06Pipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest06 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest06'
13 |
14 | SPIDER_MODULES = ['ScrapyTest06.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest06.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest06.middlewares.Scrapytest06SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest06.middlewares.Scrapytest06DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'ScrapyTest06.pipelines.Scrapytest06Pipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__pycache__/zhihuSelenium.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/spiders/__pycache__/zhihuSelenium.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/zhihuSelenium.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from selenium import webdriver
4 | from selenium.webdriver import ActionChains
5 | import time
6 | import base64
7 | from io import BytesIO
8 | from PIL import Image
9 | import random
10 | import re
11 |
12 |
13 | class ZhihuseleniumSpider(scrapy.Spider):
14 | name = 'zhihuSelenium'
15 | allowed_domains = ['www.zhihu.com']
16 | start_urls = ['https://www.zhihu.com/inbox']
17 |
18 | def start_requests(self):
19 | # chrome_options = webdriver.ChromeOptions()
20 | # chrome_options.add_argument('--headless')
21 | # browser = webdriver.Chrome(chrome_options=chrome_options)
22 | browser = webdriver.Chrome()
23 | browser.set_window_size(100,100)
24 | browser.get('https://www.zhihu.com/signup')
25 | # 切换到登陆
26 | browser.find_element_by_xpath("//div[@class ='SignContainer-switch']/span").click()
27 | # 输入账号
28 | browser.find_element_by_name("username").send_keys("1XXXXXXXXXX")
29 | # 输入密码
30 | browser.find_element_by_name("password").send_keys("XXXXXXXXXXXXXXX")
31 | # 查看是否有验证码
32 | Captcha_element = browser.find_element_by_xpath("//form[@class='SignFlow']/div[3]//img")
33 | Captcha_base64 = Captcha_element.get_attribute('src')
34 | print(Captcha_base64)
35 | # # 如果有验证码:
36 | if Captcha_base64 != '':
37 | # 得到验证码图片
38 | img_data1 = Captcha_base64.split(',')[-1]
39 | data1 = base64.b64decode(img_data1)
40 | image = Image.open(BytesIO(data1))
41 | image.show()
42 | Captcha_type = Captcha_element.get_attribute('class')
43 | # 如果是英文验证码:
44 | if Captcha_type == 'Captcha-englishImg':
45 | # 输入验证码字符并send_keys
46 | Captcha = input('请输入图片中的验证码:')
47 | browser.find_element_by_name("captcha").send_keys(Captcha)
48 | # 否则:
49 | else:
50 | # 输入坐标,鼠标模拟点击
51 | # 每个字宽度约 (160.5-5.5)/7=22
52 | # 每个字高度范围:13.5———35.5
53 | handstand = input('请输入倒立文字的序号(以‘,’分割):')
54 | handstand_serial_nums = handstand.split(',')
55 | for handstand_serial_num in handstand_serial_nums:
56 | x = 5.5 + (int(handstand_serial_num) - 1) * 22 + random.uniform(10, 20) # 随机一个范围
57 | y = random.uniform(15, 30)
58 | click_pos = (x, y)
59 | print(click_pos)
60 | ActionChains(browser).move_to_element_with_offset(Captcha_element, x, y).perform()
61 | ActionChains(browser).click().perform()
62 | # 点击登陆按钮
63 | browser.find_element_by_xpath("//button[@type='submit']").click()
64 | time.sleep(2) # 等待登陆跳转
65 | print(browser.title)
66 | # 如果登陆title包含“首页”,登陆成功
67 | if re.search(r'首页', browser.title):
68 | print('登陆成功!!')
69 | cookies = browser.get_cookies()
70 | browser.close()
71 | for url in self.start_urls:
72 | yield scrapy.FormRequest(url, cookies=cookies, callback=self.parse_page)
73 | else:
74 | print("登陆失败!")
75 |
76 | # 处理登陆后的响应内容
77 | def parse_page(self, response):
78 | with open("zhihu.html", "wb") as filename:
79 | filename.write(response.body)
--------------------------------------------------------------------------------
/ScrapyTest06/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest06.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest06
12 |
--------------------------------------------------------------------------------
/ScrapyTest06/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl zhihuSelenium".split())
--------------------------------------------------------------------------------
/ScrapyTest06/zhihu.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | 私信 - 知乎
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
知乎
57 |
58 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 | -
119 | 首页
120 |
121 |
122 |
123 |
124 | -
125 | 话题
126 |
127 |
128 | -
129 | 发现
130 |
131 |
132 | -
133 | 消息
134 |
135 |
136 |
137 |
138 |
139 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
x
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 | - 最近联系
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
更多
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 | 担心骚扰?可以
设置 为「开启陌生人私信箱」。
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
295 |
296 |
297 |
298 |
326 |
327 |
328 |
329 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
--------------------------------------------------------------------------------
/ScrapyTest07/.idea/ScrapyTest07.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest07/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest07/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest07/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Scrapytest07Item(scrapy.Item):
12 | # 说说内容
13 | content = scrapy.Field()
14 | # 发表时间
15 | created_time = scrapy.Field()
16 | # 发表地点
17 | location_name = scrapy.Field()
18 | # 经度
19 | location_pos_x = scrapy.Field()
20 | # 纬度
21 | location_pos_y = scrapy.Field()
22 | # 设备
23 | source_name = scrapy.Field()
24 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Scrapytest07SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Scrapytest07DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymysql
9 |
10 |
11 | class DBPipeline(object):
12 | def __init__(self):
13 | self.connect = pymysql.connect(
14 | host='localhost',
15 | db='Scrapy',
16 | user='root',
17 | passwd='zhiqi'
18 | )
19 | # 数据库游标,用于操作数据库
20 | self.cursor = self.connect.cursor()
21 |
22 | def process_item(self, item, spider):
23 | try:
24 | # 将信息写入数据库
25 | self.cursor.execute("INSERT INTO my_qzone (content,created_time,location_name,location_pos_x,location_pos_y,source_name) VALUES (%s,%s,%s,%s,%s,%s)",(item['content'],item['created_time'],item['location_name'],item['location_pos_x'],item['location_pos_y'],item['source_name'])) # 提交信息
26 | self.connect.commit()
27 | except Exception as e:
28 | # 输出错误信息
29 | print(e)
30 |
31 | return item
32 |
33 | def close_spider(self, spider):
34 | # 关闭游标
35 | self.cursor.close()
36 | # 关闭连接
37 | self.connect.close()
38 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ScrapyTest07 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ScrapyTest07'
13 |
14 | SPIDER_MODULES = ['ScrapyTest07.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest07.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'ScrapyTest07.middlewares.Scrapytest07SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'ScrapyTest07.middlewares.Scrapytest07DownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'ScrapyTest07.pipelines.DBPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__pycache__/qzone.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/spiders/__pycache__/qzone.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/qzone.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from selenium import webdriver
4 | import time
5 | import json
6 | from ScrapyTest07.items import Scrapytest07Item
7 |
8 | class QzoneSpider(scrapy.Spider):
9 | name = 'qzone'
10 | allowed_domains = ['qq.com']
11 |
12 | def __init__(self):
13 | self.cookies = ''
14 | self.page_num = 0
15 | self.g_tk = ''
16 |
17 | def start_requests(self):
18 | browser = webdriver.Chrome()
19 | browser.get('https://user.qzone.qq.com')
20 | # 登录表单在页面的框架中,所以要切换到该框架
21 | browser.switch_to.frame('login_frame')
22 | browser.find_element_by_id('switcher_plogin').click()
23 | browser.find_element_by_id('u').send_keys('QQ')
24 | browser.find_element_by_id('p').send_keys('密码')
25 | browser.find_element_by_id('login_button').click()
26 | time.sleep(2)
27 | try:
28 | # 得到验证码图片
29 | bg_link = browser.find_element_by_id('slideBkg').get_attribute('src')
30 | block_link = browser.find_element_by_id('slideBlock').get_attribute('src')
31 | time.sleep(10)
32 | print(bg_link, block_link)
33 | except:
34 | pass
35 | # 获得 gtk
36 | cookie = {} # 初始化cookie字典
37 | self.cookies = browser.get_cookies()
38 | for elem in self.cookies: # 取cookies
39 | cookie[elem['name']] = elem['value']
40 |
41 | self.g_tk = self.getGTK(cookie) # 通过getGTK函数计算gtk
42 | browser.close()
43 | # https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=0&num=20&g_tk=438032980
44 | start_url = 'https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=0&num=20&g_tk=' + str(self.g_tk)
45 | yield scrapy.Request(start_url, cookies=self.cookies, callback=self.get_msg)
46 |
47 | def getGTK(self, cookie):
48 | hashes = 5381
49 | for letter in cookie['p_skey']:
50 | hashes += (hashes << 5) + ord(letter)
51 | return hashes & 0x7fffffff
52 |
53 | def get_msg(self, response):
54 | response_fix = response.body.decode('utf-8')[10:-2]
55 | # print(response_fix)
56 | jsonBody = json.loads(response_fix)
57 | msglist = jsonBody['msglist']
58 | if msglist != None:
59 | for msg in msglist:
60 | item = Scrapytest07Item()
61 | item['content'] = msg['content']
62 | # 转换成localtime
63 | time_local = time.localtime(int(msg['created_time']))
64 | # 转换成新的时间格式(XXXX-XX-XX XX:XX:XX)
65 | dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
66 | item['created_time'] = dt
67 | item['location_name'] = msg['lbs']['name']
68 | item['location_pos_x'] = msg['lbs']['pos_x']
69 | item['location_pos_y'] = msg['lbs']['pos_y']
70 | item['source_name'] = msg['source_name']
71 | print(item)
72 | yield item
73 | self.page_num += 1
74 | url = 'https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=' + str(self.page_num * 20) + '&num=20&g_tk=' + str(self.g_tk)
75 | yield scrapy.Request(url, cookies=self.cookies, callback=self.get_msg)
76 | else:
77 | print('全部数据获取完毕!')
78 |
79 |
--------------------------------------------------------------------------------
/ScrapyTest07/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ScrapyTest07.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest07
12 |
--------------------------------------------------------------------------------
/ScrapyTest07/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl qzone".split())
4 |
--------------------------------------------------------------------------------
/ScrapyTest07/statistical.py:
--------------------------------------------------------------------------------
1 | from pyecharts import Bar
2 | from pyecharts import Line
3 | from pyecharts import WordCloud
4 | from pyecharts import Geo
5 | from pyecharts import Pie
6 | from pyecharts.engine import create_default_environment
7 | import pymysql
8 | import jieba
9 | import re
10 |
11 | db = pymysql.connect(host='localhost', user='root', password='zhiqi', database='Scrapy')
12 | cursor = db.cursor()
13 |
14 | sql = "SELECT * FROM my_qzone"
15 |
16 | contents = []
17 | years = []
18 | months = []
19 | days = []
20 | hours = []
21 | locations = {}
22 | source_names = []
23 |
24 | try:
25 | cursor.execute(sql)
26 | result = cursor.fetchall()
27 | for row in result:
28 | contents.append(row[1])
29 | years.append(row[2].split(' ')[0].split('-')[0])
30 | months.append(row[2].split(' ')[0].split('-')[1])
31 | days.append(row[2].split(' ')[0].split('-')[2])
32 | hours.append(row[2].split(' ')[1].split(':')[0])
33 | locations[row[3]] = [row[4], row[5]]
34 | if row[6] != '':
35 | source_names.append(row[6])
36 | except Exception as e:
37 | # 输出错误信息
38 | print(e)
39 |
40 | cursor.close()
41 | db.close()
42 |
43 |
44 | # 将所有说说连起来
45 | str_content = ''
46 | for content in contents:
47 | str_content += content
48 | str_content = ''.join(re.findall(u'[\u4e00-\u9fff]+', str_content))
49 | words = jieba.cut(str_content)
50 | stop_list = ['的', '了', '我', '是', '不',
51 | '你', '都', '就', '在', '也',
52 | '有', '去', '好', '说', '到',
53 | '又', '要', '这', '还', '啊',
54 | '吧', '给', '和', '人', '来',
55 | '被', '上', '没', '会', '能',
56 | '着', '多', '他', '一', '年',
57 | '看', '很', '谁', '再', '为',
58 | ]
59 | result_word_list = []
60 | for word in words:
61 | if word not in stop_list:
62 | result_word_list.append(word)
63 |
64 | geo = Geo(
65 | "位置信息",
66 | title_color="#fff",
67 | title_pos="center",
68 | width=1200,
69 | height=600,
70 | background_color="#404a59",
71 | )
72 |
73 | sorted_list = set(list(locations.keys()))
74 | list_infos = {}
75 | for info in sorted_list:
76 | list_infos[info] = list(locations.keys()).count(info)
77 |
78 | attr, value = geo.cast(list_infos)
79 | geo.add(
80 | "",
81 | attr,
82 | value,
83 | geo_cities_coords=locations
84 | )
85 | geo.render()
86 |
87 |
88 | def get_chart(original_list, form_type, table_name, series_name):
89 | sorted_list = sorted(list(set(original_list)))
90 | list_infos = {}
91 | for info in sorted_list:
92 | list_infos[info] = original_list.count(info)
93 | chart = form_type(table_name)
94 | chart.add(series_name, list(list_infos.keys()), list(list_infos.values()))
95 | return chart
96 |
97 |
98 | def drawing(chart, path, filet_type='html'):
99 | env = create_default_environment(filet_type)
100 | # create_default_environment(filet_type)
101 | # file_type: 'html', 'svg', 'png', 'jpeg', 'gif' or 'pdf'
102 | env.render_chart_to_file(chart, path=path)
103 |
104 | drawing(get_chart(years, Bar, '年发表统计图', '发表数'), 'years.html')
105 | drawing(get_chart(months, Bar, '月发表统计图', '发表数'), 'months.html')
106 | drawing(get_chart(days, Bar, '日发表统计图', '发表数'), 'days.html')
107 | drawing(get_chart(hours, Bar, '小时发表统计图', '发表数'), 'hours.html')
108 | drawing(get_chart(result_word_list, WordCloud, '', ''), 'word.html')
109 | drawing(get_chart(source_names, Pie, '', ''), 'source.html')
110 |
--------------------------------------------------------------------------------
/ScrapyTest08/.idea/ScrapyTest08.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest08/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | true
33 | DEFINITION_ORDER
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | 1535693580353
86 |
87 |
88 | 1535693580353
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/DaiLi.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DailiItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | proxy = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class DailiSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class DailiDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class DailiPipeline(object):
10 |
11 | def __init__(self):
12 | self.file = open('proxy.txt', 'w')
13 |
14 | def process_item(self, item, spider):
15 | self.file.write(str(item['proxy']) + '\n')
16 | return item
17 |
18 | def close_spider(self, spider):
19 | self.file.close()
20 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for DaiLi project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'DaiLi'
13 |
14 | SPIDER_MODULES = ['DaiLi.spiders']
15 | NEWSPIDER_MODULE = 'DaiLi.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'DaiLi.middlewares.DailiSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'DaiLi.middlewares.DailiDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'DaiLi.pipelines.DailiPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/xici.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/xici.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/proxy.txt:
--------------------------------------------------------------------------------
1 | {'http': 'http://106.75.9.39:8080', 'https': 'http://106.75.9.39:8080'}
2 | {'http': 'http://61.135.217.7:80', 'https': 'http://61.135.217.7:80'}
3 | {'http': 'http://118.190.95.35:9001', 'https': 'http://118.190.95.35:9001'}
4 | {'http': 'http://139.224.118.25:3128', 'https': 'http://139.224.118.25:3128'}
5 | {'http': 'http://182.38.14.237:8118', 'https': 'http://182.38.14.237:8118'}
6 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/xici.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from DaiLi.items import DailiItem
4 | import requests
5 |
6 |
7 | class XiciSpider(scrapy.Spider):
8 | name = 'xici'
9 | allowed_domains = ['xicidaili.com']
10 | start_urls = []
11 | for i in range(1, 6):
12 | start_urls.append('http://www.xicidaili.com/nn/' + str(i))
13 |
14 | def parse(self, response):
15 | ip = response.xpath('//tr[@class]/td[2]/text()').extract()
16 | port = response.xpath('//tr[@class]/td[3]/text()').extract()
17 | agreement_type = response.xpath('//tr[@class]/td[6]/text()').extract()
18 | proxies = zip(ip, port, agreement_type)
19 | # print(proxies)
20 |
21 | # 验证代理是否可用
22 | for ip, port, agreement_type in proxies:
23 | proxy = {'http': agreement_type.lower() + '://' + ip + ':' + port,
24 | 'https': agreement_type.lower() + '://' + ip + ':' + port}
25 | try:
26 | # 设置代理链接 如果状态码为200 则表示该代理可以使用
27 | print(proxy)
28 | resp = requests.get('http://icanhazip.com', proxies=proxy, timeout=2)
29 | print(resp.status_code)
30 | if resp.status_code == 200:
31 | print(resp.text)
32 | # print('success %s' % ip)
33 | item = DailiItem()
34 | item['proxy'] = proxy #agreement_type + '://' + ip + ':' + port
35 | #print(item['proxy'])
36 | yield item
37 | except:
38 | print('fail %s' % ip)
39 |
40 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/proxy.txt:
--------------------------------------------------------------------------------
1 | {'http': 'http://106.75.9.39:8080', 'https': 'http://106.75.9.39:8080'}
2 | {'http': 'http://61.135.217.7:80', 'https': 'http://61.135.217.7:80'}
3 | {'http': 'http://118.190.95.43:9001', 'https': 'http://118.190.95.43:9001'}
4 | {'http': 'http://118.190.95.35:9001', 'https': 'http://118.190.95.35:9001'}
5 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = DaiLi.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DaiLi
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl xici".split())
4 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/dytt_redis_master.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DyttRedisMasterItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | url = scrapy.Field()
14 |
15 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class DyttRedisMasterSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class DyttRedisMasterDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import redis
9 |
10 | class DyttRedisMasterPipeline(object):
11 | def __init__(self):
12 | # 初始化连接数据的变量
13 | self.REDIS_HOST = '127.0.0.1'
14 | self.REDIS_PORT = 6379
15 | # 链接redis
16 | self.r = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
17 |
18 | def process_item(self, item, spider):
19 | # 向redis中插入需要爬取的链接地址
20 | self.r.lpush('dytt:start_urls', item['url'])
21 | return item
22 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for dytt_redis_master project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'dytt_redis_master'
13 |
14 | SPIDER_MODULES = ['dytt_redis_master.spiders']
15 | NEWSPIDER_MODULE = 'dytt_redis_master.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dytt_redis_master (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'dytt_redis_master.middlewares.DyttRedisMasterSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'dytt_redis_master.middlewares.DyttRedisMasterDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'dytt_redis_master.pipelines.DyttRedisMasterPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/dytt_master.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/dytt_master.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/dytt_master.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from dytt_redis_master.items import DyttRedisMasterItem
6 |
7 | class DyttMasterSpider(CrawlSpider):
8 | name = 'dytt_master'
9 | allowed_domains = ['dy2018.com']
10 | start_urls = ['https://www.dy2018.com/0/']
11 |
12 | rules = (
13 | Rule(LinkExtractor(allow=r'/\d{1,2}/$'), callback='parse_item'),
14 | )
15 |
16 | def parse_item(self, response):
17 | # print(response.url)
18 | items = DyttRedisMasterItem()
19 | items['url'] = response.url
20 | yield items
21 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = dytt_redis_master.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dytt_redis_master
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl dytt_master".split())
4 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/dytt_redis_slaver.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__init__.py
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DyttRedisSlaverItem(scrapy.Item):
12 | # 片名
13 | name = scrapy.Field()
14 | # 年代
15 | year = scrapy.Field()
16 | # 语言
17 | language = scrapy.Field()
18 | # 类别
19 | movie_type = scrapy.Field()
20 | # 上映日期
21 | release_date = scrapy.Field()
22 | # 评分
23 | score = scrapy.Field()
24 | # 文件大小
25 | file_size = scrapy.Field()
26 | # 片长
27 | film_time = scrapy.Field()
28 | # 简介
29 | introduction = scrapy.Field()
30 | # 海报
31 | posters = scrapy.Field()
32 | # 下载链接
33 | download_link = scrapy.Field()
34 | # utc时间
35 | crawled = scrapy.Field()
36 | # 爬虫名
37 | spider = scrapy.Field()
38 |
39 |
40 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 | import random
11 | import base64
12 | from dytt_redis_slaver.settings import USER_AGENTS
13 | from dytt_redis_slaver.settings import PROXIES
14 |
15 | # 随机的User-Agent
16 | class RandomUserAgent(object):
17 | def process_request(self, request, spider):
18 | useragent = random.choice(USER_AGENTS)
19 | #print useragent
20 | request.headers.setdefault("User-Agent", useragent)
21 |
22 | # 随机的代理ip
23 | class RandomProxy(object):
24 | def process_request(self, request, spider):
25 | proxy = random.choice(PROXIES)
26 | if proxy['user_passwd'] is None:
27 | # 没有代理账户验证的代理使用方式
28 | request.meta['proxy'] = "http://" + proxy['ip_port']
29 |
30 | else:
31 | # 对账户密码进行base64编码转换
32 | base64_userpasswd = base64.b64encode(proxy['user_passwd'])
33 | # 对应到代理服务器的信令格式里
34 | request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
35 | request.meta['proxy'] = "http://" + proxy['ip_port']
36 |
37 | class DyttRedisSlaverSpiderMiddleware(object):
38 | # Not all methods need to be defined. If a method is not defined,
39 | # scrapy acts as if the spider middleware does not modify the
40 | # passed objects.
41 |
42 | @classmethod
43 | def from_crawler(cls, crawler):
44 | # This method is used by Scrapy to create your spiders.
45 | s = cls()
46 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
47 | return s
48 |
49 | def process_spider_input(self, response, spider):
50 | # Called for each response that goes through the spider
51 | # middleware and into the spider.
52 |
53 | # Should return None or raise an exception.
54 | return None
55 |
56 | def process_spider_output(self, response, result, spider):
57 | # Called with the results returned from the Spider, after
58 | # it has processed the response.
59 |
60 | # Must return an iterable of Request, dict or Item objects.
61 | for i in result:
62 | yield i
63 |
64 | def process_spider_exception(self, response, exception, spider):
65 | # Called when a spider or process_spider_input() method
66 | # (from other spider middleware) raises an exception.
67 |
68 | # Should return either None or an iterable of Response, dict
69 | # or Item objects.
70 | pass
71 |
72 | def process_start_requests(self, start_requests, spider):
73 | # Called with the start requests of the spider, and works
74 | # similarly to the process_spider_output() method, except
75 | # that it doesn’t have a response associated.
76 |
77 | # Must return only requests (not items).
78 | for r in start_requests:
79 | yield r
80 |
81 | def spider_opened(self, spider):
82 | spider.logger.info('Spider opened: %s' % spider.name)
83 |
84 |
85 | class DyttRedisSlaverDownloaderMiddleware(object):
86 | # Not all methods need to be defined. If a method is not defined,
87 | # scrapy acts as if the downloader middleware does not modify the
88 | # passed objects.
89 |
90 | @classmethod
91 | def from_crawler(cls, crawler):
92 | # This method is used by Scrapy to create your spiders.
93 | s = cls()
94 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
95 | return s
96 |
97 | def process_request(self, request, spider):
98 | # Called for each request that goes through the downloader
99 | # middleware.
100 |
101 | # Must either:
102 | # - return None: continue processing this request
103 | # - or return a Response object
104 | # - or return a Request object
105 | # - or raise IgnoreRequest: process_exception() methods of
106 | # installed downloader middleware will be called
107 | return None
108 |
109 | def process_response(self, request, response, spider):
110 | # Called with the response returned from the downloader.
111 |
112 | # Must either;
113 | # - return a Response object
114 | # - return a Request object
115 | # - or raise IgnoreRequest
116 | return response
117 |
118 | def process_exception(self, request, exception, spider):
119 | # Called when a download handler or a process_request()
120 | # (from other downloader middleware) raises an exception.
121 |
122 | # Must either:
123 | # - return None: continue processing this exception
124 | # - return a Response object: stops process_exception() chain
125 | # - return a Request object: stops process_exception() chain
126 | pass
127 |
128 | def spider_opened(self, spider):
129 | spider.logger.info('Spider opened: %s' % spider.name)
130 |
131 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import json
9 | from datetime import datetime
10 |
11 | class DyttRedisSlaverPipeline(object):
12 | def __init__(self):
13 | self.file = open('movie.json', 'w')
14 |
15 | def process_item(self, item, spider):
16 | content = json.dumps(dict(item), ensure_ascii=False) + "\n"
17 | self.file.write(content)
18 | return item
19 |
20 | def close_spider(self, spider):
21 | self.file.close()
22 |
23 |
24 | class InfoPipeline(object):
25 |
26 | def process_item(self, item, spider):
27 | #utcnow() 是获取UTC时间
28 | item["crawled"] = datetime.utcnow()
29 | # 爬虫名
30 | item["spider"] = spider.name
31 | return item
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for dytt_redis_slaver project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'dytt_redis_slaver'
13 |
14 | SPIDER_MODULES = ['dytt_redis_slaver.spiders']
15 | NEWSPIDER_MODULE = 'dytt_redis_slaver.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 |
21 | USER_AGENTS = [
22 | 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
23 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
25 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
26 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
27 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
28 |
29 | ]
30 |
31 | PROXIES = [
32 | {'ip_port': '118.190.95.43:9001', "user_passwd": None},
33 | {'ip_port': '61.135.217.7:80', "user_passwd": None},
34 | {'ip_port': '118.190.95.35:9001', "user_passwd": None},
35 | ]
36 |
37 |
38 |
39 | # 指定使用scrapy-redis的调度器
40 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
41 | # 指定使用scrapy-redis的去重
42 | DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
43 |
44 | # 指定排序爬取地址时使用的队列,
45 | # 默认的 按优先级排序(Scrapy默认),由sorted set实现的一种非FIFO、LIFO方式。
46 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
47 | # 可选的 按先进先出排序(FIFO)
48 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
49 | # 可选的 按后进先出排序(LIFO)
50 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
51 |
52 | # 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
53 | SCHEDULER_PERSIST = True
54 |
55 | # 只在使用SpiderQueue或者SpiderStack是有效的参数,指定爬虫关闭的最大间隔时间
56 | # SCHEDULER_IDLE_BEFORE_CLOSE = 10
57 |
58 |
59 |
60 | # Obey robots.txt rules
61 | # ROBOTSTXT_OBEY = True
62 |
63 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
64 | # CONCURRENT_REQUESTS = 32
65 |
66 | # Configure a delay for requests for the same website (default: 0)
67 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
68 | # See also autothrottle settings and docs
69 | DOWNLOAD_DELAY = 2
70 | # The download delay setting will honor only one of:
71 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
72 | #CONCURRENT_REQUESTS_PER_IP = 16
73 |
74 | # Disable cookies (enabled by default)
75 | #COOKIES_ENABLED = False
76 |
77 | # Disable Telnet Console (enabled by default)
78 | #TELNETCONSOLE_ENABLED = False
79 |
80 | # Override the default request headers:
81 | #DEFAULT_REQUEST_HEADERS = {
82 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 | # 'Accept-Language': 'en',
84 | #}
85 |
86 | # Enable or disable spider middlewares
87 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
88 | #SPIDER_MIDDLEWARES = {
89 | # 'dytt_redis_slaver.middlewares.DyttRedisSlaverSpiderMiddleware': 543,
90 | #}
91 |
92 | # Enable or disable downloader middlewares
93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
94 | DOWNLOADER_MIDDLEWARES = {
95 | 'dytt_redis_slaver.middlewares.RandomUserAgent': 543,
96 | # 'dytt_redis_slaver.middlewares.RandomProxy': 553,
97 | }
98 |
99 | # Enable or disable extensions
100 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
101 | #EXTENSIONS = {
102 | # 'scrapy.extensions.telnet.TelnetConsole': None,
103 | #}
104 |
105 | # Configure item pipelines
106 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
107 | # 通过配置RedisPipeline将item写入key为 spider.name : items 的redis的list中,供后面的分布式处理item
108 | ITEM_PIPELINES = {
109 | # 'dytt_redis_slaver.pipelines.DyttRedisSlaverPipeline': 300,
110 | 'dytt_redis_slaver.pipelines.InfoPipeline':350,
111 | 'scrapy_redis.pipelines.RedisPipeline': 400
112 | }
113 |
114 | # 指定redis数据库的连接参数
115 | REDIS_HOST = '127.0.0.1'
116 | REDIS_PORT = 6379
117 |
118 | #默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
119 | DUPEFILTER_DEBUG =True
120 |
121 |
122 | # Enable and configure the AutoThrottle extension (disabled by default)
123 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
124 | #AUTOTHROTTLE_ENABLED = True
125 | # The initial download delay
126 | #AUTOTHROTTLE_START_DELAY = 5
127 | # The maximum download delay to be set in case of high latencies
128 | #AUTOTHROTTLE_MAX_DELAY = 60
129 | # The average number of requests Scrapy should be sending in parallel to
130 | # each remote server
131 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
132 | # Enable showing throttling stats for every response received:
133 | #AUTOTHROTTLE_DEBUG = False
134 |
135 | # Enable and configure HTTP caching (disabled by default)
136 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
137 | #HTTPCACHE_ENABLED = True
138 | #HTTPCACHE_EXPIRATION_SECS = 0
139 | #HTTPCACHE_DIR = 'httpcache'
140 | #HTTPCACHE_IGNORE_HTTP_CODES = []
141 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
142 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/dytt_slaver.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/dytt_slaver.cpython-36.pyc
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/dytt_slaver.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | #from scrapy.spiders import CrawlSpider, Rule
5 |
6 | # 1. 导入RedisCrawlSpider类,不使用CrawlSpider
7 | from scrapy_redis.spiders import RedisCrawlSpider
8 | from scrapy.spiders import Rule
9 |
10 | from dytt_redis_slaver.items import DyttRedisSlaverItem
11 | import re
12 | from selenium import webdriver
13 | #import time
14 |
15 | # class DyttSlaverSpider(CrawlSpider):
16 | # 2. 修改父类 RedisCrawlSpider
17 | class DyttSlaverSpider(RedisCrawlSpider):
18 | name = 'dytt_slaver'
19 |
20 | # 3. 取消 allowed_domains() 和 start_urls
21 | # allowed_domains = ['dy2018.com']
22 | # start_urls = ['https://www.dy2018.com/2/index.html']
23 |
24 | # 4. 增加redis-key
25 | redis_key = 'dytt:start_urls'
26 |
27 | #page_links = LinkExtractor(allow=r'/index_\d*.html')
28 | movie_links = LinkExtractor(allow=r'/i/\d*.html', restrict_xpaths=('//div[@class="co_content8"]'))
29 |
30 | rules = (
31 | # 翻页规则
32 | #Rule(page_links),
33 | Rule(movie_links, callback='parse_item'),
34 | )
35 |
36 | # # 5. 增加__init__()方法,动态获取allowed_domains()
37 | # def __init__(self, *args, **kwargs):
38 | # domain = kwargs.pop('domain', '')
39 | # self.allowed_domains = filter(None, domain.split(','))
40 | # super(DyttSlaverSpider, self).__init__(*args, **kwargs)
41 |
42 | def parse_item(self, response):
43 | items = DyttRedisSlaverItem()
44 |
45 | str_resp = response.body.decode('gb2312', errors='ignore')
46 | rep_chars = [' ', '·', '“', '”', '…']
47 | for rep in rep_chars:
48 | str_resp = str_resp.replace(rep, '')
49 |
50 | title = re.search(r'◎片 名(.*?)', str_resp).group(1).replace(u'\u3000', '')
51 | try:
52 | translation = re.search(r'◎译 名(.*?)', str_resp).group(1).replace(u'\u3000', '')
53 | except:
54 | translation = ''
55 | # 名字
56 | items['name'] = title + "|" + translation
57 | # 年代
58 | items['year'] = re.search(r'◎年 代(.*?)', str_resp).group(1).replace(u'\u3000', '')
59 | # 评分
60 | try:
61 | items['score'] = response.xpath("//strong[@class='rank']/text()").extract()[0].replace(u'\u3000', '')
62 | except:
63 | items['score'] = '无评分'
64 | # 语言
65 | items['language'] = re.search(r'◎语 言(.*?)', str_resp).group(1).replace(u'\u3000', '')
66 | # 类别
67 | items['movie_type'] = re.search(r'◎类 别(.*?)', str_resp).group(1).replace(u'\u3000', '')
68 | # 上映日期
69 | items['release_date'] = re.search(r'◎上映日期(.*?)', str_resp).group(1).replace(u'\u3000', '')
70 | # 文件大小
71 | items['file_size'] = re.search(r'◎文件大小(.*?)', str_resp).group(1).replace(u'\u3000', '')
72 | # 片长
73 | items['film_time'] = re.search(r'◎片 长(.*?)', str_resp).group(1).replace(u'\u3000', '')
74 | # 简介
75 | items['introduction'] = re.search(r'◎简 介\r\n<.+>(.*?)', str_resp).group(1).replace(u'\u3000', '')
76 | # 海报
77 | items['posters'] = response.xpath("//div[@id='Zoom']/*[1]/img/@src").extract()[0]
78 | # 下载链接
79 | items['download_link'] = self.get_download_link(response.url)
80 |
81 | # print(items)
82 | yield items
83 |
84 | def get_download_link(self, url):
85 | chrome_options = webdriver.ChromeOptions()
86 | chrome_options.add_argument('--headless')
87 | chrome_options.add_argument('--disable-gpu')
88 | driver = webdriver.Chrome(chrome_options=chrome_options)
89 | driver.get(url)
90 | #time.sleep(1)
91 | link = re.search(r'\"(thunder:.*?)\"', driver.page_source).group(1)
92 | driver.close()
93 | return link
94 |
95 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/movie.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/movie.json
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = dytt_redis_slaver.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dytt_redis_slaver
12 |
--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl dytt_slaver".split())
4 |
--------------------------------------------------------------------------------
/ScrapyTest08/redis-mysql.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import redis
5 | import pymysql
6 |
7 | def main():
8 | # 指定redis数据库信息
9 | rediscli = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 | # 指定mysql数据库
11 | mysqlcli = pymysql.connect(host='127.0.0.1', user='root', passwd='zhiqi', db='Scrapy', port=3306, use_unicode=True)
12 |
13 | while True:
14 | # FIFO模式为 blpop,LIFO模式为 brpop,获取键值
15 | source, data = rediscli.blpop(["dytt_slaver:items"])
16 | item = json.loads(data)
17 |
18 | try:
19 | # 使用cursor()方法获取操作游标
20 | cur = mysqlcli.cursor()
21 | # 使用execute方法执行SQL INSERT语句
22 | cur.execute("INSERT INTO dytt (name, year, language, "
23 | "movie_type, release_date, score, file_size, "
24 | "film_time, introduction, posters, download_link) VALUES "
25 | "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )",
26 | [item['name'], item['year'], item['language'],
27 | item['movie_type'], item['release_date'], item['score'],
28 | item['file_size'], item['film_time'], item['introduction'],
29 | item['posters'], item['download_link']])
30 | # 提交sql事务
31 | mysqlcli.commit()
32 | #关闭本次操作
33 | cur.close()
34 | print ("inserted %s" % item['name'])
35 | except pymysql.Error as e:
36 | print ("Mysql Error %d: %s" % (e.args[0], e.args[1]))
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
--------------------------------------------------------------------------------