├── .gitattributes
├── README.md
├── ScrapyTest01
    ├── .idea
    │   ├── ScrapyTest01.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── ScrapyTest01
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── qidian.cpython-36.pyc
    │   │   └── qidian.py
    ├── book.json
    ├── scrapy.cfg
    └── startScrapy.py
├── ScrapyTest02
    ├── .idea
    │   ├── ScrapyTest02.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest02
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── middlewares.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── qidian_font.cpython-36.pyc
    │   │   └── qidian_font.py
    ├── book_info.json
    ├── scrapy.cfg
    └── start.py
├── ScrapyTest03
    ├── .idea
    │   ├── ScrapyTest03.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest03
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── middlewares.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── ajax_zhihu.cpython-36.pyc
    │   │   └── ajax_zhihu.py
    ├── scrapy.cfg
    └── start.py
├── ScrapyTest04
    ├── .idea
    │   ├── ScrapyTest04.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest04
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── doubanLogin.cpython-36.pyc
    │   │   └── doubanLogin.py
    ├── people.html
    ├── scrapy.cfg
    └── start.py
├── ScrapyTest05
    ├── .idea
    │   ├── ScrapyTest05.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest05
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── zhihuLogin.cpython-36.pyc
    │   │   └── zhihuLogin.py
    ├── scrapy.cfg
    ├── start.py
    └── zhihu.html
├── ScrapyTest06
    ├── .idea
    │   ├── ScrapyTest06.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest06
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── zhihuSelenium.cpython-36.pyc
    │   │   └── zhihuSelenium.py
    ├── scrapy.cfg
    ├── start.py
    └── zhihu.html
├── ScrapyTest07
    ├── .idea
    │   ├── ScrapyTest07.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── vcs.xml
    │   └── workspace.xml
    ├── ScrapyTest07
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── qzone.cpython-36.pyc
    │   │   └── qzone.py
    ├── scrapy.cfg
    ├── start.py
    └── statistical.py
└── ScrapyTest08
    ├── .idea
        ├── ScrapyTest08.iml
        ├── vcs.xml
        └── workspace.xml
    ├── DaiLi
        ├── .idea
        │   ├── DaiLi.iml
        │   ├── misc.xml
        │   ├── modules.xml
        │   └── workspace.xml
        ├── DaiLi
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-36.pyc
        │   │   ├── items.cpython-36.pyc
        │   │   ├── pipelines.cpython-36.pyc
        │   │   └── settings.cpython-36.pyc
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-36.pyc
        │   │       └── xici.cpython-36.pyc
        │   │   ├── proxy.txt
        │   │   └── xici.py
        ├── proxy.txt
        ├── scrapy.cfg
        └── start.py
    ├── dytt_redis_master
        ├── .idea
        │   ├── dytt_redis_master.iml
        │   ├── misc.xml
        │   ├── modules.xml
        │   ├── vcs.xml
        │   └── workspace.xml
        ├── dytt_redis_master
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-36.pyc
        │   │   ├── items.cpython-36.pyc
        │   │   ├── pipelines.cpython-36.pyc
        │   │   └── settings.cpython-36.pyc
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-36.pyc
        │   │       └── dytt_master.cpython-36.pyc
        │   │   └── dytt_master.py
        ├── scrapy.cfg
        └── start.py
    ├── dytt_redis_slaver
        ├── .idea
        │   ├── dytt_redis_slaver.iml
        │   ├── misc.xml
        │   ├── modules.xml
        │   ├── vcs.xml
        │   └── workspace.xml
        ├── dytt_redis_slaver
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-36.pyc
        │   │   ├── items.cpython-36.pyc
        │   │   ├── middlewares.cpython-36.pyc
        │   │   ├── pipelines.cpython-36.pyc
        │   │   └── settings.cpython-36.pyc
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-36.pyc
        │   │       └── dytt_slaver.cpython-36.pyc
        │   │   └── dytt_slaver.py
        ├── movie.json
        ├── scrapy.cfg
        └── start.py
    └── redis-mysql.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy_notes
2 | 一个向往成为程序员的码奴的Scrapy学习笔记
3 | 
4 | 
5 | 知乎专栏地址：https://zhuanlan.zhihu.com/zhiqi-scrapy
6 | 


--------------------------------------------------------------------------------
/ScrapyTest01/.idea/ScrapyTest01.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest01/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest01/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest01.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest01.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest01Item(scrapy.Item):
12 |     # 书名
13 |     title = scrapy.Field()
14 |     # 作者
15 |     author = scrapy.Field()
16 |     # 简介
17 |     abstract = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest01SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest01DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import json
 9 | 
10 | class Scrapytest01Pipeline(object):
11 | 
12 |     def __init__(self):
13 |         self.file = open('book.json', 'w')
14 | 
15 |     def process_item(self, item, spider):
16 |         content = json.dumps(dict(item), ensure_ascii=False) + "\n"
17 |         self.file.write(content)
18 |         return item
19 | 
20 |     def close_spider(self, spider):
21 |         self.file.close()


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest01 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest01'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest01.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest01.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest01.middlewares.Scrapytest01SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest01.middlewares.Scrapytest01DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'ScrapyTest01.pipelines.Scrapytest01Pipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/__pycache__/qidian.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest01/ScrapyTest01/spiders/__pycache__/qidian.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest01/ScrapyTest01/spiders/qidian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from ScrapyTest01.items import Scrapytest01Item
 4 | 
 5 | 
 6 | class QidianSpider(scrapy.Spider):
 7 |     name = 'qidian'
 8 |     allowed_domains = ['qidian.com/']
 9 |     start_urls = ['https://www.qidian.com/rank/hotsales']
10 | 
11 |     def parse(self, response):
12 |         # with open('book.html', 'w') as f:
13 |         #      f.write(response.body.decode('utf-8'))
14 | 
15 | 
16 |         # 存放书籍的集合
17 |         book_items = []
18 | 
19 |         # item = Scrapytest01Item()
20 |         # title = response.xpath("//div[@class='book-mid-info']/h4/a/text()").extract()
21 |         # author = response.xpath("//div[@class='book-mid-info']/p/a[@class='name']/text()").extract()
22 |         # abstract = response.xpath("//div[@class='book-mid-info']/p[@class='intro']/text()").extract()
23 |         #
24 |         # item['title'] = title
25 |         # item['author'] = author
26 |         # item['abstract'] = abstract
27 |         #
28 |         # book_items.append(item)
29 | 
30 |         for each in response.xpath("//div[@class='book-mid-info']"):
31 |             item = Scrapytest01Item()
32 |             title = each.xpath("h4/a/text()").extract()[0]
33 |             author = each.xpath("p/a[@class='name']/text()").extract()[0]
34 |             abstract = each.xpath("p[@class='intro']/text()").extract()[0].strip()
35 |             item['title'] = title
36 |             item['author'] = author
37 |             item['abstract'] = abstract
38 | 
39 |             book_items.append(item)
40 | 
41 |         return book_items


--------------------------------------------------------------------------------
/ScrapyTest01/book.json:
--------------------------------------------------------------------------------
 1 | {"title": "大王饶命", "author": "会说话的肘子", "abstract": "高中生吕树在一场车祸中改变人生，当灵气复苏时代来袭，他要做这时代的领跑者。物竞天择，胜者为王。……全订验证群号：696087569"}
 2 | {"title": "太初", "author": "高楼大厦", "abstract": "一树生的万朵花，天下道门是一家。法术千般变化，人心却亘古不变"}
 3 | {"title": "牧神记", "author": "宅猪", "abstract": "大墟的祖训说，天黑，别出门。大墟残老村的老弱病残们从江边捡到了一个婴儿，取名秦牧，含辛茹苦将他养大。这一天夜幕降临，黑暗笼罩大墟，秦牧走出了家门……做个春风中荡漾的反派吧！瞎子对他"}
 4 | {"title": "修真聊天群", "author": "圣骑士的传说", "abstract": "某天，宋书航意外加入了一个仙侠中二病资深患者的交流群，里面的群友们都以‘道友’相称，群名片都是各种府主、洞主、真人、天师。连群主走失的宠物犬都称为大妖犬离家出走。整天聊的是炼丹、闯"}
 5 | {"title": "圣墟", "author": "辰东", "abstract": "在破败中崛起，在寂灭中复苏。沧海成尘，雷电枯竭，那一缕幽雾又一次临近大地，世间的枷锁被打开了，一个全新的世界就此揭开神秘的一角……"}
 6 | {"title": "汉乡", "author": "孑与2", "abstract": "我们接受了祖先的遗产，这让中华辉煌了数千年，我们是如此的心安理得，从未想过要回归那个在刀耕火种中苦苦寻找出路的时代。反哺我们苦难的祖先，并从中找到故乡的真正意义，将是本书要讲的故事"}
 7 | {"title": "诡秘之主", "author": "爱潜水的乌贼", "abstract": "蒸汽与机械的浪潮中，谁能触及非凡？历史和黑暗的迷雾里，又是谁在耳语？我从诡秘中醒来，睁眼看见这个世界：枪械，大炮，巨舰，飞空艇，差分机；魔药，占卜，诅咒，倒吊人，封印物……光明依旧"}
 8 | {"title": "重生之最强人生", "author": "俊秀才", "abstract": "一次跑步锻炼，殷俊从迷雾中穿出来时，已经回到了78年的香江。这一年，还没有霸占荧幕的综艺节目，没有熟悉的特效大片，也没有耳熟能详的歌曲。这一年，香江电视剧还没有好戏频出，香江电影也"}
 9 | {"title": "明朝败家子", "author": "上山打老虎额", "abstract": "弘治十一年。这是一个美好的清晨。此时朱厚照初成年。此时王守仁和唐伯虎磨刀霍霍，预备科举。此时小冰河期已经来临，绵长的严寒肆虐着大地。此时在南和伯府里，地主家的傻儿子，南和伯的嫡传继"}
10 | {"title": "带着仓库到大明", "author": "迪巴拉爵士", "abstract": "方醒穿了，带着两个仓库穿了！别人穿越是带着王霸之气，方醒却是只想种田！“我只想在这个时代悠闲的活着！”坐拥大别墅，顺便教几个弟子，努力让他们往上爬，好给自己当靠山！可谁想弟子有些不"}
11 | {"title": "恶魔就在身边", "author": "汉宝", "abstract": "陈曌能召唤恶魔，能够看到死亡。“别西卜，用你暴食者的能力，为这位客户治疗一下厌食症。”“雷蒙，这位老年人想重新获得男性的能力，你懂的。”“老黑，你和我说实话，这人什么时候死，怎么死"}
12 | {"title": "凡人修仙之仙界篇", "author": "忘语", "abstract": "凡人修仙，风云再起时空穿梭，轮回逆转金仙太乙，大罗道祖三千大道，法则至尊《凡人修仙传》仙界篇，一个韩立叱咤仙界的故事，一个凡人小子修仙的不灭传说。特说明下，没有看过前传的书友，并不"}
13 | {"title": "深夜书屋", "author": "纯洁滴小龙", "abstract": "一家只在深夜开门营业的书屋，欢迎您的光临。————————《舵主群》：587980337（进群粉丝值验证）《读书群》：523978007（无需验证）《战斗群》：457654443（"}
14 | {"title": "超神机械师", "author": "齐佩甲", "abstract": "韩萧，《星海》骨灰级代练，被来自东（zuo）方(zhe)的神秘力量扔进穿越大军，携带玩家面板变成NPC，回到《星海》公测之前，毅然选择难度最高的机械系。战舰列队纵横星海，星辰机甲夭"}
15 | {"title": "帝霸", "author": "厌笔萧生", "abstract": "千万年前，李七夜栽下一株翠竹。八百万年前，李七夜养了一条鲤鱼。五百万年前，李七夜收养一个小女孩。今天，李七夜一觉醒来，翠竹修练成神灵，鲤鱼化作金龙，小女孩成为九界女帝。这是一个养成"}
16 | {"title": "诸界末日在线", "author": "烟火成城", "abstract": "当游戏中的第一件法宝出现在现实世界，所有人类都为之陷入疯狂。这是最好的时代，人们进入游戏，将装备、神通、修为统统带回现实世界；这是最坏的时代，游戏中的妖魔侵入现实，实力恐怖到让人绝"}
17 | {"title": "大医凌然", "author": "志鸟村", "abstract": "医学院学生凌然有一个小目标，要成为世界上最伟大的医生，结果不小心实现了。"}
18 | {"title": "神话版三国", "author": "坟土荒草", "abstract": "陈曦看着将一块数百斤巨石撇出去的士卒，无语望苍天，这真的是东汉末年？吕布单枪匹马凿穿万人部队，这怎么看都不科学。赵子龙真心龙魂附体了，一剑断山，这真的是人？典韦单人护着曹操杀出敌营"}
19 | {"title": "斗罗大陆III龙王传说", "author": "唐家三少", "abstract": "伴随着魂导科技的进步，斗罗大陆上的人类征服了海洋，又发现了两片大陆。魂兽也随着人类魂师的猎杀无度走向灭亡，沉睡无数年的魂兽之王在星斗大森林最后的净土苏醒，它要带领仅存的族人，向人类"}
20 | {"title": "道君", "author": "跃千愁", "abstract": "一个地球神级盗墓宗师，闯入修真界的故事……桃花源里，有歌声。山外青山，白骨山。五花马，千金裘，倚天剑。应我多情，啾啾鬼鸣，美人薄嗔。天地无垠，谁家旗鼓，碧落黄泉，万古高楼。为义气争"}
21 | 


--------------------------------------------------------------------------------
/ScrapyTest01/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest01.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest01
12 | 


--------------------------------------------------------------------------------
/ScrapyTest01/startScrapy.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | # cmdline.execute("scrapy crawl qidian -o books.json".split())
4 | 
5 | cmdline.execute("scrapy crawl qidian".split())


--------------------------------------------------------------------------------
/ScrapyTest02/.idea/ScrapyTest02.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest02/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest02/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest02.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest02.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest02/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest02Item(scrapy.Item):
12 |     # 书名
13 |     title = scrapy.Field()
14 |     # 作者
15 |     author = scrapy.Field()
16 |     # 作品信息
17 |     information = scrapy.Field()
18 |     # 作品简介
19 |     Introduction = scrapy.Field()
20 |     # 字数
21 |     word_num = scrapy.Field()
22 |     # 点击量
23 |     clicks_num = scrapy.Field()
24 |     # 推荐数
25 |     recommended_num = scrapy.Field()
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | import random
 9 | import base64
10 | 
11 | from ScrapyTest02.settings import USER_AGENTS
12 | from ScrapyTest02.settings import PROXIES
13 | 
14 | # 随机的User-Agent
15 | class RandomUserAgent(object):
16 |     def process_request(self, request, spider):
17 |         useragent = random.choice(USER_AGENTS)
18 |         #print useragent
19 |         request.headers.setdefault("User-Agent", useragent)
20 | 
21 | # class RandomProxy(object):
22 | #     def process_request(self, request, spider):
23 | #         proxy = random.choice(PROXIES)
24 | #
25 | #         if proxy['user_passwd'] is None:
26 | #             # 没有代理账户验证的代理使用方式
27 | #             request.meta['proxy'] = "http://" + proxy['ip_port']
28 | #
29 | #         else:
30 | #             # 对账户密码进行base64编码转换
31 | #             base64_userpasswd = base64.b64encode(proxy['user_passwd'])
32 | #             # 对应到代理服务器的信令格式里
33 | #             request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
34 | #
35 | #             request.meta['proxy'] = "http://" + proxy['ip_port']


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | import json
10 | import pymysql
11 | 
12 | class JSONPipeline(object):
13 | 
14 |     def __init__(self):
15 |         self.file = open('book_info.json', 'w')
16 | 
17 |     def process_item(self, item, spider):
18 |         content = json.dumps(dict(item), ensure_ascii=False) + "\n"
19 |         self.file.write(content)
20 |         return item
21 | 
22 |     def close_spider(self, spider):
23 |         self.file.close()
24 | 
25 | class DBPipeline(object):
26 | 
27 |     def __init__(self):
28 |         self.connect = pymysql.connect(
29 |             host='localhost',
30 |             db='Scrapy',
31 |             user='root',
32 |             passwd='zhiqi'
33 |         )
34 |         # 数据库游标，用于操作数据库
35 |         self.cursor = self.connect.cursor()
36 | 
37 |     def process_item(self, item, spider):
38 |         try:
39 |             # 将信息写入数据库
40 |             self.cursor.execute("INSERT INTO qidian(title,author,information,introduction,word_num,clicks_num,recommended_num) VALUES (%s,%s,%s,%s,%s,%s,%s)",(item['title'],item['author'],item['information'],item['Introduction'],item['word_num'],item['clicks_num'],item['recommended_num']))
41 |             # 提交信息
42 |             self.connect.commit()
43 |         except Exception as e:
44 |             # 输出错误信息
45 |             print(e)
46 | 
47 |         return item
48 | 
49 |     def close_spider(self, spider):
50 |         # 关闭游标
51 |         self.cursor.close()
52 |         # 关闭连接
53 |         self.connect.close()


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for ScrapyTest02 project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'ScrapyTest02'
 13 | 
 14 | SPIDER_MODULES = ['ScrapyTest02.spiders']
 15 | NEWSPIDER_MODULE = 'ScrapyTest02.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
 20 | 
 21 | USER_AGENTS = [
 22 |     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
 23 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
 24 |     'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
 25 |     'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
 26 |     'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
 27 |     'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
 28 |     'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
 29 | ]
 30 | 
 31 | PROXIES = [
 32 |         {"ip_port" :"ip:端口", "user_passwd": ""},
 33 |         {"ip_port" :"ip:端口", "user_passwd": ""},
 34 |         {"ip_port" :"ip:端口", "user_passwd": ""},
 35 |         {"ip_port" :"ip:端口", "user_passwd": ""},
 36 |         {"ip_port" :"ip:端口", "user_passwd": ""},
 37 | ]
 38 | 
 39 | MYSQL_HOST = 'localhost'
 40 | MYSQL_DBNAME = 'Scrapy'
 41 | MYSQL_USER = 'root'
 42 | MYSQL_PASSWD = 'zhiqi'
 43 | 
 44 | 
 45 | 
 46 | # Obey robots.txt rules
 47 | # ROBOTSTXT_OBEY = True
 48 | 
 49 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 50 | #CONCURRENT_REQUESTS = 32
 51 | 
 52 | # Configure a delay for requests for the same website (default: 0)
 53 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 54 | # See also autothrottle settings and docs
 55 | DOWNLOAD_DELAY = 2
 56 | # The download delay setting will honor only one of:
 57 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 58 | # CONCURRENT_REQUESTS_PER_IP = 16
 59 | 
 60 | # Disable cookies (enabled by default)
 61 | #COOKIES_ENABLED = False
 62 | 
 63 | # Disable Telnet Console (enabled by default)
 64 | #TELNETCONSOLE_ENABLED = False
 65 | 
 66 | # Override the default request headers:
 67 | #DEFAULT_REQUEST_HEADERS = {
 68 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 69 | #   'Accept-Language': 'en',
 70 | #}
 71 | 
 72 | # Enable or disable spider middlewares
 73 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 74 | # SPIDER_MIDDLEWARES = {
 75 | #     'ScrapyTest02.middlewares.Scrapytest02SpiderMiddleware': 543,
 76 | #
 77 | # }
 78 | 
 79 | # Enable or disable downloader middlewares
 80 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 81 | DOWNLOADER_MIDDLEWARES = {
 82 |    'ScrapyTest02.middlewares.RandomUserAgent': 543,
 83 | #    'ScrapyTest02.middlewares.RandomProxy': 533,
 84 | }
 85 | 
 86 | # Enable or disable extensions
 87 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 88 | #EXTENSIONS = {
 89 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 90 | #}
 91 | 
 92 | # Configure item pipelines
 93 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 94 | ITEM_PIPELINES = {
 95 |     'ScrapyTest02.pipelines.JSONPipeline': 300,
 96 |     'ScrapyTest02.pipelines.DBPipeline': 280,
 97 | }
 98 | 
 99 | # Enable and configure the AutoThrottle extension (disabled by default)
100 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
101 | #AUTOTHROTTLE_ENABLED = True
102 | # The initial download delay
103 | #AUTOTHROTTLE_START_DELAY = 5
104 | # The maximum download delay to be set in case of high latencies
105 | #AUTOTHROTTLE_MAX_DELAY = 60
106 | # The average number of requests Scrapy should be sending in parallel to
107 | # each remote server
108 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
109 | # Enable showing throttling stats for every response received:
110 | #AUTOTHROTTLE_DEBUG = False
111 | 
112 | # Enable and configure HTTP caching (disabled by default)
113 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
114 | #HTTPCACHE_ENABLED = True
115 | #HTTPCACHE_EXPIRATION_SECS = 0
116 | #HTTPCACHE_DIR = 'httpcache'
117 | #HTTPCACHE_IGNORE_HTTP_CODES = []
118 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
119 | 


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/__pycache__/qidian_font.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest02/ScrapyTest02/spiders/__pycache__/qidian_font.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest02/ScrapyTest02/spiders/qidian_font.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from ScrapyTest02.items import Scrapytest02Item
  4 | from lxml import etree
  5 | import re
  6 | from fontTools.ttLib import TTFont
  7 | from io import BytesIO
  8 | 
  9 | from scrapy.linkextractors import LinkExtractor
 10 | from scrapy.spiders import CrawlSpider, Rule
 11 | 
 12 | # class QidianFontSpider(scrapy.Spider):
 13 | class QidianFontSpider(CrawlSpider):
 14 |     name = 'qidian_font'
 15 |     allowed_domains = ['qidian.com', 'qidian.gtimg.com']
 16 |     # start_urls = ['https://book.qidian.com/info/1010191960']
 17 | 
 18 |     start_urls = []
 19 |     for i in range(1, 26):
 20 |         start_urls.append('https://www.qidian.com/rank/hotsales?page=' + str(i))
 21 | 
 22 |     rules = (
 23 |         Rule(LinkExtractor(allow=r'info/\d+'), callback='parse_item'),
 24 |     )
 25 | 
 26 |     def __init__(self,*args, **kwargs):
 27 |         self.WORD_TO_NUM = {"zero": "0","one": "1","two": "2","three":"3","four":"4","five":"5","six":"6","seven":"7","eight":"8","nine":"9","period":"."}
 28 | 
 29 |         super(QidianFontSpider, self).__init__(*args, **kwargs)  # crawlSpider一定要加上这句
 30 |         self.font_dic = {}
 31 | 
 32 |     def parse_item(self, response):
 33 | 
 34 |         #item = Scrapytest02Item()
 35 |         title = response.xpath('//div[@class="book-info "]/h1/em/text()').extract()[0]
 36 |         author = response.xpath('//div[@class="book-info "]//a[@class="writer"]/text()').extract()[0]
 37 |         information = response.xpath('//div[@class="book-info "]/p[@class="intro"]/text()').extract()[0]
 38 |         Introduction = response.xpath('//div[@class="book-intro"]//p/text()').extract()[0].strip()
 39 |         # word_num = response.xpath('//div[@class="book-info "]/p[3]/em[1]/span/text()').extract()[0]
 40 |         # clicks_num = response.xpath('//div[@class="book-info "]/p[3]/em[2]/span/text()').extract()[0]
 41 |         # recommended_num = response.xpath('//div[@class="book-info "]/p[3]/em[3]/span/text()').extract()[0]
 42 |         #
 43 |         # item['title'] = title
 44 |         # item['author'] = author
 45 |         # item['information'] = information
 46 |         # item['Introduction'] = Introduction
 47 |         # item['word_num'] = word_num
 48 |         # item['clicks_num'] = clicks_num
 49 |         # item['recommended_num'] = recommended_num
 50 |         #
 51 |         # yield item
 52 | 
 53 |         # 得到字体的名字
 54 |         font_style = response.xpath('//div[@class="book-info "]//style/text()').extract()[0]
 55 |         font_name = font_style.split(';')[0].split(':')[1].strip()
 56 | 
 57 |         html = etree.HTML(response.text)
 58 |         # 获取文章字数字体的编码
 59 |         word_num_coding = self.get_coding(html, 'p[3]/em[1]/span')
 60 |         # 获取文章点击量字体的编码
 61 |         clicks_num_coding = self.get_coding(html, 'p[3]/em[2]/span')
 62 |         # 获取文章总推荐字体的编码
 63 |         recommended_num_coding = self.get_coding(html, 'p[3]/em[3]/span')
 64 | 
 65 |         # 临时的字典，用于回调传参
 66 |         temp = {}
 67 |         temp['word_num_coding'] = word_num_coding
 68 |         temp['clicks_num_coding'] = clicks_num_coding
 69 |         temp['recommended_num_coding'] = recommended_num_coding
 70 |         temp['title'] = title
 71 |         temp['author'] = author
 72 |         temp['information'] = information
 73 |         temp['Introduction'] = Introduction
 74 | 
 75 |         font_link = 'https://qidian.gtimg.com/qd_anti_spider/' + font_name + '.woff'
 76 |         if font_link not in self.font_dic.keys():
 77 |             yield scrapy.Request(font_link, callback=self.parse_detial, meta=temp, dont_filter=True)
 78 |         else:
 79 |             yield self.processing_data(self.font_dic.get(font_link), temp)
 80 | 
 81 |     def get_coding(self,html, word_num_title_xpath):
 82 |         # 文章信息的根节点
 83 |         root_node = html.xpath('//div[@class="book-info "]')
 84 |         # 字数标签
 85 |         num_title = root_node[0].find(word_num_title_xpath)
 86 |         # 字数标签解码
 87 |         num_text = etree.tostring(num_title).decode()
 88 |         # 正则匹配
 89 |         groups = re.search(r'>(.*?);<', num_text)
 90 |         # 取出字数的字码
 91 |         num_coding = groups[1]
 92 |         # 返回字数的字码
 93 |         return num_coding
 94 | 
 95 |     def parse_detial(self, response):
 96 |         font = TTFont(BytesIO(response.body))
 97 |         cmap = font.getBestCmap()
 98 |         font.close()
 99 |         self.font_dic[response.url] = cmap
100 | 
101 |         return self.processing_data(cmap, response.meta)
102 | 
103 |     def processing_data(self, cmap, meta):
104 |         word_num = self.decode_num(cmap, meta, 'word_num_coding')
105 |         clicks_num = self.decode_num(cmap, meta, 'clicks_num_coding')
106 |         recommended_num = self.decode_num(cmap, meta, 'recommended_num_coding')
107 |         item = Scrapytest02Item()
108 |         item['title'] = meta.get('title')
109 |         item['author'] = meta.get('author')
110 |         item['information'] = meta.get('information')
111 |         item['Introduction'] = meta.get('Introduction')
112 |         item['word_num'] = word_num + '万字'
113 |         item['clicks_num'] = clicks_num + '万总会员点击'
114 |         item['recommended_num'] = recommended_num + '万总推荐'
115 |         return item
116 | 
117 |     def decode_num(self, cmap, meta,code_name):
118 |         word_num = ''
119 |         num_coding_list = meta.get(code_name).replace('&#', '').split(';')
120 |         for num in num_coding_list:
121 |             ch = cmap.get(int(num))
122 |             word_num += self.WORD_TO_NUM[ch]
123 |         return word_num


--------------------------------------------------------------------------------
/ScrapyTest02/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest02.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest02
12 | 


--------------------------------------------------------------------------------
/ScrapyTest02/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl qidian_font".split())


--------------------------------------------------------------------------------
/ScrapyTest03/.idea/ScrapyTest03.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest03/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest03/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest03.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest03.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest03/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest03Item(scrapy.Item):
12 |     # 文章标题
13 |     title = scrapy.Field()
14 |     # 作者
15 |     name = scrapy.Field()
16 |     # 作者简介
17 |     headline = scrapy.Field()
18 |     # 文章链接
19 |     url = scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest03SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest03DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | 
 9 | class DBPipeline(object):
10 | 
11 |     def __init__(self):
12 |         self.connect = pymysql.connect(
13 |             host='localhost',
14 |             db='Scrapy',
15 |             user='root',
16 |             passwd='zhiqi'
17 |         )
18 |         # 数据库游标，用于操作数据库
19 |         self.cursor = self.connect.cursor()
20 | 
21 |     def process_item(self, item, spider):
22 |         try:
23 |             # 将信息写入数据库
24 |             self.cursor.execute("INSERT INTO ZhPyZnCom(title,author,headline,url) VALUES (%s,%s,%s,%s)",(item['title'],item['name'],item['headline'],item['url']))
25 |             # 提交信息
26 |             self.connect.commit()
27 |         except Exception as e:
28 |             # 输出错误信息
29 |             print(e)
30 | 
31 |         return item
32 | 
33 |     def close_spider(self, spider):
34 |         # 关闭游标
35 |         self.cursor.close()
36 |         # 关闭连接
37 |         self.connect.close()
38 | 


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest03 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest03'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest03.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest03.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | #ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 2
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest03.middlewares.Scrapytest03SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest03.middlewares.Scrapytest03DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'ScrapyTest03.pipelines.DBPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/__pycache__/ajax_zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest03/ScrapyTest03/spiders/__pycache__/ajax_zhihu.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest03/ScrapyTest03/spiders/ajax_zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | from ScrapyTest03.items import Scrapytest03Item
 5 | 
 6 | class AjaxZhihuSpider(scrapy.Spider):
 7 |     name = 'ajax_zhihu'
 8 |     allowed_domains = ['zhihu.com']
 9 |     start_urls = ['https://www.zhihu.com/api/v4/columns/zimei/articles?limit=20&offset=0']
10 | 
11 |     def parse(self, response):
12 |         # print(response.body)
13 |         jsonBody = json.loads(response.body.decode('gbk').encode('utf-8'))
14 |         articles = jsonBody['data']
15 |         for art in articles:
16 |             item = Scrapytest03Item()
17 |             item['title'] = art['title']
18 |             item['name'] = art['author']['name']
19 |             item['headline'] = art['author']['headline']
20 |             item['url'] = art['url']
21 |             yield item
22 | 
23 |         if articles:
24 |             yield scrapy.Request(jsonBody['paging']['next'], callback=self.parse)
25 |         else:
26 |             print("获取完毕！")
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/ScrapyTest03/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest03.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest03
12 | 


--------------------------------------------------------------------------------
/ScrapyTest03/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl ajax_zhihu".split())


--------------------------------------------------------------------------------
/ScrapyTest04/.idea/ScrapyTest04.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest04/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest04/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest04.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest04.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest04/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest04Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest04SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest04DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Scrapytest04Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest04 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest04'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest04.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest04.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest04.middlewares.Scrapytest04SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest04.middlewares.Scrapytest04DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'ScrapyTest04.pipelines.Scrapytest04Pipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/__pycache__/doubanLogin.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest04/ScrapyTest04/spiders/__pycache__/doubanLogin.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest04/ScrapyTest04/spiders/doubanLogin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | from io import BytesIO
 5 | from PIL import Image
 6 | 
 7 | class DoubanloginSpider(scrapy.Spider):
 8 |     name = 'doubanLogin'
 9 |     allowed_domains = ['douban.com']
10 |     start_urls = ['https://accounts.douban.com/login']
11 | 
12 |     def parse(self, response):
13 |         temp = {}
14 |         captcha = response.xpath('//img[@id = "captcha_image"]/@src').extract()
15 |         if captcha:
16 |             # 有验证码
17 |             temp['captcha'] = captcha[0]
18 |             temp['response'] = response
19 |             print(captcha)
20 |             yield scrapy.Request(captcha, callback=self.get_captcha, meta=temp)
21 |         else:
22 |             # 没有验证码
23 |             yield scrapy.FormRequest.from_response(
24 |                 response,
25 |                 formdata={
26 |                     "source": "index_nav",
27 |                     # "redir": "https://www.douban.com/people/182875833/",
28 |                     "form_email": "1XXXXXXXXXXX",
29 |                     "form_password": "zXXXXXXXXXX!",
30 |                     "user_login": "登录"
31 |                 },
32 |                 callback=self.parse_page
33 |             )
34 | 
35 | 
36 |     def get_captcha(self, response):
37 |         captcha_img = Image.open(BytesIO(response.body))
38 |         captcha_img.show()
39 | 
40 |         captcha_id = re.search(r'id=\w+:en', response.meta['captcha']).group(0).split('=')[1]
41 |         captcha_solution = input("请输入验证码：")
42 |         # 发送请求参数，并调用指定回调函数处理
43 |         yield scrapy.FormRequest.from_response(
44 |             response.meta['response'],
45 |             formdata={
46 |                 "source": "index_nav",
47 |                 # "redir": "https://www.douban.com/people/182875833/",
48 |                 "form_email": "1XXXXXXXXXX",
49 |                 "form_password": "zxXXXXXXXX!",
50 |                 "captcha-solution": captcha_solution,
51 |                 "captcha-id": captcha_id,
52 |                 "user_login": "登录"
53 |             },
54 |             callback=self.parse_page
55 |         )
56 | 
57 |     def parse_page(self, response):
58 |         url = "https://www.douban.com/people/182875833/"
59 |         yield scrapy.Request(url, callback=self.parse_newpage)
60 | 
61 |     def parse_newpage(self, response):
62 |         with open("people.html", 'wb') as f:
63 |             f.write(response.body)
64 | 


--------------------------------------------------------------------------------
/ScrapyTest04/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest04.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest04
12 | 


--------------------------------------------------------------------------------
/ScrapyTest04/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl doubanLogin".split())


--------------------------------------------------------------------------------
/ScrapyTest05/.idea/ScrapyTest05.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest05/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest05/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest05.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest05.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest05/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest05/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="ab6688eb-cfaa-40c5-b16d-c1b271150902" name="Default Changelist" comment="">
  5 |       <change afterPath="$PROJECT_DIR$/zhihu.html" afterDir="false" />
  6 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/.idea/workspace.xml" afterDir="false" />
  7 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/items.py" afterDir="false" />
  8 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/middlewares.py" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/middlewares.py" afterDir="false" />
  9 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/pipelines.py" afterDir="false" />
 10 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/settings.py" afterDir="false" />
 11 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/spiders/qidian_font.py" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/ScrapyTest02/spiders/qidian_font.py" afterDir="false" />
 12 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest02/book_info.json" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest02/book_info.json" afterDir="false" />
 13 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest03/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest03/.idea/workspace.xml" afterDir="false" />
 14 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest04/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest04/.idea/workspace.xml" afterDir="false" />
 15 |     </list>
 16 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 17 |     <option name="SHOW_DIALOG" value="false" />
 18 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 19 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 20 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 21 |   </component>
 22 |   <component name="CoverageDataManager">
 23 |     <SUITE FILE_PATH="coverage/ScrapyTest05$start.coverage" NAME="start Coverage Results" MODIFIED="1534214727965" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 24 |   </component>
 25 |   <component name="FUSProjectUsageTrigger">
 26 |     <session id="-592636394">
 27 |       <usages-collector id="statistics.lifecycle.project">
 28 |         <counts>
 29 |           <entry key="project.closed" value="1" />
 30 |           <entry key="project.open.time.1" value="1" />
 31 |           <entry key="project.opened" value="1" />
 32 |         </counts>
 33 |       </usages-collector>
 34 |       <usages-collector id="statistics.file.extensions.open">
 35 |         <counts>
 36 |           <entry key="html" value="2" />
 37 |           <entry key="py" value="4" />
 38 |         </counts>
 39 |       </usages-collector>
 40 |       <usages-collector id="statistics.file.types.open">
 41 |         <counts>
 42 |           <entry key="HTML" value="2" />
 43 |           <entry key="Python" value="4" />
 44 |         </counts>
 45 |       </usages-collector>
 46 |       <usages-collector id="statistics.file.extensions.edit">
 47 |         <counts>
 48 |           <entry key="py" value="47" />
 49 |         </counts>
 50 |       </usages-collector>
 51 |       <usages-collector id="statistics.file.types.edit">
 52 |         <counts>
 53 |           <entry key="Python" value="47" />
 54 |         </counts>
 55 |       </usages-collector>
 56 |     </session>
 57 |   </component>
 58 |   <component name="FileEditorManager">
 59 |     <leaf>
 60 |       <file pinned="false" current-in-tab="false">
 61 |         <entry file="file://$PROJECT_DIR$/ScrapyTest05/settings.py">
 62 |           <provider selected="true" editor-type-id="text-editor">
 63 |             <state relative-caret-position="208">
 64 |               <caret line="21" column="2" selection-start-line="21" selection-start-column="2" selection-end-line="21" selection-end-column="2" />
 65 |             </state>
 66 |           </provider>
 67 |         </entry>
 68 |       </file>
 69 |       <file pinned="false" current-in-tab="false">
 70 |         <entry file="file://$PROJECT_DIR$/ScrapyTest05/middlewares.py">
 71 |           <provider selected="true" editor-type-id="text-editor">
 72 |             <state relative-caret-position="-144" />
 73 |           </provider>
 74 |         </entry>
 75 |       </file>
 76 |       <file pinned="false" current-in-tab="true">
 77 |         <entry file="file://$PROJECT_DIR$/ScrapyTest05/spiders/zhihuLogin.py">
 78 |           <provider selected="true" editor-type-id="text-editor">
 79 |             <state relative-caret-position="298">
 80 |               <caret line="26" column="48" selection-start-line="26" selection-start-column="48" selection-end-line="26" selection-end-column="48" />
 81 |             </state>
 82 |           </provider>
 83 |         </entry>
 84 |       </file>
 85 |       <file pinned="false" current-in-tab="false">
 86 |         <entry file="file://$PROJECT_DIR$/zhihu.html">
 87 |           <provider selected="true" editor-type-id="text-editor">
 88 |             <state relative-caret-position="240">
 89 |               <caret line="10" column="10" selection-start-line="10" selection-start-column="10" selection-end-line="10" selection-end-column="10" />
 90 |             </state>
 91 |           </provider>
 92 |         </entry>
 93 |       </file>
 94 |       <file pinned="false" current-in-tab="false">
 95 |         <entry file="file://$PROJECT_DIR$/start.py">
 96 |           <provider selected="true" editor-type-id="text-editor">
 97 |             <state relative-caret-position="48">
 98 |               <caret line="2" column="40" selection-start-line="2" selection-start-column="40" selection-end-line="2" selection-end-column="40" />
 99 |             </state>
100 |           </provider>
101 |         </entry>
102 |       </file>
103 |     </leaf>
104 |   </component>
105 |   <component name="Git.Settings">
106 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
107 |   </component>
108 |   <component name="IdeDocumentHistory">
109 |     <option name="CHANGED_PATHS">
110 |       <list>
111 |         <option value="$PROJECT_DIR$/start.py" />
112 |         <option value="$PROJECT_DIR$/ScrapyTest05/settings.py" />
113 |         <option value="$PROJECT_DIR$/ScrapyTest05/spiders/zhihuLogin.py" />
114 |       </list>
115 |     </option>
116 |   </component>
117 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
118 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
119 |   <component name="JsGulpfileManager">
120 |     <detection-done>true</detection-done>
121 |     <sorting>DEFINITION_ORDER</sorting>
122 |   </component>
123 |   <component name="ProjectFrameBounds" extendedState="6">
124 |     <option name="x" value="-2" />
125 |     <option name="y" value="29" />
126 |     <option name="width" value="1039" />
127 |     <option name="height" value="685" />
128 |   </component>
129 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
130 |   <component name="ProjectView">
131 |     <navigator proportions="" version="1">
132 |       <foldersAlwaysOnTop value="true" />
133 |     </navigator>
134 |     <panes>
135 |       <pane id="Scope" />
136 |       <pane id="ProjectPane">
137 |         <subPane>
138 |           <expand>
139 |             <path>
140 |               <item name="ScrapyTest05" type="b2602c69:ProjectViewProjectNode" />
141 |               <item name="ScrapyTest05" type="462c0819:PsiDirectoryNode" />
142 |             </path>
143 |             <path>
144 |               <item name="ScrapyTest05" type="b2602c69:ProjectViewProjectNode" />
145 |               <item name="ScrapyTest05" type="462c0819:PsiDirectoryNode" />
146 |               <item name="ScrapyTest05" type="462c0819:PsiDirectoryNode" />
147 |             </path>
148 |             <path>
149 |               <item name="ScrapyTest05" type="b2602c69:ProjectViewProjectNode" />
150 |               <item name="ScrapyTest05" type="462c0819:PsiDirectoryNode" />
151 |               <item name="ScrapyTest05" type="462c0819:PsiDirectoryNode" />
152 |               <item name="spiders" type="462c0819:PsiDirectoryNode" />
153 |             </path>
154 |           </expand>
155 |           <select />
156 |         </subPane>
157 |       </pane>
158 |     </panes>
159 |   </component>
160 |   <component name="PropertiesComponent">
161 |     <property name="WebServerToolWindowFactoryState" value="false" />
162 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
163 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
164 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
165 |   </component>
166 |   <component name="RunDashboard">
167 |     <option name="ruleStates">
168 |       <list>
169 |         <RuleState>
170 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
171 |         </RuleState>
172 |         <RuleState>
173 |           <option name="name" value="StatusDashboardGroupingRule" />
174 |         </RuleState>
175 |       </list>
176 |     </option>
177 |   </component>
178 |   <component name="RunManager">
179 |     <configuration name="start" type="PythonConfigurationType" factoryName="Python" temporary="true">
180 |       <module name="ScrapyTest05" />
181 |       <option name="INTERPRETER_OPTIONS" value="" />
182 |       <option name="PARENT_ENVS" value="true" />
183 |       <envs>
184 |         <env name="PYTHONUNBUFFERED" value="1" />
185 |       </envs>
186 |       <option name="SDK_HOME" value="" />
187 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
188 |       <option name="IS_MODULE_SDK" value="true" />
189 |       <option name="ADD_CONTENT_ROOTS" value="true" />
190 |       <option name="ADD_SOURCE_ROOTS" value="true" />
191 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
192 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/start.py" />
193 |       <option name="PARAMETERS" value="" />
194 |       <option name="SHOW_COMMAND_LINE" value="false" />
195 |       <option name="EMULATE_TERMINAL" value="false" />
196 |       <option name="MODULE_MODE" value="false" />
197 |       <option name="REDIRECT_INPUT" value="false" />
198 |       <option name="INPUT_FILE" value="" />
199 |       <method v="2" />
200 |     </configuration>
201 |     <recent_temporary>
202 |       <list>
203 |         <item itemvalue="Python.start" />
204 |       </list>
205 |     </recent_temporary>
206 |   </component>
207 |   <component name="SvnConfiguration">
208 |     <configuration />
209 |   </component>
210 |   <component name="TaskManager">
211 |     <task active="true" id="Default" summary="Default task">
212 |       <changelist id="ab6688eb-cfaa-40c5-b16d-c1b271150902" name="Default Changelist" comment="" />
213 |       <created>1534213258864</created>
214 |       <option name="number" value="Default" />
215 |       <option name="presentableId" value="Default" />
216 |       <updated>1534213258864</updated>
217 |     </task>
218 |     <servers />
219 |   </component>
220 |   <component name="ToolWindowManager">
221 |     <frame x="67" y="25" width="1048" height="823" extended-state="6" />
222 |     <layout>
223 |       <window_info id="Favorites" side_tool="true" />
224 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.17097415" />
225 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
226 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
227 |       <window_info anchor="bottom" id="Database Changes" show_stripe_button="false" />
228 |       <window_info anchor="bottom" id="Version Control" />
229 |       <window_info anchor="bottom" id="Python Console" />
230 |       <window_info anchor="bottom" id="Terminal" />
231 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
232 |       <window_info anchor="bottom" id="Message" order="0" />
233 |       <window_info anchor="bottom" id="Find" order="1" />
234 |       <window_info anchor="bottom" id="Run" order="2" weight="0.21060172" />
235 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
236 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
237 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
238 |       <window_info anchor="bottom" id="TODO" order="6" />
239 |       <window_info anchor="right" id="SciView" />
240 |       <window_info anchor="right" id="Database" />
241 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
242 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
243 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
244 |     </layout>
245 |   </component>
246 |   <component name="TypeScriptGeneratedFilesManager">
247 |     <option name="version" value="1" />
248 |   </component>
249 |   <component name="VcsContentAnnotationSettings">
250 |     <option name="myLimit" value="2678400000" />
251 |   </component>
252 |   <component name="editorHistoryManager">
253 |     <entry file="file://$PROJECT_DIR$/start.py">
254 |       <provider selected="true" editor-type-id="text-editor">
255 |         <state relative-caret-position="48">
256 |           <caret line="2" column="40" selection-start-line="2" selection-start-column="40" selection-end-line="2" selection-end-column="40" />
257 |         </state>
258 |       </provider>
259 |     </entry>
260 |     <entry file="file://$PROJECT_DIR$/ScrapyTest05/settings.py">
261 |       <provider selected="true" editor-type-id="text-editor">
262 |         <state relative-caret-position="208">
263 |           <caret line="21" column="2" selection-start-line="21" selection-start-column="2" selection-end-line="21" selection-end-column="2" />
264 |         </state>
265 |       </provider>
266 |     </entry>
267 |     <entry file="file://$PROJECT_DIR$/zhihu.html">
268 |       <provider selected="true" editor-type-id="text-editor">
269 |         <state relative-caret-position="240">
270 |           <caret line="10" column="10" selection-start-line="10" selection-start-column="10" selection-end-line="10" selection-end-column="10" />
271 |         </state>
272 |       </provider>
273 |     </entry>
274 |     <entry file="file://$PROJECT_DIR$/ScrapyTest05/middlewares.py">
275 |       <provider selected="true" editor-type-id="text-editor">
276 |         <state relative-caret-position="-144" />
277 |       </provider>
278 |     </entry>
279 |     <entry file="file://$PROJECT_DIR$/ScrapyTest05/spiders/zhihuLogin.py">
280 |       <provider selected="true" editor-type-id="text-editor">
281 |         <state relative-caret-position="298">
282 |           <caret line="26" column="48" selection-start-line="26" selection-start-column="48" selection-end-line="26" selection-end-column="48" />
283 |         </state>
284 |       </provider>
285 |     </entry>
286 |   </component>
287 | </project>


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest05Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest05SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest05DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Scrapytest05Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest05 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest05'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest05.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest05.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest05.middlewares.Scrapytest05SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest05.middlewares.Scrapytest05DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'ScrapyTest05.pipelines.Scrapytest05Pipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/__pycache__/zhihuLogin.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest05/ScrapyTest05/spiders/__pycache__/zhihuLogin.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest05/ScrapyTest05/spiders/zhihuLogin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class ZhihuloginSpider(scrapy.Spider):
 6 |     name = 'zhihuLogin'
 7 |     allowed_domains = ['zhihu.com']
 8 |     start_urls = ['https://www.zhihu.com/inbox']
 9 | 
10 |     cookies = {
11 |         'UM_distinctid': '162xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
12 |         '__DAYU_PP': 'yxxxxxxxxxxxxxxxxxxxxxd',
13 |         '_zap': 'f50cxxxxxxxxxxxxxxxxxxxxxx048',
14 |         'z_c0': '"2|xxxxxxxxxxxxxxxxxxxxx|92:MixxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxX01ORzZnWVFTMGxOdF9B|5f5bxxxxxxxxxxxxxxxxxxxxxxxxx417"',
15 |         'CNZZDATA1256793290': '59643xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx524016409',
16 |         'd_c0': '"AOxxxxxxxxxxxxxxxxxxxxxxxxxxxxx728"',
17 |         'Hm_lvt_0bd5xxxxxxxxxxxxxxxxxxxxxxxxxxxxx14379',
18 |         'q_c1': '00cxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx9756000',
19 |         '_xsrf': 'd4xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxbjP',
20 |         '__utma': '518xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx89.2',
21 |         '__utmz': '51854390.1533xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx|utmcct=/',
22 |         '__utmv': '5185xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx160510=1',
23 |         'tgw_l7_route': '156dxxxxxxxxxxxxxxxxxxxxxxxxf36',
24 |     }
25 | 
26 |     # 重写Spider类的start_requests方法，附带Cookie值，发送POST请求
27 |     def start_requests(self):
28 |         for url in self.start_urls:
29 |             yield scrapy.FormRequest(url, cookies=self.cookies, callback=self.parse_page)
30 | 
31 |     # 处理响应内容
32 |     def parse_page(self, response):
33 |         with open("zhihu.html", "wb") as filename:
34 |             filename.write(response.body)
35 | 


--------------------------------------------------------------------------------
/ScrapyTest05/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest05.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest05
12 | 


--------------------------------------------------------------------------------
/ScrapyTest05/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl zhihuLogin".split())


--------------------------------------------------------------------------------
/ScrapyTest06/.idea/ScrapyTest06.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest06/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest06/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest06.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest06.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest06/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest06Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest06SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest06DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Scrapytest06Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest06 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest06'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest06.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest06.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest06.middlewares.Scrapytest06SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest06.middlewares.Scrapytest06DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'ScrapyTest06.pipelines.Scrapytest06Pipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/__pycache__/zhihuSelenium.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest06/ScrapyTest06/spiders/__pycache__/zhihuSelenium.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest06/ScrapyTest06/spiders/zhihuSelenium.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from selenium import webdriver
 4 | from selenium.webdriver import ActionChains
 5 | import time
 6 | import base64
 7 | from io import BytesIO
 8 | from PIL import Image
 9 | import random
10 | import re
11 | 
12 | 
13 | class ZhihuseleniumSpider(scrapy.Spider):
14 |     name = 'zhihuSelenium'
15 |     allowed_domains = ['www.zhihu.com']
16 |     start_urls = ['https://www.zhihu.com/inbox']
17 | 
18 |     def start_requests(self):
19 |         # chrome_options = webdriver.ChromeOptions()
20 |         # chrome_options.add_argument('--headless')
21 |         # browser = webdriver.Chrome(chrome_options=chrome_options)
22 |         browser = webdriver.Chrome()
23 |         browser.set_window_size(100,100)
24 |         browser.get('https://www.zhihu.com/signup')
25 |         # 切换到登陆
26 |         browser.find_element_by_xpath("//div[@class ='SignContainer-switch']/span").click()
27 |         # 输入账号
28 |         browser.find_element_by_name("username").send_keys("1XXXXXXXXXX")
29 |         # 输入密码
30 |         browser.find_element_by_name("password").send_keys("XXXXXXXXXXXXXXX")
31 |         # 查看是否有验证码
32 |         Captcha_element = browser.find_element_by_xpath("//form[@class='SignFlow']/div[3]//img")
33 |         Captcha_base64 = Captcha_element.get_attribute('src')
34 |         print(Captcha_base64)
35 |         # # 如果有验证码：
36 |         if Captcha_base64 != 'data:image/jpg;base64,null':
37 |             # 得到验证码图片
38 |             img_data1 = Captcha_base64.split(',')[-1]
39 |             data1 = base64.b64decode(img_data1)
40 |             image = Image.open(BytesIO(data1))
41 |             image.show()
42 |             Captcha_type = Captcha_element.get_attribute('class')
43 |             # 如果是英文验证码：
44 |             if Captcha_type == 'Captcha-englishImg':
45 |                 # 输入验证码字符并send_keys
46 |                 Captcha = input('请输入图片中的验证码：')
47 |                 browser.find_element_by_name("captcha").send_keys(Captcha)
48 |             # 否则：
49 |             else:
50 |                 # 输入坐标，鼠标模拟点击
51 |                 # 每个字宽度约 (160.5-5.5)/7=22
52 |                 # 每个字高度范围：13.5———35.5
53 |                 handstand = input('请输入倒立文字的序号（以‘,’分割）：')
54 |                 handstand_serial_nums = handstand.split(',')
55 |                 for handstand_serial_num in handstand_serial_nums:
56 |                     x = 5.5 + (int(handstand_serial_num) - 1) * 22 + random.uniform(10, 20) # 随机一个范围
57 |                     y = random.uniform(15, 30)
58 |                     click_pos = (x, y)
59 |                     print(click_pos)
60 |                     ActionChains(browser).move_to_element_with_offset(Captcha_element, x, y).perform()
61 |                     ActionChains(browser).click().perform()
62 |         # 点击登陆按钮
63 |         browser.find_element_by_xpath("//button[@type='submit']").click()
64 |         time.sleep(2)  # 等待登陆跳转
65 |         print(browser.title)
66 |         # 如果登陆title包含“首页”，登陆成功
67 |         if re.search(r'首页', browser.title):
68 |             print('登陆成功！！')
69 |             cookies = browser.get_cookies()
70 |             browser.close()
71 |             for url in self.start_urls:
72 |                 yield scrapy.FormRequest(url, cookies=cookies, callback=self.parse_page)
73 |         else:
74 |             print("登陆失败！")
75 | 
76 |     # 处理登陆后的响应内容
77 |     def parse_page(self, response):
78 |         with open("zhihu.html", "wb") as filename:
79 |             filename.write(response.body)


--------------------------------------------------------------------------------
/ScrapyTest06/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest06.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest06
12 | 


--------------------------------------------------------------------------------
/ScrapyTest06/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl zhihuSelenium".split())


--------------------------------------------------------------------------------
/ScrapyTest06/zhihu.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN" dropEffect="none" class="no-js page-inbox">
  3 | <head>
  4 | <meta charset="utf-8" />
  5 | 
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
  7 | <meta name="renderer" content="webkit" />
  8 | <meta http-equiv="X-ZA-Response-Id" content="320ee264cadd550a206f843bb769e6d1">
  9 | 
 10 | <meta http-equiv="X-ZA-Experiment" content="default:None,ge3:ge3_9,ge2:ge2_1,SE_I:c,nwebQAGrowth:experiment,is_office:false,nweb_growth_people:default,app_store_rate_dialog:close,search_advert_position:1,live_store:ls_a2_b2_c2_f2,nweb_search:nweb_search_heifetz,search_hybrid_tabs:pin-3#album-7,enable_vote_down_reason_menu:d,new_live_feed_mediacard:new,hybrid_zhmore_video:yes,new_mobile_column_appheader:new_header,ad_r:a,growth_search:s2,qaweb_related_readings_content_control:close,recommend_question:rec_question_old,recommend_live_detail:live_detail_word2vec_v2,android_pass_through_push:all,search_section_style:1,new_sign_bg:new,new_mobile_app_header:true,np:1,android_search_tab_style:search_tab_style_b,u_re:0,android_db_recommend_action:open,zcm-lighting:zcm,android_db_feed_hash_tag_style:button,mobile_feed_guide:block,qrcode_login:qrcode,is_new_noti_panel:no,nweb_search_suggest:experiment">
 11 | <title>私信 - 知乎</title>
 12 | 
 13 | <meta name="apple-itunes-app" content="app-id=432274380" />
 14 | 
 15 | 
 16 | <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
 17 | <meta http-equiv="mobile-agent" content="format=html5;url=https://www.zhihu.com/inbox">
 18 | <meta id="znonce" name="znonce" content="ad896f944e334526ab3131875753091a">
 19 | 
 20 | 
 21 | 
 22 | <link rel="apple-touch-icon" href="https://static.zhihu.com/static/revved/img/ios/touch-icon-152.87c020b9.png" sizes="152x152">
 23 | <link rel="apple-touch-icon" href="https://static.zhihu.com/static/revved/img/ios/touch-icon-120.496c913b.png" sizes="120x120">
 24 | <link rel="apple-touch-icon" href="https://static.zhihu.com/static/revved/img/ios/touch-icon-76.dcf79352.png" sizes="76x76">
 25 | <link rel="apple-touch-icon" href="https://static.zhihu.com/static/revved/img/ios/touch-icon-60.9911cffb.png" sizes="60x60">
 26 | 
 27 | 
 28 | <link rel="shortcut icon" href="https://static.zhihu.com/static/favicon.ico" type="image/x-icon">
 29 | 
 30 | <link rel="search" type="application/opensearchdescription+xml" href="https://static.zhihu.com/static/search.xml" title="知乎" />
 31 | <link rel="stylesheet" href="https://static.zhihu.com/static/revved/-/css/z.3e5da9dd.css">
 32 | 
 33 | <link rel="stylesheet" href="https://static.zhihu.com/static/revved/-/css/pages/inbox/main.1e966360.css">
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | <!--[if lt IE 9]>
 40 | <script src="https://static.zhihu.com/static/components/respond/dest/respond.min.js"></script>
 41 | <link href="https://static.zhihu.com/static/components/respond/cross-domain/respond-proxy.html" id="respond-proxy" rel="respond-proxy" />
 42 | <link href="/static/components/respond/cross-domain/respond.proxy.gif" id="respond-redirect" rel="respond-redirect" />
 43 | <script src="/static/components/respond/cross-domain/respond.proxy.js"></script>
 44 | <![endif]-->
 45 | <script src="https://static.zhihu.com/static/revved/-/js/instant.35b468b8.js"></script>
 46 | 
 47 | </head>
 48 | 
 49 | <body class="zhi ">
 50 | 
 51 | 
 52 | 
 53 | 
 54 | <div role="navigation" class="zu-top" data-za-module="TopNavBar">
 55 | <div class="zg-wrap modal-shifting clearfix" id="zh-top-inner">
 56 | <a href="/" class="zu-top-link-logo" id="zh-top-link-logo" data-za-c="view_home" data-za-a="visit_home" data-za-l="top_navigation_zhihu_logo">知乎</a>
 57 | 
 58 | <div class="top-nav-profile">
 59 | <a href="/people/zhiqi-kou" class="zu-top-nav-userinfo ">
 60 | <span class="name">Zhiqi Kou</span>
 61 | <img class="Avatar" src="https://pic2.zhimg.com/v2-2f7b6857f08b5a88ae09a99400220814_s.jpg" srcset="https://pic2.zhimg.com/v2-2f7b6857f08b5a88ae09a99400220814_xs.jpg 2x" alt="Zhiqi Kou" />
 62 | <span id="zh-top-nav-new-pm" class="zg-noti-number zu-top-nav-pm-count"
 63 | style="visibility:hidden" data-count="0">
 64 | 
 65 | </span>
 66 | </a>
 67 | <ul class="top-nav-dropdown" id="top-nav-profile-dropdown">
 68 | <li>
 69 | <a href="/people/zhiqi-kou">
 70 | <i class="zg-icon zg-icon-dd-home"></i>我的主页
 71 | </a>
 72 | </li>
 73 | 
 74 | <li>
 75 | <a href="/inbox">
 76 | <i class="zg-icon zg-icon-dd-pm"></i>私信
 77 | <span id="zh-top-nav-pm-count" class="zu-top-nav-pm-count zg-noti-number"
 78 | style="visibility:hidden" data-count="0">
 79 | 
 80 | </span>
 81 | </a>
 82 | </li>
 83 | <li>
 84 | <a href="/settings">
 85 | <i class="zg-icon zg-icon-dd-settings"></i>设置
 86 | </a>
 87 | </li>
 88 | <li>
 89 | <a href="/logout">
 90 | <i class="zg-icon zg-icon-dd-logout"></i>退出
 91 | </a>
 92 | </li>
 93 | </ul>
 94 | 
 95 | </div>
 96 | 
 97 | 
 98 | 
 99 | <button class="zu-top-add-question" id="zu-top-add-question">提问</button>
100 | 
101 | 
102 | <div role="search" id="zh-top-search" class="zu-top-search">
103 | <form method="GET" action="/search" id="zh-top-search-form" class="zu-top-search-form">
104 | 
105 | 
106 | 
107 | <input type="hidden" name="type" value="content">
108 | <label for="q" class="hide-text">知乎搜索</label><input type="text" class="zu-top-search-input" id="q" name="q" autocomplete="off" value="" maxlength="100" placeholder="搜索你感兴趣的内容…">
109 | <button type="submit" class="zu-top-search-button"><span class="hide-text">搜索</span><span class="sprite-global-icon-magnifier-dark"></span></button>
110 | </form>
111 | </div>
112 | 
113 | 
114 | 
115 | <div id="zg-top-nav" class="zu-top-nav">
116 | <ul class="zu-top-nav-ul zg-clear">
117 | 
118 | <li class="zu-top-nav-li " id="zh-top-nav-home">
119 | <a class="zu-top-nav-link" href="/" id="zh-top-link-home" data-za-c="view_home" data-za-a="visit_home" data-za-l="top_navigation_home">首页</a>
120 | </li>
121 | 
122 | 
123 | 
124 | <li class="top-nav-topic-selector zu-top-nav-li " id="zh-top-nav-topic">
125 | <a class="zu-top-nav-link" href="/topic" id="top-nav-dd-topic">话题</a>
126 | </li>
127 | 
128 | <li class="zu-top-nav-li " id="zh-top-nav-explore">
129 | <a class="zu-top-nav-link" href="/explore">发现</a>
130 | </li>
131 | 
132 | <li class="top-nav-noti zu-top-nav-li ">
133 | <a class="zu-top-nav-link" href="/notifications" id="zh-top-nav-count-wrap" role="button"><span class="mobi-arrow"></span>消息</a>
134 | </li>
135 | 
136 | 
137 | 
138 | </ul>
139 | <div class="zu-top-nav-live zu-noti7-popup zg-r5px no-hovercard" id="zh-top-nav-live-new" role="popup" tabindex="0">
140 | <div class="zu-top-nav-live-inner zg-r5px">
141 | <div class="zu-top-live-icon">&nbsp;</div>
142 | <div class="zu-home-noti-inner" id="zh-top-nav-live-new-inner">
143 | <div class="zm-noti7-popup-tab-container clearfix" tabindex="0">
144 | <button class="zm-noti7-popup-tab-item message">
145 | <span class="icon">消息</span>
146 | </button>
147 | <button class="zm-noti7-popup-tab-item user">
148 | <span class="icon">用户</span>
149 | </button>
150 | <button class="zm-noti7-popup-tab-item thanks">
151 | <span class="icon">赞同和感谢</span>
152 | </button>
153 | </div>
154 | </div>
155 | <div class="zm-noti7-frame-border top"></div>
156 | <div class="zm-noti7-frame">
157 | <div class="zm-noti7-content message">
158 | <div class="zm-noti7-content-inner">
159 | <div class="zm-noti7-content-body">
160 | <div class="zm-noti7-popup-loading">
161 | <span class="noti-spinner-loading"></span>
162 | </div>
163 | </div>
164 | </div>
165 | </div>
166 | <div class="zm-noti7-content user" style="display:none;">
167 | <div class="zm-noti7-content-inner">
168 | <div class="zm-noti7-content-body">
169 | <div class="zm-noti7-popup-loading">
170 | <span class="noti-spinner-loading"></span>
171 | </div>
172 | </div>
173 | </div>
174 | </div>
175 | <div class="zm-noti7-content thanks" style="display:none;">
176 | <div class="zm-noti7-content-inner">
177 | <div class="zm-noti7-content-body">
178 | <div class="zm-noti7-popup-loading">
179 | <span class="noti-spinner-loading"></span>
180 | </div>
181 | </div>
182 | </div>
183 | </div>
184 | </div>
185 | <div class="zm-noti7-frame-border bottom"></div>
186 | <div class="zm-noti7-popup-footer">
187 | <a href="/notifications" class="zm-noti7-popup-footer-all zg-right">查看全部 &raquo;</a>
188 | <a href="/settings/notification" class="zm-noti7-popup-footer-set" title="通知设置" ><i class="zg-icon zg-icon-settings"></i></a>
189 | </div>
190 | </div>
191 | </div>
192 | 
193 | </div>
194 | 
195 | </div>
196 | </div>
197 | 
198 | 
199 | <div class="zu-global-notify" id="zh-global-message" style="display:none">
200 | <div class="zg-wrap">
201 | <div class="zu-global-nitify-inner">
202 | <a class="zu-global-notify-close" href="javascript:;" title="关闭" name="close">x</a>
203 | <span class="zu-global-notify-icon"></span>
204 | <span class="zu-global-notify-msg"></span>
205 | </div>
206 | </div>
207 | </div>
208 | 
209 | 
210 | 
211 | 
212 | <div class="zg-wrap zu-main clearfix "  role="main">
213 | <div class="zu-main-content">
214 | <div class="zu-main-content-inner">
215 | 
216 | 
217 | 
218 | <div class="PageInbox-nav">
219 | <div class="PageInbox-nav-inner">
220 | 
221 | <ul class="PageInbox-nav-tabs">
222 | 
223 | <li>最近联系</li>
224 | 
225 | 
226 | </ul>
227 | 
228 | <button id="zh-create-pm" class="zg-btn-justify zg-btn-blue">写私信</button>
229 | </div>
230 | 
231 | </div>
232 | <div id="zh-pm-item-wrap">
233 | 
234 | 
235 | 
236 | </div>
237 | <a style="display:none" class="zg-btn-white zg-r3px zu-button-more" id="zh-load-more" href="javascript:;">更多</a>
238 | 
239 | </div>
240 | </div>
241 | 
242 | 
243 | <div class="zu-main-sidebar" data-za-module="RightSideBar">
244 | 
245 | <div class="zg-info-message" style="margin:0 0 10px;text-align:left;">
246 | 担心骚扰？可以 <a href="/settings/notification">设置</a> 为「开启陌生人私信箱」。</div>
247 | 
248 | </div>
249 | 
250 | 
251 | </div>
252 | 
253 | 
254 | <div id="zh-footer" class="zh-footer">
255 | <div class="content zg-wrap clearfix">
256 | <ul>
257 | 
258 | <li><a href="https://liukanshan.zhihu.com" target="_blank">刘看山</a></li>
259 | 
260 | <li><a href="/question/19581624" target="_blank">知乎指南</a></li>
261 | <li><a href="javascript:;" id="js-feedback-button">建议反馈</a></li>
262 | 
263 | <li><a href="/app" target="_blank">移动应用</a></li>
264 | <li><a href="/careers">加入知乎</a></li>
265 | <li><a href="/terms" target="_blank">知乎协议</a></li>
266 | <li><a href="/jubao" target="_blank">举报投诉</a></li>
267 | <li><a href="/terms/privacy" target="_blank">隐私政策</a></li>
268 | <li><a href="/contact">联系我们</a></li>
269 | 
270 | </ul>
271 | 
272 | <span class="copy">&copy; 2018 知乎</span>
273 | 
274 | </div>
275 | </div>
276 | 
277 | <script type="text/json" class="json-inline" data-name="guiders2">{"exclusive-popover":{},"section":{"home-topstory":{"content":"\u8fd9\u91cc\u662f\u6839\u636e\u4f60\u5173\u6ce8\u7684\u8bdd\u9898\u5b9a\u5236\u7684\u5185\u5bb9\u7cbe\u9009\uff0c\u5173\u6ce8\u5176\u4ed6\u4eba\u4e5f\u53ef\u4ee5\u8ba9\u5185\u5bb9\u66f4\u4e30\u5bcc","dismissText":"\u77e5\u9053\u4e86","title":"\u6700\u65b0\u52a8\u6001"}},"editor":[]}</script>
278 | 
279 | 
280 | 
281 | 
282 | 
283 | <script type="text/template" class="bind-phone-confirmation-dialog-template">
284 | <div class="clearfix bind-phone-confirmation-dialog" data-za-module="BindPhoneForm">
285 | <p>
286 | <span class="bind-phone-tip">应国家法规对于帐号实名的要求，进行下一步操作前，需要先完成手机绑定。</span>
287 | <a href="https://zhuanlan.zhihu.com/p/27257715" target="_blank">了解更多</a>
288 | </p>
289 | <div class="text-right">
290 | <a href="#" class="not-to-bind-phone">暂不绑定</a>&nbsp;&nbsp;
291 | <button class="zg-btn-blue go-to-bind-phone">去绑定手机</button>
292 | </div>
293 | </div>
294 | </script>
295 | 
296 | 
297 | 
298 | <script type="text/template" class="bind-phone-dialog-template">
299 | <div class="code-activate-view clearfix" data-za-module="BindPhoneForm">
300 | <div class="tip">绑定手机后，你就可以使用知乎的提问、回答、评论和编辑等功能。</div>
301 | <form class="send-code clearfix">
302 | <div class="input-wrapper">
303 | <label class="info" for="phone_no-to-send">手机：</label>
304 | <input type="text" id="phone_no-to-send" name="new_phone_no" class="text" required>
305 | </div>
306 | <button class="submit zg-btn-blue">获取验证码</button>
307 | </form>
308 | <form class="activate clearfix">
309 | <div class="input-wrapper">
310 | <label class="info" for="phone_no-to-bind">手机：</label>
311 | <input type="text" id="phone_no-to-bind" name="new_phone_no" class="text" required>
312 | </div>
313 | <div class="input-wrapper verification-code">
314 | <label class="info" for="digits">验证码：</label>
315 | <input type="text" id="digits" name="digits" class="text" placeholder="6 位数验证码" maxlength="6" required>
316 | <button type="button" class="resend-code">重发验证码</button>
317 | </div>
318 | <button class="submit zg-btn-blue">确认</button>
319 | </form>
320 | <div class="form-help-extra z-text-muted">
321 | <p>没有收到验证码？</p>
322 | <p>你也可以私信 <a target="_blank" href="http://www.zhihu.com/people/zhihuadmin">@知乎小管家</a> 或者发邮件至 <a href="mailto:i@zhihu.com">i@zhihu.com</a> 联系知乎管理员</p>
323 | </div>
324 | </div>
325 | </script>
326 | 
327 | 
328 | 
329 | <script type="text/template" class="bind-email-notification-dialog-template">
330 | <div class="clearfix bind-email-notification-dialog" data-za-module="BindMailForm">
331 | <p>
332 | 由于你此前绑定的邮箱未经验证，未来请使用手机号登录知乎。
333 | </p>
334 | <div class="text-right">
335 | <a class="zg-btn-blue js-reload" href="#">知道了</a>
336 | </div>
337 | </div>
338 | </script>
339 | <script type="text/json" class="json-inline" data-name="current_user">["Zhiqi Kou","zhiqi-kou","https:\/\/pic2.zhimg.com\/v2-2f7b6857f08b5a88ae09a99400220814_s.jpg","276a3e960e653b573f037857664502a9","",0,0,true,null,"",0,false,false,false,"713497997710737408",false,false,false,true,null,null,true]</script>
340 | <script type="text/json" class="json-inline" data-name="front_web_config">{"realname_win_config":{"timestamp":1501344000,"tip":"\u5e94\u56fd\u5bb6\u6cd5\u89c4\u5bf9\u4e8e\u5e10\u53f7\u5b9e\u540d\u7684\u8981\u6c42\uff0c\u8fdb\u884c\u4e0b\u4e00\u6b65\u64cd\u4f5c\u524d\uff0c\u9700\u8981\u5148\u5b8c\u6210\u624b\u673a\u7ed1\u5b9a\u3002","continue":0,"continue_time":3600,"skip_ut_verification":0}}</script>
341 | <script type="text/json" class="json-inline" data-name="user_status">[null,null,true]</script>
342 | <script type="text/json" class="json-inline" data-name="env">["zhihu.com","comet.zhihu.com",false,null,false,false]</script>
343 | <script type="text/json" class="json-inline" data-name="permissions">[]</script>
344 | 
345 | 
346 | <script type="text/json" class="json-inline" data-name="ga_vars">{"user_created":1462891178000,"now":1534406243000,"abtest_mask":"---------0--------------------","user_attr":[1,0,0,"-","-"],"user_hash":"276a3e960e653b573f037857664502a9"}</script>
347 | 
348 | <script type="text/json" class="json-inline" data-name="ra-urls">{"Copyright":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/Copyright.773358e5.js","PayUIApp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/PayUIApp.36707672.js","CouponApp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/CouponApp.1f64e458.js","PaymentApp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/PaymentApp.a5a8951a.js","Community":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/Community.4b35cdba.js","Report":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/Report.ba8f1b4f.js","OrgOpHelp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/OrgOpHelp.9e284cd9.js","common":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/common.8eaa69f7.js","BalanceApp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/BalanceApp.1e4cdac8.js","AnswerWarrant":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/AnswerWarrant.a94bf518.js","CommentApp":"https:\/\/static.zhihu.com\/static\/revved\/-\/apps\/CommentApp.275ffa2a.js"}</script>
349 | 
350 | <script src="https://static.zhihu.com/static/revved/-/js/vendor.cb14a042.js"></script>
351 | <script src="https://static.zhihu.com/static/revved/-/js/closure/base.05b6d7f6.js"></script>
352 | 
353 | <script src="https://static.zhihu.com/static/revved/-/js/closure/common.58a68220.js"></script>
354 | 
355 | 
356 | <script src="https://static.zhihu.com/static/revved/-/js/closure/richtexteditor.c31820a9.js" async></script>
357 | 
358 | <script src="https://static.zhihu.com/static/revved/-/js/closure/page-main.bd0498f6.js"></script>
359 | <meta name="entry" content="ZH.entryPM" data-module-id="page-main">
360 | 
361 | 
362 | <script type="text/zscript" znonce="ad896f944e334526ab3131875753091a"></script>
363 | 
364 | <input type="hidden" name="_xsrf" value="34353930663938382d666336372d346266342d393061652d316337313135386435336139"/>
365 | </body>
366 | </html>


--------------------------------------------------------------------------------
/ScrapyTest07/.idea/ScrapyTest07.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest07/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest07/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyTest07.iml" filepath="$PROJECT_DIR$/.idea/ScrapyTest07.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest07/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Scrapytest07Item(scrapy.Item):
12 |     # 说说内容
13 |     content = scrapy.Field()
14 |     # 发表时间
15 |     created_time = scrapy.Field()
16 |     # 发表地点
17 |     location_name = scrapy.Field()
18 |     # 经度
19 |     location_pos_x = scrapy.Field()
20 |     # 纬度
21 |     location_pos_y = scrapy.Field()
22 |     # 设备
23 |     source_name = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Scrapytest07SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Scrapytest07DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymysql
 9 | 
10 | 
11 | class DBPipeline(object):
12 |     def __init__(self):
13 |         self.connect = pymysql.connect(
14 |             host='localhost',
15 |             db='Scrapy',
16 |             user='root',
17 |             passwd='zhiqi'
18 |         )
19 |         # 数据库游标，用于操作数据库
20 |         self.cursor = self.connect.cursor()
21 | 
22 |     def process_item(self, item, spider):
23 |         try:
24 |             # 将信息写入数据库
25 |             self.cursor.execute("INSERT INTO my_qzone (content,created_time,location_name,location_pos_x,location_pos_y,source_name) VALUES (%s,%s,%s,%s,%s,%s)",(item['content'],item['created_time'],item['location_name'],item['location_pos_x'],item['location_pos_y'],item['source_name']))     # 提交信息
26 |             self.connect.commit()
27 |         except Exception as e:
28 |             # 输出错误信息
29 |             print(e)
30 | 
31 |         return item
32 | 
33 |     def close_spider(self, spider):
34 |         # 关闭游标
35 |         self.cursor.close()
36 |         # 关闭连接
37 |         self.connect.close()
38 | 


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyTest07 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyTest07'
13 | 
14 | SPIDER_MODULES = ['ScrapyTest07.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyTest07.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyTest07.middlewares.Scrapytest07SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'ScrapyTest07.middlewares.Scrapytest07DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'ScrapyTest07.pipelines.DBPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/__pycache__/qzone.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest07/ScrapyTest07/spiders/__pycache__/qzone.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest07/ScrapyTest07/spiders/qzone.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from selenium import webdriver
 4 | import time
 5 | import json
 6 | from ScrapyTest07.items import Scrapytest07Item
 7 | 
 8 | class QzoneSpider(scrapy.Spider):
 9 |     name = 'qzone'
10 |     allowed_domains = ['qq.com']
11 | 
12 |     def __init__(self):
13 |         self.cookies = ''
14 |         self.page_num = 0
15 |         self.g_tk = ''
16 | 
17 |     def start_requests(self):
18 |         browser = webdriver.Chrome()
19 |         browser.get('https://user.qzone.qq.com')
20 |         # 登录表单在页面的框架中，所以要切换到该框架
21 |         browser.switch_to.frame('login_frame')
22 |         browser.find_element_by_id('switcher_plogin').click()
23 |         browser.find_element_by_id('u').send_keys('QQ')
24 |         browser.find_element_by_id('p').send_keys('密码')
25 |         browser.find_element_by_id('login_button').click()
26 |         time.sleep(2)
27 |         try:
28 |             # 得到验证码图片
29 |             bg_link = browser.find_element_by_id('slideBkg').get_attribute('src')
30 |             block_link = browser.find_element_by_id('slideBlock').get_attribute('src')
31 |             time.sleep(10)
32 |             print(bg_link, block_link)
33 |         except:
34 |             pass
35 |         # 获得 gtk
36 |         cookie = {}  # 初始化cookie字典
37 |         self.cookies = browser.get_cookies()
38 |         for elem in self.cookies:  # 取cookies
39 |             cookie[elem['name']] = elem['value']
40 | 
41 |         self.g_tk = self.getGTK(cookie)  # 通过getGTK函数计算gtk
42 |         browser.close()
43 |         # https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=0&num=20&g_tk=438032980
44 |         start_url = 'https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=0&num=20&g_tk=' + str(self.g_tk)
45 |         yield scrapy.Request(start_url, cookies=self.cookies, callback=self.get_msg)
46 | 
47 |     def getGTK(self, cookie):
48 |         hashes = 5381
49 |         for letter in cookie['p_skey']:
50 |             hashes += (hashes << 5) + ord(letter)
51 |         return hashes & 0x7fffffff
52 | 
53 |     def get_msg(self, response):
54 |         response_fix = response.body.decode('utf-8')[10:-2]
55 |         # print(response_fix)
56 |         jsonBody = json.loads(response_fix)
57 |         msglist = jsonBody['msglist']
58 |         if msglist != None:
59 |             for msg in msglist:
60 |                 item = Scrapytest07Item()
61 |                 item['content'] = msg['content']
62 |                 # 转换成localtime
63 |                 time_local = time.localtime(int(msg['created_time']))
64 |                 # 转换成新的时间格式(XXXX-XX-XX XX:XX:XX)
65 |                 dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
66 |                 item['created_time'] = dt
67 |                 item['location_name'] = msg['lbs']['name']
68 |                 item['location_pos_x'] = msg['lbs']['pos_x']
69 |                 item['location_pos_y'] = msg['lbs']['pos_y']
70 |                 item['source_name'] = msg['source_name']
71 |                 print(item)
72 |                 yield item
73 |             self.page_num += 1
74 |             url = 'https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=QQ号&pos=' + str(self.page_num * 20) + '&num=20&g_tk=' + str(self.g_tk)
75 |             yield scrapy.Request(url, cookies=self.cookies, callback=self.get_msg)
76 |         else:
77 |             print('全部数据获取完毕！')
78 | 
79 | 


--------------------------------------------------------------------------------
/ScrapyTest07/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyTest07.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyTest07
12 | 


--------------------------------------------------------------------------------
/ScrapyTest07/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl qzone".split())
4 | 


--------------------------------------------------------------------------------
/ScrapyTest07/statistical.py:
--------------------------------------------------------------------------------
  1 | from pyecharts import Bar
  2 | from pyecharts import Line
  3 | from pyecharts import WordCloud
  4 | from pyecharts import Geo
  5 | from pyecharts import Pie
  6 | from pyecharts.engine import create_default_environment
  7 | import pymysql
  8 | import jieba
  9 | import re
 10 | 
 11 | db = pymysql.connect(host='localhost', user='root', password='zhiqi', database='Scrapy')
 12 | cursor = db.cursor()
 13 | 
 14 | sql = "SELECT * FROM my_qzone"
 15 | 
 16 | contents = []
 17 | years = []
 18 | months = []
 19 | days = []
 20 | hours = []
 21 | locations = {}
 22 | source_names = []
 23 | 
 24 | try:
 25 |     cursor.execute(sql)
 26 |     result = cursor.fetchall()
 27 |     for row in result:
 28 |         contents.append(row[1])
 29 |         years.append(row[2].split(' ')[0].split('-')[0])
 30 |         months.append(row[2].split(' ')[0].split('-')[1])
 31 |         days.append(row[2].split(' ')[0].split('-')[2])
 32 |         hours.append(row[2].split(' ')[1].split(':')[0])
 33 |         locations[row[3]] = [row[4], row[5]]
 34 |         if row[6] != '':
 35 |             source_names.append(row[6])
 36 | except Exception as e:
 37 |     # 输出错误信息
 38 |     print(e)
 39 | 
 40 | cursor.close()
 41 | db.close()
 42 | 
 43 | 
 44 | # 将所有说说连起来
 45 | str_content = ''
 46 | for content in contents:
 47 |     str_content += content
 48 | str_content = ''.join(re.findall(u'[\u4e00-\u9fff]+', str_content))
 49 | words = jieba.cut(str_content)
 50 | stop_list = ['的', '了', '我', '是', '不',
 51 |              '你', '都', '就', '在', '也',
 52 |              '有', '去', '好', '说', '到',
 53 |              '又', '要', '这', '还', '啊',
 54 |              '吧', '给', '和', '人', '来',
 55 |              '被', '上', '没', '会', '能',
 56 |              '着', '多', '他', '一', '年',
 57 |              '看', '很', '谁', '再', '为',
 58 |              ]
 59 | result_word_list = []
 60 | for word in words:
 61 |     if word not in stop_list:
 62 |         result_word_list.append(word)
 63 | 
 64 | geo = Geo(
 65 |     "位置信息",
 66 |     title_color="#fff",
 67 |     title_pos="center",
 68 |     width=1200,
 69 |     height=600,
 70 |     background_color="#404a59",
 71 | )
 72 | 
 73 | sorted_list = set(list(locations.keys()))
 74 | list_infos = {}
 75 | for info in sorted_list:
 76 |     list_infos[info] = list(locations.keys()).count(info)
 77 | 
 78 | attr, value = geo.cast(list_infos)
 79 | geo.add(
 80 |     "",
 81 |     attr,
 82 |     value,
 83 |     geo_cities_coords=locations
 84 | )
 85 | geo.render()
 86 | 
 87 | 
 88 | def get_chart(original_list, form_type, table_name, series_name):
 89 |     sorted_list = sorted(list(set(original_list)))
 90 |     list_infos = {}
 91 |     for info in sorted_list:
 92 |         list_infos[info] = original_list.count(info)
 93 |     chart = form_type(table_name)
 94 |     chart.add(series_name, list(list_infos.keys()), list(list_infos.values()))
 95 |     return chart
 96 | 
 97 | 
 98 | def drawing(chart, path, filet_type='html'):
 99 |     env = create_default_environment(filet_type)
100 |     # create_default_environment(filet_type)
101 |     # file_type: 'html', 'svg', 'png', 'jpeg', 'gif' or 'pdf'
102 |     env.render_chart_to_file(chart, path=path)
103 | 
104 | drawing(get_chart(years, Bar, '年发表统计图', '发表数'), 'years.html')
105 | drawing(get_chart(months, Bar, '月发表统计图', '发表数'), 'months.html')
106 | drawing(get_chart(days, Bar, '日发表统计图', '发表数'), 'days.html')
107 | drawing(get_chart(hours, Bar, '小时发表统计图', '发表数'), 'hours.html')
108 | drawing(get_chart(result_word_list, WordCloud, '', ''), 'word.html')
109 | drawing(get_chart(source_names, Pie, '', ''), 'source.html')
110 | 


--------------------------------------------------------------------------------
/ScrapyTest08/.idea/ScrapyTest08.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest08/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="10cb7312-9480-4da7-9875-1aca153e23da" name="Default Changelist" comment="">
  5 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest05/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest05/.idea/workspace.xml" afterDir="false" />
  6 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest06/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest06/.idea/workspace.xml" afterDir="false" />
  7 |       <change beforePath="$PROJECT_DIR$/../ScrapyTest07/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../ScrapyTest07/.idea/workspace.xml" afterDir="false" />
  8 |     </list>
  9 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 10 |     <option name="SHOW_DIALOG" value="false" />
 11 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 12 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 13 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 14 |   </component>
 15 |   <component name="FUSProjectUsageTrigger">
 16 |     <session id="-1182176867">
 17 |       <usages-collector id="statistics.lifecycle.project">
 18 |         <counts>
 19 |           <entry key="project.closed" value="1" />
 20 |           <entry key="project.open.time.0" value="1" />
 21 |           <entry key="project.opened" value="1" />
 22 |         </counts>
 23 |       </usages-collector>
 24 |     </session>
 25 |   </component>
 26 |   <component name="Git.Settings">
 27 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
 28 |   </component>
 29 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 30 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 31 |   <component name="JsGulpfileManager">
 32 |     <detection-done>true</detection-done>
 33 |     <sorting>DEFINITION_ORDER</sorting>
 34 |   </component>
 35 |   <component name="ProjectFrameBounds" extendedState="6">
 36 |     <option name="x" value="1" />
 37 |     <option name="y" value="25" />
 38 |     <option name="width" value="1033" />
 39 |     <option name="height" value="575" />
 40 |   </component>
 41 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
 42 |   <component name="ProjectView">
 43 |     <navigator proportions="" version="1">
 44 |       <foldersAlwaysOnTop value="true" />
 45 |     </navigator>
 46 |     <panes>
 47 |       <pane id="Scope" />
 48 |       <pane id="ProjectPane">
 49 |         <subPane>
 50 |           <expand>
 51 |             <path>
 52 |               <item name="ScrapyTest08" type="b2602c69:ProjectViewProjectNode" />
 53 |               <item name="ScrapyTest08" type="462c0819:PsiDirectoryNode" />
 54 |             </path>
 55 |           </expand>
 56 |           <select />
 57 |         </subPane>
 58 |       </pane>
 59 |     </panes>
 60 |   </component>
 61 |   <component name="PropertiesComponent">
 62 |     <property name="WebServerToolWindowFactoryState" value="false" />
 63 |     <property name="last_opened_file_path" value="$PROJECT_DIR$/dytt_redis_slaver" />
 64 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
 65 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
 66 |   </component>
 67 |   <component name="RunDashboard">
 68 |     <option name="ruleStates">
 69 |       <list>
 70 |         <RuleState>
 71 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
 72 |         </RuleState>
 73 |         <RuleState>
 74 |           <option name="name" value="StatusDashboardGroupingRule" />
 75 |         </RuleState>
 76 |       </list>
 77 |     </option>
 78 |   </component>
 79 |   <component name="SvnConfiguration">
 80 |     <configuration />
 81 |   </component>
 82 |   <component name="TaskManager">
 83 |     <task active="true" id="Default" summary="Default task">
 84 |       <changelist id="10cb7312-9480-4da7-9875-1aca153e23da" name="Default Changelist" comment="" />
 85 |       <created>1535693580353</created>
 86 |       <option name="number" value="Default" />
 87 |       <option name="presentableId" value="Default" />
 88 |       <updated>1535693580353</updated>
 89 |     </task>
 90 |     <servers />
 91 |   </component>
 92 |   <component name="ToolWindowManager">
 93 |     <frame x="67" y="25" width="1219" height="672" extended-state="6" />
 94 |     <layout>
 95 |       <window_info id="Favorites" side_tool="true" />
 96 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.38317758" />
 97 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
 98 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
 99 |       <window_info anchor="bottom" id="Database Changes" show_stripe_button="false" />
100 |       <window_info anchor="bottom" id="Version Control" />
101 |       <window_info anchor="bottom" id="Python Console" />
102 |       <window_info anchor="bottom" id="Terminal" />
103 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
104 |       <window_info anchor="bottom" id="Message" order="0" />
105 |       <window_info anchor="bottom" id="Find" order="1" />
106 |       <window_info anchor="bottom" id="Run" order="2" />
107 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
108 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
109 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
110 |       <window_info anchor="bottom" id="TODO" order="6" />
111 |       <window_info anchor="right" id="SciView" />
112 |       <window_info anchor="right" id="Database" />
113 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
114 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
115 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
116 |     </layout>
117 |   </component>
118 |   <component name="TypeScriptGeneratedFilesManager">
119 |     <option name="version" value="1" />
120 |   </component>
121 |   <component name="VcsContentAnnotationSettings">
122 |     <option name="myLimit" value="2678400000" />
123 |   </component>
124 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/DaiLi.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/DaiLi.iml" filepath="$PROJECT_DIR$/.idea/DaiLi.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DailiItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     proxy = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class DailiSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class DailiDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DailiPipeline(object):
10 | 
11 |     def __init__(self):
12 |         self.file = open('proxy.txt', 'w')
13 | 
14 |     def process_item(self, item, spider):
15 |         self.file.write(str(item['proxy']) + '\n')
16 |         return item
17 | 
18 |     def close_spider(self, spider):
19 |         self.file.close()
20 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for DaiLi project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'DaiLi'
13 | 
14 | SPIDER_MODULES = ['DaiLi.spiders']
15 | NEWSPIDER_MODULE = 'DaiLi.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'DaiLi.middlewares.DailiSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'DaiLi.middlewares.DailiDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'DaiLi.pipelines.DailiPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/xici.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/DaiLi/DaiLi/spiders/__pycache__/xici.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/proxy.txt:
--------------------------------------------------------------------------------
1 | {'http': 'http://106.75.9.39:8080', 'https': 'http://106.75.9.39:8080'}
2 | {'http': 'http://61.135.217.7:80', 'https': 'http://61.135.217.7:80'}
3 | {'http': 'http://118.190.95.35:9001', 'https': 'http://118.190.95.35:9001'}
4 | {'http': 'http://139.224.118.25:3128', 'https': 'http://139.224.118.25:3128'}
5 | {'http': 'http://182.38.14.237:8118', 'https': 'http://182.38.14.237:8118'}
6 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/DaiLi/spiders/xici.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from DaiLi.items import DailiItem
 4 | import requests
 5 | 
 6 | 
 7 | class XiciSpider(scrapy.Spider):
 8 |     name = 'xici'
 9 |     allowed_domains = ['xicidaili.com']
10 |     start_urls = []
11 |     for i in range(1, 6):
12 |         start_urls.append('http://www.xicidaili.com/nn/' + str(i))
13 | 
14 |     def parse(self, response):
15 |         ip = response.xpath('//tr[@class]/td[2]/text()').extract()
16 |         port = response.xpath('//tr[@class]/td[3]/text()').extract()
17 |         agreement_type = response.xpath('//tr[@class]/td[6]/text()').extract()
18 |         proxies = zip(ip, port, agreement_type)
19 |         # print(proxies)
20 | 
21 |         # 验证代理是否可用
22 |         for ip, port, agreement_type in proxies:
23 |             proxy = {'http': agreement_type.lower() + '://' + ip + ':' + port,
24 |                      'https': agreement_type.lower() + '://' + ip + ':' + port}
25 |             try:
26 |                 # 设置代理链接  如果状态码为200 则表示该代理可以使用
27 |                 print(proxy)
28 |                 resp = requests.get('http://icanhazip.com', proxies=proxy, timeout=2)
29 |                 print(resp.status_code)
30 |                 if resp.status_code == 200:
31 |                     print(resp.text)
32 |                     # print('success %s' % ip)
33 |                     item = DailiItem()
34 |                     item['proxy'] = proxy #agreement_type + '://' + ip + ':' + port
35 |                     #print(item['proxy'])
36 |                     yield item
37 |             except:
38 |                 print('fail %s' % ip)
39 | 
40 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/proxy.txt:
--------------------------------------------------------------------------------
1 | {'http': 'http://106.75.9.39:8080', 'https': 'http://106.75.9.39:8080'}
2 | {'http': 'http://61.135.217.7:80', 'https': 'http://61.135.217.7:80'}
3 | {'http': 'http://118.190.95.43:9001', 'https': 'http://118.190.95.43:9001'}
4 | {'http': 'http://118.190.95.35:9001', 'https': 'http://118.190.95.35:9001'}
5 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = DaiLi.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DaiLi
12 | 


--------------------------------------------------------------------------------
/ScrapyTest08/DaiLi/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl xici".split())
4 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/dytt_redis_master.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/dytt_redis_master.iml" filepath="$PROJECT_DIR$/.idea/dytt_redis_master.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DyttRedisMasterItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     url = scrapy.Field()
14 | 
15 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class DyttRedisMasterSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class DyttRedisMasterDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import redis
 9 | 
10 | class DyttRedisMasterPipeline(object):
11 |     def __init__(self):
12 |         # 初始化连接数据的变量
13 |         self.REDIS_HOST = '127.0.0.1'
14 |         self.REDIS_PORT = 6379
15 |         # 链接redis
16 |         self.r = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
17 | 
18 |     def process_item(self, item, spider):
19 |         # 向redis中插入需要爬取的链接地址
20 |         self.r.lpush('dytt:start_urls', item['url'])
21 |         return item
22 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dytt_redis_master project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dytt_redis_master'
13 | 
14 | SPIDER_MODULES = ['dytt_redis_master.spiders']
15 | NEWSPIDER_MODULE = 'dytt_redis_master.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dytt_redis_master (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | # ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'dytt_redis_master.middlewares.DyttRedisMasterSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'dytt_redis_master.middlewares.DyttRedisMasterDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'dytt_redis_master.pipelines.DyttRedisMasterPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/dytt_master.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/__pycache__/dytt_master.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/dytt_redis_master/spiders/dytt_master.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from dytt_redis_master.items import DyttRedisMasterItem
 6 | 
 7 | class DyttMasterSpider(CrawlSpider):
 8 |     name = 'dytt_master'
 9 |     allowed_domains = ['dy2018.com']
10 |     start_urls = ['https://www.dy2018.com/0/']
11 | 
12 |     rules = (
13 |         Rule(LinkExtractor(allow=r'/\d{1,2}/$'), callback='parse_item'),
14 |     )
15 | 
16 |     def parse_item(self, response):
17 |         # print(response.url)
18 |         items = DyttRedisMasterItem()
19 |         items['url'] = response.url
20 |         yield items
21 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dytt_redis_master.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dytt_redis_master
12 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_master/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl dytt_master".split())
4 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/dytt_redis_slaver.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/dytt_redis_slaver.iml" filepath="$PROJECT_DIR$/.idea/dytt_redis_slaver.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__init__.py


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DyttRedisSlaverItem(scrapy.Item):
12 |     # 片名
13 |     name = scrapy.Field()
14 |     # 年代
15 |     year = scrapy.Field()
16 |     # 语言
17 |     language = scrapy.Field()
18 |     # 类别
19 |     movie_type = scrapy.Field()
20 |     # 上映日期
21 |     release_date = scrapy.Field()
22 |     # 评分
23 |     score = scrapy.Field()
24 |     # 文件大小
25 |     file_size = scrapy.Field()
26 |     # 片长
27 |     film_time = scrapy.Field()
28 |     # 简介
29 |     introduction = scrapy.Field()
30 |     # 海报
31 |     posters = scrapy.Field()
32 |     # 下载链接
33 |     download_link = scrapy.Field()
34 |     # utc时间
35 |     crawled = scrapy.Field()
36 |     # 爬虫名
37 |     spider = scrapy.Field()
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | import random
 11 | import base64
 12 | from dytt_redis_slaver.settings import USER_AGENTS
 13 | from dytt_redis_slaver.settings import PROXIES
 14 | 
 15 | # 随机的User-Agent
 16 | class RandomUserAgent(object):
 17 |     def process_request(self, request, spider):
 18 |         useragent = random.choice(USER_AGENTS)
 19 |         #print useragent
 20 |         request.headers.setdefault("User-Agent", useragent)
 21 | 
 22 | # 随机的代理ip
 23 | class RandomProxy(object):
 24 |     def process_request(self, request, spider):
 25 |         proxy = random.choice(PROXIES)
 26 |         if proxy['user_passwd'] is None:
 27 |             # 没有代理账户验证的代理使用方式
 28 |             request.meta['proxy'] = "http://" + proxy['ip_port']
 29 | 
 30 |         else:
 31 |             # 对账户密码进行base64编码转换
 32 |             base64_userpasswd = base64.b64encode(proxy['user_passwd'])
 33 |             # 对应到代理服务器的信令格式里
 34 |             request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
 35 |             request.meta['proxy'] = "http://" + proxy['ip_port']
 36 | 
 37 | class DyttRedisSlaverSpiderMiddleware(object):
 38 |     # Not all methods need to be defined. If a method is not defined,
 39 |     # scrapy acts as if the spider middleware does not modify the
 40 |     # passed objects.
 41 | 
 42 |     @classmethod
 43 |     def from_crawler(cls, crawler):
 44 |         # This method is used by Scrapy to create your spiders.
 45 |         s = cls()
 46 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 47 |         return s
 48 | 
 49 |     def process_spider_input(self, response, spider):
 50 |         # Called for each response that goes through the spider
 51 |         # middleware and into the spider.
 52 | 
 53 |         # Should return None or raise an exception.
 54 |         return None
 55 | 
 56 |     def process_spider_output(self, response, result, spider):
 57 |         # Called with the results returned from the Spider, after
 58 |         # it has processed the response.
 59 | 
 60 |         # Must return an iterable of Request, dict or Item objects.
 61 |         for i in result:
 62 |             yield i
 63 | 
 64 |     def process_spider_exception(self, response, exception, spider):
 65 |         # Called when a spider or process_spider_input() method
 66 |         # (from other spider middleware) raises an exception.
 67 | 
 68 |         # Should return either None or an iterable of Response, dict
 69 |         # or Item objects.
 70 |         pass
 71 | 
 72 |     def process_start_requests(self, start_requests, spider):
 73 |         # Called with the start requests of the spider, and works
 74 |         # similarly to the process_spider_output() method, except
 75 |         # that it doesn’t have a response associated.
 76 | 
 77 |         # Must return only requests (not items).
 78 |         for r in start_requests:
 79 |             yield r
 80 | 
 81 |     def spider_opened(self, spider):
 82 |         spider.logger.info('Spider opened: %s' % spider.name)
 83 | 
 84 | 
 85 | class DyttRedisSlaverDownloaderMiddleware(object):
 86 |     # Not all methods need to be defined. If a method is not defined,
 87 |     # scrapy acts as if the downloader middleware does not modify the
 88 |     # passed objects.
 89 | 
 90 |     @classmethod
 91 |     def from_crawler(cls, crawler):
 92 |         # This method is used by Scrapy to create your spiders.
 93 |         s = cls()
 94 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 95 |         return s
 96 | 
 97 |     def process_request(self, request, spider):
 98 |         # Called for each request that goes through the downloader
 99 |         # middleware.
100 | 
101 |         # Must either:
102 |         # - return None: continue processing this request
103 |         # - or return a Response object
104 |         # - or return a Request object
105 |         # - or raise IgnoreRequest: process_exception() methods of
106 |         #   installed downloader middleware will be called
107 |         return None
108 | 
109 |     def process_response(self, request, response, spider):
110 |         # Called with the response returned from the downloader.
111 | 
112 |         # Must either;
113 |         # - return a Response object
114 |         # - return a Request object
115 |         # - or raise IgnoreRequest
116 |         return response
117 | 
118 |     def process_exception(self, request, exception, spider):
119 |         # Called when a download handler or a process_request()
120 |         # (from other downloader middleware) raises an exception.
121 | 
122 |         # Must either:
123 |         # - return None: continue processing this exception
124 |         # - return a Response object: stops process_exception() chain
125 |         # - return a Request object: stops process_exception() chain
126 |         pass
127 | 
128 |     def spider_opened(self, spider):
129 |         spider.logger.info('Spider opened: %s' % spider.name)
130 | 
131 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import json
 9 | from datetime import datetime
10 | 
11 | class DyttRedisSlaverPipeline(object):
12 |     def __init__(self):
13 |         self.file = open('movie.json', 'w')
14 | 
15 |     def process_item(self, item, spider):
16 |         content = json.dumps(dict(item), ensure_ascii=False) + "\n"
17 |         self.file.write(content)
18 |         return item
19 | 
20 |     def close_spider(self, spider):
21 |         self.file.close()
22 | 
23 | 
24 | class InfoPipeline(object):
25 | 
26 |     def process_item(self, item, spider):
27 |         #utcnow() 是获取UTC时间
28 |         item["crawled"] = datetime.utcnow()
29 |         # 爬虫名
30 |         item["spider"] = spider.name
31 |         return item


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for dytt_redis_slaver project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'dytt_redis_slaver'
 13 | 
 14 | SPIDER_MODULES = ['dytt_redis_slaver.spiders']
 15 | NEWSPIDER_MODULE = 'dytt_redis_slaver.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
 20 | 
 21 | USER_AGENTS = [
 22 |     'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 23 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 24 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
 25 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 26 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
 27 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
 28 | 
 29 | ]
 30 | 
 31 | PROXIES = [
 32 |     {'ip_port': '118.190.95.43:9001', "user_passwd": None},
 33 |     {'ip_port': '61.135.217.7:80', "user_passwd": None},
 34 |     {'ip_port': '118.190.95.35:9001', "user_passwd": None},
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | # 指定使用scrapy-redis的调度器
 40 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 41 | # 指定使用scrapy-redis的去重
 42 | DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
 43 | 
 44 | # 指定排序爬取地址时使用的队列，
 45 | # 默认的 按优先级排序(Scrapy默认)，由sorted set实现的一种非FIFO、LIFO方式。
 46 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
 47 | # 可选的 按先进先出排序（FIFO）
 48 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
 49 | # 可选的 按后进先出排序（LIFO）
 50 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
 51 | 
 52 | # 在redis中保持scrapy-redis用到的各个队列，从而允许暂停和暂停后恢复，也就是不清理redis queues
 53 | SCHEDULER_PERSIST = True
 54 | 
 55 | # 只在使用SpiderQueue或者SpiderStack是有效的参数，指定爬虫关闭的最大间隔时间
 56 | # SCHEDULER_IDLE_BEFORE_CLOSE = 10
 57 | 
 58 | 
 59 | 
 60 | # Obey robots.txt rules
 61 | # ROBOTSTXT_OBEY = True
 62 | 
 63 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 64 | # CONCURRENT_REQUESTS = 32
 65 | 
 66 | # Configure a delay for requests for the same website (default: 0)
 67 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 68 | # See also autothrottle settings and docs
 69 | DOWNLOAD_DELAY = 2
 70 | # The download delay setting will honor only one of:
 71 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 72 | #CONCURRENT_REQUESTS_PER_IP = 16
 73 | 
 74 | # Disable cookies (enabled by default)
 75 | #COOKIES_ENABLED = False
 76 | 
 77 | # Disable Telnet Console (enabled by default)
 78 | #TELNETCONSOLE_ENABLED = False
 79 | 
 80 | # Override the default request headers:
 81 | #DEFAULT_REQUEST_HEADERS = {
 82 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 83 | #   'Accept-Language': 'en',
 84 | #}
 85 | 
 86 | # Enable or disable spider middlewares
 87 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 88 | #SPIDER_MIDDLEWARES = {
 89 | #    'dytt_redis_slaver.middlewares.DyttRedisSlaverSpiderMiddleware': 543,
 90 | #}
 91 | 
 92 | # Enable or disable downloader middlewares
 93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 94 | DOWNLOADER_MIDDLEWARES = {
 95 |     'dytt_redis_slaver.middlewares.RandomUserAgent': 543,
 96 |     # 'dytt_redis_slaver.middlewares.RandomProxy': 553,
 97 | }
 98 | 
 99 | # Enable or disable extensions
100 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
101 | #EXTENSIONS = {
102 | #    'scrapy.extensions.telnet.TelnetConsole': None,
103 | #}
104 | 
105 | # Configure item pipelines
106 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
107 | # 通过配置RedisPipeline将item写入key为 spider.name : items 的redis的list中，供后面的分布式处理item
108 | ITEM_PIPELINES = {
109 |    # 'dytt_redis_slaver.pipelines.DyttRedisSlaverPipeline': 300,
110 |    'dytt_redis_slaver.pipelines.InfoPipeline':350,
111 |    'scrapy_redis.pipelines.RedisPipeline': 400
112 | }
113 | 
114 | # 指定redis数据库的连接参数
115 | REDIS_HOST = '127.0.0.1'
116 | REDIS_PORT = 6379
117 | 
118 | #默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
119 | DUPEFILTER_DEBUG =True
120 | 
121 | 
122 | # Enable and configure the AutoThrottle extension (disabled by default)
123 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
124 | #AUTOTHROTTLE_ENABLED = True
125 | # The initial download delay
126 | #AUTOTHROTTLE_START_DELAY = 5
127 | # The maximum download delay to be set in case of high latencies
128 | #AUTOTHROTTLE_MAX_DELAY = 60
129 | # The average number of requests Scrapy should be sending in parallel to
130 | # each remote server
131 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
132 | # Enable showing throttling stats for every response received:
133 | #AUTOTHROTTLE_DEBUG = False
134 | 
135 | # Enable and configure HTTP caching (disabled by default)
136 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
137 | #HTTPCACHE_ENABLED = True
138 | #HTTPCACHE_EXPIRATION_SECS = 0
139 | #HTTPCACHE_DIR = 'httpcache'
140 | #HTTPCACHE_IGNORE_HTTP_CODES = []
141 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
142 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/dytt_slaver.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/__pycache__/dytt_slaver.cpython-36.pyc


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/dytt_redis_slaver/spiders/dytt_slaver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | #from scrapy.spiders import CrawlSpider, Rule
 5 | 
 6 | # 1. 导入RedisCrawlSpider类，不使用CrawlSpider
 7 | from scrapy_redis.spiders import RedisCrawlSpider
 8 | from scrapy.spiders import Rule
 9 | 
10 | from dytt_redis_slaver.items import DyttRedisSlaverItem
11 | import re
12 | from selenium import webdriver
13 | #import time
14 | 
15 | # class DyttSlaverSpider(CrawlSpider):
16 | # 2. 修改父类 RedisCrawlSpider
17 | class DyttSlaverSpider(RedisCrawlSpider):
18 |     name = 'dytt_slaver'
19 | 
20 |     # 3. 取消 allowed_domains() 和 start_urls
21 |     # allowed_domains = ['dy2018.com']
22 |     # start_urls = ['https://www.dy2018.com/2/index.html']
23 | 
24 |     # 4. 增加redis-key
25 |     redis_key = 'dytt:start_urls'
26 | 
27 |     #page_links = LinkExtractor(allow=r'/index_\d*.html')
28 |     movie_links = LinkExtractor(allow=r'/i/\d*.html', restrict_xpaths=('//div[@class="co_content8"]'))
29 | 
30 |     rules = (
31 |         # 翻页规则
32 |         #Rule(page_links),
33 |         Rule(movie_links, callback='parse_item'),
34 |     )
35 | 
36 |     # # 5. 增加__init__()方法，动态获取allowed_domains()
37 |     # def __init__(self, *args, **kwargs):
38 |     #     domain = kwargs.pop('domain', '')
39 |     #     self.allowed_domains = filter(None, domain.split(','))
40 |     #     super(DyttSlaverSpider, self).__init__(*args, **kwargs)
41 | 
42 |     def parse_item(self, response):
43 |         items = DyttRedisSlaverItem()
44 | 
45 |         str_resp = response.body.decode('gb2312', errors='ignore')
46 |         rep_chars = ['&nbsp;', '&middot;', '&ldquo;', '&rdquo;', '&hellip;']
47 |         for rep in rep_chars:
48 |             str_resp = str_resp.replace(rep, '')
49 | 
50 |         title = re.search(r'◎片　　名(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
51 |         try:
52 |             translation = re.search(r'◎译　　名(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
53 |         except:
54 |             translation = ''
55 |         # 名字
56 |         items['name'] = title + "|" + translation
57 |         # 年代
58 |         items['year'] = re.search(r'◎年　　代(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
59 |         # 评分
60 |         try:
61 |             items['score'] = response.xpath("//strong[@class='rank']/text()").extract()[0].replace(u'\u3000', '')
62 |         except:
63 |             items['score'] = '无评分'
64 |         # 语言
65 |         items['language'] = re.search(r'◎语　　言(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
66 |         # 类别
67 |         items['movie_type'] = re.search(r'◎类　　别(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
68 |         # 上映日期
69 |         items['release_date'] = re.search(r'◎上映日期(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
70 |         # 文件大小
71 |         items['file_size'] = re.search(r'◎文件大小(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
72 |         # 片长
73 |         items['film_time'] = re.search(r'◎片　　长(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
74 |         # 简介
75 |         items['introduction'] = re.search(r'◎简　　介</.+>\r\n<.+>(.*?)</.+>', str_resp).group(1).replace(u'\u3000', '')
76 |         # 海报
77 |         items['posters'] = response.xpath("//div[@id='Zoom']/*[1]/img/@src").extract()[0]
78 |         # 下载链接
79 |         items['download_link'] = self.get_download_link(response.url)
80 | 
81 |         # print(items)
82 |         yield items
83 | 
84 |     def get_download_link(self, url):
85 |         chrome_options = webdriver.ChromeOptions()
86 |         chrome_options.add_argument('--headless')
87 |         chrome_options.add_argument('--disable-gpu')
88 |         driver = webdriver.Chrome(chrome_options=chrome_options)
89 |         driver.get(url)
90 |         #time.sleep(1)
91 |         link = re.search(r'\"(thunder:.*?)\"',  driver.page_source).group(1)
92 |         driver.close()
93 |         return link
94 | 
95 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/movie.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhiqiKou/Scrapy_notes/40ff93d5773e184f6fdfef0cc6078f245733683d/ScrapyTest08/dytt_redis_slaver/movie.json


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dytt_redis_slaver.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dytt_redis_slaver
12 | 


--------------------------------------------------------------------------------
/ScrapyTest08/dytt_redis_slaver/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl dytt_slaver".split())
4 | 


--------------------------------------------------------------------------------
/ScrapyTest08/redis-mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import redis
 5 | import pymysql
 6 | 
 7 | def main():
 8 |     # 指定redis数据库信息
 9 |     rediscli = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 |     # 指定mysql数据库
11 |     mysqlcli = pymysql.connect(host='127.0.0.1', user='root', passwd='zhiqi', db='Scrapy', port=3306, use_unicode=True)
12 | 
13 |     while True:
14 |         # FIFO模式为 blpop，LIFO模式为 brpop，获取键值
15 |         source, data = rediscli.blpop(["dytt_slaver:items"])
16 |         item = json.loads(data)
17 | 
18 |         try:
19 |             # 使用cursor()方法获取操作游标
20 |             cur = mysqlcli.cursor()
21 |             # 使用execute方法执行SQL INSERT语句
22 |             cur.execute("INSERT INTO dytt (name, year, language, "
23 |                         "movie_type, release_date, score, file_size, "
24 |                         "film_time, introduction, posters, download_link) VALUES "
25 |                         "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )",
26 |                         [item['name'], item['year'], item['language'],
27 |                          item['movie_type'], item['release_date'], item['score'],
28 |                          item['file_size'], item['film_time'], item['introduction'],
29 |                          item['posters'], item['download_link']])
30 |             # 提交sql事务
31 |             mysqlcli.commit()
32 |             #关闭本次操作
33 |             cur.close()
34 |             print ("inserted %s" % item['name'])
35 |         except pymysql.Error as e:
36 |             print ("Mysql Error %d: %s" % (e.args[0], e.args[1]))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()


--------------------------------------------------------------------------------