├── .gitignore
├── Coding.gif
├── DouBan
    ├── DouBan
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── douban_booklist.py
    └── scrapy.cfg
├── IpProxy
    ├── IpProxy
    │   ├── __init__.py
    │   ├── checkproxy.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   └── crawlall.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── setup.py
    │   └── spiders
    │   │   ├── 66ip.py
    │   │   ├── Daili.txt
    │   │   ├── __init__.py
    │   │   ├── cnproxy.py
    │   │   └── xici.py
    └── scrapy.cfg
├── JanDan
    ├── JanDan
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── jandan_ooxx.py
    └── scrapy.cfg
├── QiuBai
    ├── README.md
    ├── qiubai
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── middlewares.pyc
    │   ├── pipelines.py
    │   ├── pipelines.pyc
    │   ├── qiubai.xlsx
    │   ├── settings.py
    │   ├── settings.pyc
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── indexpage.py
    │   │   └── indexpage.pyc
    ├── qiubai_pachong_01.png
    └── scrapy.cfg
├── README.md
├── WeatherReport
    ├── README.md
    ├── WeatherReport
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── settings.pyc
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── weather.py
    │   │   └── weather.pyc
    ├── scrapy.cfg
    └── weather.jpg
├── dyly
    ├── dyly
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── middlewares.pyc
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── settings.pyc
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── news_dyly.py
    │   │   └── news_dyly.pyc
    └── scrapy.cfg
├── kelagirl
    ├── kelagirl
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── kela_pic.py
    └── scrapy.cfg
└── rosiok
    ├── rosiok
        ├── __init__.py
        ├── items.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
        │   ├── __init__.py
        │   └── rosiok_pic.py
    └── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.jpg
3 | *.jpeg
4 | *.png
5 | 
6 | 


--------------------------------------------------------------------------------
/Coding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/Coding.gif


--------------------------------------------------------------------------------
/DouBan/DouBan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/DouBan/DouBan/__init__.py


--------------------------------------------------------------------------------
/DouBan/DouBan/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/DouBan/DouBan/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DoubanPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/DouBan/DouBan/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for DouBan project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'DouBan'
13 | 
14 | SPIDER_MODULES = ['DouBan.spiders']
15 | NEWSPIDER_MODULE = 'DouBan.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'DouBan (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'DouBan.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'DouBan.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'DouBan.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/DouBan/DouBan/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/DouBan/DouBan/spiders/douban_booklist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | class QiuBaiItem(scrapy.Item):
 5 |     author = scrapy.Field()
 6 |     content = scrapy.Field()
 7 |     
 8 | 
 9 | class QiuBaiSpider(scrapy.Spider):
10 |     name="qiubai"
11 |     url = "https://www.douban.com/doulist/1264675/"    
12 | 
13 |     def parse(self,response):
14 |         item = QiuBaiItem()
15 |         item['author'] = response.xpath('//div[@class="author clearfix"]//h2/text()').extract()
16 |         item['content'] = response.xpath('//div[@class="content"]/span/text()').extract()
17 |         '''
18 |         print "===============================================" *3
19 |         print "author:",item['author']
20 |         print "content:",item['content']
21 |         print "===============================================" *3
22 |         '''
23 |         yield item
24 | 
25 | 


--------------------------------------------------------------------------------
/DouBan/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = DouBan.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DouBan
12 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/IpProxy/IpProxy/__init__.py


--------------------------------------------------------------------------------
/IpProxy/IpProxy/checkproxy.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 |  
 3 | import requests
 4 | '''
 5 | 测试代理IP可用脚本，仅测试作用
 6 | '''
 7 | 
 8 | '''
 9 | proxies = [
10 |    {"http":"183.78.183.156:82"},
11 |    {"http":"180.76.154.5:8888"},
12 |    {"http":"124.88.67.24:80"},
13 |    {"http":"115.29.2.139:80"},
14 |    {"http":"115.29.2.139:8011"},
15 |    {"http":"121.204.165.80:8118"}
16 | ]
17 | '''
18 | 
19 | proxies = [
20 |    {'http':'183.78.183.156:82'},
21 |    {'http':'180.76.154.5:8888'},
22 |    {'http':'115.29.2.139:80'},
23 |    {'http':'121.204.165.80:8118'}
24 | ]
25 | 
26 | 
27 | 
28 | proxypool = []
29 | 
30 | for index in range(len(proxies)):
31 |     print proxies[index]
32 |     try:
33 |         result = requests.get('http://ip.cn/',proxies=proxies[index],timeout=3)
34 |         proxypool.append(proxies[index])
35 |     except Exception as e:
36 |         continue
37 | 
38 | print proxypool
39 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/IpProxy/IpProxy/commands/__init__.py


--------------------------------------------------------------------------------
/IpProxy/IpProxy/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #增加自定义命令，一次爬取所有Spiders
 3 | from scrapy.commands import ScrapyCommand  
 4 | from scrapy.crawler import CrawlerRunner
 5 | from scrapy.utils.conf import arglist_to_dict
 6 | 
 7 | class Command(ScrapyCommand):
 8 |   
 9 |     requires_project = True
10 |   
11 |     def syntax(self):  
12 |         return '[options]'  
13 |   
14 |     def short_desc(self):  
15 |         return 'Runs all of the spiders'  
16 | 
17 |     def add_options(self, parser):
18 |         ScrapyCommand.add_options(self, parser)
19 |         parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
20 |                           help="set spider argument (may be repeated)")
21 |         parser.add_option("-o", "--output", metavar="FILE",
22 |                           help="dump scraped items into FILE (use - for stdout)")
23 |         parser.add_option("-t", "--output-format", metavar="FORMAT",
24 |                           help="format to use for dumping items with -o")
25 | 
26 |     def process_options(self, args, opts):
27 |         ScrapyCommand.process_options(self, args, opts)
28 |         try:
29 |             opts.spargs = arglist_to_dict(opts.spargs)
30 |         except ValueError:
31 |             raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
32 | 
33 |     def run(self, args, opts):
34 | 	#settings = get_project_settings()
35 | 	
36 | 	spider_loader = self.crawler_process.spider_loader
37 | 	for spidername in args or spider_loader.list():
38 | 	    print "*********cralall spidername************" + spidername
39 | 	    self.crawler_process.crawl(spidername, **opts.spargs)
40 | 
41 |         self.crawler_process.start()
42 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class IpProxyItem(scrapy.Item):
12 |     proxy_ipaddr = scrapy.Field()
13 |     proxy_port = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | USER_AGENTS = [
 5 |   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 6 |   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 7 |   "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 8 |   "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 9 |   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
10 |   "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
11 |   "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
12 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
13 |   "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
14 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
15 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
16 |   "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
17 |   "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
18 |   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19 |   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20 |   "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21 | ]
22 | 
23 | class RandomUAMiddleware(object):
24 |     def process_request(self,request,spider):
25 |         request.headers["User-Agent"] = random.choice(USER_AGENTS)
26 | 
27 | 
28 | class PrintUAMiddleware(object):
29 |     def process_request(self,request,spider):
30 |         print request.headers["User-Agent"] 
31 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import requests
 9 | 
10 | class IpproxyPipeline(object):
11 |     def process_item(self, item, spider):
12 | 
13 |         proxy_ip = item['proxy_ipaddr']
14 |         proxy_port = item['proxy_port']
15 |         proxypool = []
16 | 
17 |         for i,j in zip(proxy_ip,proxy_port):
18 |             proxies = [
19 |                 {"http":"%s:%s" %(i,j)}
20 |             ]    
21 |             
22 |             for index in range(len(proxies)):
23 |                 print proxies[index]
24 |             try:
25 |                 result = requests.get('http://ip.cn/',proxies=proxies[index],timeout=3)
26 |                 proxypool.append(proxies[index])
27 |             except Exception as e:
28 |                 print e
29 |         print "---" * 10
30 |         print proxypool
31 |             
32 |         return item
33 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for IpProxy project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'IpProxy'
13 | 
14 | SPIDER_MODULES = ['IpProxy.spiders']
15 | NEWSPIDER_MODULE = 'IpProxy.spiders'
16 | 
17 | COMMANDS_MODULE = 'IpProxy.commands'
18 | 
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'IpProxy (+http://www.yourdomain.com)'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 2
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'IpProxy.middlewares.MyCustomSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'IpProxy.middlewares.RandomUAMiddleware': 543,
58 | #    'IpProxy.middlewares.PrintUAMiddleware': 544,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |     'IpProxy.pipelines.IpproxyPipeline': 300,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='scrapy-mymodule',
 4 |   entry_points={
 5 |     'scrapy.commands': [
 6 |       'crawlall=cnblogs.commands:crawlall',
 7 |     ],
 8 |   },
 9 |  )
10 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/spiders/66ip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from IpProxy.items import IpProxyItem
 5 | '''
 6 | 相关网站整理
 7 | http://www.kuaidaili.com/free/
 8 | http://www.xicidaili.com/
 9 | http://www.proxy360.cn/default.aspx
10 | http://ip.zdaye.com/ 端口号是图片需要OCR
11 | http://www.cz88.net/proxy
12 | http://cn-proxy.com
13 | http://www.66ip.cn
14 | '''
15 | 
16 | 
17 | class sixsixip(scrapy.Spider):
18 |     name="66ip"
19 |     start_urls = [
20 |         "http://www.66ip.cn/areaindex_1/1.html", 
21 |         "http://www.66ip.cn/areaindex_1/2.html", 
22 |         "http://www.66ip.cn/areaindex_1/3.html", 
23 |         "http://www.66ip.cn/areaindex_1/4.html"
24 |     ]
25 |  
26 | 
27 |     def parse(self,response):
28 |         item = IpProxyItem()
29 |         item['proxy_ipaddr'] =  response.xpath('//*[@id="footer"]/div/table//tr/td[1]/text()').extract()[1:-1]
30 |         item['proxy_port'] = response.xpath('//*[@id="footer"]/div/table//tr/td[2]/text()').extract()[1:-1]
31 |         '''
32 |         print "=" * 15
33 |         print "66ip ipaddr:"
34 |         print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port']
35 |         print "=" * 15
36 |         '''
37 |         yield item
38 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/spiders/Daili.txt:
--------------------------------------------------------------------------------
1 | 相关网站整理
2 | xici      http://www.xicidaili.com/     Done
3 | proxy360  http://www.proxy360.cn/default.aspx  代理IP数太少
4 | zdaye     http://ip.zdaye.com/ 端口号是图片需要OCR
5 | cz88      http://www.cz88.net/proxy  代理IP数太少
6 | cn-proxy  http://cn-proxy.com  代理IP数太少
7 | 66ip      http://www.66ip.cn Done
8 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/spiders/cnproxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from IpProxy.items import IpProxyItem
 5 | '''
 6 | 相关网站整理
 7 | http://www.kuaidaili.com/free/
 8 | http://www.xicidaili.com/
 9 | http://www.proxy360.cn/default.aspx
10 | http://ip.zdaye.com/ 端口号是图片需要OCR
11 | http://www.cz88.net/proxy
12 | http://cn-proxy.com
13 | http://www.66ip.cn
14 | '''
15 | 
16 | 
17 | class sixsixip(scrapy.Spider):
18 |     name="cnproxy"
19 |     start_urls = [
20 |         "http://cn-proxy.com/",
21 |     ]
22 | 
23 |     def parse(self,response):
24 |         item = IpProxyItem()
25 |         item['proxy_ipaddr']=response.xpath('//div[@class="table-container"]//tr/td[1]/text()').extract()
26 |         item['proxy_port']=response.xpath('//div[@class="table-container"]//tr/td[2]/text()').extract()
27 |         '''
28 |         print "=" * 15
29 |         print "cn-proxy ipaddr:"
30 |         print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port']
31 |         print "=" * 15
32 |         '''
33 |         yield item
34 | 


--------------------------------------------------------------------------------
/IpProxy/IpProxy/spiders/xici.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from IpProxy.items import IpProxyItem
 5 | '''
 6 | 相关网站整理
 7 | http://www.kuaidaili.com/free/
 8 | http://www.xicidaili.com/
 9 | http://www.proxy360.cn/default.aspx
10 | http://ip.zdaye.com/ 端口号是图片需要OCR
11 | http://www.cz88.net/proxy
12 | http://cn-proxy.com
13 | http://www.66ip.cn
14 | '''
15 | 
16 | 
17 | class xicidaili(scrapy.Spider):
18 |     name="xici"
19 |     url = "http://www.xicidaili.com/nn/"
20 |     start_urls = [
21 |         url + str(i) for i in range(1,5)
22 |     ]
23 | 
24 | 
25 |     def parse(self,response):
26 |         item = IpProxyItem()
27 |         item['proxy_ipaddr'] =  response.xpath('//*[@id="ip_list"]//tr/td[2]/text()').extract()
28 |         item['proxy_port'] = response.xpath('//*[@id="ip_list"]//tr/td[3]/text()').extract()
29 |         ''' 
30 |         print "=" * 15
31 |         print "xici ipaddr:"
32 |         print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port']
33 |         print "=" * 15
34 |         '''
35 |         yield item
36 | 


--------------------------------------------------------------------------------
/IpProxy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = IpProxy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = IpProxy
12 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/JanDan/JanDan/__init__.py


--------------------------------------------------------------------------------
/JanDan/JanDan/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JandanItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     image_urls = scrapy.Field()
15 |     images = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | USER_AGENTS = [
 5 |   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 6 |   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 7 |   "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 8 |   "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 9 |   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
10 |   "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
11 |   "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
12 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
13 |   "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
14 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
15 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
16 |   "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
17 |   "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
18 |   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19 |   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20 |   "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21 | ]
22 | 
23 | class RandomUAMiddleware(object):
24 |     def process_request(self,request,spider):
25 |         request.headers["User-Agent"] = random.choice(USER_AGENTS)
26 | 
27 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import scrapy
 9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy.exceptions import DropItem
11 | 
12 | class JandanPipeline(object):
13 |     def get_media_requests(self, item, info):
14 |         for image_url in item['image_urls']:
15 |             yield scrapy.Request(image_url)
16 | 
17 |     def item_completed(self, results, item, info):
18 |         image_paths = [x['path'] for ok, x in results if ok]
19 |         if not image_paths:
20 |             raise DropItem("Item contains no images")
21 |         item['image_paths'] = image_paths
22 |         return item
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for JanDan project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'JanDan'
13 | 
14 | SPIDER_MODULES = ['JanDan.spiders']
15 | NEWSPIDER_MODULE = 'JanDan.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'JanDan (+http://www.yourdomain.com)'
20 | 
21 | 
22 | 
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = True
25 | 
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | #CONCURRENT_REQUESTS = 32
28 | 
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | DOWNLOAD_DELAY = 2
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'JanDan.middlewares.MyCustomSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | DOWNLOADER_MIDDLEWARES = {
58 |     'JanDan.middlewares.RandomUAMiddleware': 300,
59 | }
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |     'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
71 | }
72 | 
73 | IMAGES_STORE = '.'
74 | # Enable and configure the AutoThrottle extension (disabled by default)
75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
76 | #AUTOTHROTTLE_ENABLED = True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY = 5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY = 60
81 | # The average number of requests Scrapy should be sending in parallel to
82 | # each remote server
83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84 | # Enable showing throttling stats for every response received:
85 | #AUTOTHROTTLE_DEBUG = False
86 | 
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | #HTTPCACHE_ENABLED = True
90 | #HTTPCACHE_EXPIRATION_SECS = 0
91 | #HTTPCACHE_DIR = 'httpcache'
92 | #HTTPCACHE_IGNORE_HTTP_CODES = []
93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
94 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/JanDan/JanDan/spiders/jandan_ooxx.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.spiders import CrawlSpider, Rule
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from JanDan.items import JandanItem
 5 | 
 6 | class JandanSpider(CrawlSpider):
 7 |     name = "JanDan"
 8 |     allowed_domains = ["jandan.net"]
 9 |     start_urls = [
10 |         "http://jandan.net/ooxx",
11 |     ]
12 | 
13 | 
14 |     rules = (
15 |         Rule(LinkExtractor(allow=('http://jandan.net/ooxx/page-\d+#comments', )), callback='parse_item', follow=True),
16 |     )
17 | 
18 | 
19 |     def parse_item(self, response):
20 |         for href in response.xpath('//a[@class="view_img_link"]/@href').extract():
21 |             pic_url = "http:" + href
22 |             item = JandanItem(image_urls=[pic_url])
23 |             yield item
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/JanDan/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = JanDan.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = JanDan
12 | 


--------------------------------------------------------------------------------
/QiuBai/README.md:
--------------------------------------------------------------------------------
1 | #【爬虫】糗事百科
2 | ###突然觉得爬虫很好玩，就写了写
3 | ###这个版本是用Scrapy把糗百的内容抓下来放到了Excel里
4 | 
5 | ![](qiubai_pachong_01.png)


--------------------------------------------------------------------------------
/QiuBai/qiubai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/__init__.py


--------------------------------------------------------------------------------
/QiuBai/qiubai/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/__init__.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class QiubaiItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | USER_AGENTS = [
 5 |   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 6 |   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 7 |   "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 8 |   "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 9 |   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
10 |   "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
11 |   "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
12 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
13 |   "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
14 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
15 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
16 |   "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
17 |   "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
18 |   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19 |   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20 |   "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21 | ]
22 | 
23 | class RandomUAMiddleware(object):
24 |     def process_request(self,request,spider):
25 |         request.headers["User-Agent"] = random.choice(USER_AGENTS)
26 | 
27 | 
28 | class PrintUAMiddleware(object):
29 |     def process_request(self,request,spider):
30 |         print request.headers["User-Agent"] 
31 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/middlewares.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from openpyxl import Workbook
 9 | 
10 | class QiubaiPipeline(object):
11 |     def __init__(self):
12 |         self.wb = Workbook()
13 |         self.ws = self.wb.active
14 |         self.ws.append(['Author','Contents'])
15 |     
16 | 
17 |     def process_item(self, item, spider):
18 |         line = item['author']
19 |         line2 = item['content']
20 | 
21 |         for i,j in zip(line,line2):
22 |             self.ws.append([i,j])
23 |             self.wb.save('qiubai.xlsx')
24 |         return item
25 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/pipelines.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai/qiubai.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/qiubai.xlsx


--------------------------------------------------------------------------------
/QiuBai/qiubai/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for qiubai project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'qiubai'
 13 | 
 14 | SPIDER_MODULES = ['qiubai.spiders']
 15 | NEWSPIDER_MODULE = 'qiubai.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'qiubai (+http://www.yourdomain.com)'
 20 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
 21 | 
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = True
 25 | 
 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 27 | #CONCURRENT_REQUESTS = 32
 28 | 
 29 | # Configure a delay for requests for the same website (default: 0)
 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 31 | # See also autothrottle settings and docs
 32 | #DOWNLOAD_DELAY = 3
 33 | DOWNLOAD_DELAY = 1
 34 | # The download delay setting will honor only one of:
 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 36 | #CONCURRENT_REQUESTS_PER_IP = 16
 37 | 
 38 | # Disable cookies (enabled by default)
 39 | #COOKIES_ENABLED = False
 40 | 
 41 | # Disable Telnet Console (enabled by default)
 42 | #TELNETCONSOLE_ENABLED = False
 43 | 
 44 | # Override the default request headers:
 45 | #DEFAULT_REQUEST_HEADERS = {
 46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 47 | #   'Accept-Language': 'en',
 48 | #}
 49 | 
 50 | # Enable or disable spider middlewares
 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 52 | #SPIDER_MIDDLEWARES = {
 53 | #    'qiubai.middlewares.MyCustomSpiderMiddleware': 543,
 54 | #}
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 | #DOWNLOADER_MIDDLEWARES = {
 59 | #    'qiubai.middlewares.MyCustomDownloaderMiddleware': 543,
 60 | #}
 61 | 
 62 | DOWNLOADER_MIDDLEWARES = {
 63 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None,
 64 |     'qiubai.middlewares.RandomUAMiddleware' : 400,
 65 |     'qiubai.middlewares.PrintUAMiddleware' : 410,
 66 |         
 67 | }
 68 | 
 69 | # Enable or disable extensions
 70 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 71 | #EXTENSIONS = {
 72 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 73 | #}
 74 | 
 75 | # Configure item pipelines
 76 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 77 | #ITEM_PIPELINES = {
 78 | #    'qiubai.pipelines.SomePipeline': 300,
 79 | #}
 80 | 
 81 | 
 82 | ITEM_PIPELINES = {
 83 |     'qiubai.pipelines.QiubaiPipeline': 300,
 84 | }
 85 | 
 86 | # Enable and configure the AutoThrottle extension (disabled by default)
 87 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 88 | #AUTOTHROTTLE_ENABLED = True
 89 | # The initial download delay
 90 | #AUTOTHROTTLE_START_DELAY = 5
 91 | # The maximum download delay to be set in case of high latencies
 92 | #AUTOTHROTTLE_MAX_DELAY = 60
 93 | # The average number of requests Scrapy should be sending in parallel to
 94 | # each remote server
 95 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 96 | # Enable showing throttling stats for every response received:
 97 | #AUTOTHROTTLE_DEBUG = False
 98 | 
 99 | # Enable and configure HTTP caching (disabled by default)
100 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
101 | #HTTPCACHE_ENABLED = True
102 | #HTTPCACHE_EXPIRATION_SECS = 0
103 | #HTTPCACHE_DIR = 'httpcache'
104 | #HTTPCACHE_IGNORE_HTTP_CODES = []
105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
106 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/settings.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/spiders/__init__.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai/spiders/indexpage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | class QiuBaiItem(scrapy.Item):
 5 |     author = scrapy.Field()
 6 |     content = scrapy.Field()
 7 |     
 8 | 
 9 | class QiuBaiSpider(scrapy.Spider):
10 |     name="qiubai"
11 |     url = "http://www.qiushibaike.com/8hr/page/"    
12 |     start_urls = [ url+str(i) for i in range(1,5)]
13 | 
14 |     def parse(self,response):
15 |         item = QiuBaiItem()
16 |         item['author'] = response.xpath('//div[@class="author clearfix"]//h2/text()').extract()
17 |         item['content'] = response.xpath('//div[@class="content"]/span/text()').extract()
18 |         '''
19 |         print "===============================================" *3
20 |         print "author:",item['author']
21 |         print "content:",item['content']
22 |         print "===============================================" *3
23 |         '''
24 |         yield item
25 | 
26 | 


--------------------------------------------------------------------------------
/QiuBai/qiubai/spiders/indexpage.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/spiders/indexpage.pyc


--------------------------------------------------------------------------------
/QiuBai/qiubai_pachong_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai_pachong_01.png


--------------------------------------------------------------------------------
/QiuBai/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = qiubai.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qiubai
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapy-Project
 2 | 
 3 | ## 《目录》
 4 | 
 5 | 1. 糗事百科 QiuBai
 6 | 
 7 | 2. 天气预报 WeatherReport
 8 | 
 9 | 3. 第一路演 dyly 
10 | 
11 | 4. 克拉女神 kelagirl
12 | 
13 | 5. 代理IP IpProxy
14 | 
15 | 6. 肉丝写真图 rosiok
16 | 
17 | 7. 煎蛋美女图 JianDan
18 | 
19 | 8. 豆瓣高分书单 DouBan
20 | 
21 | ![coding](Coding.gif)
22 | 
23 | ## QiuBai
24 | 抓取糗事百科的作者和糗事内容，最后都放到EXCEL中保存。
25 | 
26 | ## WeatherReport
27 | 魔都的天气变换太快，爬一下天气预报把每天和第二天的天气在下班前用邮件发出来。
28 | 
29 | ## dyly
30 | 第一路演新闻抓取,从AJAX中获取对应news_id回调爬取函数。
31 | 
32 | 
33 | ## kelagirl
34 | 克拉女神专辑图片爬取。
35 | 
36 | [DONE] 完成专辑图片爬取功能；
37 | 
38 | 
39 | ## IpProxy
40 | 爬取免费代理ip地址，验证后存库。
41 | 
42 | [DONE] 收集整体免费代理网站地址；
43 | 
44 | [DONE] 完成代理ip爬取;
45 | 
46 | [DONE] 完成代理ip验证;
47 | 
48 | [TODO] 完成代理ip存库工作；
49 | 
50 | ## rosiok
51 | 爬取rosiok上的写真图片,借鉴别人代码；
52 | 
53 | [DONE] 爬取写真图片；
54 | 
55 | ## JianDan
56 | 爬取JianDan上的写真图片。
57 | 
58 | [DONE] 爬取煎蛋网妹子图片,使用CrawlSpider进行深层爬取；
59 | 
60 | ## DouBan
61 | 爬取豆瓣高分书单信息。
62 | [TODO] 爬取豆瓣信息;
63 | 


--------------------------------------------------------------------------------
/WeatherReport/README.md:
--------------------------------------------------------------------------------
1 | #【爬虫】上海天气
2 | ##魔都天气随机播放!!
3 | 每天下班前收到邮件看下外面天气顺便提醒下明天天气。
4 | 
5 | ![](weather.jpg)
6 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/__init__.py


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/__init__.pyc


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeatherreportItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WeatherreportPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for WeatherReport project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'WeatherReport'
13 | 
14 | SPIDER_MODULES = ['WeatherReport.spiders']
15 | NEWSPIDER_MODULE = 'WeatherReport.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'WeatherReport (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'WeatherReport.middlewares.MyCustomSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'WeatherReport.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | #    'WeatherReport.pipelines.SomePipeline': 300,
70 | #}
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/settings.pyc


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/spiders/__init__.pyc


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/spiders/weather.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | import smtplib
 5 | from email.mime.text import MIMEText
 6 | 
 7 | class WeatherSpider(scrapy.Spider):
 8 |     name="weather"
 9 |     start_urls = [
10 |         "http://www.weather.com.cn/weather1d/101020100.shtml",
11 |     ]
12 | 
13 | 
14 |     def parse(self,response):
15 |         Tod_Weather_Date = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/h1/text()').extract()
16 |         Tod_Weather_Wea = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/p[1]/text()').extract()
17 |         Tod_Weather_Tem = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/p[2]/span/text()').extract()
18 |     
19 |         Tom_Weather_Date = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/h1/text()').extract()
20 |         Tom_Weather_Wea = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/p[1]/text()').extract()
21 |         Tom_Weather_Tem = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/p[2]/span/text()').extract()
22 | 
23 |         lst = ['今天日期:' + Tod_Weather_Date[0].encode('utf-8'),"\n",'今天天气情况:'+ Tod_Weather_Wea[0].encode('utf-8'),"\n",'今天温度:' + Tod_Weather_Tem[0].encode('utf-8') + '℃',"\n","\n",'明天日期:' + Tom_Weather_Date[0].encode('utf-8'),"\n",'明天天气情况:'+ Tom_Weather_Wea[0].encode('utf-8'),"\n",'明天温度:' + Tom_Weather_Tem[0].encode('utf-8') + '℃']
24 |         
25 |         mailto_list="liudehua@wenba100.com"
26 |         mail_host="smtp.exmail.qq.com"
27 |         mail_user="ldy@wenba100.com"
28 |         mail_pass="XXXXXXXXXXXX"
29 | 
30 |         content = ''.join(lst)
31 | 	msg = MIMEText(content,_subtype='plain',_charset='utf-8')
32 |         msg['Subject'] = "Two day's weather"
33 |         msg['From'] = mail_user
34 |         msg['To'] = mailto_list
35 | 
36 |         s = smtplib.SMTP()
37 |         s.connect(mail_host)
38 |         s.login(mail_user,mail_pass)
39 |         s.sendmail(mail_user,mailto_list,msg.as_string())
40 |         s.close()
41 | 


--------------------------------------------------------------------------------
/WeatherReport/WeatherReport/spiders/weather.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/spiders/weather.pyc


--------------------------------------------------------------------------------
/WeatherReport/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = WeatherReport.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = WeatherReport
12 | 


--------------------------------------------------------------------------------
/WeatherReport/weather.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/weather.jpg


--------------------------------------------------------------------------------
/dyly/dyly/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/__init__.py


--------------------------------------------------------------------------------
/dyly/dyly/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/__init__.pyc


--------------------------------------------------------------------------------
/dyly/dyly/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DylyItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/dyly/dyly/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | USER_AGENTS = [
 5 |   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 6 |   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 7 |   "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 8 |   "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 9 |   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
10 |   "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
11 |   "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
12 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
13 |   "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
14 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
15 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
16 |   "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
17 |   "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
18 |   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19 |   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20 |   "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21 | ]
22 | 
23 | class RandomUAMiddleware(object):
24 |     def process_request(self,request,spider):
25 |         request.headers["User-Agent"] = random.choice(USER_AGENTS)
26 | 
27 | 


--------------------------------------------------------------------------------
/dyly/dyly/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/middlewares.pyc


--------------------------------------------------------------------------------
/dyly/dyly/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from openpyxl import Workbook
 9 | 
10 | class DylyPipeline(object):
11 |     def __init__(self):
12 |         self.wb = Workbook()
13 |         self.ws = self.wb.active
14 |         self.ws.append(['Titles','Contents'])
15 |     
16 | 
17 |     def process_item(self, item, spider):
18 |         line = item['title']
19 |         line2 = item['content']
20 | 
21 |         for i,j in zip(line,line2):
22 |             self.ws.append([i,j])
23 |             self.wb.save('dyly.xlsx')
24 |         return item
25 |   
26 | 


--------------------------------------------------------------------------------
/dyly/dyly/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dyly project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dyly'
13 | 
14 | SPIDER_MODULES = ['dyly.spiders']
15 | NEWSPIDER_MODULE = 'dyly.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dyly (+http://www.yourdomain.com)'
20 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 1.5
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'dyly.middlewares.MyCustomSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | DOWNLOADER_MIDDLEWARES = {
57 |     'dyly.middlewares.MyCustomDownloaderMiddleware': None,
58 |     'dyly.middlewares.RandomUAMiddleware' : 543,
59 | }
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |     'dyly.pipelines.DylyPipeline': 300,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/dyly/dyly/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/settings.pyc


--------------------------------------------------------------------------------
/dyly/dyly/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dyly/dyly/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/spiders/__init__.pyc


--------------------------------------------------------------------------------
/dyly/dyly/spiders/news_dyly.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | import scrapy
 4 | import json
 5 | 
 6 | class dydlItem(scrapy.Item): 
 7 |     title = scrapy.Field()
 8 |     content = scrapy.Field()
 9 | 
10 | 
11 | class dydlSpider(scrapy.Spider):
12 |     name = "dyly"
13 |     
14 |     start_urls = [
15 |         "https://news.dyly.com/",
16 |     ]
17 |    
18 |     
19 |     def start_requests(self):
20 |     #通过ajax请求获取对应news_id之后请求回调函数
21 |         return [scrapy.http.FormRequest('https://news.dyly.com/getAppNewsList.do',method="POST",formdata={'ajax':'ajax','type':'news_primary','loginMethod':'wap','pageNo':'2'})]
22 |    
23 | 
24 |     def parse(self,response):
25 |         response_html =  response.body
26 |         result = json.loads(response_html)
27 |         result_newsid = result['newsContent']
28 |         urls = ["https://news.dyly.com/news/detail/",]
29 |         for i in  result_newsid:
30 |             #拼接出文章url
31 |             url = urls[0] + i['objectId'] + ".html"
32 |             yield scrapy.Request(url,callback=self.parse_item)
33 | 
34 |     def parse_item(self,response):
35 |         item = dydlItem()
36 |         item['title'] = response.xpath('//article//section//header/text()').extract()
37 |         content = response.xpath('//div//p//span/text()').extract()
38 |         if content:
39 |             item['content'] = content   
40 |         else:
41 |             item['content'] = response.xpath('//div//p/text()').extract()
42 | 
43 |         '''
44 |         print "===============================================" *3
45 |         print "title:",item['title']
46 |         print "content:",item['content']
47 |         print "===============================================" *3
48 |         '''
49 |         yield item
50 | 


--------------------------------------------------------------------------------
/dyly/dyly/spiders/news_dyly.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/spiders/news_dyly.pyc


--------------------------------------------------------------------------------
/dyly/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dyly.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dyly
12 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/kelagirl/kelagirl/__init__.py


--------------------------------------------------------------------------------
/kelagirl/kelagirl/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class KelagirlItem(scrapy.Item):
12 |     image_urls = scrapy.Field()
13 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | USER_AGENTS = [
 5 |   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 6 |   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 7 |   "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 8 |   "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 9 |   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
10 |   "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
11 |   "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
12 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
13 |   "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
14 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
15 |   "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
16 |   "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
17 |   "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
18 |   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19 |   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20 |   "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21 | ]
22 | 
23 | class RandomUAMiddleware(object):
24 |     def process_request(self,request,spider):
25 |         request.headers["User-Agent"] = random.choice(USER_AGENTS)
26 | 
27 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | import scrapy
10 | from scrapy.pipelines.images import ImagesPipeline
11 | from scrapy.exceptions import DropItem
12 | class KelagirlPipeline(ImagesPipeline):
13 | 
14 |     def get_media_requests(self, item, info):
15 |         for image_url in item['image_urls']:
16 |             yield scrapy.Request(image_url)
17 | 
18 |     def item_completed(self, results, item, info):
19 |         image_paths = [x['path'] for ok, x in results if ok]
20 |         if not image_paths:
21 |             raise DropItem("Item contains no images")
22 |         item['image_paths'] = image_paths
23 |         return item
24 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for kelagirl project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'kelagirl'
13 | 
14 | SPIDER_MODULES = ['kelagirl.spiders']
15 | NEWSPIDER_MODULE = 'kelagirl.spiders'
16 | 
17 | IMAGES_STORE = '.'
18 | 
19 | USERNAME = 'zhujialin'
20 | PASSWORD = '123jkluio' 
21 | 
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'kelagirl (+http://www.yourdomain.com)'
24 | 
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = True
27 | 
28 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 | #CONCURRENT_REQUESTS = 32
30 | 
31 | # Configure a delay for requests for the same website (default: 0)
32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
33 | # See also autothrottle settings and docs
34 | DOWNLOAD_DELAY = 1
35 | # The download delay setting will honor only one of:
36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 | #CONCURRENT_REQUESTS_PER_IP = 16
38 | 
39 | # Disable cookies (enabled by default)
40 | #COOKIES_ENABLED = False
41 | 
42 | # Disable Telnet Console (enabled by default)
43 | #TELNETCONSOLE_ENABLED = False
44 | 
45 | # Override the default request headers:
46 | #DEFAULT_REQUEST_HEADERS = {
47 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 | #   'Accept-Language': 'en',
49 | #}
50 | 
51 | # Enable or disable spider middlewares
52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
53 | #SPIDER_MIDDLEWARES = {
54 | #    'kelagirl.middlewares.MyCustomSpiderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable downloader middlewares
58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
59 | DOWNLOADER_MIDDLEWARES = {
60 |     'kelagirl.middlewares.RandomUAMiddleware' :543,
61 |     'kelagirl.middlewares.MyCustomDownloaderMiddleware': None,
62 | }
63 | 
64 | # Enable or disable extensions
65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 | 
70 | # Configure item pipelines
71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
72 | ITEM_PIPELINES = {
73 |     'kelagirl.pipelines.KelagirlPipeline': 3,
74 | }
75 | 
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 | 
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/kelagirl/kelagirl/spiders/kela_pic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from kelagirl.items import KelagirlItem
 5 | 
 6 | class KeLaGirlSpider(scrapy.Spider):
 7 |     name="kelagirl"
 8 |     start_urls = [
 9 |         "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=1",
10 |         "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=2",
11 |         "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=3",
12 |     ]
13 | 
14 |     def parse(self,response):
15 |         uid_list = response.xpath('//div[@class="zhuanjimorewrap"]/div/@pid').extract()
16 |         urls = ["http://www.kelagirls.com/zhuanji!findForDetail.action?pid=",]
17 |         for i in uid_list:
18 |             url = urls[0] + i 
19 |             yield scrapy.Request(url,callback=self.parse_item)
20 | 
21 |     def parse_item(self,response):
22 |         item = KelagirlItem()
23 |         #继续爬取图片路径
24 |         image_url = response.xpath('//div[@class="smallwrap"]/img/@src').extract()
25 |         image_url_full = map(lambda x: 'http://www.kelagirls.com/'+x,image_url)
26 |  
27 |         item['image_urls'] =  image_url_full
28 |         yield item
29 | 


--------------------------------------------------------------------------------
/kelagirl/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = kelagirl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = kelagirl
12 | 


--------------------------------------------------------------------------------
/rosiok/rosiok/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/rosiok/rosiok/__init__.py


--------------------------------------------------------------------------------
/rosiok/rosiok/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class RosiokItem(scrapy.Item):
12 |     img_url = scrapy.Field()
13 |     image_paths = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/rosiok/rosiok/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import scrapy
 9 | from scrapy.exceptions import DropItem
10 | from scrapy.pipelines.images import ImagesPipeline
11 | 
12 | class RosiooPipeline(ImagesPipeline):
13 |     def get_media_requests(self, item, spider):
14 |         for image_url in item['img_url']:
15 |             yield scrapy.Request(image_url)
16 | 
17 |     def item_completed(self, results, item, spider):
18 |         image_paths = [x['path'] for ok, x in results if ok]
19 |         if not image_paths:
20 |             raise DropItem("Item contains no images")
21 |         item['image_paths'] = image_paths
22 |         return results
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/rosiok/rosiok/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for rosiok project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'rosiok'
13 | 
14 | SPIDER_MODULES = ['rosiok.spiders']
15 | NEWSPIDER_MODULE = 'rosiok.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'rosiok.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'rosiok.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'rosiok.pipelines.RosiooPipeline': 300,
69 | }
70 | 
71 | IMAGES_STORE = '.'
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/rosiok/rosiok/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/rosiok/rosiok/spiders/rosiok_pic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from rosioo.items import RosiooItem
 4 | 
 5 | class RosiokSpider(scrapy.Spider):
 6 |     name = "rosiok"
 7 |     allowed_domains = ["rosiok.com"]
 8 |     start_urls = (
 9 |         'http://www.rosiok.com/x/list_1_1.html',
10 |     )
11 | 
12 |     def parse2(self,response):
13 |         for i in response.xpath('//div[@class="photo"]'):
14 |             item = RosiooItem()
15 |             item['img_url'] = i.xpath('img/@src').extract()
16 |             yield item
17 | 
18 |     def parse(self, response):
19 | 
20 |         urls = response.xpath('//*[@id="imgBox"]/li/a/@href').extract()
21 |         if urls:
22 |             for url in urls:
23 |                 yield scrapy.Request(url=r"http://www.rosiok.com"+url, callback=self.parse2)
24 | 
25 |         next_page = response.xpath('//*[@class="cPage"]/li[last()]/a/@href') # next page
26 |         if next_page:
27 |             url = response.urljoin(next_page[0].extract())
28 |             yield scrapy.Request(url, self.parse)
29 | 
30 | 


--------------------------------------------------------------------------------
/rosiok/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = rosiok.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = rosiok
12 | 


--------------------------------------------------------------------------------