├── .gitignore ├── Coding.gif ├── DouBan ├── DouBan │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── douban_booklist.py └── scrapy.cfg ├── IpProxy ├── IpProxy │ ├── __init__.py │ ├── checkproxy.py │ ├── commands │ │ ├── __init__.py │ │ └── crawlall.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── setup.py │ └── spiders │ │ ├── 66ip.py │ │ ├── Daili.txt │ │ ├── __init__.py │ │ ├── cnproxy.py │ │ └── xici.py └── scrapy.cfg ├── JanDan ├── JanDan │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── jandan_ooxx.py └── scrapy.cfg ├── QiuBai ├── README.md ├── qiubai │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── middlewares.py │ ├── middlewares.pyc │ ├── pipelines.py │ ├── pipelines.pyc │ ├── qiubai.xlsx │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── indexpage.py │ │ └── indexpage.pyc ├── qiubai_pachong_01.png └── scrapy.cfg ├── README.md ├── WeatherReport ├── README.md ├── WeatherReport │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── weather.py │ │ └── weather.pyc ├── scrapy.cfg └── weather.jpg ├── dyly ├── dyly │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── middlewares.py │ ├── middlewares.pyc │ ├── pipelines.py │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── news_dyly.py │ │ └── news_dyly.pyc └── scrapy.cfg ├── kelagirl ├── kelagirl │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── kela_pic.py └── scrapy.cfg └── rosiok ├── rosiok ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── rosiok_pic.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.jpg 3 | *.jpeg 4 | *.png 5 | 6 | -------------------------------------------------------------------------------- /Coding.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/Coding.gif -------------------------------------------------------------------------------- /DouBan/DouBan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/DouBan/DouBan/__init__.py -------------------------------------------------------------------------------- /DouBan/DouBan/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /DouBan/DouBan/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /DouBan/DouBan/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DouBan project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DouBan' 13 | 14 | SPIDER_MODULES = ['DouBan.spiders'] 15 | NEWSPIDER_MODULE = 'DouBan.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'DouBan (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'DouBan.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'DouBan.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'DouBan.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /DouBan/DouBan/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /DouBan/DouBan/spiders/douban_booklist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | class QiuBaiItem(scrapy.Item): 5 | author = scrapy.Field() 6 | content = scrapy.Field() 7 | 8 | 9 | class QiuBaiSpider(scrapy.Spider): 10 | name="qiubai" 11 | url = "https://www.douban.com/doulist/1264675/" 12 | 13 | def parse(self,response): 14 | item = QiuBaiItem() 15 | item['author'] = response.xpath('//div[@class="author clearfix"]//h2/text()').extract() 16 | item['content'] = response.xpath('//div[@class="content"]/span/text()').extract() 17 | ''' 18 | print "===============================================" *3 19 | print "author:",item['author'] 20 | print "content:",item['content'] 21 | print "===============================================" *3 22 | ''' 23 | yield item 24 | 25 | -------------------------------------------------------------------------------- /DouBan/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = DouBan.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = DouBan 12 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/IpProxy/IpProxy/__init__.py -------------------------------------------------------------------------------- /IpProxy/IpProxy/checkproxy.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import requests 4 | ''' 5 | 测试代理IP可用脚本,仅测试作用 6 | ''' 7 | 8 | ''' 9 | proxies = [ 10 | {"http":"183.78.183.156:82"}, 11 | {"http":"180.76.154.5:8888"}, 12 | {"http":"124.88.67.24:80"}, 13 | {"http":"115.29.2.139:80"}, 14 | {"http":"115.29.2.139:8011"}, 15 | {"http":"121.204.165.80:8118"} 16 | ] 17 | ''' 18 | 19 | proxies = [ 20 | {'http':'183.78.183.156:82'}, 21 | {'http':'180.76.154.5:8888'}, 22 | {'http':'115.29.2.139:80'}, 23 | {'http':'121.204.165.80:8118'} 24 | ] 25 | 26 | 27 | 28 | proxypool = [] 29 | 30 | for index in range(len(proxies)): 31 | print proxies[index] 32 | try: 33 | result = requests.get('http://ip.cn/',proxies=proxies[index],timeout=3) 34 | proxypool.append(proxies[index]) 35 | except Exception as e: 36 | continue 37 | 38 | print proxypool 39 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/IpProxy/IpProxy/commands/__init__.py -------------------------------------------------------------------------------- /IpProxy/IpProxy/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #增加自定义命令,一次爬取所有Spiders 3 | from scrapy.commands import ScrapyCommand 4 | from scrapy.crawler import CrawlerRunner 5 | from scrapy.utils.conf import arglist_to_dict 6 | 7 | class Command(ScrapyCommand): 8 | 9 | requires_project = True 10 | 11 | def syntax(self): 12 | return '[options]' 13 | 14 | def short_desc(self): 15 | return 'Runs all of the spiders' 16 | 17 | def add_options(self, parser): 18 | ScrapyCommand.add_options(self, parser) 19 | parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 20 | help="set spider argument (may be repeated)") 21 | parser.add_option("-o", "--output", metavar="FILE", 22 | help="dump scraped items into FILE (use - for stdout)") 23 | parser.add_option("-t", "--output-format", metavar="FORMAT", 24 | help="format to use for dumping items with -o") 25 | 26 | def process_options(self, args, opts): 27 | ScrapyCommand.process_options(self, args, opts) 28 | try: 29 | opts.spargs = arglist_to_dict(opts.spargs) 30 | except ValueError: 31 | raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 32 | 33 | def run(self, args, opts): 34 | #settings = get_project_settings() 35 | 36 | spider_loader = self.crawler_process.spider_loader 37 | for spidername in args or spider_loader.list(): 38 | print "*********cralall spidername************" + spidername 39 | self.crawler_process.crawl(spidername, **opts.spargs) 40 | 41 | self.crawler_process.start() 42 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class IpProxyItem(scrapy.Item): 12 | proxy_ipaddr = scrapy.Field() 13 | proxy_port = scrapy.Field() 14 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | USER_AGENTS = [ 5 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 6 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 7 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 8 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 9 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 10 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 11 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 13 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 16 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 17 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 20 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 21 | ] 22 | 23 | class RandomUAMiddleware(object): 24 | def process_request(self,request,spider): 25 | request.headers["User-Agent"] = random.choice(USER_AGENTS) 26 | 27 | 28 | class PrintUAMiddleware(object): 29 | def process_request(self,request,spider): 30 | print request.headers["User-Agent"] 31 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import requests 9 | 10 | class IpproxyPipeline(object): 11 | def process_item(self, item, spider): 12 | 13 | proxy_ip = item['proxy_ipaddr'] 14 | proxy_port = item['proxy_port'] 15 | proxypool = [] 16 | 17 | for i,j in zip(proxy_ip,proxy_port): 18 | proxies = [ 19 | {"http":"%s:%s" %(i,j)} 20 | ] 21 | 22 | for index in range(len(proxies)): 23 | print proxies[index] 24 | try: 25 | result = requests.get('http://ip.cn/',proxies=proxies[index],timeout=3) 26 | proxypool.append(proxies[index]) 27 | except Exception as e: 28 | print e 29 | print "---" * 10 30 | print proxypool 31 | 32 | return item 33 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for IpProxy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'IpProxy' 13 | 14 | SPIDER_MODULES = ['IpProxy.spiders'] 15 | NEWSPIDER_MODULE = 'IpProxy.spiders' 16 | 17 | COMMANDS_MODULE = 'IpProxy.commands' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'IpProxy (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 2 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'IpProxy.middlewares.MyCustomSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'IpProxy.middlewares.RandomUAMiddleware': 543, 58 | # 'IpProxy.middlewares.PrintUAMiddleware': 544, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'IpProxy.pipelines.IpproxyPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='scrapy-mymodule', 4 | entry_points={ 5 | 'scrapy.commands': [ 6 | 'crawlall=cnblogs.commands:crawlall', 7 | ], 8 | }, 9 | ) 10 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/66ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from IpProxy.items import IpProxyItem 5 | ''' 6 | 相关网站整理 7 | http://www.kuaidaili.com/free/ 8 | http://www.xicidaili.com/ 9 | http://www.proxy360.cn/default.aspx 10 | http://ip.zdaye.com/ 端口号是图片需要OCR 11 | http://www.cz88.net/proxy 12 | http://cn-proxy.com 13 | http://www.66ip.cn 14 | ''' 15 | 16 | 17 | class sixsixip(scrapy.Spider): 18 | name="66ip" 19 | start_urls = [ 20 | "http://www.66ip.cn/areaindex_1/1.html", 21 | "http://www.66ip.cn/areaindex_1/2.html", 22 | "http://www.66ip.cn/areaindex_1/3.html", 23 | "http://www.66ip.cn/areaindex_1/4.html" 24 | ] 25 | 26 | 27 | def parse(self,response): 28 | item = IpProxyItem() 29 | item['proxy_ipaddr'] = response.xpath('//*[@id="footer"]/div/table//tr/td[1]/text()').extract()[1:-1] 30 | item['proxy_port'] = response.xpath('//*[@id="footer"]/div/table//tr/td[2]/text()').extract()[1:-1] 31 | ''' 32 | print "=" * 15 33 | print "66ip ipaddr:" 34 | print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port'] 35 | print "=" * 15 36 | ''' 37 | yield item 38 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/Daili.txt: -------------------------------------------------------------------------------- 1 | 相关网站整理 2 | xici http://www.xicidaili.com/ Done 3 | proxy360 http://www.proxy360.cn/default.aspx 代理IP数太少 4 | zdaye http://ip.zdaye.com/ 端口号是图片需要OCR 5 | cz88 http://www.cz88.net/proxy 代理IP数太少 6 | cn-proxy http://cn-proxy.com 代理IP数太少 7 | 66ip http://www.66ip.cn Done 8 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/cnproxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from IpProxy.items import IpProxyItem 5 | ''' 6 | 相关网站整理 7 | http://www.kuaidaili.com/free/ 8 | http://www.xicidaili.com/ 9 | http://www.proxy360.cn/default.aspx 10 | http://ip.zdaye.com/ 端口号是图片需要OCR 11 | http://www.cz88.net/proxy 12 | http://cn-proxy.com 13 | http://www.66ip.cn 14 | ''' 15 | 16 | 17 | class sixsixip(scrapy.Spider): 18 | name="cnproxy" 19 | start_urls = [ 20 | "http://cn-proxy.com/", 21 | ] 22 | 23 | def parse(self,response): 24 | item = IpProxyItem() 25 | item['proxy_ipaddr']=response.xpath('//div[@class="table-container"]//tr/td[1]/text()').extract() 26 | item['proxy_port']=response.xpath('//div[@class="table-container"]//tr/td[2]/text()').extract() 27 | ''' 28 | print "=" * 15 29 | print "cn-proxy ipaddr:" 30 | print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port'] 31 | print "=" * 15 32 | ''' 33 | yield item 34 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/xici.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from IpProxy.items import IpProxyItem 5 | ''' 6 | 相关网站整理 7 | http://www.kuaidaili.com/free/ 8 | http://www.xicidaili.com/ 9 | http://www.proxy360.cn/default.aspx 10 | http://ip.zdaye.com/ 端口号是图片需要OCR 11 | http://www.cz88.net/proxy 12 | http://cn-proxy.com 13 | http://www.66ip.cn 14 | ''' 15 | 16 | 17 | class xicidaili(scrapy.Spider): 18 | name="xici" 19 | url = "http://www.xicidaili.com/nn/" 20 | start_urls = [ 21 | url + str(i) for i in range(1,5) 22 | ] 23 | 24 | 25 | def parse(self,response): 26 | item = IpProxyItem() 27 | item['proxy_ipaddr'] = response.xpath('//*[@id="ip_list"]//tr/td[2]/text()').extract() 28 | item['proxy_port'] = response.xpath('//*[@id="ip_list"]//tr/td[3]/text()').extract() 29 | ''' 30 | print "=" * 15 31 | print "xici ipaddr:" 32 | print "ipaddr:",item['proxy_ipaddr']," port:",item['proxy_port'] 33 | print "=" * 15 34 | ''' 35 | yield item 36 | -------------------------------------------------------------------------------- /IpProxy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = IpProxy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = IpProxy 12 | -------------------------------------------------------------------------------- /JanDan/JanDan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/JanDan/JanDan/__init__.py -------------------------------------------------------------------------------- /JanDan/JanDan/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JandanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | image_urls = scrapy.Field() 15 | images = scrapy.Field() 16 | -------------------------------------------------------------------------------- /JanDan/JanDan/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | USER_AGENTS = [ 5 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 6 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 7 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 8 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 9 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 10 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 11 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 13 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 16 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 17 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 20 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 21 | ] 22 | 23 | class RandomUAMiddleware(object): 24 | def process_request(self,request,spider): 25 | request.headers["User-Agent"] = random.choice(USER_AGENTS) 26 | 27 | -------------------------------------------------------------------------------- /JanDan/JanDan/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import scrapy 9 | from scrapy.pipelines.images import ImagesPipeline 10 | from scrapy.exceptions import DropItem 11 | 12 | class JandanPipeline(object): 13 | def get_media_requests(self, item, info): 14 | for image_url in item['image_urls']: 15 | yield scrapy.Request(image_url) 16 | 17 | def item_completed(self, results, item, info): 18 | image_paths = [x['path'] for ok, x in results if ok] 19 | if not image_paths: 20 | raise DropItem("Item contains no images") 21 | item['image_paths'] = image_paths 22 | return item 23 | 24 | 25 | -------------------------------------------------------------------------------- /JanDan/JanDan/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for JanDan project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'JanDan' 13 | 14 | SPIDER_MODULES = ['JanDan.spiders'] 15 | NEWSPIDER_MODULE = 'JanDan.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'JanDan (+http://www.yourdomain.com)' 20 | 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = True 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | DOWNLOAD_DELAY = 2 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'JanDan.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | DOWNLOADER_MIDDLEWARES = { 58 | 'JanDan.middlewares.RandomUAMiddleware': 300, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'scrapy.contrib.pipeline.images.ImagesPipeline': 300, 71 | } 72 | 73 | IMAGES_STORE = '.' 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | #AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | #AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | #HTTPCACHE_ENABLED = True 90 | #HTTPCACHE_EXPIRATION_SECS = 0 91 | #HTTPCACHE_DIR = 'httpcache' 92 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /JanDan/JanDan/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /JanDan/JanDan/spiders/jandan_ooxx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from scrapy.linkextractors import LinkExtractor 4 | from JanDan.items import JandanItem 5 | 6 | class JandanSpider(CrawlSpider): 7 | name = "JanDan" 8 | allowed_domains = ["jandan.net"] 9 | start_urls = [ 10 | "http://jandan.net/ooxx", 11 | ] 12 | 13 | 14 | rules = ( 15 | Rule(LinkExtractor(allow=('http://jandan.net/ooxx/page-\d+#comments', )), callback='parse_item', follow=True), 16 | ) 17 | 18 | 19 | def parse_item(self, response): 20 | for href in response.xpath('//a[@class="view_img_link"]/@href').extract(): 21 | pic_url = "http:" + href 22 | item = JandanItem(image_urls=[pic_url]) 23 | yield item 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /JanDan/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = JanDan.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = JanDan 12 | -------------------------------------------------------------------------------- /QiuBai/README.md: -------------------------------------------------------------------------------- 1 | #【爬虫】糗事百科 2 | ###突然觉得爬虫很好玩,就写了写 3 | ###这个版本是用Scrapy把糗百的内容抓下来放到了Excel里 4 | 5 | ![](qiubai_pachong_01.png) -------------------------------------------------------------------------------- /QiuBai/qiubai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/__init__.py -------------------------------------------------------------------------------- /QiuBai/qiubai/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/__init__.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QiubaiItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /QiuBai/qiubai/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | USER_AGENTS = [ 5 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 6 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 7 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 8 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 9 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 10 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 11 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 13 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 16 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 17 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 20 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 21 | ] 22 | 23 | class RandomUAMiddleware(object): 24 | def process_request(self,request,spider): 25 | request.headers["User-Agent"] = random.choice(USER_AGENTS) 26 | 27 | 28 | class PrintUAMiddleware(object): 29 | def process_request(self,request,spider): 30 | print request.headers["User-Agent"] 31 | -------------------------------------------------------------------------------- /QiuBai/qiubai/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/middlewares.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from openpyxl import Workbook 9 | 10 | class QiubaiPipeline(object): 11 | def __init__(self): 12 | self.wb = Workbook() 13 | self.ws = self.wb.active 14 | self.ws.append(['Author','Contents']) 15 | 16 | 17 | def process_item(self, item, spider): 18 | line = item['author'] 19 | line2 = item['content'] 20 | 21 | for i,j in zip(line,line2): 22 | self.ws.append([i,j]) 23 | self.wb.save('qiubai.xlsx') 24 | return item 25 | -------------------------------------------------------------------------------- /QiuBai/qiubai/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/pipelines.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai/qiubai.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/qiubai.xlsx -------------------------------------------------------------------------------- /QiuBai/qiubai/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for qiubai project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'qiubai' 13 | 14 | SPIDER_MODULES = ['qiubai.spiders'] 15 | NEWSPIDER_MODULE = 'qiubai.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'qiubai (+http://www.yourdomain.com)' 20 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = True 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | #DOWNLOAD_DELAY = 3 33 | DOWNLOAD_DELAY = 1 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | #COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'qiubai.middlewares.MyCustomSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | #DOWNLOADER_MIDDLEWARES = { 59 | # 'qiubai.middlewares.MyCustomDownloaderMiddleware': 543, 60 | #} 61 | 62 | DOWNLOADER_MIDDLEWARES = { 63 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None, 64 | 'qiubai.middlewares.RandomUAMiddleware' : 400, 65 | 'qiubai.middlewares.PrintUAMiddleware' : 410, 66 | 67 | } 68 | 69 | # Enable or disable extensions 70 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 71 | #EXTENSIONS = { 72 | # 'scrapy.extensions.telnet.TelnetConsole': None, 73 | #} 74 | 75 | # Configure item pipelines 76 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 77 | #ITEM_PIPELINES = { 78 | # 'qiubai.pipelines.SomePipeline': 300, 79 | #} 80 | 81 | 82 | ITEM_PIPELINES = { 83 | 'qiubai.pipelines.QiubaiPipeline': 300, 84 | } 85 | 86 | # Enable and configure the AutoThrottle extension (disabled by default) 87 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 88 | #AUTOTHROTTLE_ENABLED = True 89 | # The initial download delay 90 | #AUTOTHROTTLE_START_DELAY = 5 91 | # The maximum download delay to be set in case of high latencies 92 | #AUTOTHROTTLE_MAX_DELAY = 60 93 | # The average number of requests Scrapy should be sending in parallel to 94 | # each remote server 95 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 96 | # Enable showing throttling stats for every response received: 97 | #AUTOTHROTTLE_DEBUG = False 98 | 99 | # Enable and configure HTTP caching (disabled by default) 100 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 101 | #HTTPCACHE_ENABLED = True 102 | #HTTPCACHE_EXPIRATION_SECS = 0 103 | #HTTPCACHE_DIR = 'httpcache' 104 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 106 | -------------------------------------------------------------------------------- /QiuBai/qiubai/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/settings.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /QiuBai/qiubai/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/spiders/__init__.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai/spiders/indexpage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | class QiuBaiItem(scrapy.Item): 5 | author = scrapy.Field() 6 | content = scrapy.Field() 7 | 8 | 9 | class QiuBaiSpider(scrapy.Spider): 10 | name="qiubai" 11 | url = "http://www.qiushibaike.com/8hr/page/" 12 | start_urls = [ url+str(i) for i in range(1,5)] 13 | 14 | def parse(self,response): 15 | item = QiuBaiItem() 16 | item['author'] = response.xpath('//div[@class="author clearfix"]//h2/text()').extract() 17 | item['content'] = response.xpath('//div[@class="content"]/span/text()').extract() 18 | ''' 19 | print "===============================================" *3 20 | print "author:",item['author'] 21 | print "content:",item['content'] 22 | print "===============================================" *3 23 | ''' 24 | yield item 25 | 26 | -------------------------------------------------------------------------------- /QiuBai/qiubai/spiders/indexpage.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai/spiders/indexpage.pyc -------------------------------------------------------------------------------- /QiuBai/qiubai_pachong_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/QiuBai/qiubai_pachong_01.png -------------------------------------------------------------------------------- /QiuBai/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = qiubai.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = qiubai 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy-Project 2 | 3 | ## 《目录》 4 | 5 | 1. 糗事百科 QiuBai 6 | 7 | 2. 天气预报 WeatherReport 8 | 9 | 3. 第一路演 dyly 10 | 11 | 4. 克拉女神 kelagirl 12 | 13 | 5. 代理IP IpProxy 14 | 15 | 6. 肉丝写真图 rosiok 16 | 17 | 7. 煎蛋美女图 JianDan 18 | 19 | 8. 豆瓣高分书单 DouBan 20 | 21 | ![coding](Coding.gif) 22 | 23 | ## QiuBai 24 | 抓取糗事百科的作者和糗事内容,最后都放到EXCEL中保存。 25 | 26 | ## WeatherReport 27 | 魔都的天气变换太快,爬一下天气预报把每天和第二天的天气在下班前用邮件发出来。 28 | 29 | ## dyly 30 | 第一路演新闻抓取,从AJAX中获取对应news_id回调爬取函数。 31 | 32 | 33 | ## kelagirl 34 | 克拉女神专辑图片爬取。 35 | 36 | [DONE] 完成专辑图片爬取功能; 37 | 38 | 39 | ## IpProxy 40 | 爬取免费代理ip地址,验证后存库。 41 | 42 | [DONE] 收集整体免费代理网站地址; 43 | 44 | [DONE] 完成代理ip爬取; 45 | 46 | [DONE] 完成代理ip验证; 47 | 48 | [TODO] 完成代理ip存库工作; 49 | 50 | ## rosiok 51 | 爬取rosiok上的写真图片,借鉴别人代码; 52 | 53 | [DONE] 爬取写真图片; 54 | 55 | ## JianDan 56 | 爬取JianDan上的写真图片。 57 | 58 | [DONE] 爬取煎蛋网妹子图片,使用CrawlSpider进行深层爬取; 59 | 60 | ## DouBan 61 | 爬取豆瓣高分书单信息。 62 | [TODO] 爬取豆瓣信息; 63 | -------------------------------------------------------------------------------- /WeatherReport/README.md: -------------------------------------------------------------------------------- 1 | #【爬虫】上海天气 2 | ##魔都天气随机播放!! 3 | 每天下班前收到邮件看下外面天气顺便提醒下明天天气。 4 | 5 | ![](weather.jpg) 6 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/__init__.py -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/__init__.pyc -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeatherreportItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WeatherreportPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for WeatherReport project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'WeatherReport' 13 | 14 | SPIDER_MODULES = ['WeatherReport.spiders'] 15 | NEWSPIDER_MODULE = 'WeatherReport.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'WeatherReport (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'WeatherReport.middlewares.MyCustomSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'WeatherReport.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'WeatherReport.pipelines.SomePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/settings.pyc -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/spiders/__init__.pyc -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/spiders/weather.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | import smtplib 5 | from email.mime.text import MIMEText 6 | 7 | class WeatherSpider(scrapy.Spider): 8 | name="weather" 9 | start_urls = [ 10 | "http://www.weather.com.cn/weather1d/101020100.shtml", 11 | ] 12 | 13 | 14 | def parse(self,response): 15 | Tod_Weather_Date = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/h1/text()').extract() 16 | Tod_Weather_Wea = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/p[1]/text()').extract() 17 | Tod_Weather_Tem = response.xpath('//*[@id="today"]/div[1]/ul/li[1]/p[2]/span/text()').extract() 18 | 19 | Tom_Weather_Date = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/h1/text()').extract() 20 | Tom_Weather_Wea = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/p[1]/text()').extract() 21 | Tom_Weather_Tem = response.xpath('//*[@id="today"]/div[1]/ul/li[2]/p[2]/span/text()').extract() 22 | 23 | lst = ['今天日期:' + Tod_Weather_Date[0].encode('utf-8'),"\n",'今天天气情况:'+ Tod_Weather_Wea[0].encode('utf-8'),"\n",'今天温度:' + Tod_Weather_Tem[0].encode('utf-8') + '℃',"\n","\n",'明天日期:' + Tom_Weather_Date[0].encode('utf-8'),"\n",'明天天气情况:'+ Tom_Weather_Wea[0].encode('utf-8'),"\n",'明天温度:' + Tom_Weather_Tem[0].encode('utf-8') + '℃'] 24 | 25 | mailto_list="liudehua@wenba100.com" 26 | mail_host="smtp.exmail.qq.com" 27 | mail_user="ldy@wenba100.com" 28 | mail_pass="XXXXXXXXXXXX" 29 | 30 | content = ''.join(lst) 31 | msg = MIMEText(content,_subtype='plain',_charset='utf-8') 32 | msg['Subject'] = "Two day's weather" 33 | msg['From'] = mail_user 34 | msg['To'] = mailto_list 35 | 36 | s = smtplib.SMTP() 37 | s.connect(mail_host) 38 | s.login(mail_user,mail_pass) 39 | s.sendmail(mail_user,mailto_list,msg.as_string()) 40 | s.close() 41 | -------------------------------------------------------------------------------- /WeatherReport/WeatherReport/spiders/weather.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/WeatherReport/spiders/weather.pyc -------------------------------------------------------------------------------- /WeatherReport/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = WeatherReport.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = WeatherReport 12 | -------------------------------------------------------------------------------- /WeatherReport/weather.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/WeatherReport/weather.jpg -------------------------------------------------------------------------------- /dyly/dyly/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/__init__.py -------------------------------------------------------------------------------- /dyly/dyly/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/__init__.pyc -------------------------------------------------------------------------------- /dyly/dyly/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DylyItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /dyly/dyly/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | USER_AGENTS = [ 5 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 6 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 7 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 8 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 9 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 10 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 11 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 13 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 16 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 17 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 20 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 21 | ] 22 | 23 | class RandomUAMiddleware(object): 24 | def process_request(self,request,spider): 25 | request.headers["User-Agent"] = random.choice(USER_AGENTS) 26 | 27 | -------------------------------------------------------------------------------- /dyly/dyly/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/middlewares.pyc -------------------------------------------------------------------------------- /dyly/dyly/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from openpyxl import Workbook 9 | 10 | class DylyPipeline(object): 11 | def __init__(self): 12 | self.wb = Workbook() 13 | self.ws = self.wb.active 14 | self.ws.append(['Titles','Contents']) 15 | 16 | 17 | def process_item(self, item, spider): 18 | line = item['title'] 19 | line2 = item['content'] 20 | 21 | for i,j in zip(line,line2): 22 | self.ws.append([i,j]) 23 | self.wb.save('dyly.xlsx') 24 | return item 25 | 26 | -------------------------------------------------------------------------------- /dyly/dyly/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dyly project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dyly' 13 | 14 | SPIDER_MODULES = ['dyly.spiders'] 15 | NEWSPIDER_MODULE = 'dyly.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dyly (+http://www.yourdomain.com)' 20 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 1.5 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'dyly.middlewares.MyCustomSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | 'dyly.middlewares.MyCustomDownloaderMiddleware': None, 58 | 'dyly.middlewares.RandomUAMiddleware' : 543, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'dyly.pipelines.DylyPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /dyly/dyly/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/settings.pyc -------------------------------------------------------------------------------- /dyly/dyly/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dyly/dyly/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/spiders/__init__.pyc -------------------------------------------------------------------------------- /dyly/dyly/spiders/news_dyly.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | import scrapy 4 | import json 5 | 6 | class dydlItem(scrapy.Item): 7 | title = scrapy.Field() 8 | content = scrapy.Field() 9 | 10 | 11 | class dydlSpider(scrapy.Spider): 12 | name = "dyly" 13 | 14 | start_urls = [ 15 | "https://news.dyly.com/", 16 | ] 17 | 18 | 19 | def start_requests(self): 20 | #通过ajax请求获取对应news_id之后请求回调函数 21 | return [scrapy.http.FormRequest('https://news.dyly.com/getAppNewsList.do',method="POST",formdata={'ajax':'ajax','type':'news_primary','loginMethod':'wap','pageNo':'2'})] 22 | 23 | 24 | def parse(self,response): 25 | response_html = response.body 26 | result = json.loads(response_html) 27 | result_newsid = result['newsContent'] 28 | urls = ["https://news.dyly.com/news/detail/",] 29 | for i in result_newsid: 30 | #拼接出文章url 31 | url = urls[0] + i['objectId'] + ".html" 32 | yield scrapy.Request(url,callback=self.parse_item) 33 | 34 | def parse_item(self,response): 35 | item = dydlItem() 36 | item['title'] = response.xpath('//article//section//header/text()').extract() 37 | content = response.xpath('//div//p//span/text()').extract() 38 | if content: 39 | item['content'] = content 40 | else: 41 | item['content'] = response.xpath('//div//p/text()').extract() 42 | 43 | ''' 44 | print "===============================================" *3 45 | print "title:",item['title'] 46 | print "content:",item['content'] 47 | print "===============================================" *3 48 | ''' 49 | yield item 50 | -------------------------------------------------------------------------------- /dyly/dyly/spiders/news_dyly.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/dyly/dyly/spiders/news_dyly.pyc -------------------------------------------------------------------------------- /dyly/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dyly.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dyly 12 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/kelagirl/kelagirl/__init__.py -------------------------------------------------------------------------------- /kelagirl/kelagirl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class KelagirlItem(scrapy.Item): 12 | image_urls = scrapy.Field() 13 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | USER_AGENTS = [ 5 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 6 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 7 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 8 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 9 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 10 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 11 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 13 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 15 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 16 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 17 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 20 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 21 | ] 22 | 23 | class RandomUAMiddleware(object): 24 | def process_request(self,request,spider): 25 | request.headers["User-Agent"] = random.choice(USER_AGENTS) 26 | 27 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import scrapy 10 | from scrapy.pipelines.images import ImagesPipeline 11 | from scrapy.exceptions import DropItem 12 | class KelagirlPipeline(ImagesPipeline): 13 | 14 | def get_media_requests(self, item, info): 15 | for image_url in item['image_urls']: 16 | yield scrapy.Request(image_url) 17 | 18 | def item_completed(self, results, item, info): 19 | image_paths = [x['path'] for ok, x in results if ok] 20 | if not image_paths: 21 | raise DropItem("Item contains no images") 22 | item['image_paths'] = image_paths 23 | return item 24 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for kelagirl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'kelagirl' 13 | 14 | SPIDER_MODULES = ['kelagirl.spiders'] 15 | NEWSPIDER_MODULE = 'kelagirl.spiders' 16 | 17 | IMAGES_STORE = '.' 18 | 19 | USERNAME = 'zhujialin' 20 | PASSWORD = '123jkluio' 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'kelagirl (+http://www.yourdomain.com)' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = True 27 | 28 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 | #CONCURRENT_REQUESTS = 32 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | DOWNLOAD_DELAY = 1 35 | # The download delay setting will honor only one of: 36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 37 | #CONCURRENT_REQUESTS_PER_IP = 16 38 | 39 | # Disable cookies (enabled by default) 40 | #COOKIES_ENABLED = False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | #TELNETCONSOLE_ENABLED = False 44 | 45 | # Override the default request headers: 46 | #DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | #} 50 | 51 | # Enable or disable spider middlewares 52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 53 | #SPIDER_MIDDLEWARES = { 54 | # 'kelagirl.middlewares.MyCustomSpiderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable downloader middlewares 58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 59 | DOWNLOADER_MIDDLEWARES = { 60 | 'kelagirl.middlewares.RandomUAMiddleware' :543, 61 | 'kelagirl.middlewares.MyCustomDownloaderMiddleware': None, 62 | } 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | 'kelagirl.pipelines.KelagirlPipeline': 3, 74 | } 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /kelagirl/kelagirl/spiders/kela_pic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from kelagirl.items import KelagirlItem 5 | 6 | class KeLaGirlSpider(scrapy.Spider): 7 | name="kelagirl" 8 | start_urls = [ 9 | "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=1", 10 | "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=2", 11 | "http://www.kelagirls.com/zhuanji!findForIndexMoreTag.action?tagId=0&page=3", 12 | ] 13 | 14 | def parse(self,response): 15 | uid_list = response.xpath('//div[@class="zhuanjimorewrap"]/div/@pid').extract() 16 | urls = ["http://www.kelagirls.com/zhuanji!findForDetail.action?pid=",] 17 | for i in uid_list: 18 | url = urls[0] + i 19 | yield scrapy.Request(url,callback=self.parse_item) 20 | 21 | def parse_item(self,response): 22 | item = KelagirlItem() 23 | #继续爬取图片路径 24 | image_url = response.xpath('//div[@class="smallwrap"]/img/@src').extract() 25 | image_url_full = map(lambda x: 'http://www.kelagirls.com/'+x,image_url) 26 | 27 | item['image_urls'] = image_url_full 28 | yield item 29 | -------------------------------------------------------------------------------- /kelagirl/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = kelagirl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = kelagirl 12 | -------------------------------------------------------------------------------- /rosiok/rosiok/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atiger77/ScrapyProject/19c1f477843fc51570a7f341b86e37a75d2e9143/rosiok/rosiok/__init__.py -------------------------------------------------------------------------------- /rosiok/rosiok/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class RosiokItem(scrapy.Item): 12 | img_url = scrapy.Field() 13 | image_paths = scrapy.Field() 14 | -------------------------------------------------------------------------------- /rosiok/rosiok/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import scrapy 9 | from scrapy.exceptions import DropItem 10 | from scrapy.pipelines.images import ImagesPipeline 11 | 12 | class RosiooPipeline(ImagesPipeline): 13 | def get_media_requests(self, item, spider): 14 | for image_url in item['img_url']: 15 | yield scrapy.Request(image_url) 16 | 17 | def item_completed(self, results, item, spider): 18 | image_paths = [x['path'] for ok, x in results if ok] 19 | if not image_paths: 20 | raise DropItem("Item contains no images") 21 | item['image_paths'] = image_paths 22 | return results 23 | 24 | 25 | -------------------------------------------------------------------------------- /rosiok/rosiok/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for rosiok project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'rosiok' 13 | 14 | SPIDER_MODULES = ['rosiok.spiders'] 15 | NEWSPIDER_MODULE = 'rosiok.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'rosiok.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'rosiok.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'rosiok.pipelines.RosiooPipeline': 300, 69 | } 70 | 71 | IMAGES_STORE = '.' 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /rosiok/rosiok/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /rosiok/rosiok/spiders/rosiok_pic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from rosioo.items import RosiooItem 4 | 5 | class RosiokSpider(scrapy.Spider): 6 | name = "rosiok" 7 | allowed_domains = ["rosiok.com"] 8 | start_urls = ( 9 | 'http://www.rosiok.com/x/list_1_1.html', 10 | ) 11 | 12 | def parse2(self,response): 13 | for i in response.xpath('//div[@class="photo"]'): 14 | item = RosiooItem() 15 | item['img_url'] = i.xpath('img/@src').extract() 16 | yield item 17 | 18 | def parse(self, response): 19 | 20 | urls = response.xpath('//*[@id="imgBox"]/li/a/@href').extract() 21 | if urls: 22 | for url in urls: 23 | yield scrapy.Request(url=r"http://www.rosiok.com"+url, callback=self.parse2) 24 | 25 | next_page = response.xpath('//*[@class="cPage"]/li[last()]/a/@href') # next page 26 | if next_page: 27 | url = response.urljoin(next_page[0].extract()) 28 | yield scrapy.Request(url, self.parse) 29 | 30 | -------------------------------------------------------------------------------- /rosiok/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = rosiok.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = rosiok 12 | --------------------------------------------------------------------------------