├── MyNews
├── __init__.py
├── spiders
│ ├── __init__.py
│ ├── qqnews.py
│ ├── EastmoneyNews.py
│ ├── wangyinews.py
│ ├── peoplenews.py
│ ├── sinanews.py
│ ├── ifengnews.py
│ └── sohunews.py
├── headers.py
├── pipelines.py
├── items.py
├── settings.py
└── middlewares.py
├── example
├── example
│ ├── __init__.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── myspider_redis.py
│ │ ├── dmoz.py
│ │ └── mycrawler_redis.py
│ ├── pipelines.py
│ ├── settings.py
│ └── items.py
├── requirements.txt
├── Dockerfile
├── test.py
├── docker-compose.yml
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── NewsSpider.iml
│ └── workspace.xml
├── scrapy.cfg
├── README.rst
└── process_items.py
├── .idea
├── vcs.xml
├── misc.xml
├── modules.xml
└── MyNews.iml
├── scrapy.cfg
├── push.py
├── test.py
└── save2mysql.py
/MyNews/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/example/example/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/example/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | scrapy-redis
3 |
--------------------------------------------------------------------------------
/example/Dockerfile:
--------------------------------------------------------------------------------
1 | #@IgnoreInspection BashAddShebang
2 | FROM python:2.7-onbuild
3 |
4 | ENTRYPOINT ["scrapy"]
5 | CMD ["crawl", "dmoz"]
6 |
--------------------------------------------------------------------------------
/example/test.py:
--------------------------------------------------------------------------------
1 | import redis
2 |
3 | r = redis.Redis(host='10.36.131.52',port=6379)
4 |
5 | r.set('name','value')
6 | print(r.get('name'))
--------------------------------------------------------------------------------
/MyNews/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/example/docker-compose.yml:
--------------------------------------------------------------------------------
1 | redis:
2 | image: redis
3 | ports:
4 | - "6379:6379" # added port for external db provisioning
5 |
6 | crawler:
7 | build: .
8 | links:
9 | - redis:localhost
10 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/example/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/example/example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # To create the first spider for your project use this command:
4 | #
5 | # scrapy genspider myspider myspider-domain.com
6 | #
7 | # For more info see:
8 | # http://doc.scrapy.org/topics/spiders.html
9 |
--------------------------------------------------------------------------------
/example/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # http://doc.scrapy.org/topics/scrapyd.html
5 |
6 | [settings]
7 | default = example.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = example
12 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = MyNews.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = MyNews
12 |
--------------------------------------------------------------------------------
/example/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/MyNews/headers.py:
--------------------------------------------------------------------------------
1 |
2 | qqheaders = {
3 | 'Host':'roll.news.qq.com',
4 | 'Connection':'keep-alive',
5 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
6 | 'Accept':'*/*',
7 | 'Referer':'http://www.qq.com/',
8 | 'Accept-Language':'zh-CN,zh;q=0.8',
9 | }
--------------------------------------------------------------------------------
/.idea/MyNews.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/example/.idea/NewsSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/push.py:
--------------------------------------------------------------------------------
1 | import redis
2 |
3 |
4 | r = redis.Redis(host='10.36.131.52',port=6379)
5 |
6 | r.lpush('qqnews:start_urls','https://news.qq.com')
7 | r.lpush('wangyinews:start_urls','https://news.163.com')
8 | r.lpush('ifengnews:start_urls','https://news.ifeng.com')
9 | r.lpush('sohunews:start_urls','https://news.sohu.com')
10 | r.lpush('EastmoneyNews:start_urls','http://stock.eastmoney.com')
11 | r.lpush('sinanews:start_urls','http://news.sina.com.cn/roll/#pageid=153')
12 | r.lpush('peoplenews:start_urls','http://news.people.com.cn/')
--------------------------------------------------------------------------------
/MyNews/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from datetime import datetime
8 |
9 |
10 | class MynewsPipeline(object):
11 | def process_item(self, item, spider):
12 | item["crawled"] = datetime.utcnow()
13 | item["spider"] = spider.name
14 | item["body"] = item["body"].strip()
15 | item["pubtime"] = item["pubtime"].replace('来源: ','')
16 | item["pubtime"] = item["pubtime"].strip()
17 | return item
18 |
--------------------------------------------------------------------------------
/example/example/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
5 | from datetime import datetime
6 |
7 | # class ExamplePipeline(object):
8 | # def process_item(self, item, spider):
9 | # item["crawled"] = datetime.utcnow()
10 | # item["spider"] = spider.name
11 | # return item
12 |
13 | class ExamplePipeline(object):
14 | def process_item(self, item, spider):
15 | item["crawled"] = datetime.utcnow()
16 | item["spider"] = spider.name
17 | return item
--------------------------------------------------------------------------------
/example/example/spiders/myspider_redis.py:
--------------------------------------------------------------------------------
1 | from scrapy_redis.spiders import RedisSpider
2 |
3 |
4 | class MySpider(RedisSpider):
5 | """Spider that reads urls from redis queue (myspider:start_urls)."""
6 | name = 'myspider_redis'
7 | redis_key = 'myspider:start_urls'
8 |
9 | def __init__(self, *args, **kwargs):
10 | # Dynamically define the allowed domains list.
11 | domain = kwargs.pop('domain', '')
12 | self.allowed_domains = filter(None, domain.split(','))
13 | super(MySpider, self).__init__(*args, **kwargs)
14 |
15 | def parse(self, response):
16 | return {
17 | 'name': response.css('title::text').extract_first(),
18 | 'url': response.url,
19 | }
20 |
--------------------------------------------------------------------------------
/MyNews/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 | from scrapy.item import Item, Field
8 | from scrapy.loader import ItemLoader
9 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
10 |
11 |
12 |
13 | class NewsItem(Item):
14 | # define the fields for your item here like:
15 | title = Field() #标题
16 | body = Field() #内容
17 | url = Field()
18 | refer = Field()
19 | tag = Field()
20 | pubtime = Field() #发布时间
21 | crawled = Field()
22 | spider = Field()
23 |
24 |
25 |
26 |
27 | class ExampleItem(Item):
28 | name = Field()
29 | description = Field()
30 | link = Field()
31 | crawled = Field()
32 | spider = Field()
33 | url = Field()
34 |
35 |
36 | class ExampleLoader(ItemLoader):
37 | default_item_class = ExampleItem
38 | default_input_processor = MapCompose(lambda s: s.strip())
39 | default_output_processor = TakeFirst()
40 | description_out = Join()
41 |
--------------------------------------------------------------------------------
/example/example/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for example project
2 | #
3 | # For simplicity, this file contains only the most important settings by
4 | # default. All the other settings are documented here:
5 | #
6 | # http://doc.scrapy.org/topics/settings.html
7 | #
8 | SPIDER_MODULES = ['example.spiders']
9 | NEWSPIDER_MODULE = 'example.spiders'
10 |
11 | USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
12 |
13 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
14 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
15 | SCHEDULER_PERSIST = True
16 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
17 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
18 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
19 |
20 | ITEM_PIPELINES = {
21 | 'example.pipelines.ExamplePipeline': 300,
22 | 'scrapy_redis.pipelines.RedisPipeline': 400,
23 | }
24 |
25 | LOG_LEVEL = 'DEBUG'
26 |
27 | # Introduce an artifical delay to make use of parallelism. to speed up the
28 | # crawl.
29 | DOWNLOAD_DELAY = 1
30 |
31 | # REDIS_HOST = '10.36.131.52'
32 | # REDIS_PORT = 6379
33 |
34 |
35 | REDIS_URL = 'redis://10.36.131.52:6379'
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | import lxml
5 | import lxml.etree
6 | import requests
7 |
8 |
9 | headers = {
10 | 'Host':'roll.news.qq.com',
11 | 'Connection':'keep-alive',
12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
13 | 'Accept':'*/*',
14 | 'Referer':'http://www.qq.com/',
15 | # 'Accept-Encoding':'gzip, deflate, sdch',
16 | 'Accept-Language':'zh-CN,zh;q=0.8',
17 | }
18 |
19 | response = requests.get('http://yule.sohu.com/_scroll_newslist/20180420/news.inc')
20 | response.encoding = 'utf8'
21 | html =response.text
22 | # html = html.replace('var newsJason = ','')
23 | re_str1 = re.compile("item:\[(.*)\]")
24 | info = re_str1.findall(html)[0]
25 | # info = info.split(',')
26 | re_str2 = re.compile("\[(.*?),\"(.*?)\",\"(.*?)\",\"(.*?)\"]")
27 | infolist = re_str2.findall(info)
28 | for info in infolist:
29 | print(info[0])
30 | print(info[1])
31 | print(info[2])
32 | print(info[3])
33 | # info = dict(html)
34 | # print(info)
35 | # info = json.loads(html)
36 |
37 | # html = lxml.etree.HTML(response.text)
38 | # print(response.text)
39 | # info = html.xpath("//div[@id='c06']/table/tr")
40 | # print(info)
41 |
42 |
--------------------------------------------------------------------------------
/example/example/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # http://doc.scrapy.org/topics/items.html
5 |
6 | from scrapy.item import Item, Field
7 | from scrapy.loader import ItemLoader
8 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
9 |
10 |
11 | class ExampleItem(Item):
12 | name = Field()
13 | description = Field()
14 | link = Field()
15 | crawled = Field()
16 | spider = Field()
17 | url = Field()
18 |
19 | class LengzhishiItem(Item):
20 | # define the fields for your item here like:
21 | # name = scrapy.Field()
22 | content = Field()
23 | crawled = Field()
24 | spider = Field()
25 |
26 |
27 | class JdspiderItem(Item):
28 | # define the fields for your item here like:
29 | # name = scrapy.Field()
30 | name = Field()
31 | content = Field()
32 | url = Field()
33 | crawled = Field()
34 | spider = Field()
35 |
36 |
37 | class LengzhishiLoader(ItemLoader):
38 | default_item_class = LengzhishiItem
39 | default_input_processor = MapCompose(lambda s: s.strip())
40 | default_output_processor = TakeFirst()
41 | description_out = Join()
42 |
43 | class ExampleLoader(ItemLoader):
44 | default_item_class = ExampleItem
45 | default_input_processor = MapCompose(lambda s: s.strip())
46 | default_output_processor = TakeFirst()
47 | description_out = Join()
48 |
--------------------------------------------------------------------------------
/example/example/spiders/dmoz.py:
--------------------------------------------------------------------------------
1 | from scrapy.linkextractors import LinkExtractor
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import LengzhishiItem
4 |
5 |
6 | class DmozSpider(CrawlSpider):
7 | """Follow categories and extract links."""
8 | name = 'dmoz'
9 | # allowed_domains = ['dmoz.org']
10 | # start_urls = ['http://www.dmoz.org/']
11 | #
12 | # rules = [
13 | # Rule(LinkExtractor(
14 | # restrict_css=('.top-cat', '.sub-cat', '.cat-item')
15 | # ), callback='parse_directory', follow=True),
16 | # ]
17 | #
18 | # def parse_directory(self, response):
19 | # for div in response.css('.title-and-desc'):
20 | # yield {
21 | # 'name': div.css('.site-title::text').extract_first(),
22 | # 'description': div.css('.site-descr::text').extract_first().strip(),
23 | # 'link': div.css('a::attr(href)').extract_first(),
24 | # }
25 | allowed_domains = ['lengdou.net']
26 | start_urls = ['http://lengdou.net/']
27 |
28 | linkextractor = LinkExtractor(allow=(r'/topic/\d+',))
29 | rules = [
30 | Rule(linkextractor,callback="parseContent",follow=True)
31 | ]
32 |
33 | def parseContent(self, response):
34 | content = response.xpath("//p[@class='topic-content']/text()").extract()[0]
35 | item = LengzhishiItem()
36 | item['content'] = content
37 | yield item
38 |
--------------------------------------------------------------------------------
/example/README.rst:
--------------------------------------------------------------------------------
1 | ============================
2 | Scrapy Redis Example Project
3 | ============================
4 |
5 |
6 | This directory contains an example Scrapy project integrated with scrapy-redis.
7 | By default, all items are sent to redis (key ``:items``). All spiders
8 | schedule requests through redis, so you can start additional spiders to speed
9 | up the crawling.
10 |
11 | Spiders
12 | -------
13 |
14 | * **dmoz**
15 |
16 | This spider simply scrapes dmoz.org.
17 |
18 | * **myspider_redis**
19 |
20 | This spider uses redis as a shared requests queue and uses
21 | ``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
22 | one item.
23 |
24 | * **mycrawler_redis**
25 |
26 | This spider uses redis as a shared requests queue and uses
27 | ``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
28 | are links.
29 |
30 |
31 | .. note::
32 |
33 | All requests are persisted by default. You can clear the queue by using the
34 | ``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
35 | SCHEDULER_FLUSH_ON_START=1``.
36 |
37 |
38 | Processing items
39 | ----------------
40 |
41 | The ``process_items.py`` provides an example of consuming the items queue::
42 |
43 | python process_items.py --help
44 |
45 |
46 | Run via Docker
47 | --------------
48 |
49 | You require the following applications:
50 |
51 | * docker (https://docs.docker.com/installation/)
52 | * docker-compose (https://docs.docker.com/compose/install/)
53 |
54 | For implementation details see `Dockerfile` and `docker-compose.yml` and read
55 | official docker documentation.
56 |
57 | 1. To start sample `example-project` (`-d` for daemon)::
58 |
59 | docker-compose up
60 |
61 | 2. To scale `crawler` (4 instances for example)::
62 |
63 | docker-compose scale crawler=4
64 |
--------------------------------------------------------------------------------
/save2mysql.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pymysql
4 | import redis
5 |
6 |
7 |
8 | def saveitem(r,conn,item):
9 | if r.llen(item) > 0:
10 | source, data = r.blpop(item,timeout=1)
11 | item = json.loads(data)
12 | try:
13 | with conn.cursor() as cur:
14 | cur.execute( r'''
15 | insert into mynews (title,pubtime,url,tag,refer,body)
16 | VALUES ('%s','%s','%s','%s','%s','%s')'''%(item['title'],item['pubtime'],item['url'],item['tag'],item['refer'],item['body']))
17 | conn.commit()
18 | print("inserted %s" % item['title'])
19 | except Exception as e:
20 | # print(e)
21 | pass
22 | else:
23 | pass
24 |
25 |
26 |
27 | conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='123456',database='news',charset='utf8')
28 |
29 | r = redis.Redis(host='10.36.131.52',port=6379)
30 |
31 |
32 |
33 | sql = '''
34 | drop table if exists mynews;
35 | '''
36 | with conn.cursor() as cur:
37 | cur.execute(sql)
38 | conn.commit()
39 |
40 | sql = '''
41 | create table mynews(
42 | id int PRIMARY KEY not null AUTO_INCREMENT,
43 | title VARCHAR(200),
44 | pubtime VARCHAR(130),
45 | body Text,
46 | url VARCHAR(250),
47 | tag VARCHAR(30),
48 | refer VARCHAR(30)
49 | );
50 | '''
51 |
52 | with conn.cursor() as cur:
53 | cur.execute(sql)
54 | conn.commit()
55 |
56 | conn.close()
57 |
58 | while True:
59 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='news', charset='utf8')
60 | saveitem(r, conn, 'qqnews:items')
61 | saveitem(r, conn, 'ifengnews:items')
62 | saveitem(r, conn, 'sohunews:items')
63 | saveitem(r, conn, 'wangyinews:items')
64 | saveitem(r, conn, 'EastmoneyNews:items')
65 | saveitem(r, conn, 'sinanews:items')
66 | saveitem(r, conn, 'peoplenews:items')
67 | conn.close()
68 |
69 |
70 |
--------------------------------------------------------------------------------
/MyNews/spiders/qqnews.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from scrapy_redis.spiders import RedisSpider
4 | from MyNews.items import NewsItem
5 | from scrapy import Request
6 | from ..headers import qqheaders
7 |
8 | class qqnewsSpider(RedisSpider):
9 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
10 |
11 | '''
12 | start_url = "http://ent.qq.com/articleList/rolls/"
13 | '''
14 | name = 'qqnews'
15 | redis_key = 'qqnews:start_urls'
16 | taglist = ['ent','sports','finance','tech']
17 |
18 | def __init__(self, *args, **kwargs):
19 | # Dynamically define the allowed domains list.
20 | domain = kwargs.pop('qq.com', 'news.qq.com')
21 | self.allowed_domains = filter(None, domain.split(','))
22 | super(qqnewsSpider, self).__init__(*args, **kwargs)
23 |
24 | def parse(self,response):
25 | for tag in self.taglist:
26 | url = 'http://roll.news.qq.com/interface/cpcroll.php?site='+tag+'&mode=1&cata=&date=2018-04-20&page=1'
27 | yield Request(url,callback=self.parsepage,headers=qqheaders,meta={'tag':tag},dont_filter=True)
28 |
29 | def parsepage(self,response):
30 | tag = response.meta['tag']
31 | newsjson = json.loads(response.text)
32 | newslist = newsjson['data']['article_info']
33 | for news in newslist:
34 | url = news['url']
35 | meta = {
36 | 'tag':tag,
37 | 'title':news['title'],
38 | 'pubtime':news['time'],
39 | }
40 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
41 |
42 |
43 | def parsebody(self,response):
44 | meta = response.meta
45 |
46 | item = NewsItem()
47 | item['tag'] = meta['tag']
48 | item['title']= meta['title']
49 | item['url'] = response.url
50 | item['body'] = '\n'.join(response.xpath("//div[@id='Cnt-Main-Article-QQ']/p[@class='text']/text()").extract())
51 | item['pubtime'] = meta['pubtime']
52 | item['refer'] = '腾讯新闻'
53 | if not item['body'] == '':
54 | yield item
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/MyNews/spiders/EastmoneyNews.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import json
5 |
6 | from scrapy_redis.spiders import RedisSpider
7 | from MyNews.items import NewsItem
8 | from scrapy import Request
9 | from ..headers import qqheaders
10 | import requests
11 | from lxml import etree
12 |
13 | class EastmoneyNewsSpider(RedisSpider):
14 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
15 |
16 | '''
17 | start_url = "http://stock.eastmoney.com/news/cgszb.html"
18 | '''
19 | name = 'EastmoneyNews'
20 | redis_key = 'EastmoneyNews:start_urls'
21 | taglist = ['ent','sports','finance','tech']
22 |
23 | def __init__(self, *args, **kwargs):
24 | # Dynamically define the allowed domains list.
25 | domain = kwargs.pop('eastmoney.com', 'stock.eastmoney.com')
26 | self.allowed_domains = filter(None, domain.split(','))
27 | super(EastmoneyNewsSpider, self).__init__(*args, **kwargs)
28 |
29 | def parse(self,response):
30 | # 取得总页数
31 | r = requests.get('http://stock.eastmoney.com/news/cgszb.html')
32 | html = etree.HTML(r.text)
33 | list = html.xpath("//div[@id='pagerNoDiv']//a/text()")
34 | page_total = int(list[-2])
35 | # 循环遍历所有页
36 | for i in range(1,page_total+1):
37 | # 得到每页的url
38 | url = 'http://stock.eastmoney.com/news/cgszb_'+str(i)+'.html'
39 | yield Request(url,callback=self.parsepage,dont_filter=True)
40 |
41 | def parsepage(self,response):
42 | # 得到新闻详情页的url列表
43 | post_url_list = response.xpath("//ul[@id='newsListContent']//p[@class='title']/a/@href").extract()
44 | # 遍历所有新闻url
45 | for post_url in post_url_list:
46 | yield Request(post_url, callback=self.parsebody, meta={'tag':'finance'},dont_filter=True)
47 |
48 | # 抽取页面数据
49 | def parsebody(self,response):
50 | meta = response.meta
51 |
52 | item = NewsItem()
53 | item['tag'] = meta['tag']
54 | item['title']= response.xpath("//div[@class='main_left']//div[@class='newsContent']/h1/text()").extract()[0]
55 | item['url'] = response.url
56 | item['body'] = '\n'.join(response.xpath("//div[@class='newsContent']//div[@id='ContentBody']//p").xpath('string(.)').extract()).strip()
57 | item['pubtime'] = response.xpath("//div[@class='time-source']//div[@class='time']/text()").extract()[0]
58 | item['refer'] = '东方财富'
59 | yield item
--------------------------------------------------------------------------------
/MyNews/spiders/wangyinews.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from scrapy_redis.spiders import RedisSpider
4 | from MyNews.items import NewsItem
5 | from scrapy import Request
6 | # from ..headers import qqheaders
7 |
8 | class wangyinewsSpider(RedisSpider):
9 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
10 |
11 | name = 'wangyinews'
12 | redis_key = 'wangyinews:start_urls'
13 | taglist = ['ent','sports','tech']
14 |
15 | def __init__(self, *args, **kwargs):
16 | # Dynamically define the allowed domains list.
17 | domain = kwargs.pop('163.com', 'news.163.com')
18 | self.allowed_domains = filter(None, domain.split(','))
19 | super(wangyinewsSpider, self).__init__(*args, **kwargs)
20 |
21 | def parse(self,response):
22 | monenyurl = 'http://money.163.com/special/002526BH/rank.html'
23 | yield Request(monenyurl,callback=self.parsepage,meta={'tag':'finance'},dont_filter=True)
24 |
25 | for tag in self.taglist:
26 | url = 'http://news.163.com/special/0001386F/rank_'+tag+'.html'
27 | yield Request(url,callback=self.parsepage,meta={'tag':tag},dont_filter=True)
28 |
29 | def parsepage(self,response):
30 |
31 | tag = response.meta['tag']
32 | # newsjson = json.loads(response.text)
33 | newslist = response.xpath("//div[@class='tabContents active']/table/tr/td/a")
34 | for news in newslist:
35 | url = news.xpath("./@href").extract()[0]
36 | meta = {
37 | 'tag':tag,
38 | 'title':news.xpath("./text()").extract()[0],
39 | }
40 |
41 |
42 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
43 |
44 |
45 | def parsebody(self,response):
46 | meta = response.meta
47 |
48 | item = NewsItem()
49 | item['tag'] = meta['tag']
50 | item['title']= meta['title']
51 | item['url'] = response.url
52 | item['refer'] = '网易新闻'
53 | item['body'] = '\n'.join(response.xpath("//div[@id='epContentLeft']/div[@class='post_body']/div[@id='endText']/p/text()").extract())
54 | pubtime = response.xpath("//div[@id='epContentLeft']/div[@class='post_time_source']/text()[1]").extract()
55 | if pubtime:
56 | item['pubtime'] = pubtime[0]
57 | else:
58 | item['pubtime'] = None
59 | if not item['body'] == '':
60 | yield item
61 |
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/MyNews/spiders/peoplenews.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import time
4 | from scrapy_redis.spiders import RedisSpider
5 | from MyNews.items import NewsItem
6 | from scrapy import Request
7 | from ..headers import qqheaders
8 |
9 | class PeopleNewsSpider(RedisSpider):
10 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
11 |
12 | '''
13 | start_url = "http://news.people.com.cn/"
14 | '''
15 | name = 'peoplenews'
16 | redis_key = 'peoplenews:start_urls'
17 | tags_list = ['ent','sports','finance','tech']
18 |
19 | def __init__(self, *args, **kwargs):
20 | # Dynamically define the allowed domains list.
21 | domain = kwargs.pop('people.com.cn', 'news.people.com.cn')
22 | self.allowed_domains = filter(None, domain.split(','))
23 | super(PeopleNewsSpider, self).__init__(*args, **kwargs)
24 |
25 | def parse(self,response):
26 | # 获取总页数:
27 | # page_total = int(response.xpath('//div[@id="Pagination"]//span[last()-1]').extract())
28 | # 获取当前时间戳
29 | now = int(time.time())
30 | url = 'http://news.people.com.cn/210801/211150/index.js?_='+str(now)
31 | yield Request(url, callback=self.parsepage_url, dont_filter=True)
32 |
33 |
34 | def parsepage_url(self,response):
35 | newsjson = json.loads(response.text)
36 | newslist = newsjson['items']
37 | for news in newslist:
38 | post_title = news['title']
39 | pub_time = news['date']
40 | post_url = news['url']
41 | if 'ent' in post_url:
42 | yield Request(post_url, callback=self.parsebody, meta={'tag':'ent','title':post_title,'pubtime':pub_time},dont_filter=True)
43 | if 'sports' in post_url:
44 | yield Request(post_url, callback=self.parsebody, meta={'tag':'sport','title':post_title,'pubtime':pub_time},dont_filter=True)
45 | if 'finance' in post_url:
46 | yield Request(post_url, callback=self.parsebody, meta={'tag':'finance','title':post_title,'pubtime':pub_time},dont_filter=True)
47 | if 'it' in post_url:
48 | yield Request(post_url, callback=self.parsebody, meta={'tag':'tech','title':post_title,'pubtime':pub_time},dont_filter=True)
49 |
50 |
51 | def parsebody(self,response):
52 | meta = response.meta
53 |
54 | item = NewsItem()
55 | item['tag'] = meta['tag']
56 | item['title']= meta['title']
57 | item['url'] = response.url
58 | item['body'] = ''.join(response.xpath('//div[@class="fl text_con_left"]//div[@id="rwb_zw"]//p/text()').extract()).replace('\t','')
59 | item['pubtime'] = meta['pubtime']
60 | item['refer'] = '人民网'
61 | yield item
62 |
63 |
64 |
--------------------------------------------------------------------------------
/MyNews/spiders/sinanews.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import time
4 | from scrapy_redis.spiders import RedisSpider
5 | from MyNews.items import NewsItem
6 | from scrapy import Request
7 | from ..headers import qqheaders
8 |
9 | class SinaNewsSpider(RedisSpider):
10 | """Spider that reads urls from redis queue (sinanews:start_urls)."""
11 |
12 | '''
13 | start_url = "http://feed.mix.sina.com.cn/api/roll/get?pageid=153"
14 | '''
15 | name = 'sinanews'
16 | redis_key = 'sinanews:start_urls'
17 | tags_list = ['ent','sports','finance','tech']
18 | tagnum_list = [2513,2512,2516,2515]
19 |
20 | def __init__(self, *args, **kwargs):
21 | # Dynamically define the allowed domains list.
22 | domain = kwargs.pop('sina.com.cn', 'news.sina.com.cn')
23 | self.allowed_domains = filter(None, domain.split(','))
24 | super(SinaNewsSpider, self).__init__(*args, **kwargs)
25 |
26 | def parse(self,response):
27 | for j in range(10):
28 | for i in self.tagnum_list:
29 | if i == 2513:
30 | url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid='+str(i)+'&k=&num=50&page='+str(j)
31 | yield Request(url, callback=self.parsepage, meta={'tag':'ent'}, dont_filter=True)
32 | if i == 2512:
33 | url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=' + str(i) + '&k=&num=50&page=' + str(j)
34 | yield Request(url, callback=self.parsepage, meta={'tag': 'sports'}, dont_filter=True)
35 | if i == 2516:
36 | url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=' + str(
37 | i) + '&k=&num=50&page=' + str(j)
38 | yield Request(url, callback=self.parsepage, meta={'tag': 'finance'}, dont_filter=True)
39 | if i == 2515:
40 | url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=' + str(
41 | i) + '&k=&num=50&page=' + str(j)
42 | yield Request(url, callback=self.parsepage, meta={'tag': 'tech'}, dont_filter=True)
43 |
44 | def parsepage(self,response):
45 | tag = response.meta['tag']
46 | newsjson = json.loads(response.text)
47 | newslist = newsjson['result']['data']
48 | for news in newslist:
49 | post_url = news['url']
50 | # 转换成localtime
51 | time_local = time.localtime(int(news['ctime']))
52 | # 转换成新的时间格式(2016-05-05 20:28:54)
53 | dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
54 | meta = {
55 | 'tag': tag,
56 | 'title': news['title'],
57 | 'pubtime': dt,
58 | }
59 | yield Request(post_url, callback=self.parsebody, meta=meta,dont_filter=True)
60 |
61 | def parsebody(self,response):
62 | meta = response.meta
63 |
64 | item = NewsItem()
65 | item['tag'] = meta['tag']
66 | item['title']= meta['title']
67 | item['url'] = response.url
68 | item['body'] = '\n'.join(response.xpath("//div[@class='article']//p[position()>4]/text()").extract())
69 | item['pubtime'] = meta['pubtime']
70 | item['refer'] = '新浪新闻'
71 | yield item
72 |
73 |
74 |
--------------------------------------------------------------------------------
/example/process_items.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -*- coding: utf-8 -*-
4 | """A script to process items from a redis queue."""
5 | from __future__ import print_function, unicode_literals
6 |
7 | import argparse
8 | import json
9 | import logging
10 | import pprint
11 | import sys
12 | import time
13 |
14 | from scrapy_redis import get_redis
15 |
16 |
17 | logger = logging.getLogger('process_items')
18 |
19 |
20 | def process_items(r, keys, timeout, limit=0, log_every=1000, wait=.1):
21 | """Process items from a redis queue.
22 |
23 | Parameters
24 | ----------
25 | r : Redis
26 | Redis connection instance.
27 | keys : list
28 | List of keys to read the items from.
29 | timeout: int
30 | Read timeout.
31 |
32 | """
33 | limit = limit or float('inf')
34 | processed = 0
35 | while processed < limit:
36 | # Change ``blpop`` to ``brpop`` to process as LIFO.
37 | ret = r.blpop(keys, timeout)
38 | # If data is found before the timeout then we consider we are done.
39 | if ret is None:
40 | time.sleep(wait)
41 | continue
42 |
43 | source, data = ret
44 | try:
45 | item = json.loads(data)
46 | except Exception:
47 | logger.exception("Failed to load item:\n%r", pprint.pformat(data))
48 | continue
49 |
50 | try:
51 | name = item.get('name') or item.get('title')
52 | url = item.get('url') or item.get('link')
53 | logger.debug("[%s] Processing item: %s <%s>", source, name, url)
54 | except KeyError:
55 | logger.exception("[%s] Failed to process item:\n%r",
56 | source, pprint.pformat(item))
57 | continue
58 |
59 | processed += 1
60 | if processed % log_every == 0:
61 | logger.info("Processed %s items", processed)
62 |
63 |
64 | def main():
65 | parser = argparse.ArgumentParser(description=__doc__)
66 | parser.add_argument('key', help="Redis key where items are stored")
67 | parser.add_argument('--host')
68 | parser.add_argument('--port')
69 | parser.add_argument('--timeout', type=int, default=5)
70 | parser.add_argument('--limit', type=int, default=0)
71 | parser.add_argument('--progress-every', type=int, default=100)
72 | parser.add_argument('-v', '--verbose', action='store_true')
73 |
74 | args = parser.parse_args()
75 |
76 | params = {}
77 | if args.host:
78 | params['host'] = args.host
79 | if args.port:
80 | params['port'] = args.port
81 |
82 | logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
83 |
84 | r = get_redis(**params)
85 | host = r.connection_pool.get_connection('info').host
86 | logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
87 | kwargs = {
88 | 'keys': [args.key],
89 | 'timeout': args.timeout,
90 | 'limit': args.limit,
91 | 'log_every': args.progress_every,
92 | }
93 | try:
94 | process_items(r, **kwargs)
95 | retcode = 0 # ok
96 | except KeyboardInterrupt:
97 | retcode = 0 # ok
98 | except Exception:
99 | logger.exception("Unhandled exception")
100 | retcode = 2
101 |
102 | return retcode
103 |
104 |
105 | if __name__ == '__main__':
106 | sys.exit(main())
107 |
--------------------------------------------------------------------------------
/MyNews/spiders/ifengnews.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from scrapy_redis.spiders import RedisSpider
4 | from MyNews.items import NewsItem
5 | from scrapy import Request
6 | # from ..headers import qqheaders
7 |
8 | class ifengnewsSpider(RedisSpider):
9 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
10 |
11 | name = 'ifengnews'
12 | redis_key = 'ifengnews:start_urls'
13 |
14 | def __init__(self, *args, **kwargs):
15 | # Dynamically define the allowed domains list.
16 | domain = kwargs.pop('ifeng.com','news.ifeng.com')
17 | self.allowed_domains = filter(None, domain.split(','))
18 | super(ifengnewsSpider, self).__init__(*args, **kwargs)
19 |
20 | def parse(self,response):
21 | baseurl = 'http://news.ifeng.com/hotnews/'
22 | yield Request(baseurl,callback=self.parsepage,dont_filter=True)
23 |
24 |
25 | def parsepage(self,response):
26 | sportslist = response.xpath("//div[@id='c06']/table/tr")[1:-1]
27 | entlist = response.xpath("//div[@id='c10']/table/tr")[1:-1]
28 | financelist = response.xpath("//div[@id='c07']/table/tr")[1:-1]
29 |
30 | for news in sportslist:
31 | url = news.xpath("./td/h3/a/@href").extract()[0]
32 | pubtime = news.xpath("./td[4]/text()").extract()
33 | if pubtime:
34 | pubtime = pubtime[0]
35 | else:
36 | pubtime = None
37 | meta = {
38 | 'title':news.xpath("./td/h3/a/text()").extract()[0],
39 | 'pubtime':pubtime,
40 | 'tag':'sports'
41 | }
42 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
43 |
44 | for news in entlist:
45 | url = news.xpath("./td/h3/a/@href").extract()[0]
46 | pubtime = news.xpath("./td[4]/text()").extract()
47 | if pubtime:
48 | pubtime = pubtime[0]
49 | else:
50 | pubtime = None
51 | meta = {
52 | 'title':news.xpath("./td/h3/a/text()").extract()[0],
53 | 'pubtime':pubtime,
54 | 'tag':'ent'
55 | }
56 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
57 |
58 | for news in financelist:
59 | url = news.xpath("./td/h3/a/@href").extract()[0]
60 | pubtime = news.xpath("./td[4]/text()").extract()
61 | if pubtime:
62 | pubtime = pubtime[0]
63 | else:
64 | pubtime = None
65 | meta = {
66 | 'title':news.xpath("./td/h3/a/text()").extract()[0],
67 | 'pubtime':pubtime,
68 | 'tag':'finance'
69 | }
70 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
71 |
72 |
73 | def parsebody(self,response):
74 | meta = response.meta
75 |
76 | if not meta['pubtime']:
77 | pubtime = response.xpath("//div[@id='artical_sth']/p[@class='p_time']/span[1]/text()").extract()
78 | if pubtime:
79 | meta['pubtime'] = pubtime[0]
80 | else:
81 | meta['pubtime'] = None
82 |
83 | item = NewsItem()
84 | item['tag'] = meta['tag']
85 | item['title']= meta['title']
86 | item['pubtime'] = meta['pubtime']
87 | item['url'] = response.url
88 | item['body'] = '\n'.join(response.xpath("//div[@id='main_content']/p/text()").extract())
89 | item['refer'] = '凤凰新闻'
90 | if not item['body'] == '':
91 | yield item
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/MyNews/spiders/sohunews.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | from scrapy_redis.spiders import RedisSpider
5 | from MyNews.items import NewsItem
6 | from scrapy import Request
7 | # from ..headers import qqheaders
8 |
9 | class sohunewsSpider(RedisSpider):
10 | """Spider that reads urls from redis queue (qqnews:start_urls)."""
11 |
12 | name = 'sohunews'
13 | redis_key = 'sohunews:start_urls'
14 |
15 | re_str1 = re.compile("item:\[(.*)\]")
16 | re_str2 = re.compile("\[(.*?),\"(.*?)\",\"(.*?)\",\"(.*?)\"]")
17 |
18 | # taglist = ['ent','sports','tech']
19 |
20 | def __init__(self, *args, **kwargs):
21 | # Dynamically define the allowed domains list.
22 | domain = kwargs.pop('sohu.com', 'news.sohu.com')
23 | self.allowed_domains = filter(None, domain.split(','))
24 | super(sohunewsSpider, self).__init__(*args, **kwargs)
25 |
26 | def parse(self,response):
27 | sportsurl = 'http://sports.sohu.com/_scroll_newslist/20180420/news.inc'
28 | enturl = 'http://yule.sohu.com/_scroll_newslist/20180420/news.inc'
29 | yield Request(sportsurl,callback=self.parsepage1,meta={'tag':'sports'},dont_filter=True)
30 | yield Request(enturl, callback=self.parsepage1, meta={'tag': 'ent'}, dont_filter=True)
31 |
32 | techurl = "http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&page=1&size=40"
33 | financeurl= "http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=15&page=1&size=40"
34 | yield Request(techurl,callback=self.parsepage2,meta={'tag':'tech'},dont_filter=True)
35 | yield Request(financeurl, callback=self.parsepage2, meta={'tag': 'finance'}, dont_filter=True)
36 |
37 |
38 | def parsepage1(self,response):
39 | tag = response.meta['tag']
40 | html = response.text
41 | # html = html.replace('var newsJason = ','')
42 | info = self.re_str1.findall(html)[0]
43 | # info = info.split(',')
44 | infolist = self.re_str2.findall(info)
45 | for info in infolist:
46 | url = info[2]
47 | meta = {
48 | 'tag':tag,
49 | 'title':info[1],
50 | 'pubtime':info[3]
51 | }
52 | yield Request(url,callback=self.parsebody,dont_filter=True,meta=meta)
53 |
54 |
55 | def parsepage2(self,response):
56 | tag = response.meta['tag']
57 | datalist = json.loads(response.text)
58 | for data in datalist:
59 | id = data['id']
60 | authorId = data['authorId']
61 | url = 'http://www.sohu.com/a/'+ str(id) +'_'+ str(authorId)
62 | title = data['title']
63 |
64 | meta = {
65 | 'tag': tag,
66 | 'title': title,
67 | 'pubtime':None
68 | }
69 | yield Request(url, callback=self.parsebody, dont_filter=True, meta=meta)
70 |
71 | def parsebody(self,response):
72 | meta = response.meta
73 |
74 | item = NewsItem()
75 | item['tag'] = meta['tag']
76 | item['title']= meta['title']
77 | if not meta['pubtime']:
78 | pubtime = response.xpath("//div[@class='article-info']/span[@id='news-time']/text()")
79 | if pubtime:
80 | meta['pubtime'] = pubtime.extract()[0]
81 | else:
82 | meta['pubtime'] = None
83 | item['pubtime'] = meta['pubtime']
84 | item['url'] = response.url
85 | item['body'] = '\n'.join(response.xpath("//article[@id='mp-editor']/p/text()").extract())
86 | item['refer'] = '搜狐新闻'
87 |
88 | if not item['body'] == '':
89 | yield item
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/MyNews/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for MyNews project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'MyNews'
13 |
14 | SPIDER_MODULES = ['MyNews.spiders']
15 | NEWSPIDER_MODULE = 'MyNews.spiders'
16 |
17 |
18 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
19 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
20 | SCHEDULER_PERSIST = True
21 |
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'MyNews (+http://www.yourdomain.com)'
24 |
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = False
27 |
28 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 | #CONCURRENT_REQUESTS = 32
30 |
31 | # Configure a delay for requests for the same website (default: 0)
32 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
33 | # See also autothrottle settings and docs
34 | #DOWNLOAD_DELAY = 3
35 | # The download delay setting will honor only one of:
36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 | #CONCURRENT_REQUESTS_PER_IP = 16
38 |
39 | # Disable cookies (enabled by default)
40 | #COOKIES_ENABLED = False
41 |
42 | # Disable Telnet Console (enabled by default)
43 | #TELNETCONSOLE_ENABLED = False
44 |
45 | # Override the default request headers:
46 | #DEFAULT_REQUEST_HEADERS = {
47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 | # 'Accept-Language': 'en',
49 | #}
50 |
51 | # Enable or disable spider middlewares
52 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
53 | #SPIDER_MIDDLEWARES = {
54 | # 'MyNews.middlewares.MynewsSpiderMiddleware': 543,
55 | #}
56 |
57 | # Enable or disable downloader middlewares
58 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
59 | # DOWNLOADER_MIDDLEWARES = {
60 | # # 'MyNews.middlewares.MynewsDownloaderMiddleware': 543,
61 | # 'MyNews.middlewares.Headersmiddleware': 543,
62 | # }
63 |
64 | # Enable or disable extensions
65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | # 'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 |
70 | # Configure item pipelines
71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
72 | ITEM_PIPELINES = {
73 | 'scrapy_redis.pipelines.RedisPipeline': 400,
74 | 'MyNews.pipelines.MynewsPipeline': 300,
75 | }
76 |
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 |
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 |
98 | REDIS_URL = 'redis://10.36.131.52:6379'
99 |
100 | LOG_LEVEL = 'DEBUG'
101 |
102 | DOWNLOAD_DELAY = 1
--------------------------------------------------------------------------------
/example/example/spiders/mycrawler_redis.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import lxml
4 | from scrapy.spiders import Rule
5 | from scrapy.linkextractors import LinkExtractor
6 |
7 | from scrapy_redis.spiders import RedisCrawlSpider, RedisMixin
8 | from example.items import LengzhishiItem, JdspiderItem
9 | from scrapy.spiders import CrawlSpider
10 | import requests
11 |
12 |
13 | class MyCrawler(RedisCrawlSpider):
14 | """Spider that reads urls from redis queue (myspider:start_urls)."""
15 | name = 'mycrawler_redis'
16 | redis_key = 'mycrawler:start_urls'
17 |
18 | # rules = (
19 | # # follow all links
20 | # Rule(LinkExtractor(), callback='parse_page', follow=True),
21 | # )
22 | #
23 | # def __init__(self, *args, **kwargs):
24 | # # Dynamically define the allowed domains list.
25 | # domain = kwargs.pop('domain', '')
26 | # self.allowed_domains = filter(None, domain.split(','))
27 | # super(MyCrawler, self).__init__(*args, **kwargs)
28 | #
29 | # def parse_page(self, response):
30 | # return {
31 | # 'name': response.css('title::text').extract_first(),
32 | # 'url': response.url,
33 | # }
34 |
35 | rules = [
36 | Rule(LinkExtractor(allow=(r'jd\.com/\d+\.html',)),callback="parseContent",follow=True)
37 | ]
38 |
39 |
40 | def set_crawler(self, crawer):
41 | CrawlSpider.set_crawler(self, crawer) # 设置默认爬去
42 | RedisMixin.setup_redis(self) # url由redis
43 |
44 | def parseContent(self, response):
45 | # content = response.xpath("//p[@class='topic-content']/text()").extract()[0]
46 | # item = LengzhishiItem()
47 | # item['content'] = content
48 | # yield item
49 | # return {
50 | # 'content': response.xpath("//p[@class='topic-content']/text()").extract()[0],
51 | # 'url': response.url,
52 | # }
53 | namelist = []
54 | contentlist = []
55 |
56 | item = JdspiderItem()
57 | url = response.url
58 | response = requests.get(url)
59 |
60 | html = lxml.etree.HTML(response.text)
61 |
62 | infolist = html.xpath("//*[@id=\"detail\"]/div[2]/div//dl")
63 |
64 | name = html.xpath("//div[@class='item ellipsis']/text()")[0].strip()
65 |
66 | # print("商品名称:", name)
67 | namelist.append("商品名称")
68 | contentlist.append(name)
69 | # item['name'] = name
70 |
71 | try:
72 | baozhuang = html.xpath("//div[@class='package-list']/p/text()")[0].strip().replace("\n", '、')
73 | except:
74 | baozhuang = "未列明"
75 | # print("包装清单:", baozhuang)
76 | namelist.append("包装清单")
77 | contentlist.append(baozhuang)
78 |
79 | # jieshao = html.xpath("//div[@class='item hide']/text()")[0]
80 | # print("商品简介:",jieshao)
81 |
82 | # 京东的价格采用ajax动态加载,而且同一IP请求过于频繁可能触发验证码,这里很坑
83 | # 如果触发验证码则获取不到价格信息,暂时没找到好的解决办法,添加异常处理
84 | try:
85 | number = re.findall(r"com/(\d+)\.html", url)[0]
86 | # print(number)
87 |
88 | ajaxUrl = r"https://p.3.cn/prices/mgets?pdtk=&skuIds=J_" + number
89 |
90 | ajaxResponse = requests.get(ajaxUrl)
91 | # print(ajaxResponse.text)
92 | prices = re.findall('"p":"(.*?)"', ajaxResponse.text)[0].strip()
93 | # print("价格:", prices)
94 |
95 | except:
96 | prices = "获取失败"
97 |
98 | namelist.append("价格")
99 | contentlist.append(prices)
100 |
101 | for info in infolist:
102 | titles = info.xpath("./dt/text()")
103 | contents = info.xpath("./dd/text()")
104 | for title, content in zip(titles, contents):
105 | # print(title, ':', content)
106 | namelist.append(title.strip())
107 | contentlist.append(content.strip())
108 |
109 | item['name'] = namelist
110 | item['content'] = contentlist
111 | item['url'] = response.url
112 |
113 | yield item
--------------------------------------------------------------------------------
/MyNews/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware
10 |
11 | from MyNews.headers import qqheaders
12 |
13 |
14 | class MynewsSpiderMiddleware(object):
15 | # Not all methods need to be defined. If a method is not defined,
16 | # scrapy acts as if the spider middleware does not modify the
17 | # passed objects.
18 |
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | # This method is used by Scrapy to create your spiders.
22 | s = cls()
23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
24 | return s
25 |
26 | def process_spider_input(self, response, spider):
27 | # Called for each response that goes through the spider
28 | # middleware and into the spider.
29 |
30 | # Should return None or raise an exception.
31 | return None
32 |
33 | def process_spider_output(self, response, result, spider):
34 | # Called with the results returned from the Spider, after
35 | # it has processed the response.
36 |
37 | # Must return an iterable of Request, dict or Item objects.
38 | for i in result:
39 | yield i
40 |
41 | def process_spider_exception(self, response, exception, spider):
42 | # Called when a spider or process_spider_input() method
43 | # (from other spider middleware) raises an exception.
44 |
45 | # Should return either None or an iterable of Response, dict
46 | # or Item objects.
47 | pass
48 |
49 | def process_start_requests(self, start_requests, spider):
50 | # Called with the start requests of the spider, and works
51 | # similarly to the process_spider_output() method, except
52 | # that it doesn’t have a response associated.
53 |
54 | # Must return only requests (not items).
55 | for r in start_requests:
56 | yield r
57 |
58 | def spider_opened(self, spider):
59 | spider.logger.info('Spider opened: %s' % spider.name)
60 |
61 |
62 | class MynewsDownloaderMiddleware(object):
63 | # Not all methods need to be defined. If a method is not defined,
64 | # scrapy acts as if the downloader middleware does not modify the
65 | # passed objects.
66 |
67 | @classmethod
68 | def from_crawler(cls, crawler):
69 | # This method is used by Scrapy to create your spiders.
70 | s = cls()
71 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
72 | return s
73 |
74 | def process_request(self, request, spider):
75 | # Called for each request that goes through the downloader
76 | # middleware.
77 |
78 | # Must either:
79 | # - return None: continue processing this request
80 | # - or return a Response object
81 | # - or return a Request object
82 | # - or raise IgnoreRequest: process_exception() methods of
83 | # installed downloader middleware will be called
84 | return None
85 |
86 | def process_response(self, request, response, spider):
87 | # Called with the response returned from the downloader.
88 |
89 | # Must either;
90 | # - return a Response object
91 | # - return a Request object
92 | # - or raise IgnoreRequest
93 | return response
94 |
95 | def process_exception(self, request, exception, spider):
96 | # Called when a download handler or a process_request()
97 | # (from other downloader middleware) raises an exception.
98 |
99 | # Must either:
100 | # - return None: continue processing this exception
101 | # - return a Response object: stops process_exception() chain
102 | # - return a Request object: stops process_exception() chain
103 | pass
104 |
105 | def spider_opened(self, spider):
106 | spider.logger.info('Spider opened: %s' % spider.name)
107 |
108 |
109 |
110 | # class Headersmiddleware(DefaultHeadersMiddleware):
111 | # def process_request(self, request, spider):
112 | # if spider.name == 'qqnews':
113 | # request.headers = qqheaders
114 | # super(Headersmiddleware, self).process_request(request, spider)
--------------------------------------------------------------------------------
/example/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
128 |
129 |
130 |
131 |
132 | true
133 | DEFINITION_ORDER
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 | 1524110057862
211 |
212 |
213 | 1524110057862
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
--------------------------------------------------------------------------------