├── .idea
├── NewsSpider-master.iml
├── misc.xml
├── modules.xml
└── workspace.xml
├── README.md
├── __init__.py
├── debug_163.py
├── debug_ifeng.py
├── debug_pengpai.py
├── debug_qq.py
├── debug_sohu.py
├── scrapy.cfg
└── scrapyspider
├── __init__.py
├── csv_process.py
├── items.py
├── items.pyc
├── middlewares.py
├── pipelines.py
├── settings.py
├── settings.pyc
└── spiders
├── __init__.py
├── news_163.py
├── news_ifeng.py
├── news_pengpai.py
├── news_qq.py
└── news_sohu.py
/.idea/NewsSpider-master.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 | 1535685413873
205 |
206 |
207 | 1535685413873
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NewsSpider
2 | 该项目是基于Scrapy框架的Python新闻爬虫,能够爬取网易,搜狐,凤凰和澎湃网站上的新闻,将标题,内容,评论,时间等内容整理并保存到本地
3 |
4 | 项目需求
5 | 1:爬取网易,搜狐,凤凰和澎湃新闻网站的文章及评论
6 | 2:新闻网页数目不少于10万页
7 | 3:每个新闻网页及其评论能在1天内更新
8 |
9 | 项目技术
10 | 1:设计一个网络爬虫,能够爬取指定网站的全部页面,并提取其中的文章及评论内容
11 | 2:定时运行网络爬虫,实现每日更新数据
12 |
13 | 首先从初始URL 开始,Scheduler 会将其交给 Downloader 进行下载,下载之后会交给 Spider 进行分析,这里的spider就是爬虫的核心功能代码,Spider分析出来的结果有两种:一种是需要进一步抓取的链接,它们会通过middleware传回 Scheduler ;另一种是需要保存的数据,送入Item Pipeline ,进行处理和存储,最后将所有数据输出并保存为文件
14 |
15 | 项目结构
16 | scrapyspier为scrapy框架文件
17 | spiders是核心文件,对新闻网站信息爬取
18 | debug_xxx.py是运行文件
19 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/debug_163.py:
--------------------------------------------------------------------------------
1 |
2 | from scrapy import cmdline
3 | cmdline.execute("scrapy crawl 163news -o 网易新闻0831.csv".split())
4 |
--------------------------------------------------------------------------------
/debug_ifeng.py:
--------------------------------------------------------------------------------
1 | import time
2 | import datetime
3 | from scrapy import cmdline
4 | #
5 | def runifengnews(h, m):
6 | '''h表示设定的小时,m为设定的分钟'''
7 | while True:
8 | # 判断是否达到设定时间,例如0:00
9 | while True:
10 | now = datetime.datetime.now()
11 | # 到达设定时间,结束内循环
12 | if (now.hour == h and now.minute >= m) or (now.hour > h):
13 | break
14 | # 不到时间就等10秒之后再次检测
15 | time.sleep(10)
16 |
17 | cmdline.execute("scrapy crawl ifengnews -o 凤凰新闻0106_1.csv".split())
18 |
19 | runifengnews(8, 38)
20 |
--------------------------------------------------------------------------------
/debug_pengpai.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | #
3 | cmdline.execute("scrapy crawl pengpainews -o 澎湃新闻0101_1.csv".split())
4 |
--------------------------------------------------------------------------------
/debug_qq.py:
--------------------------------------------------------------------------------
1 | import time
2 | import datetime
3 | from scrapy import cmdline
4 | #
5 | def runqqnews(h, m):
6 | '''h表示设定的小时,m为设定的分钟'''
7 | while True:
8 | # 判断是否达到设定时间,例如0:00
9 | while True:
10 | now = datetime.datetime.now()
11 | # 到达设定时间,结束内循环
12 | if now.hour >= h and now.minute >= m:
13 | break
14 | # 不到时间就等10秒之后再次检测
15 | time.sleep(10)
16 |
17 | cmdline.execute("scrapy crawl qqnews -o 腾讯新闻0106_1.csv".split())
18 |
19 | runqqnews(8, 0)
20 |
--------------------------------------------------------------------------------
/debug_sohu.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | #from scrapy import cmdline
3 | cmdline.execute("scrapy crawl sohunews -o 搜狐新闻0111_1.csv".split())
4 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyspider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyspider
12 |
--------------------------------------------------------------------------------
/scrapyspider/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | '''
3 | 20171203 数据库
4 |
5 | def __init__(self, dbpool):
6 | self.dbpool = dbpool
7 | '''
8 |
9 |
--------------------------------------------------------------------------------
/scrapyspider/csv_process.py:
--------------------------------------------------------------------------------
1 | #用于将新闻中的评论内容单独存表
2 | # -*- coding:utf-8 -*-
3 | import csv, re
4 | import pandas
5 | import sys
6 | maxInt = sys.maxsize
7 | decrement = True
8 |
9 |
10 | # 爬取的新闻csv文件
11 | csv_path = 'E:\Python\以前的项目\\NewsSpider-master\\网易新闻0831.csv'
12 | # 评论文件输出路径
13 | out_path = 'E:\Python\以前的项目/NewsSpider-master\网易新闻0831评论.csv'
14 |
15 | # 将爬取的新闻内容保存到csv文件
16 | def csv_process():
17 | id = []
18 | username = []
19 | date_time = []
20 | content = []
21 | news_id = []
22 |
23 | # 这里指定csv文件的保存路径
24 | news_dict = csv.reader(open(csv_path, encoding='ANSI'))
25 | count = 0
26 | for items in news_dict:
27 | try:
28 | newsid = items[5]
29 | comment_data = items[1]
30 | keyword_start =u'{'
31 | keyword_end = u'}'
32 |
33 | #寻找每条评论的起止位置
34 | comment_start = [m.start() for m in re.finditer(keyword_start, comment_data)]
35 | comment_end = [n.start() for n in re.finditer(keyword_end, comment_data)]
36 |
37 | # 提取每条评论
38 | for i in range(0,len(comment_end)):
39 | comments = comment_data[comment_start[i]:comment_end[i]]
40 | id_start = comments.find("'id':")
41 | id_end = comments.find(", 'username'")
42 | id.append(comments[id_start + 5 : id_end])
43 |
44 | username_start = comments.find("'username':")
45 | username_end = comments.find(", 'date_time'")
46 | username.append(comments[username_start + 13 : username_end-1])
47 |
48 | datetime_start = comments.find("'date_time':")
49 | datetime_end = comments.find(", 'content'")
50 | date_time.append(comments[datetime_start + 14 : datetime_end-1])
51 |
52 | content_start = comments.find("'content':")
53 | content_end = comments.find("}]")
54 | content.append(comments[content_start + 12: content_end])
55 |
56 | news_id.append(newsid)
57 | count += 1
58 | except:
59 | continue
60 |
61 | #字典中的key值即为csv中列名
62 | dataframe = pandas.DataFrame(
63 | {'id':id,
64 | 'username':username,
65 | 'date_time':date_time,
66 | 'content':content,
67 | 'news_id':news_id}
68 | )
69 |
70 | #将DataFrame存储为csv,index表示是否显示行名,default=True
71 | dataframe.to_csv(
72 | out_path,
73 | index=False,
74 | encoding='utf_8_sig'
75 | )
76 |
77 | while decrement:
78 | # 每当出现OverflowError就将maxInt减小10倍
79 | decrement = False
80 | try:
81 | csv.field_size_limit(maxInt)
82 | except OverflowError:
83 | maxInt = int(maxInt / 10)
84 | decrement = True
85 | csv_process()
86 |
--------------------------------------------------------------------------------
/scrapyspider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 | class newsItem(Item): # 新闻内容包含以下属性
11 | # 文章标题
12 | title = Field()
13 | # 时间
14 | date = Field()
15 | # 正文
16 | content = Field()
17 | #简介(20个字)
18 | abstract = Field()
19 | # 文章热度(参与数)
20 | heat = Field()
21 | # ID
22 | id = Field()
23 | # 链接
24 | url = Field()
25 | # 评论字典
26 | comments = Field()
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/scrapyspider/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/F-debug/NewsSpider/63b40bfadad677290bd516237152a3ce3dfd176f/scrapyspider/items.pyc
--------------------------------------------------------------------------------
/scrapyspider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | '''中间件保持不变即可'''
8 | from scrapy import signals
9 |
10 |
11 | class ScrapyspiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/scrapyspider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import json
8 | import codecs
9 |
10 | '''在这里可以选择生成json文件还是csv文件'''
11 | # class ScrapyspiderPipeline(object):
12 | # def __init__(self):
13 | # self.file = codecs.open('网易新闻1225_1.json', 'w', encoding='utf-8')
14 | # def process_item(self, item, spider):
15 | # line = json.dumps(dict(item), ensure_ascii=False) + "\n"
16 | # self.file.write(line)
17 | # return item
18 | # def spider_closed(self, spider):
19 | # self.file.close()
20 | class ScrapyspiderPipeline(object):
21 | def process_item(self, item, spider):
22 | return item
23 |
24 |
--------------------------------------------------------------------------------
/scrapyspider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for scrapyspider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | import sys
13 | # 这里改成爬虫项目的绝对路径,防止出现路径搜索的bug
14 | sys.path.append('E:\Python\以前的项目\\NewsSpider-master\scrapyspider')
15 |
16 | BOT_NAME = 'scrapyspider'
17 |
18 | SPIDER_MODULES = ['scrapyspider.spiders']
19 | NEWSPIDER_MODULE = 'scrapyspider.spiders'
20 |
21 |
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'scrapyspider (+http://www.yourdomain.com)'
24 |
25 | # Obey robots.txt rules
26 | # 设置是否服从网站的爬虫规则
27 | ROBOTSTXT_OBEY = True
28 |
29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
30 | # 同时并发请求数,越大则爬取越快同时负载也大
31 | CONCURRENT_REQUESTS = 32
32 |
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 |
41 | # Disable cookies (enabled by default)
42 | #禁止cookies,防止被ban
43 | COOKIES_ENABLED = False
44 |
45 | # Disable Telnet Console (enabled by default)
46 | #TELNETCONSOLE_ENABLED = False
47 |
48 | # Override the default request headers:
49 | #DEFAULT_REQUEST_HEADERS = {
50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51 | # 'Accept-Language': 'en',
52 | #}
53 |
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | # 'scrapyspider.middlewares.ScrapyspiderSpiderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | # 'scrapyspider.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 |
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | # 'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 |
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | #ITEM_PIPELINES = {
75 | # 'scrapyspider.pipelines.ScrapyspiderPipeline': 300,
76 | #}
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 |
99 | # 输出的编码格式,由于Excel默认是ANSI编码,所以这里保持一致
100 | # 如果有其他编码需求如utf-8等可自行更改
101 | FEED_EXPORT_ENCODING = 'ANSI'
102 |
103 | # 增加爬取延迟,降低被爬网站服务器压力
104 | DOWNLOAD_DELAY = 0.01
105 |
106 | # 爬取的新闻条数上限
107 | CLOSESPIDER_ITEMCOUNT = 500
108 |
109 | # 下载超时设定,超过10秒没响应则放弃当前URL
110 | DOWNLOAD_TIMEOUT = 100
111 | ITEM_PIPELINES = {
112 | 'scrapyspider.pipelines.ScrapyspiderPipeline': 300,# pipeline中的类名
113 | }
114 |
115 |
--------------------------------------------------------------------------------
/scrapyspider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/F-debug/NewsSpider/63b40bfadad677290bd516237152a3ce3dfd176f/scrapyspider/settings.pyc
--------------------------------------------------------------------------------
/scrapyspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
6 |
--------------------------------------------------------------------------------
/scrapyspider/spiders/news_163.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import newsItem
4 | from scrapy.linkextractors import LinkExtractor
5 | import re, requests, json
6 | from scrapy.selector import Selector
7 | count = 0
8 |
9 | class news163_Spider(CrawlSpider):
10 | # 网易新闻爬虫名称
11 | name = "163news"
12 | # 伪装成浏览器
13 | headers = {
14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
15 | }
16 | #网易全网
17 | allowed_domains = [
18 | "163.com"
19 | ]
20 | #新闻版
21 | start_urls = [
22 | 'http://news.163.com/'
23 | ]
24 | #可以继续访问的url规则,http://news.163.com/\d\d\d\d\d(/([\w\._+-])*)*$
25 | rules = [
26 | Rule(LinkExtractor(
27 | allow=(
28 | ('http://news\.163\.com/.*$')
29 | # ('http://ent\.163\.com/.*$'),
30 | # ('http://money\.163\.com/.*$'),
31 | # ('http://war\.163\.com/.*$'),
32 | # ('http://sport\.163\.com/.*$'),
33 | # ('http://tech\.163\.com/.*$'),
34 | # ('http://fashion\.163\.com/.*$'),
35 | # ('http://auto\.163\.com/.*$'),
36 | # ('http://jiankang\.163\.com/.*$')
37 | ),
38 | deny = ('http://.*.163.com/photo.*$')
39 | ),
40 | callback="parse_item",
41 | follow=True)
42 | ]
43 | def parse_item(self, response):
44 | # response是当前url的响应
45 | article = Selector(response)
46 | article_url = response.url
47 | global count
48 | # 分析网页类型
49 | # 比较新的网易新闻 http://news.163.com/05-17/
50 | if get_category(article) == 1:
51 | articleXpath = '//*[@id="epContentLeft"]'
52 | if article.xpath(articleXpath):
53 | titleXpath = '//*[@id="epContentLeft"]/h1/text()'
54 | dateXpath = '//*[@id="epContentLeft"]/div[1]/text()'
55 | contentXpath = '//*[@id="endText"]'
56 | news_infoXpath ='//*[@id="post_comment_area"]/script[3]/text()'
57 |
58 | # 标题
59 | if article.xpath(titleXpath):
60 | news_item = newsItem()
61 | news_item['url'] = article_url
62 | get_title(article, titleXpath, news_item)
63 | # 日期
64 | if article.xpath(dateXpath):
65 | get_date(article, dateXpath, news_item)
66 | # 内容
67 | if article.xpath(contentXpath):
68 | get_content(article, contentXpath, news_item)
69 | count = count + 1
70 | news_item['id'] = count
71 | # 尝试寻找评论
72 | try:
73 | comment_url = get_comment_url(article, news_infoXpath)
74 | # 评论处理
75 | comments = get_comment(comment_url, news_item)[1]
76 | news_item['comments'] = comments
77 | except:
78 | news_item['comments'] = ' '
79 | news_item['heat'] = 0
80 | yield news_item
81 |
82 | # http://news.163.com/40706/
83 | if get_category(article) == 2:
84 | articleXpath = '/html/body/table[9]/tr/td[1]'
85 | if article.xpath(articleXpath):
86 | titleXpath = '/html/body/table[9]/tr/td[1]/table[1]/tr[1]/td/text()'
87 | dateXpath = '/html/body/table[9]/tr/td[1]/table[1]/tr[2]/td[2]/table/tbody/tr[2]/td[1]/text()[1]'
88 | contentXpath = '//*[@id="content"]'
89 | news_item = newsItem()
90 | news_item['url'] = article_url
91 | # 获取标题
92 | if article.xpath(titleXpath):
93 | get_title(article, titleXpath, news_item)
94 | # 获取日期
95 | if article.xpath(dateXpath):
96 | get_date(article, dateXpath, news_item)
97 | # 内容
98 | if article.xpath(contentXpath):
99 | get_content(article, contentXpath, news_item)
100 | count = count + 1
101 | news_item['id'] = count
102 | news_item['heat'] = 0
103 | news_item['comments'] = ' '
104 | yield news_item
105 |
106 | # http://news.163.com/2004w03/
107 | if get_category(article) == 3:
108 | articleXpath = '/html/body/table[7]/tr/td[1]'
109 | if article.xpath(articleXpath):
110 | titleXpath = '/html/body/table[7]/tr/td[1]/b/span/text()'
111 | dateXpath = '//html/body/table[7]/tr/td[1]/table[1]/tr/td[1]/div/span/text()'
112 | dateXpath2 = '/html/body/table[7]/tr/td[1]/table[1]/tr/td[1]/div/span/text()'
113 | contentXpath = '/html/body/table[7]/tbody/tr/td[1]/table[1]/tbody/tr[1]/td'
114 | contentXpath2 = '/html/body/table[7]/tr/td[1]/table[2]/tr[1]/td'
115 | news_item = newsItem()
116 | news_item['url'] = article_url
117 | # 标题
118 | if article.xpath(titleXpath):
119 | get_title(article, titleXpath, news_item)
120 | # 日期
121 | if article.xpath(dateXpath):
122 | get_date(article, dateXpath, news_item)
123 | elif article.xpath(dateXpath2):
124 | get_date(article, dateXpath2, news_item)
125 | # 内容
126 | if article.xpath(contentXpath):
127 | get_content(article, contentXpath, news_item)
128 | count = count + 1
129 | news_item['id'] = count
130 | news_item['heat'] = 0
131 | news_item['comments'] = ' '
132 | elif article.xpath(contentXpath2):
133 | get_content(article, contentXpath2, news_item)
134 | count = count + 1
135 | news_item['id'] = count
136 | news_item['heat'] = 0
137 | news_item['comments'] = ' '
138 | yield news_item
139 |
140 |
141 | '''通用标题处理函数'''
142 | def get_title(article, titleXpath, news_item):
143 | #标题
144 | try:
145 | article_title = article.xpath(titleXpath).extract()[0]
146 | article_title = article_title.replace('\n', '')
147 | article_title = article_title.replace('\r', '')
148 | article_title = article_title.replace('\t', '')
149 | article_title = article_title.replace(' ', '')
150 | news_item['title'] = article_title
151 | except:
152 | news_item['title'] = ' '
153 |
154 |
155 | '''通用日期处理函数'''
156 | def get_date(article, dateXpath, news_item):
157 | # 时间
158 | try:
159 | article_date = article.xpath(dateXpath).extract()[0]
160 | pattern = re.compile("(\d.*\d)") # 正则匹配新闻时间
161 | article_datetime = pattern.findall(article_date)[0]
162 | #article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
163 | news_item['date'] = article_datetime
164 | except:
165 | news_item['date'] = '2010-10-01 17:00:00'
166 | '''网站分类函数'''
167 | def get_category(article):
168 | if article.xpath('//*[@id="epContentLeft"]'):
169 | case = 1 # 最近的网易新闻
170 | return case
171 |
172 | elif article.xpath('/html/body/table[9]/tr/td[1]'):
173 | case = 2 # 零几年的网易新闻
174 | return case
175 | # elif article.xpath('/html/body/table[7]/tr/td[1]'):
176 | # case = 3 # 零几年的网易新闻,5位数字开头的
177 | # return case
178 |
179 | '''字符过滤函数'''
180 | def str_replace(content):
181 | # article_content = ' '.join(content)
182 | # rule = re.compile('\w')
183 | try:
184 | article_content = re.sub('[\sa-zA-Z\[\]!/*(^)$%~@#…&¥—+=_<>.{}\'\-:;"‘’|]', '', content)
185 | return article_content
186 | except:
187 | return content
188 |
189 | '''通用正文处理函数'''
190 | def get_content(article, contentXpath, news_item):
191 | try:
192 | content_data = article.xpath(contentXpath )
193 | article_content = content_data.xpath('string(.)').extract()[0]
194 | article_content = str_replace(article_content)
195 | news_item['content'] = article_content
196 | # 匹配新闻简介,前100个字
197 | try:
198 | abstract = article_content[0:100]
199 | news_item['abstract'] = abstract
200 | except 1:
201 | news_item['abstract'] = article_content
202 | # except 2:
203 | # index = article_content.find('。')
204 | # abstract = article_content[0:index]
205 | # news_item['abstract'] = abstract
206 | except:
207 | news_item['content'] = ' '
208 | news_item['abstract'] = ' '
209 |
210 | '''评论信息提取函数'''
211 | def get_comment_url(article, news_infoXpath):
212 | # 提取新闻评论路径
213 | news_info = article.xpath(news_infoXpath)
214 | news_info_text = news_info.extract()[0]
215 | # 正则寻找
216 | pattern_productKey = re.compile("\"productKey\" :.*")
217 | productKey_text = pattern_productKey.findall(news_info_text)[0]
218 | productKey = re.findall(r"\"productKey\".*\"(.*)\"", productKey_text)
219 | pattern_docId = re.compile("\"docId\" :.*")
220 | docId_text = pattern_docId.findall(news_info_text)[0]
221 | docId = re.findall(r"\"docId\".*\"(.*)\"", docId_text)
222 | comment_url = 'http://comment.news.163.com/api/v1/products/' + productKey[0] + '/threads/' + docId[0] + '/comments/newList?offset=0'
223 | return comment_url
224 |
225 | '''评论处理函数'''
226 | def get_comment(comment_url, news_item):
227 | comments = []
228 | comment_id = 0
229 | try:
230 | comment_data = requests.get(comment_url).text
231 | js_comment = json.loads(comment_data)
232 | try:
233 | heat = js_comment['newListSize']
234 | news_item['heat'] = heat
235 | js_comments = js_comment['comments']
236 | for each,value in js_comments.items():
237 | comment_id += 1
238 | comments_dict = {}
239 | # 评论id
240 | comments_dict['id'] = comment_id
241 | # 评论用户名
242 | try:
243 | comments_dict['username'] = value['user']['nickname']
244 | except:
245 | comments_dict['username'] = '匿名用户'
246 | try:
247 | # 评论时间,datetime格式
248 | date_time = value['createTime']
249 | #date_time = datetime.datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
250 | comments_dict['date_time'] = date_time
251 | except:
252 | comments_dict['date_time'] = news_item['date']
253 | # 评论内容
254 | ori_content = value['content']
255 | content = str_replace(ori_content)
256 | comments_dict['content'] = content
257 | comments.append(comments_dict)
258 | if comments:
259 | return heat, comments
260 | else:
261 | return 0,''
262 | except:
263 | return 0, ''
264 | except:
265 | return 0, ''
266 |
267 |
268 |
269 |
270 |
--------------------------------------------------------------------------------
/scrapyspider/spiders/news_ifeng.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import newsItem
4 | from scrapy.linkextractors import LinkExtractor
5 | import re
6 | import requests
7 | import json
8 | import time
9 | from scrapy.selector import Selector
10 |
11 |
12 | count = 49687
13 |
14 |
15 | class NewsifengSpider(CrawlSpider):
16 | # 爬虫名称
17 | name = "ifengnews"
18 | # 伪装成浏览器
19 | headers = {
20 | 'User-Agent':
21 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
22 | 'AppleWebKit/537.36 (KHTML, like Gecko) '
23 | 'Chrome/53.0.2785.143 '
24 | 'Safari/537.36'
25 | }
26 | # 全网域名
27 | allowed_domains = [
28 | "ifeng.com"
29 | ]
30 | # 新闻版
31 | start_urls = [
32 | 'http://www.ifeng.com/'
33 | # 'http://news.ifeng.com/a/20160411/48422258_0.shtml'
34 | ]
35 | # 可以继续访问的url规则,http://news.163.com/\d\d\d\d\d(/([\w\._+-])*)*$
36 | rules = [
37 | Rule(LinkExtractor(
38 | allow='http://.*.ifeng.com/a(/([\w\._+-])*)*$'
39 | # deny=('http://news.ifeng.com/snapshots(/([\w\._+-])*)*$')
40 | ),
41 | callback="parse_item",
42 | follow=True)
43 | ]
44 | def parse_item(self, response):
45 | global count
46 | # response是当前url的响应
47 | article = Selector(response)
48 | url = response.url
49 |
50 | # http://news.ifeng.com/a/20171228/54623520_0.shtml
51 | if get_category(article) == 1:
52 | articleXpath = '//*[@id="artical"]'
53 | if article.xpath(articleXpath):#如果文章页面存在
54 | titleXpath = '//*[@id="artical_topic"]/text()'
55 | dateXpath = '//*[@id="artical_sth"]/p/span[1]/text()'
56 | contentXpath = '//*[@id="main_content"]'
57 | news_infoXpath = '/html/head/script[10]/text()'
58 | # news_infoXpath2 = '/html/body/div[24]/script[1]/text()'
59 | # 标题
60 | if article.xpath(titleXpath):
61 | news_item = newsItem()# 实例化条目
62 | get_title(article, titleXpath, news_item)
63 | news_item['url'] = url
64 | # 日期
65 | if article.xpath(dateXpath):
66 | get_date(article, dateXpath, news_item)
67 | # 内容
68 | if article.xpath(contentXpath):
69 | try:
70 | get_content(article, contentXpath, news_item)
71 | count += 1
72 | news_item['id'] = count
73 | except:
74 | return
75 | # 评论
76 | try:
77 | comment_url = get_comment_url(article,url)
78 | # 评论处理
79 | comments = get_comment(comment_url, news_item)[1]
80 | news_item['comments'] = comments
81 | except:
82 | news_item['comments'] = ' '
83 | news_item['heat'] = 0
84 | yield news_item
85 |
86 | # http://news.ifeng.com/a/20171228/54620295_0.shtml
87 | if get_category(article) == 2:
88 | articleXpath = '/html/body/div[3]'
89 | if article.xpath(articleXpath): # 如果文章页面存在
90 | titleXpath = '/html/body/div[3]/div[1]/h1/text()'
91 | dateXpath = '/html/body/div[3]/div[1]/p/span/text()'
92 | contentXpath = '/html/body/div[3]/div[2]/div[1]/div[1]'
93 | contentXpath2 = '/html/body/div[3]/div[2]/div[1]'
94 | contentXpath3 = '//*[@id="yc_con_txt"]'
95 | # news_infoXpath = '/html/head/script[6]/text()'
96 |
97 | # 标题
98 | if article.xpath(titleXpath):
99 | news_item = newsItem()# 实例化条目
100 | get_title(article, titleXpath, news_item)
101 | news_item['url'] = url
102 | # 日期
103 | if article.xpath(dateXpath):
104 | get_date(article, dateXpath, news_item)
105 | # 内容
106 | if article.xpath(contentXpath):
107 | try:
108 | get_content(article, contentXpath, news_item)
109 | count += 1
110 | news_item['id'] = count
111 | except 1:
112 | get_content(article, contentXpath2, news_item)
113 | count += 1
114 | news_item['id'] = count
115 | except 2:
116 | get_content(article, contentXpath3, news_item)
117 | count += 1
118 | news_item['id'] = count
119 |
120 | # 评论
121 | try:
122 | comment_url = get_comment_url2(article, url)
123 | # 评论处理
124 | comments = get_comment(comment_url, news_item)[1]
125 | news_item['comments'] = comments
126 | except:
127 | news_item['comments'] = ' '
128 | news_item['heat'] = 0
129 | yield news_item
130 |
131 | if get_category(article) == 3:
132 | articleXpath = '/html/body/div[4]'
133 | if article.xpath(articleXpath): # 如果文章页面存在
134 | titleXpath = '/html/body/div[4]/div[2]/h1/text()'
135 | dateXpath = '//*[@id="artical_sth"]/p/span[1]/text()'
136 | contentXpath = '//*[@id="main_content"]'
137 | contentXpath2 = '/html/body/div[3]/div[2]/div[1]'
138 | contentXpath3 = '//*[@id="yc_con_txt"]'
139 | # news_infoXpath = '/html/head/script[6]/text()'
140 |
141 | # 标题
142 | if article.xpath(titleXpath):
143 | news_item = newsItem()# 实例化条目
144 | get_title(article, titleXpath, news_item)
145 | news_item['url'] = url
146 | # 日期
147 | if article.xpath(dateXpath):
148 | get_date(article, dateXpath, news_item)
149 | # 内容
150 | if article.xpath(contentXpath):
151 | try:
152 | get_content(article, contentXpath, news_item)
153 | count += 1
154 | news_item['id'] = count
155 | except 1:
156 | get_content(article, contentXpath2, news_item)
157 | count += 1
158 | news_item['id'] = count
159 | except 2:
160 | get_content(article, contentXpath3, news_item)
161 | count += 1
162 | news_item['id'] = count
163 |
164 | # 评论
165 | try:
166 | comment_url = get_comment_url2(article, url)
167 | # 评论处理
168 | comments = get_comment(comment_url, news_item)[1]
169 | news_item['comments'] = comments
170 | except:
171 | news_item['comments'] = ' '
172 | news_item['heat'] = 0
173 | yield news_item
174 |
175 | if get_category(article) == 4:
176 | articleXpath = '/html/body/div[2]/div/div[3]'
177 | if article.xpath(articleXpath): # 如果文章页面存在
178 | titleXpath = '/html/body/div[2]/div/div[3]/h1/text()'
179 | dateXpath = '/html/body/div[2]/div/div[3]/div[1]/div[1]/div/div[2]/p[2]/text()'
180 | contentXpath = '/html/body/div[2]/div/div[3]/div[7]'
181 | contentXpath2 = '/html/body/div[3]/div[2]/div[1]'
182 | contentXpath3 = '//*[@id="yc_con_txt"]'
183 | # news_infoXpath = '/html/head/script[6]/text()'
184 |
185 | # 标题
186 | if article.xpath(titleXpath):
187 | news_item = newsItem()# 实例化条目
188 | get_title(article, titleXpath, news_item)
189 | news_item['url'] = url
190 | # 日期
191 | if article.xpath(dateXpath):
192 | get_date(article, dateXpath, news_item)
193 | # 内容
194 | if article.xpath(contentXpath):
195 | try:
196 | get_content(article, contentXpath, news_item)
197 | count += 1
198 | news_item['id'] = count
199 | except 1:
200 | get_content(article, contentXpath2, news_item)
201 | count += 1
202 | news_item['id'] = count
203 | except 2:
204 | get_content(article, contentXpath3, news_item)
205 | count += 1
206 | news_item['id'] = count
207 | # 评论
208 | try:
209 | comment_url = get_comment_url2(article, url)
210 | # 评论处理
211 | comments = get_comment(comment_url, news_item)[1]
212 | news_item['comments'] = comments
213 | except:
214 | news_item['comments'] = ' '
215 | news_item['heat'] = 0
216 | yield news_item
217 | '''网站分类函数'''
218 | def get_category(article):
219 | if article.xpath('//*[@id="artical"]'):
220 | case = 1#最近的凤凰新闻
221 | return case
222 | elif article.xpath('/html/body/div[3]'):
223 | case = 2 #
224 | return case
225 | elif article.xpath('/html/body/div[4]'):
226 | case = 3
227 | return case
228 | # elif article.xpath('/html/body/div[2]'):
229 | # case = 4
230 | # return case
231 | '''通用标题处理函数'''
232 | def get_title(article, titleXpath, news_item):
233 | #标题
234 | try:
235 | article_title = article.xpath(titleXpath).extract()[0]
236 | article_title = article_title.replace('\n', ' ')
237 | article_title = article_title.replace('\r', ' ')
238 | article_title = article_title.replace('\t', ' ')
239 | article_title = article_title.replace(' ', '')
240 | news_item['title'] = article_title
241 | except:
242 | news_item['title'] = ' '
243 |
244 | '''通用日期处理函数'''
245 | def get_date(article, dateXpath, news_item):
246 | # 时间
247 | try:
248 | article_date = article.xpath(dateXpath).extract()[0]
249 | pattern = re.compile("(\d.*\d)") # 正则匹配新闻时间
250 | article_datetime = pattern.findall(article_date)[0]
251 | # 替换日期中的汉字
252 | try:
253 | article_datetime = article_datetime.replace('年', '-')
254 | article_datetime = article_datetime.replace('月', '-')
255 | article_datetime = article_datetime.replace('日', '')
256 | except:
257 | pass
258 | #article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
259 | news_item['date'] = article_datetime
260 | except:
261 | news_item['date'] = '2010-10-01 17:00:00'
262 |
263 | '''通用正文处理函数'''
264 | def get_content(article, contentXpath, news_item):
265 | try:
266 | content_data = article.xpath(contentXpath )
267 | article_content = content_data.xpath('string(.)').extract()
268 |
269 | article_content = ' '.join(article_content)
270 | article_content = article_content.replace(' ', '')
271 | article_content = article_content.replace('\t', '')
272 | article_content = article_content.replace('\n', '')
273 | article_content = article_content.replace('\r', '')
274 | for ch in article_content:
275 | if (u'\u4e00' <= ch <= u'\u9fff'):
276 | pass
277 | news_item['content'] = article_content
278 | # 匹配新闻简介
279 | index = article_content.find('。')
280 | abstract = article_content[0:index]
281 | news_item['abstract'] = abstract
282 | except:
283 | news_item['content'] = ' '
284 | news_item['abstract'] = ' '
285 |
286 | '''评论信息提取函数'''
287 | def get_comment_url(article, url):
288 | try:
289 | comment_url = 'http://comment.ifeng.com/get.php?job=1&doc_url=' + url
290 | return comment_url
291 | except:
292 | return
293 | def get_comment_url2(article, news_infoXpath):
294 | news_info = article.xpath(news_infoXpath) # 包含评论信息的变量
295 | news_info_text = news_info.extract()[0]
296 | pattern = re.compile('"commentUrl":(.*)') # 正则匹配新闻id
297 | commentUrl_text = pattern.findall(news_info_text)[0]
298 | commentUrl = commentUrl_text.replace('"', '')
299 | commentUrl = commentUrl.replace(',', '')
300 | comment_url = 'http://comment.ifeng.com/get.php?job=1&doc_url=' + commentUrl + '&job=1'
301 | return comment_url
302 |
303 | '''评论处理函数'''
304 | def get_comment(comment_url, news_item):
305 | comments = []
306 | comment_id = 0
307 | try:
308 | comment_data = requests.get(comment_url).text
309 | js_comment = json.loads(comment_data)
310 | try:
311 | heat = js_comment['count']
312 | news_item['heat'] = heat
313 | js_comments = js_comment['comments']
314 | for each in js_comments:
315 | comment_id += 1
316 | comments_dict = {}
317 | # 评论id
318 | comments_dict['id'] = comment_id
319 | # 评论用户名
320 | try:
321 | comments_dict['username'] = each['uname']
322 | except:
323 | comments_dict['username'] = '匿名用户'
324 | # 评论时间,datetime格式
325 | timestamp = int(each['add_time'])
326 | timeArray = time.localtime(timestamp)
327 | date_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
328 | comments_dict['date_time'] = date_time
329 | # 评论内容
330 | comments_dict['content'] = each['comment_contents']
331 | comments.append(comments_dict)
332 | #a = 1
333 | return heat, comments
334 | except:
335 | return 0, ' '
336 | except:
337 | return 0, ' '
338 |
339 |
340 |
--------------------------------------------------------------------------------
/scrapyspider/spiders/news_pengpai.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import newsItem
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.selector import Selector
6 | import re, requests, time
7 | import json
8 |
9 | count = 77980
10 |
11 | class NewspengpaiSpider(CrawlSpider):
12 | # 爬虫名称
13 | name = "pengpainews"
14 | # 伪装成浏览器
15 | headers = {
16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
17 | 'AppleWebKit/537.36 (KHTML, like Gecko) '
18 | 'Chrome/53.0.2785.143 '
19 | 'Safari/537.36'
20 | }
21 | #网易全网
22 | allowed_domains = [
23 | "www.thepaper.cn"
24 | ]
25 | #新闻版
26 | start_urls = [
27 | "http://www.thepaper.cn/"
28 | ]
29 | #可以继续访问的url规则,http://news.163.com/\d\d\d\d\d(/([\w\._+-])*)*$
30 | rules = [
31 | Rule(LinkExtractor(
32 | allow=('https://www.thepaper.cn(/([\w\._+-])*)*$')),
33 | callback="parse_item",
34 | follow=True)
35 | ]
36 |
37 |
38 | def parse_item(self, response):
39 | global count
40 | # response是当前url的响应
41 | article = Selector(response)
42 | url = response.url
43 | # 分析网页类型
44 |
45 | if get_category(article) == 1:
46 | articleXpath = '/html/body/div/div[2]/div[2]/div[1]'
47 | a = article.xpath(articleXpath)
48 | if article.xpath(articleXpath):#如果文章页面存在
49 | titleXpath = '/html/body/div/div[2]/div[2]/div[1]/h1/text()'
50 | dateXpath = '/html/body/div[3]/div[1]/div[1]/div[2]/p[2]/text()'
51 | contentXpath = '/html/body/div/div[2]/div[2]/div[1]/div[2]'
52 | news_infoXpath = '/html/body/script[5]/text()'
53 | # 实例化条目
54 | news_item = newsItem()
55 | news_item['url'] = url
56 | # 标题
57 | if article.xpath(titleXpath):
58 | get_title(article, titleXpath, news_item)
59 | # 日期
60 | if article.xpath(dateXpath):
61 | get_date(article, dateXpath, news_item)
62 | # 内容
63 | if article.xpath(contentXpath):
64 | get_content(article, contentXpath, news_item)
65 | count += 1
66 | news_item['id'] = count
67 | # 评论
68 | try:
69 | comment_url = get_comment_url(article, news_infoXpath)
70 | # 评论处理
71 | comments = get_comment(comment_url, news_item)[1]
72 | news_item['comments'] = comments
73 | except:
74 | news_item['comments'] = ' '
75 | news_item['heat'] = 0
76 | yield news_item
77 |
78 |
79 | '''通用标题处理函数'''
80 | def get_title(article, titleXpath, news_item):
81 | #标题
82 | try:
83 | article_title = article.xpath(titleXpath).extract()[0]
84 | article_title = article_title.replace('\n', '')
85 | article_title = article_title.replace('\r', '')
86 | article_title = article_title.replace('\t', '')
87 | article_title = article_title.replace(' ', '')
88 | news_item['title'] = article_title
89 | except:
90 | news_item['title'] = ' '
91 |
92 |
93 | '''通用日期处理函数'''
94 | def get_date(article, dateXpath, news_item):
95 | # 时间
96 | try:
97 | article_date = article.xpath(dateXpath).extract()[0]
98 | pattern = re.compile("(\d.*\d)") # 正则匹配新闻时间
99 | article_datetime = pattern.findall(article_date)[0]
100 | #article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
101 | news_item['date'] = article_datetime
102 | except:
103 | news_item['date'] = '2010-10-01 17:00:00'
104 |
105 | '''网站分类函数'''
106 | def get_category(article):
107 | try:
108 | article.xpath('/html/body/div/div[2]/div[2]/div[1]')
109 | case = 1 # 国内新闻
110 | return case
111 | except:
112 | return
113 |
114 | '''通用正文处理函数'''
115 | def get_content(article, contentXpath, news_item):
116 | try:
117 | content_data = article.xpath(contentXpath )
118 | article_content = content_data.xpath('string(.)').extract()
119 |
120 | article_content = ' '.join(article_content)
121 | article_content = article_content.replace('\n', ' ')
122 | article_content = article_content.replace('\t', ' ')
123 | article_content = article_content.replace('\r', ' ')
124 | article_content = article_content.replace(' ', ' ')
125 | news_item['content'] = article_content
126 | # 匹配新闻简介
127 | index = article_content.find('。')
128 | abstract = article_content[0:index]
129 | news_item['abstract'] = abstract
130 | except:
131 | news_item['content'] = ' '
132 | news_item['abstract'] = ' '
133 |
134 | '''评论信息提取函数'''
135 | def get_comment_url(article, news_infoXpath):
136 | news_info = article.xpath(news_infoXpath)#包含评论信息的变量
137 | news_info_text = news_info.extract()[0]
138 | pattern = re.compile("news_id:(.*)")#正则匹配新闻id
139 | news_id_text = pattern.findall(news_info_text)[0]
140 | news_id = re.findall(r"\"(.*)\"", news_id_text)
141 | comment_url = 'http://apiv2.sohu.com/api/comment/list?page_size&topic_id=1&source_id=mp_' + news_id[0]
142 | return comment_url
143 |
144 | '''评论处理函数'''
145 | def get_comment(comment_url, news_item):
146 | comments = []
147 | comment_id = 0
148 | try:
149 | comment_data = requests.get(comment_url).text
150 | js_comment = json.loads(comment_data)
151 | try:
152 | jsObj = js_comment['jsonObject']
153 | heat = jsObj['participation_sum']
154 | news_item['heat'] = heat
155 | js_comments = jsObj['comments']
156 | for each in js_comments:
157 | comment_id += 1
158 | comments_dict = {}
159 | #评论id
160 | comments_dict['id'] = comment_id
161 | #评论用户名
162 | comments_dict['username'] = each['passport']['nickname']
163 | #评论时间,datetime格式
164 | timestamp = int(each['create_time']/1000)
165 | timeArray = time.localtime(timestamp)
166 | date_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
167 | comments_dict['date_time'] = date_time
168 | # 评论内容
169 | comments_dict['content'] = each['content']
170 | comments.append(comments_dict)
171 | return heat, comments
172 | except:
173 | return 0, ' '
174 | except:
175 | return 0, ' '
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
--------------------------------------------------------------------------------
/scrapyspider/spiders/news_qq.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import newsItem
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.selector import Selector
6 | import re, requests
7 | import json
8 |
9 | count = 49687
10 |
11 | class NewsqqSpider(CrawlSpider):
12 |
13 | # 爬虫名称
14 | name = "qqnews"
15 | # 伪装成浏览器
16 | headers = {
17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
18 | }
19 | #网易全网
20 | allowed_domains = [
21 | "qq.com"
22 | ]
23 | #新闻版
24 | start_urls = [
25 | "http://news.qq.com/"
26 | ]
27 | #可以继续访问的url规则,http://(\a)*.sina.com.cn(/([\w\._+-])*)*$
28 | rules = [
29 | Rule(LinkExtractor(allow=('http://news.qq.com/a(/([\w\._+-])*)*$')), callback="parse_item", follow=True),
30 | ]
31 | def parse_item(self, response):
32 | global count
33 | # response是当前url的响应
34 | article = Selector(response)
35 | url = response.url
36 |
37 |
38 | # 分析网页类型
39 | if get_category(article) == 1:
40 | titleXpath = '//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()'
41 | dateXpath = '//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()'
42 | contentXpath = '//*[@id="Cnt-Main-Article-QQ"]'
43 | news_infoXpath = '//*[@id="Main-Article-QQ"]/div/div[1]/div[2]/script/text()'
44 | # 标题
45 |
46 | if article.xpath(titleXpath):
47 | news_item = newsItem()
48 | get_title(article, titleXpath, news_item)
49 | news_item['url'] = url
50 | # 日期
51 | if article.xpath(dateXpath):
52 | get_date(article, dateXpath, news_item)
53 | # 内容
54 | try:
55 | get_content(article, contentXpath, news_item)
56 | count += 1
57 | news_item['id'] = count
58 | except:
59 | return
60 | # 评论
61 | # try:
62 | # comment_url = get_comment_url(article, news_infoXpath)
63 | # # 评论处理
64 | # comments = get_comment(comment_url, news_item)[1]
65 | # news_item['comments'] = comments
66 | # except:
67 | news_item['comments'] = ' '
68 | news_item['heat'] = 0
69 | yield news_item
70 |
71 |
72 | '''网站分类函数'''
73 | def get_category(article):
74 | try:
75 | article.xpath('/html/body/div[3]/div[1]/div[1]')
76 | case = 1
77 | return case
78 | except:
79 | return 0
80 |
81 |
82 | '''通用标题处理函数'''
83 | def get_title(article, titleXpath, news_item):
84 | #标题
85 | article_title = article.xpath(titleXpath).extract()[0]
86 | article_title = article_title.replace('\n', '')
87 | article_title = article_title.replace('\r', '')
88 | article_title = article_title.replace('\t', '')
89 | article_title = article_title.replace(' ', '')
90 | news_item['title'] = article_title
91 |
92 |
93 |
94 | '''通用日期处理函数'''
95 | def get_date(article, dateXpath, news_item):
96 | # 时间
97 | try:
98 | article_date = article.xpath(dateXpath).extract()[0]
99 | pattern = re.compile("(\d.*\d)") # 正则匹配新闻时间
100 | article_datetime = pattern.findall(article_date)[0]
101 | #article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
102 | news_item['date'] = article_datetime
103 | except:
104 | news_item['date'] = '2010-10-01 17:00:00'
105 |
106 | '''通用正文处理函数'''
107 | def get_content(article, contentXpath, news_item):
108 | try:
109 | content_data = article.xpath(contentXpath )
110 | article_content = content_data.xpath('string(.)').extract()
111 |
112 | article_content = ' '.join(article_content)
113 | article_content = article_content.replace('\n', ' ')
114 | article_content = article_content.replace('\t', ' ')
115 | article_content = article_content.replace('\r', ' ')
116 | article_content = article_content.replace(' ', ' ')
117 | news_item['content'] = article_content
118 | # 匹配新闻简介
119 | index = article_content.find('。')
120 | abstract = article_content[0:index]
121 | news_item['abstract'] = abstract
122 | except:
123 | news_item['content'] = ' '
124 | news_item['abstract'] = ' '
125 |
126 | '''评论信息提取函数'''
127 | def get_comment_url(article, news_infoXpath):
128 | news_info = article.xpath(news_infoXpath)
129 | news_info_text = news_info.extract()[0]
130 | pattern = re.compile("cmt_id = (.*);")
131 | news_id_text = pattern.findall(news_info_text)[0]
132 | # news_id = re.findall(r"(\d.*)", news_id_text)
133 | comment_url = 'http://coral.qq.com/article/'+ news_id_text+'/comment/v2?'
134 | return comment_url
135 |
136 | '''评论处理函数'''
137 | def get_comment(comment_url, news_item):
138 | comments = []
139 | comment_id = 0
140 | try:
141 | comment_data = requests.get(comment_url).text
142 | js_comment = json.loads(comment_data)
143 | try:
144 | jsObj = js_comment['jsonObject']
145 | heat = jsObj['participation_sum']
146 | news_item['heat'] = heat
147 | comments = jsObj['comments']
148 | return heat, comments
149 | except:
150 | return 0, ' '
151 | except:
152 | return 0, ' '
153 |
--------------------------------------------------------------------------------
/scrapyspider/spiders/news_sohu.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spiders import CrawlSpider, Rule
3 | from ..items import newsItem
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.selector import Selector
6 | import re, requests, time
7 | import json
8 |
9 |
10 | count = 50000
11 |
12 | class newssohu_Spider(CrawlSpider):
13 | # 爬虫名称
14 | name = "sohunews"
15 | # 伪装成浏览器
16 | headers = {
17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
18 | }
19 | #网易全网
20 | allowed_domains = [
21 | "sohu.com"
22 | ]
23 | #新闻版
24 | start_urls = [
25 | "http://news.sohu.com/"
26 | ]
27 | #可以继续访问的url规则,http://news.163.com/\d\d\d\d\d(/([\w\._+-])*)*$
28 | rules = [
29 | Rule(LinkExtractor(
30 | allow=('http://www.sohu.com(/([\w\._+-])*)*$')),
31 | callback="parse_item",
32 | follow=True)
33 | ]
34 |
35 |
36 | def parse_item(self, response):
37 | global count
38 | # response是当前url的响应
39 | article = Selector(response)
40 | url = response.url
41 | # 分析网页类型
42 | # http://www.sohu.com/a/207094395_162758?_f=index_chan08news_0
43 | if get_category(article) == 1:
44 | articleXpath = '//*[@id="article-container"]/div[2]'
45 | if article.xpath(articleXpath):#如果文章页面存在
46 | titleXpath = '//*[@id="article-container"]/div[2]/div[1]/div[1]/h1/text()'
47 | dateXpath = '//*[@id="news-time"]/text()'
48 | contentXpath = '//*[@id="article-container"]/div[2]/div[1]/article'
49 | news_infoXpath = '/html/body/script[5]/text()'
50 | # 实例化条目
51 |
52 | # 标题
53 | if article.xpath(titleXpath):
54 | news_item = newsItem()
55 | get_title(article, titleXpath, news_item)
56 | news_item['url'] = url
57 | # 日期
58 | if article.xpath(dateXpath):
59 | get_date(article, dateXpath, news_item)
60 | # 内容
61 | if article.xpath(contentXpath):
62 | get_content(article, contentXpath, news_item)
63 | count += 1
64 | news_item['id'] = count
65 | # 评论
66 | try:
67 | comment_url = get_comment_url(article, news_infoXpath)
68 | # 评论处理
69 | comments = get_comment(comment_url, news_item)[1]
70 | news_item['comments'] = comments
71 | except:
72 | news_item['comments'] = ' '
73 | news_item['heat'] = 0
74 | yield news_item
75 |
76 |
77 | '''通用标题处理函数'''
78 | def get_title(article, titleXpath, news_item):
79 | #标题
80 | try:
81 | article_title = article.xpath(titleXpath).extract()[0]
82 | article_title = article_title.replace('\n', '')
83 | article_title = article_title.replace('\r', '')
84 | article_title = article_title.replace('\t', '')
85 | article_title = article_title.replace(' ', '')
86 | news_item['title'] = article_title
87 | except:
88 | news_item['title'] = ' '
89 |
90 |
91 | '''通用日期处理函数'''
92 | def get_date(article, dateXpath, news_item):
93 | # 时间
94 | try:
95 | article_date = article.xpath(dateXpath).extract()[0]
96 | pattern = re.compile("(\d.*\d)") # 正则匹配新闻时间
97 | article_datetime = pattern.findall(article_date)[0]
98 | #article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
99 | news_item['date'] = article_datetime
100 | except:
101 | news_item['date'] = '2010-10-01 17:00:00'
102 |
103 | '''网站分类函数'''
104 | def get_category(article):
105 | try:
106 | article.xpath('//*[@id="article-container"]/div[2]')
107 | case = 1 # 国内新闻
108 | return case
109 | except:
110 | return
111 |
112 | '''字符过滤函数'''
113 | def str_replace(content):
114 | # article_content = ' '.join(content)
115 | # rule = re.compile('\w')
116 | try:
117 | article_content = re.sub('[\sa-zA-Z\[\]!/*(^)$%~@#…&¥—+=_<>.{}\'\-:;"‘’|]', '', content)
118 | return article_content
119 | except:
120 | return content
121 |
122 | '''通用正文处理函数'''
123 | def get_content(article, contentXpath, news_item):
124 | try:
125 | content_data = article.xpath(contentXpath )
126 | article_content = content_data.xpath('string(.)').extract()[0]
127 | article_content = str_replace(article_content)
128 | news_item['content'] = article_content
129 | # 匹配新闻简介,前100个字
130 | try:
131 | abstract = article_content[0:100]
132 | news_item['abstract'] = abstract
133 | except 1:
134 | news_item['abstract'] = article_content
135 | # except 2:
136 | # index = article_content.find('。')
137 | # abstract = article_content[0:index]
138 | # news_item['abstract'] = abstract
139 | except:
140 | news_item['content'] = ' '
141 | news_item['abstract'] = ' '
142 |
143 |
144 | '''评论信息提取函数'''
145 | def get_comment_url(article, news_infoXpath):
146 | news_info = article.xpath(news_infoXpath)#包含评论信息的变量
147 | news_info_text = news_info.extract()[0]
148 | pattern = re.compile("news_id:(.*)")#正则匹配新闻id
149 | news_id_text = pattern.findall(news_info_text)[0]
150 | news_id = re.findall(r"\"(.*)\"", news_id_text)
151 | comment_url = 'http://apiv2.sohu.com/api/comment/list?page_size&topic_id=1&source_id=mp_' + news_id[0]
152 | return comment_url
153 |
154 | '''评论处理函数'''
155 | def get_comment(comment_url, news_item):
156 | comments = []
157 | comment_id = 0
158 | try:
159 | comment_data = requests.get(comment_url).text
160 | js_comment = json.loads(comment_data)
161 | try:
162 | jsObj = js_comment['jsonObject']
163 | heat = jsObj['participation_sum']
164 | news_item['heat'] = heat
165 | js_comments = jsObj['comments']
166 | for each in js_comments:
167 | comment_id += 1
168 | comments_dict = {}
169 | #评论id
170 | comments_dict['id'] = comment_id
171 | #评论用户名
172 | comments_dict['username'] = each['passport']['nickname']
173 | try:
174 | #评论时间,datetime格式
175 | timestamp = int(each['create_time']/1000)
176 | timeArray = time.localtime(timestamp)
177 | date_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
178 | comments_dict['date_time'] = date_time
179 | except:
180 | comments_dict['date_time'] = news_item['date']
181 |
182 | # 评论内容
183 | ori_content = each['content']
184 | comments_dict['content'] = str_replace(ori_content)
185 | comments.append(comments_dict)
186 | if comments:
187 | return heat, comments
188 | else:
189 | return 0, ''
190 | except:
191 | return 0, ''
192 | except:
193 | return 0, ''
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
--------------------------------------------------------------------------------