├── .gitignore ├── README.md ├── newscrawler ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── newsspider.py ├── runspiders.py ├── save2xml.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | *.log 4 | 5 | 6 | docs/ 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # newscrawler 2 | 3 | [![Join the chat at https://gitter.im/tankle/newscrawler](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/tankle/newscrawler?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 新闻网站爬虫,目前能够爬取网易,新浪,qq, sohu等三家网站的新闻页面。 5 | 6 | 7 | 8 | ##Using: 9 | 10 | python runspiders.py 11 | 12 | ##json file 13 | 14 | The news file saved as json file: 15 | 16 | newsId: the news's id 17 | 18 | source: the source of the news , such as news.163.com, news.sina.com.cn or news.qq.com 19 | 20 | date: the creation time of news, 20150529 21 | 22 | contents: 23 | 24 | link: the link of news 25 | 26 | title: the title of news 27 | 28 | passage: the content of news 29 | 30 | 31 | The title and passage are encode as unicode, so you need transform it when load it. 32 | 33 | ##Other: 34 | save2xml.py is used to changing the json to xml type. 35 | 36 | The xml file can be tagged by [TemporaliaChTagger](https://github.com/ntcirtemporalia/TemporaliaChTagger.git). 37 | 38 | 39 | ###Reference 40 | [news-combinator](https://github.com/fanfank/news-combinator.git) 41 | -------------------------------------------------------------------------------- /newscrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tankle/newscrawler/a9dc40c9bf02f2e82781d6148cef6fa9f7a58d3b/newscrawler/__init__.py -------------------------------------------------------------------------------- /newscrawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class NewsItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | source = Field() 15 | date = Field() 16 | newsId = Field() 17 | contents = Field() 18 | 19 | def __str__(self): 20 | return "news downloading ... " 21 | -------------------------------------------------------------------------------- /newscrawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import os 9 | import json 10 | import codecs 11 | 12 | class NewscrawlerPipeline(object): 13 | def __init__(self): 14 | self.current_dir = os.getcwd() 15 | 16 | def process_item(self, item, spider): 17 | dir_path = self.current_dir + '/docs/' + item['source'] + '/' + item['date'] 18 | if not os.path.exists(dir_path): 19 | os.makedirs(dir_path) 20 | 21 | news_file_path = dir_path + '/' + item['newsId'] + '.json' 22 | if os.path.exists(news_file_path) and os.path.isfile(news_file_path): 23 | print '---------------------------------------' 24 | print item['newsId'] + '.json exists, not overriden' 25 | print '---------------------------------------' 26 | return item 27 | 28 | news_file = codecs.open(news_file_path, 'w', 'utf-8') 29 | line = json.dumps(dict(item)) 30 | news_file.write(line) 31 | news_file.close() 32 | return item 33 | -------------------------------------------------------------------------------- /newscrawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for newscrawler project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'newscrawler' 12 | 13 | SPIDER_MODULES = ['newscrawler.spiders'] 14 | NEWSPIDER_MODULE = 'newscrawler.spiders' 15 | 16 | 17 | ITEM_PIPELINES = { 18 | 'newscrawler.pipelines.NewscrawlerPipeline': 1, 19 | } 20 | 21 | 22 | DOWNLOAD_DELAY = 5 23 | 24 | LOG_FILE = "news.crawl.log" 25 | 26 | COOKIES_ENABLED = False 27 | 28 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 29 | #USER_AGENT = 'newscrawler (+http://www.yourdomain.com)' 30 | -------------------------------------------------------------------------------- /newscrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /newscrawler/spiders/newsspider.py: -------------------------------------------------------------------------------- 1 | 2 | from newscrawler.items import NewsItem 3 | 4 | from scrapy.contrib.spiders import CrawlSpider, Rule 5 | from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor 6 | from scrapy.selector import Selector 7 | 8 | import re 9 | 10 | def ListCombiner(lst): 11 | string = '' 12 | for e in lst: 13 | string += e 14 | return string 15 | 16 | 17 | class NeteaseNewsSpider(CrawlSpider): 18 | name = 'netease_news_spider' 19 | allowed_domains = ['news.163.com'] 20 | start_urls = ['http://news.163.com'] 21 | url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/\d+/(\w+)\.html' 22 | rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)] 23 | 24 | def parse_news(self, response): 25 | sel = Selector(response) 26 | pattern = re.match(self.url_pattern, str(response.url)) 27 | 28 | item = NewsItem() 29 | item['source'] = 'news.163.com' # pattern.group(1) 30 | item['date'] = '20' + pattern.group(2) + pattern.group(3) 31 | item['newsId'] = pattern.group(4) 32 | item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} 33 | item['contents']['title'] = sel.xpath("//h1[@id='h1title']/text()").extract()[0] 34 | item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) 35 | return item 36 | 37 | class SinaNewsSpider(CrawlSpider): 38 | name = 'sina_news_spider' 39 | allowed_domains = ['news.sina.com.cn'] 40 | start_urls = ['http://news.sina.com.cn'] 41 | url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/(\d{4}-\d{2}-\d{2})/\d{4}(\d{8})\.(?:s)html' 42 | rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)] 43 | 44 | def parse_news(self, response): 45 | sel = Selector(response) 46 | pattern = re.match(self.url_pattern, str(response.url)) 47 | 48 | item = NewsItem() 49 | item['source'] = 'news.sina.com.cn' # pattern.group(1) 50 | item['date'] = ListCombiner(str(pattern.group(2)).split('-')) 51 | item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0] 52 | item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} 53 | item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0] 54 | item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) 55 | return item 56 | 57 | class TencentNewsSpider(CrawlSpider): 58 | name = 'tencent_news_spider' 59 | allowed_domains = ['news.qq.com'] 60 | start_urls = ['http://news.qq.com'] 61 | url_pattern = r'(.*)/a/(\d{8})/(\d+)\.htm' 62 | rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news',follow=True)] 63 | 64 | def parse_news(self, response): 65 | sel = Selector(response) 66 | pattern = re.match(self.url_pattern, str(response.url)) 67 | item = NewsItem() 68 | item['source'] = 'news.qq.com' # pattern.group(1) 69 | item['date'] = pattern.group(2) 70 | item['newsId'] = pattern.group(3) 71 | 72 | item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} 73 | title = sel.xpath('//h1/text()').extract() 74 | if len(title) > 0: 75 | item['contents']['title'] = title[0] 76 | else: 77 | title = sel.xpath("//div[@id='ArticleTit']/text()").extract() 78 | item['contents']['title'] = title[0] 79 | 80 | item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) 81 | return item 82 | 83 | class NeteaseNewsSpider(CrawlSpider): 84 | name = 'sohu_news_spider' 85 | allowed_domains = ['sohu.com'] 86 | start_urls = ['http://www.sohu.com'] 87 | url_pattern = r'(http://.*?\.sohu\.com)/(\d{8})/(\w+)\.shtml' 88 | rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)] 89 | 90 | def parse_news(self, response): 91 | sel = Selector(response) 92 | pattern = re.match(self.url_pattern, str(response.url)) 93 | 94 | item = NewsItem() 95 | item['source'] = 'www.sohu.com' # pattern.group(1) 96 | item['date'] = pattern.group(2) 97 | item['newsId'] = pattern.group(3) 98 | item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} 99 | title = sel.xpath("//h1[@itemprop='headline']/text()").extract() 100 | if len(title) == 0: 101 | return None 102 | item['contents']['title'] = title[0] 103 | item['contents']['passage'] = ListCombiner(sel.xpath('//div[@id="contentText"]/p/text()').extract()) 104 | return item 105 | 106 | -------------------------------------------------------------------------------- /runspiders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # runspiders.py 3 | 4 | import os 5 | os.system('scrapy crawl tencent_news_spider &') 6 | os.system('scrapy crawl netease_news_spider &') 7 | os.system('scrapy crawl sina_news_spider &') 8 | os.system('scrapy crawl sohu_news_spider &') 9 | -------------------------------------------------------------------------------- /save2xml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import codecs 5 | import os 6 | import sys 7 | from dateutil.parser import parse 8 | 9 | 10 | # save the json file into xml type 11 | STR = ''' 12 | 13 | %s 14 | %s 15 | %s 16 | %s 17 | %s 18 | 19 | 20 | %s 21 | 22 | 23 | ''' 24 | 25 | 26 | def change2XML(inname, outfile): 27 | news_file = open(inname, 'r') 28 | js = json.load(news_file) 29 | newid = js["date"]+"-"+js["newsId"] 30 | source = js["source"] 31 | date = parse(js["date"]).date() 32 | link = js["contents"]["link"] 33 | title = unicode(js["contents"]["title"]) 34 | passage = unicode(js["contents"]["passage"]) 35 | outfile.write(STR % (newid, source, date, link, title, "UTF-8", passage)) 36 | 37 | 38 | def changeFiles(dirname, outname): 39 | names = os.listdir(dirname) 40 | outfile = codecs.open(outname, "w", "utf-8") 41 | for name in names: 42 | inname = dirname + os.sep + name 43 | change2XML(inname, outfile) 44 | print("total %d file(s) save into %s" % (len(names), outname)) 45 | outfile.close() 46 | 47 | def changeDir(indirname, outdirname): 48 | ''' 49 | 比如: 50 | 默认保存路径如下 51 | docs/ 52 | news.163.com/ 53 | 20150529/ 54 | 2244534.json 55 | 56 | :param indirname: json保存文件根目录 57 | :param outdirname: 输出文件根目录 58 | :return: 59 | ''' 60 | if not os.path.exists(outdirname): 61 | os.makedirs(outdirname) 62 | names = os.listdir(indirname) 63 | for news_name in names: 64 | print(news_name) 65 | dir_path = indirname + os.sep + news_name 66 | if os.path.isdir(dir_path): 67 | time_names = os.listdir(dir_path) 68 | for time_name in time_names: 69 | dir_path2 = indirname + os.sep + news_name + os.sep + time_name 70 | if os.path.isdir(dir_path2): 71 | outname = outdirname + os.sep + news_name + "." + time_name + ".xml" 72 | changeFiles(dir_path2, outname) 73 | 74 | 75 | 76 | if __name__ == "__main__": 77 | #changeDir("docs/", "outdir") 78 | if len(sys.argv) != 3: 79 | print("Usage:\n python save2xml.py indirname outdirname") 80 | exit(0) 81 | indirname = sys.argv[1] 82 | outdirname = sys.argv[2] 83 | print("saving into dir %s " % outdirname) 84 | changeDir(indirname, outdirname) 85 | 86 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = newscrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = newscrawler 12 | --------------------------------------------------------------------------------