├── .gitignore
├── README.md
├── newscrawler
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── newsspider.py
├── runspiders.py
├── save2xml.py
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.pyc
3 | *.log
4 | 
5 | 
6 | docs/
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # newscrawler
 2 | 
 3 | [![Join the chat at https://gitter.im/tankle/newscrawler](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/tankle/newscrawler?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 4 | 新闻网站爬虫,目前能够爬取网易，新浪，qq, sohu等三家网站的新闻页面。
 5 | 
 6 | 
 7 | 
 8 | ##Using:
 9 | 
10 |     python runspiders.py
11 | 
12 | ##json file
13 | 
14 | The news file saved as json file:
15 | 
16 | newsId: the news's id
17 | 
18 | source: the source of the news , such as news.163.com, news.sina.com.cn or news.qq.com
19 | 
20 | date: the creation time of news, 20150529
21 | 
22 | contents:
23 | 
24 | link: the link of news
25 | 
26 | title: the title of news
27 | 
28 | passage: the content of news
29 | 
30 | 
31 | The title and passage are encode as unicode, so you need transform it when load it.
32 | 
33 | ##Other:
34 | save2xml.py is used to changing the json to xml type. 
35 | 
36 | The xml file can be tagged by [TemporaliaChTagger](https://github.com/ntcirtemporalia/TemporaliaChTagger.git).
37 | 
38 | 
39 | ###Reference
40 | [news-combinator](https://github.com/fanfank/news-combinator.git)
41 | 


--------------------------------------------------------------------------------
/newscrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tankle/newscrawler/a9dc40c9bf02f2e82781d6148cef6fa9f7a58d3b/newscrawler/__init__.py


--------------------------------------------------------------------------------
/newscrawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class NewsItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     source = Field()
15 |     date = Field()
16 |     newsId = Field()
17 |     contents = Field()
18 |     
19 |     def __str__(self):
20 |         return "news downloading ...  "
21 | 


--------------------------------------------------------------------------------
/newscrawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import os
 9 | import json
10 | import codecs
11 | 
12 | class NewscrawlerPipeline(object):
13 |     def __init__(self):
14 |         self.current_dir = os.getcwd()
15 | 
16 |     def process_item(self, item, spider):
17 |         dir_path = self.current_dir + '/docs/' + item['source'] + '/' + item['date']
18 |         if not os.path.exists(dir_path):
19 |             os.makedirs(dir_path)
20 | 
21 |         news_file_path = dir_path + '/' + item['newsId'] + '.json'
22 |         if os.path.exists(news_file_path) and os.path.isfile(news_file_path):
23 |             print '---------------------------------------'
24 |             print item['newsId'] + '.json exists, not overriden'
25 |             print '---------------------------------------'
26 |             return item
27 | 
28 |         news_file = codecs.open(news_file_path, 'w', 'utf-8')
29 |         line = json.dumps(dict(item))
30 |         news_file.write(line)
31 |         news_file.close()
32 |         return item
33 | 


--------------------------------------------------------------------------------
/newscrawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for newscrawler project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'newscrawler'
12 | 
13 | SPIDER_MODULES = ['newscrawler.spiders']
14 | NEWSPIDER_MODULE = 'newscrawler.spiders'
15 | 
16 | 
17 | ITEM_PIPELINES = {
18 |         'newscrawler.pipelines.NewscrawlerPipeline': 1,
19 |         }
20 | 
21 | 
22 | DOWNLOAD_DELAY = 5
23 | 
24 | LOG_FILE = "news.crawl.log"
25 | 
26 | COOKIES_ENABLED = False
27 | 
28 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
29 | #USER_AGENT = 'newscrawler (+http://www.yourdomain.com)'
30 | 


--------------------------------------------------------------------------------
/newscrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/newscrawler/spiders/newsspider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from newscrawler.items import NewsItem
  3 | 
  4 | from scrapy.contrib.spiders import CrawlSpider, Rule
  5 | from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
  6 | from scrapy.selector import Selector
  7 | 
  8 | import re
  9 | 
 10 | def ListCombiner(lst):
 11 |     string = ''
 12 |     for e in lst:
 13 |         string += e
 14 |     return string
 15 | 
 16 | 
 17 | class NeteaseNewsSpider(CrawlSpider):
 18 |     name = 'netease_news_spider'
 19 |     allowed_domains = ['news.163.com']
 20 |     start_urls = ['http://news.163.com']
 21 |     url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/\d+/(\w+)\.html'
 22 |     rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)]
 23 |     
 24 |     def parse_news(self, response):
 25 |         sel = Selector(response)
 26 |         pattern = re.match(self.url_pattern, str(response.url))
 27 |         
 28 |         item = NewsItem()
 29 |         item['source'] = 'news.163.com' # pattern.group(1)
 30 |         item['date'] = '20' + pattern.group(2) + pattern.group(3)
 31 |         item['newsId'] = pattern.group(4)
 32 |         item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
 33 |         item['contents']['title'] = sel.xpath("//h1[@id='h1title']/text()").extract()[0]
 34 |         item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
 35 |         return item
 36 | 
 37 | class SinaNewsSpider(CrawlSpider):
 38 |     name = 'sina_news_spider'
 39 |     allowed_domains = ['news.sina.com.cn']
 40 |     start_urls = ['http://news.sina.com.cn']
 41 |     url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/(\d{4}-\d{2}-\d{2})/\d{4}(\d{8})\.(?:s)html'
 42 |     rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)]
 43 |     
 44 |     def parse_news(self, response):
 45 |         sel = Selector(response)
 46 |         pattern = re.match(self.url_pattern, str(response.url))
 47 |         
 48 |         item = NewsItem()
 49 |         item['source'] = 'news.sina.com.cn' # pattern.group(1)
 50 |         item['date'] = ListCombiner(str(pattern.group(2)).split('-'))
 51 |         item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0]
 52 |         item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
 53 |         item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0]
 54 |         item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
 55 |         return item
 56 | 
 57 | class TencentNewsSpider(CrawlSpider):
 58 |     name = 'tencent_news_spider'
 59 |     allowed_domains = ['news.qq.com']
 60 |     start_urls = ['http://news.qq.com']
 61 |     url_pattern = r'(.*)/a/(\d{8})/(\d+)\.htm'
 62 |     rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news',follow=True)]
 63 |     
 64 |     def parse_news(self, response):
 65 |         sel = Selector(response)
 66 |         pattern = re.match(self.url_pattern, str(response.url))
 67 |         item = NewsItem()
 68 |         item['source'] = 'news.qq.com' # pattern.group(1)
 69 |         item['date'] = pattern.group(2)
 70 |         item['newsId'] = pattern.group(3)
 71 | 
 72 |         item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
 73 |         title = sel.xpath('//h1/text()').extract()
 74 |         if len(title) > 0:
 75 |             item['contents']['title'] = title[0]
 76 |         else:
 77 |             title = sel.xpath("//div[@id='ArticleTit']/text()").extract()
 78 |             item['contents']['title'] = title[0]
 79 |  
 80 |         item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
 81 |         return item
 82 | 
 83 | class NeteaseNewsSpider(CrawlSpider):
 84 |     name = 'sohu_news_spider'
 85 |     allowed_domains = ['sohu.com']
 86 |     start_urls = ['http://www.sohu.com']
 87 |     url_pattern = r'(http://.*?\.sohu\.com)/(\d{8})/(\w+)\.shtml'
 88 |     rules = [Rule(LxmlLinkExtractor(allow=[url_pattern]), 'parse_news', follow=True)]
 89 | 
 90 |     def parse_news(self, response):
 91 |         sel = Selector(response)
 92 |         pattern = re.match(self.url_pattern, str(response.url))
 93 | 
 94 |         item = NewsItem()
 95 |         item['source'] = 'www.sohu.com' # pattern.group(1)
 96 |         item['date'] = pattern.group(2)
 97 |         item['newsId'] = pattern.group(3)
 98 |         item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
 99 |         title = sel.xpath("//h1[@itemprop='headline']/text()").extract()
100 |         if len(title) == 0:
101 |             return None
102 |         item['contents']['title'] = title[0]
103 |         item['contents']['passage'] = ListCombiner(sel.xpath('//div[@id="contentText"]/p/text()').extract())
104 |         return item
105 | 
106 | 


--------------------------------------------------------------------------------
/runspiders.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # runspiders.py
3 | 
4 | import os
5 | os.system('scrapy crawl tencent_news_spider &')
6 | os.system('scrapy crawl netease_news_spider &')
7 | os.system('scrapy crawl sina_news_spider &')
8 | os.system('scrapy crawl sohu_news_spider &')
9 | 


--------------------------------------------------------------------------------
/save2xml.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import codecs
 5 | import os
 6 | import sys
 7 | from dateutil.parser import parse
 8 | 
 9 | 
10 | # save the json file into xml type
11 | STR = '''<doc id="%s">
12 | <meta-info>
13 |     <tag name="host">%s</tag>
14 |     <tag name="date">%s</tag>
15 |     <tag name="url">%s</tag>
16 |     <tag name="title">%s</tag>
17 |     <tag name="source-encoding">%s</tag>
18 | </meta-info>
19 | <text>
20 |     %s
21 | </text>
22 | </doc>
23 | '''
24 | 
25 | 
26 | def change2XML(inname, outfile):
27 |     news_file = open(inname, 'r')
28 |     js = json.load(news_file)
29 |     newid = js["date"]+"-"+js["newsId"]
30 |     source = js["source"]
31 |     date = parse(js["date"]).date()
32 |     link = js["contents"]["link"]
33 |     title = unicode(js["contents"]["title"])
34 |     passage = unicode(js["contents"]["passage"])
35 |     outfile.write(STR % (newid, source, date, link, title, "UTF-8", passage))
36 | 
37 | 
38 | def changeFiles(dirname, outname):
39 |     names = os.listdir(dirname)
40 |     outfile = codecs.open(outname, "w", "utf-8")
41 |     for name in names:
42 |         inname = dirname + os.sep + name
43 |         change2XML(inname, outfile)
44 |     print("total %d file(s) save into %s" % (len(names), outname))
45 |     outfile.close()
46 | 
47 | def changeDir(indirname, outdirname):
48 |     '''
49 |     比如：
50 |     默认保存路径如下
51 |     docs/
52 |         news.163.com/
53 |             20150529/
54 |                 2244534.json
55 | 
56 |     :param indirname: json保存文件根目录
57 |     :param outdirname: 输出文件根目录
58 |     :return:
59 |     '''
60 |     if not os.path.exists(outdirname):
61 |         os.makedirs(outdirname)
62 |     names = os.listdir(indirname)
63 |     for news_name in names:
64 |         print(news_name)
65 |         dir_path = indirname + os.sep + news_name
66 |         if os.path.isdir(dir_path):
67 |             time_names = os.listdir(dir_path)
68 |             for time_name in time_names:
69 |                 dir_path2 = indirname + os.sep + news_name + os.sep + time_name
70 |                 if os.path.isdir(dir_path2):
71 |                     outname = outdirname + os.sep + news_name + "." + time_name + ".xml"
72 |                     changeFiles(dir_path2, outname)
73 | 
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     #changeDir("docs/", "outdir")
78 |     if len(sys.argv) != 3:
79 |         print("Usage:\n python save2xml.py indirname outdirname")
80 |         exit(0)
81 |     indirname = sys.argv[1]
82 |     outdirname = sys.argv[2]
83 |     print("saving into dir %s " % outdirname)
84 |     changeDir(indirname, outdirname)
85 | 
86 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = newscrawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = newscrawler
12 | 


--------------------------------------------------------------------------------