├── .gitignore
├── README.md
├── index.js
├── lib
└── rss.js
├── package.json
└── test
└── index.js
/.gitignore:
--------------------------------------------------------------------------------
1 | */node-log.log
2 | *.log
3 | !logs/
4 | !.gitignore
5 | node_modules/*
6 | .idea/*
7 |
8 |
9 | .idea/workspace.xml
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rssSpider
2 |
3 | Design and coding with all the love in the world by ShaneLau.
4 |
5 |
6 |
7 | > The simplest way to use rssspide to fetch rss list and site info.
8 | > Fetch post'content ,give clean view to you.
9 | >rss 爬虫,快速抓取站点信息和文章列表,文章的正文抓取
10 |
11 | This project is base on [feedparser](https://github.com/kballard/feedparser) and [node-readability](https://github.com/luin/node-readability)
12 |
13 |
14 |
15 | ## Usage
16 |
17 | ```
18 | npm install rssspider
19 | ```
20 | Then:
21 |
22 | ```
23 | var spide = require('rssspider');
24 | var url = 'http://www.bigertech.com/rss';
25 | spide.fetchRss(url).then(function(data){
26 | console.log(data); // rss post list
27 | });
28 | ```
29 |
30 | ## API Documentation
31 |
32 | ### 1. fetchRss(url,[options])
33 |
34 | get rss site'post list ,like this [www.bigertech.com/rss](http://www.bigertech.com/rss)
35 |
36 | * **url** : webiste'rss url
37 | * **options** :what data you need ? default value:
38 |
39 | ```
40 | ['title','description','summary','date','link','guid','author','comments','origlink','image','source','categories','enclosures']
41 | ```
42 | response data
43 | **Array**
44 |
45 | ```
46 | [{ title: '一个营销人员的自我修养',
47 | description: '
siteInfo(url,[options])
65 | get website info
66 |
67 | * **url** webiste'rss url
68 | * **options** what data you need ? default value:
69 |
70 | ```
71 | ['title','description','date','link','xmlurl','author','favicon','copyright','generator','image']
72 |
73 | ```
74 | response data **Array**
75 |
76 | ```
77 | { title: '笔戈科技',
78 | description: '简单、有趣、有价值',
79 | date: Thu Oct 09 2014 18:15:14 GMT+0800 (CST),
80 | link: 'http://www.bigertech.com/',
81 | xmlurl: 'http://www.bigertech.com/rss/',
82 | author: null,
83 | favicon: null,
84 | copyright: null,
85 | generator: 'Ghost 0.5',
86 | image: {},
87 | feedurl: 'http://www.bigertech.com/rss' }
88 | ```
89 |
90 |
91 | ** 以下功能在 1.2.0 才能使用, readability 的库支持不是很好 **
92 |
93 | ### 3. `getCleanBody(url)`
94 |
95 | Turn any web page into a clean view. This module is based on arc90's readability project.
96 |
97 | * **html** url or html code.
98 | * **options** is an optional options object
99 | * **callback** is the callback to run - `callback(error, article, meta)`
100 |
101 |
102 | ```
103 | var url = 'http://www.bigertech.com/learn-social-media-marketing/';
104 | spide.getCleanBody(url).then(function(article){
105 | console.log(article.content); //clean code view
106 | });
107 | ```
108 |
109 | ##### More info [node-readability](https://github.com/luin/node-readability)
110 |
111 |
112 | #### article.content is clean view
113 |
114 | The article content of the web page. Return `false` if failed.
115 |
116 |
117 |
118 | ### 4. getAllByUrl(url,[options])
119 | This method is similar to **fetchRss**
120 | ####What'more ,it fetch the clean page content.
121 | Turn any web page into a clean view. This module is based on arc90's readability project.
122 |
123 | * **url** website'rss url
124 |
125 | * **Array** respose data
126 |
127 | get clean view code , Clean view **content**
128 |
129 | ```
130 |
131 | [{ title: '一个营销人员的自我修养',
132 | content:'clean code view', // clean code view
133 | description: '',
134 | summary: '',
135 | date: Wed Oct 08 2014 17:14:26 GMT+0800 (CST),
136 | link: 'http://www.bigertech.com/learn-social-media-marketing/',
137 | guid: 'a623d78a-dae9-4915-9caa-0fd34fb3757c',
138 | author: '巴依老爷',
139 | comments: null,
140 | origlink: null,
141 | image: {},
142 | source: {},
143 | categories: [],
144 | enclosures: [] },
145 | ....... // more
146 | ]
147 |
148 | ```
149 |
150 | ## test 100%
151 | ```
152 | nodeunit test/index.js
153 |
154 | ```
155 |
156 | ## upgrade
157 | Add node 4.x support
158 |
159 |
160 | ### Any question [shanelau](http://weibo.com/kissliux)
161 | or
162 | [shanelau1021@gmail.com](shanelau1021@gmail.com)
163 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by liuxing on 14-9-22.
3 | */
4 | module.exports = require('./lib/rss');
--------------------------------------------------------------------------------
/lib/rss.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2014 Meizu bigertech, All rights reserved.
3 | * http://www.bigertech.com/
4 | * @author liuxing
5 | * @date 15/3/16
6 | * @description
7 | *
8 | */
9 | var Promise = require('bluebird'),
10 | FeedParser = require('feedparser'),
11 | _ = require('lodash'),
12 | request = require('request'),
13 | // read = require('node-readability'),
14 | iconv = require('iconv-lite'),
15 | es = require('event-stream'),
16 | postOptions = ['title', 'description', 'summary', 'date', 'link',
17 | 'guid', 'author', 'comments', 'origlink', 'image', 'source', 'categories',
18 | 'enclosures'
19 | ],
20 | siteInfoOption = ['title', 'description', 'date', 'link', 'xmlurl', 'author',
21 | 'favicon', 'copyright', 'generator', 'image'
22 | ];
23 |
24 | /**
25 | * get all post info ,by rss url
26 | * @param url
27 | * @param options
28 | * @returns {Promise}
29 | */
30 | function fetchRss(url, options) {
31 | options = options || postOptions;
32 |
33 | return new Promise(function(resolve, reject) {
34 | var posts, encoding;
35 | var req = request(url, {
36 | timeout: 10000,
37 | pool: false
38 | });
39 | req.setMaxListeners(50);
40 | req.setHeader('user-agent',
41 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
42 | )
43 | req.setHeader('accept', 'text/html,application/xhtml+xml');
44 |
45 | var feedparser = new FeedParser();
46 |
47 | req.on('error', reject);
48 |
49 | req.on('response', function(res) {
50 | var stream = this;
51 | posts = [];
52 |
53 | if (res.statusCode !== 200) {
54 | return this.emit('error', new Error('Bad status code'));
55 | }
56 |
57 |
58 | }).pipe(es.through(function(data) {
59 |
60 | //get charset from