├── .gitignore
├── README.md
├── index.js
├── lib
    └── rss.js
├── package.json
└── test
    └── index.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | */node-log.log
 2 | *.log
 3 | !logs/
 4 | !.gitignore
 5 | node_modules/*
 6 | .idea/*
 7 | 
 8 | 
 9 | .idea/workspace.xml
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # rssSpider
  2 | 
  3 | Design and coding with all the love in the world by ShaneLau.
  4 | 
  5 | 
  6 | 
  7 | > The simplest way to use rssspide to fetch rss list and site info.  
  8 | > Fetch post'content ,give clean view to you.  
  9 | >rss 爬虫，快速抓取站点信息和文章列表，文章的正文抓取
 10 | 
 11 | This project is base on [feedparser](https://github.com/kballard/feedparser) and [node-readability](https://github.com/luin/node-readability)
 12 | 
 13 | 
 14 | 
 15 | ## Usage  
 16 | 
 17 | ```
 18 | npm install rssspider
 19 | ```
 20 | Then:
 21 | 
 22 | ```
 23 | var spide = require('rssspider');
 24 | var url = 'http://www.bigertech.com/rss';
 25 | spide.fetchRss(url).then(function(data){
 26 | 		console.log(data); // rss  post list
 27 | });
 28 | ```
 29 | 
 30 | ## API Documentation
 31 | 
 32 | ### 1. <code>fetchRss(url,[options])</code>
 33 | 
 34 | get rss site'post list  ,like this  [www.bigertech.com/rss](http://www.bigertech.com/rss)
 35 | 
 36 | *  **url** : webiste'rss url
 37 | *  **options** :what data you need ?  default value:
 38 | 
 39 | ```
 40 | 	['title','description','summary','date','link','guid','author','comments','origlink','image','source','categories','enclosures']
 41 | ```  
 42 | response data
 43 | **Array**  
 44 | 
 45 | ```
 46 | [{ title: '一个营销人员的自我修养',
 47 |   description: '<p></p>',
 48 |   summary: '</p>',
 49 |   date: Wed Oct 08 2014 17:14:26 GMT+0800 (CST),
 50 |   link: 'http://www.bigertech.com/learn-social-media-marketing/',
 51 |   guid: 'a623d78a-dae9-4915-9caa-0fd34fb3757c',
 52 |   author: '巴依老爷',
 53 |   comments: null,
 54 |   origlink: null,
 55 |   image: {},
 56 |   source: {},
 57 |   categories: [],
 58 |   enclosures: [] },
 59 |   ....  // more
 60 | 	]
 61 | 
 62 | ```
 63 | 
 64 | ### 2. <code>siteInfo(url,[options])</code>
 65 | get website info  
 66 | 
 67 | * **url**   webiste'rss url
 68 | * **options**  what data you need ?  default value:
 69 | 
 70 |     ```
 71 | ['title','description','date','link','xmlurl','author','favicon','copyright','generator','image']
 72 | 
 73 |     ```
 74 | response data **Array**
 75 | 
 76 |    ```
 77 |   { title: '笔戈科技',
 78 |   description: '简单、有趣、有价值',
 79 |   date: Thu Oct 09 2014 18:15:14 GMT+0800 (CST),
 80 |   link: 'http://www.bigertech.com/',
 81 |   xmlurl: 'http://www.bigertech.com/rss/',
 82 |   author: null,
 83 |   favicon: null,
 84 |   copyright: null,
 85 |   generator: 'Ghost 0.5',
 86 |   image: {},
 87 |   feedurl: 'http://www.bigertech.com/rss' }
 88 |    ```
 89 | 
 90 | 
 91 | ** 以下功能在 1.2.0 才能使用， readability 的库支持不是很好 **
 92 | 
 93 | ### 3. `getCleanBody(url)`
 94 | 
 95 | Turn any web page into a clean view. This module is based on arc90's readability project.  
 96 | 
 97 |   * **html** url or html code.
 98 |   * **options** is an optional options object
 99 |   * **callback** is the callback to run - `callback(error, article, meta)`
100 | 
101 | 
102 |   ```
103 |   var url = 'http://www.bigertech.com/learn-social-media-marketing/';
104 |   spide.getCleanBody(url).then(function(article){
105 |         console.log(article.content);   //clean code view
106 |     });
107 |   ```
108 | 
109 | ##### More info [node-readability](https://github.com/luin/node-readability)
110 | 
111 | 
112 | #### article.content  is clean view
113 | 
114 | The article content of the web page. Return `false` if failed.
115 | 
116 | 
117 | 
118 | ### 4. <code>getAllByUrl(url,[options])</code>
119 | This method is similar to  **fetchRss**  
120 | ####What'more ,it fetch the clean page content.
121 | Turn any web page into a clean view. This module is based on arc90's readability project.
122 | 
123 | * **url** website'rss url  
124 | 
125 | * **Array**  respose data
126 | 
127 | get clean view code  , Clean view **content**
128 | 
129 | ```  
130 | 
131 | [{ title: '一个营销人员的自我修养',
132 |    content:'clean code view',     // clean code view
133 |   description: '<p></p>',
134 |   summary: '</p>',
135 |   date: Wed Oct 08 2014 17:14:26 GMT+0800 (CST),
136 |   link: 'http://www.bigertech.com/learn-social-media-marketing/',
137 |   guid: 'a623d78a-dae9-4915-9caa-0fd34fb3757c',
138 |   author: '巴依老爷',
139 |   comments: null,
140 |   origlink: null,
141 |   image: {},
142 |   source: {},
143 |   categories: [],
144 |   enclosures: [] },
145 |     ....... // more
146 | 	]
147 | 
148 | ```
149 | 
150 | ## test  100%
151 | ```
152 | nodeunit test/index.js
153 | 
154 | ```
155 | 
156 | ## upgrade
157 | Add  node 4.x support
158 | 
159 | 
160 | ### Any question [shanelau](http://weibo.com/kissliux)  
161 | or  
162 | [shanelau1021@gmail.com](shanelau1021@gmail.com)
163 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by liuxing on 14-9-22.
3 |  */
4 | module.exports = require('./lib/rss');


--------------------------------------------------------------------------------
/lib/rss.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2014 Meizu bigertech, All rights reserved.
  3 |  * http://www.bigertech.com/
  4 |  * @author liuxing
  5 |  * @date  15/3/16
  6 |  * @description
  7 |  *
  8 |  */
  9 | var Promise = require('bluebird'),
 10 |   FeedParser = require('feedparser'),
 11 |   _ = require('lodash'),
 12 |   request = require('request'),
 13 |   // read = require('node-readability'),
 14 |   iconv = require('iconv-lite'),
 15 |   es = require('event-stream'),
 16 |   postOptions = ['title', 'description', 'summary', 'date', 'link',
 17 |     'guid', 'author', 'comments', 'origlink', 'image', 'source', 'categories',
 18 |     'enclosures'
 19 |   ],
 20 |   siteInfoOption = ['title', 'description', 'date', 'link', 'xmlurl', 'author',
 21 |     'favicon', 'copyright', 'generator', 'image'
 22 |   ];
 23 | 
 24 | /**
 25 |  * get  all post info ,by rss url
 26 |  * @param url
 27 |  * @param options
 28 |  * @returns {Promise}
 29 |  */
 30 | function fetchRss(url, options) {
 31 |   options = options || postOptions;
 32 | 
 33 |   return new Promise(function(resolve, reject) {
 34 |     var posts, encoding;
 35 |     var req = request(url, {
 36 |       timeout: 10000,
 37 |       pool: false
 38 |     });
 39 |     req.setMaxListeners(50);
 40 |     req.setHeader('user-agent',
 41 |       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
 42 |     )
 43 |     req.setHeader('accept', 'text/html,application/xhtml+xml');
 44 | 
 45 |     var feedparser = new FeedParser();
 46 | 
 47 |     req.on('error', reject);
 48 | 
 49 |     req.on('response', function(res) {
 50 |       var stream = this;
 51 |       posts = [];
 52 | 
 53 |       if (res.statusCode !== 200) {
 54 |         return this.emit('error', new Error('Bad status code'));
 55 |       }
 56 | 
 57 | 
 58 |     }).pipe(es.through(function(data) {
 59 | 
 60 |       //get charset from <?xml version="1.0" encoding="gb2312"?><rss version="2.0">
 61 |       //then convert gb2312,gbk,big5 etc to utf-8
 62 | 
 63 |       var result = data.toString('utf-8');
 64 | 
 65 |       var meta = result.match(/<\?(.*?)\?>/g);
 66 |       if (meta !== null) {
 67 |         meta = meta[0].toString().match(/encoding="(.*?)"\?>/g);
 68 |         encoding = meta && meta.toString().split('"')[1];
 69 |       }
 70 | 
 71 |       //iconv-lite , which can support windows
 72 |       try {
 73 |         result = iconv.decode(data, encoding);
 74 |       } catch (err) {
 75 |         result = data.toString('utf-8');
 76 |       }
 77 |       this.emit('data', result);
 78 |     })).pipe(feedparser);
 79 | 
 80 | 
 81 |     feedparser.on('error', reject);
 82 | 
 83 |     feedparser.on('end', function(err) {
 84 |       if (err) {
 85 |         reject(err);
 86 |       }
 87 |       resolve(posts);
 88 |     });
 89 | 
 90 |     feedparser.on('readable', function() {
 91 |       while (post = this.read()) {
 92 |         var post = _.pick(post, options);
 93 |         posts.push(post);
 94 |       }
 95 |     });
 96 |   });
 97 | }
 98 | 
 99 | 
100 | /**
101 |  * get website info
102 |  * @param url
103 |  * @param options
104 |  * @returns {Promise}
105 |  */
106 | function siteInfo(url, options) {
107 |   options = options || siteInfoOption;
108 |   return new Promise(function(resolve, reject) {
109 |     var rss, encoding;
110 |     var req = request(url, {
111 |       timeout: 10000,
112 |       pool: false
113 |     });
114 |     req.setMaxListeners(50);
115 |     // Some feeds do not response without user-agent and accept headers.
116 |     req.setHeader('user-agent',
117 |       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
118 |     );
119 |     req.setHeader('accept', 'text/html,application/xhtml+xml');
120 |     var feedparser = new FeedParser();
121 |     req.on('error', reject);
122 |     req.on('response', function(res) {
123 |       var stream = this;
124 |       if (res.statusCode !== 200) {
125 |         return this.emit('error', new Error('Bad status code'));
126 |       }
127 |       //charset = getParams(res.headers['content-type'] || '').charset;
128 |       // stream.pipe(feedparser);
129 |     }).pipe(es.through(function(data) {
130 | 
131 |       //get charset from <?xml version="1.0" encoding="gb2312"?><rss version="2.0">
132 |       //then convert gb2312,gbk,big5 etc to utf-8
133 | 
134 |       var result = data.toString('utf-8');
135 | 
136 |       var meta = result.match(/<\?(.*?)\?>/g);
137 |       if (meta !== null) {
138 |         meta = meta[0].toString().match(/encoding="(.*?)"\?>/g);
139 |         encoding = meta.toString().split('"')[1];
140 |       }
141 | 
142 |       //iconv-lite , which can support windows
143 |       result = iconv.decode(data, encoding);
144 |       this.emit('data', result);
145 |     })).pipe(feedparser);
146 | 
147 |     feedparser.on('error', reject);
148 |     feedparser.on('end', function(err) {
149 |       if (err) {
150 |         reject(err);
151 |       }
152 |       resolve(rss);
153 |     });
154 |     feedparser.on('readable', function() {
155 |       var post;
156 |       if (post = this.read()) {
157 |         rss = _.pick(post.meta, options);
158 |         rss.feedurl = url; //rss 的url
159 |         resolve(rss);
160 |       }
161 |     });
162 |   });
163 | }
164 | // /**
165 | //  * get all post's body content  by  post list
166 | //  * @param posts
167 | //  * @returns {*|Promise}
168 | //  */
169 | // function fetchAllContent(posts) {
170 | //   return Promise.reduce(posts, function(total, post) {
171 | //     return getCleanBody(post.link).then(function(article) {
172 | //       post.content = article.content ? article.content : post.description ||
173 | //         post.summary;
174 | //       return post;
175 | //     });
176 | //   }, []).then(function(total) {
177 | //     return posts;
178 | //   });
179 | // }
180 | 
181 | /**
182 |  * get all content and rss post by rssUrl
183 |  * @param url
184 |  * @returns {*}
185 |  */
186 | // function getAllByUrl(url) {
187 | //   return fetchRss(url).then(function(posts) {
188 | //     return fetchAllContent(posts);
189 | //   });
190 | // }
191 | // /**
192 | //  *  get body content by link
193 | //  * @param link
194 | //  * @returns {Promise}
195 | //  */
196 | // function getCleanBody(link) {
197 | //   return new Promise(function (resolve, reject) {
198 | //     read(link, function (err, article, meta) {
199 | //       if (err) {
200 | //         reject(err);
201 | //       }
202 | //       resolve(article);
203 | //     });
204 | //   });
205 | // }
206 | 
207 | module.exports = {
208 |   fetchRss: fetchRss,
209 |   siteInfo: siteInfo
210 |     //fetchAllContent: fetchAllContent,
211 |     //getCleanBody: getCleanBody,
212 |     //getAllByUrl: getAllByUrl
213 | };
214 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "rssspider",
 3 |   "version": "1.4.3",
 4 |   "author": "Liu Xing <shanelau1021@gmail.com> (http://kissliux.github.io)",
 5 |   "description": "The simplest way to use rssspide to fetch rss list and site info. Fetch post'content ,give clean view to you.  ",
 6 |   "email": "shanelau1021@gmail.com",
 7 |   "homepage": "https://github.com/shanelau/rssSpider",
 8 |   "main": "index.js",
 9 |   "repository": {},
10 |   "engines": {
11 |     "node": ">=0.12.x"
12 |   },
13 |   "sciprts": {
14 |     "test": "nodeunit test/index.js"
15 |   },
16 |   "dependencies": {
17 |     "bluebird": "^3.1.1",
18 |     "event-stream": "^3.3.2",
19 |     "feedparser": "^1.1.4",
20 |     "iconv-lite": "^0.4.13",
21 |     "lodash": "^3.10.1",
22 |     "request": "^2.67.0"
23 |   },
24 |   "devDependencies": {
25 |     "nodeunit": "^0.9.1"
26 |   },
27 |   "license": "BSD"
28 | }
29 | 


--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by liuxing on 14-10-9.
 3 |  */
 4 | var spide = require('../index'),
 5 |   url =
 6 |   'http://news.baidu.com/ns?word=%F7%C8%CD%E6%B0%EF&tn=newsrss&from=news&cl=2&rn=20&ct=1';
 7 | exports.fetchRSS = function(test) {
 8 |   spide.fetchRss(url).then(function(data) {
 9 |     test.ok(data.length > 0, "this assertion should pass");
10 |     test.done();
11 |   }).catch(test.done);
12 | }
13 | 
14 | exports.fetchSiteInfo = function(test) {
15 |   spide.siteInfo(url).then(function(data) {
16 |     test.ok(data.title !== '', "this assertion should pass");
17 |     test.done();
18 |   }).catch(function(err) {
19 |     console.error(err);
20 |   });
21 | 
22 | };
23 | 


--------------------------------------------------------------------------------