├── .gitignore ├── README.md ├── index.js ├── lib └── rss.js ├── package.json └── test └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | */node-log.log 2 | *.log 3 | !logs/ 4 | !.gitignore 5 | node_modules/* 6 | .idea/* 7 | 8 | 9 | .idea/workspace.xml 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rssSpider 2 | 3 | Design and coding with all the love in the world by ShaneLau. 4 | 5 | 6 | 7 | > The simplest way to use rssspide to fetch rss list and site info. 8 | > Fetch post'content ,give clean view to you. 9 | >rss 爬虫,快速抓取站点信息和文章列表,文章的正文抓取 10 | 11 | This project is base on [feedparser](https://github.com/kballard/feedparser) and [node-readability](https://github.com/luin/node-readability) 12 | 13 | 14 | 15 | ## Usage 16 | 17 | ``` 18 | npm install rssspider 19 | ``` 20 | Then: 21 | 22 | ``` 23 | var spide = require('rssspider'); 24 | var url = 'http://www.bigertech.com/rss'; 25 | spide.fetchRss(url).then(function(data){ 26 | console.log(data); // rss post list 27 | }); 28 | ``` 29 | 30 | ## API Documentation 31 | 32 | ### 1. fetchRss(url,[options]) 33 | 34 | get rss site'post list ,like this [www.bigertech.com/rss](http://www.bigertech.com/rss) 35 | 36 | * **url** : webiste'rss url 37 | * **options** :what data you need ? default value: 38 | 39 | ``` 40 | ['title','description','summary','date','link','guid','author','comments','origlink','image','source','categories','enclosures'] 41 | ``` 42 | response data 43 | **Array** 44 | 45 | ``` 46 | [{ title: '一个营销人员的自我修养', 47 | description: '

', 48 | summary: '

', 49 | date: Wed Oct 08 2014 17:14:26 GMT+0800 (CST), 50 | link: 'http://www.bigertech.com/learn-social-media-marketing/', 51 | guid: 'a623d78a-dae9-4915-9caa-0fd34fb3757c', 52 | author: '巴依老爷', 53 | comments: null, 54 | origlink: null, 55 | image: {}, 56 | source: {}, 57 | categories: [], 58 | enclosures: [] }, 59 | .... // more 60 | ] 61 | 62 | ``` 63 | 64 | ### 2. siteInfo(url,[options]) 65 | get website info 66 | 67 | * **url** webiste'rss url 68 | * **options** what data you need ? default value: 69 | 70 | ``` 71 | ['title','description','date','link','xmlurl','author','favicon','copyright','generator','image'] 72 | 73 | ``` 74 | response data **Array** 75 | 76 | ``` 77 | { title: '笔戈科技', 78 | description: '简单、有趣、有价值', 79 | date: Thu Oct 09 2014 18:15:14 GMT+0800 (CST), 80 | link: 'http://www.bigertech.com/', 81 | xmlurl: 'http://www.bigertech.com/rss/', 82 | author: null, 83 | favicon: null, 84 | copyright: null, 85 | generator: 'Ghost 0.5', 86 | image: {}, 87 | feedurl: 'http://www.bigertech.com/rss' } 88 | ``` 89 | 90 | 91 | ** 以下功能在 1.2.0 才能使用, readability 的库支持不是很好 ** 92 | 93 | ### 3. `getCleanBody(url)` 94 | 95 | Turn any web page into a clean view. This module is based on arc90's readability project. 96 | 97 | * **html** url or html code. 98 | * **options** is an optional options object 99 | * **callback** is the callback to run - `callback(error, article, meta)` 100 | 101 | 102 | ``` 103 | var url = 'http://www.bigertech.com/learn-social-media-marketing/'; 104 | spide.getCleanBody(url).then(function(article){ 105 | console.log(article.content); //clean code view 106 | }); 107 | ``` 108 | 109 | ##### More info [node-readability](https://github.com/luin/node-readability) 110 | 111 | 112 | #### article.content is clean view 113 | 114 | The article content of the web page. Return `false` if failed. 115 | 116 | 117 | 118 | ### 4. getAllByUrl(url,[options]) 119 | This method is similar to **fetchRss** 120 | ####What'more ,it fetch the clean page content. 121 | Turn any web page into a clean view. This module is based on arc90's readability project. 122 | 123 | * **url** website'rss url 124 | 125 | * **Array** respose data 126 | 127 | get clean view code , Clean view **content** 128 | 129 | ``` 130 | 131 | [{ title: '一个营销人员的自我修养', 132 | content:'clean code view', // clean code view 133 | description: '

', 134 | summary: '

', 135 | date: Wed Oct 08 2014 17:14:26 GMT+0800 (CST), 136 | link: 'http://www.bigertech.com/learn-social-media-marketing/', 137 | guid: 'a623d78a-dae9-4915-9caa-0fd34fb3757c', 138 | author: '巴依老爷', 139 | comments: null, 140 | origlink: null, 141 | image: {}, 142 | source: {}, 143 | categories: [], 144 | enclosures: [] }, 145 | ....... // more 146 | ] 147 | 148 | ``` 149 | 150 | ## test 100% 151 | ``` 152 | nodeunit test/index.js 153 | 154 | ``` 155 | 156 | ## upgrade 157 | Add node 4.x support 158 | 159 | 160 | ### Any question [shanelau](http://weibo.com/kissliux) 161 | or 162 | [shanelau1021@gmail.com](shanelau1021@gmail.com) 163 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by liuxing on 14-9-22. 3 | */ 4 | module.exports = require('./lib/rss'); -------------------------------------------------------------------------------- /lib/rss.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2014 Meizu bigertech, All rights reserved. 3 | * http://www.bigertech.com/ 4 | * @author liuxing 5 | * @date 15/3/16 6 | * @description 7 | * 8 | */ 9 | var Promise = require('bluebird'), 10 | FeedParser = require('feedparser'), 11 | _ = require('lodash'), 12 | request = require('request'), 13 | // read = require('node-readability'), 14 | iconv = require('iconv-lite'), 15 | es = require('event-stream'), 16 | postOptions = ['title', 'description', 'summary', 'date', 'link', 17 | 'guid', 'author', 'comments', 'origlink', 'image', 'source', 'categories', 18 | 'enclosures' 19 | ], 20 | siteInfoOption = ['title', 'description', 'date', 'link', 'xmlurl', 'author', 21 | 'favicon', 'copyright', 'generator', 'image' 22 | ]; 23 | 24 | /** 25 | * get all post info ,by rss url 26 | * @param url 27 | * @param options 28 | * @returns {Promise} 29 | */ 30 | function fetchRss(url, options) { 31 | options = options || postOptions; 32 | 33 | return new Promise(function(resolve, reject) { 34 | var posts, encoding; 35 | var req = request(url, { 36 | timeout: 10000, 37 | pool: false 38 | }); 39 | req.setMaxListeners(50); 40 | req.setHeader('user-agent', 41 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36' 42 | ) 43 | req.setHeader('accept', 'text/html,application/xhtml+xml'); 44 | 45 | var feedparser = new FeedParser(); 46 | 47 | req.on('error', reject); 48 | 49 | req.on('response', function(res) { 50 | var stream = this; 51 | posts = []; 52 | 53 | if (res.statusCode !== 200) { 54 | return this.emit('error', new Error('Bad status code')); 55 | } 56 | 57 | 58 | }).pipe(es.through(function(data) { 59 | 60 | //get charset from 61 | //then convert gb2312,gbk,big5 etc to utf-8 62 | 63 | var result = data.toString('utf-8'); 64 | 65 | var meta = result.match(/<\?(.*?)\?>/g); 66 | if (meta !== null) { 67 | meta = meta[0].toString().match(/encoding="(.*?)"\?>/g); 68 | encoding = meta && meta.toString().split('"')[1]; 69 | } 70 | 71 | //iconv-lite , which can support windows 72 | try { 73 | result = iconv.decode(data, encoding); 74 | } catch (err) { 75 | result = data.toString('utf-8'); 76 | } 77 | this.emit('data', result); 78 | })).pipe(feedparser); 79 | 80 | 81 | feedparser.on('error', reject); 82 | 83 | feedparser.on('end', function(err) { 84 | if (err) { 85 | reject(err); 86 | } 87 | resolve(posts); 88 | }); 89 | 90 | feedparser.on('readable', function() { 91 | while (post = this.read()) { 92 | var post = _.pick(post, options); 93 | posts.push(post); 94 | } 95 | }); 96 | }); 97 | } 98 | 99 | 100 | /** 101 | * get website info 102 | * @param url 103 | * @param options 104 | * @returns {Promise} 105 | */ 106 | function siteInfo(url, options) { 107 | options = options || siteInfoOption; 108 | return new Promise(function(resolve, reject) { 109 | var rss, encoding; 110 | var req = request(url, { 111 | timeout: 10000, 112 | pool: false 113 | }); 114 | req.setMaxListeners(50); 115 | // Some feeds do not response without user-agent and accept headers. 116 | req.setHeader('user-agent', 117 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36' 118 | ); 119 | req.setHeader('accept', 'text/html,application/xhtml+xml'); 120 | var feedparser = new FeedParser(); 121 | req.on('error', reject); 122 | req.on('response', function(res) { 123 | var stream = this; 124 | if (res.statusCode !== 200) { 125 | return this.emit('error', new Error('Bad status code')); 126 | } 127 | //charset = getParams(res.headers['content-type'] || '').charset; 128 | // stream.pipe(feedparser); 129 | }).pipe(es.through(function(data) { 130 | 131 | //get charset from 132 | //then convert gb2312,gbk,big5 etc to utf-8 133 | 134 | var result = data.toString('utf-8'); 135 | 136 | var meta = result.match(/<\?(.*?)\?>/g); 137 | if (meta !== null) { 138 | meta = meta[0].toString().match(/encoding="(.*?)"\?>/g); 139 | encoding = meta.toString().split('"')[1]; 140 | } 141 | 142 | //iconv-lite , which can support windows 143 | result = iconv.decode(data, encoding); 144 | this.emit('data', result); 145 | })).pipe(feedparser); 146 | 147 | feedparser.on('error', reject); 148 | feedparser.on('end', function(err) { 149 | if (err) { 150 | reject(err); 151 | } 152 | resolve(rss); 153 | }); 154 | feedparser.on('readable', function() { 155 | var post; 156 | if (post = this.read()) { 157 | rss = _.pick(post.meta, options); 158 | rss.feedurl = url; //rss 的url 159 | resolve(rss); 160 | } 161 | }); 162 | }); 163 | } 164 | // /** 165 | // * get all post's body content by post list 166 | // * @param posts 167 | // * @returns {*|Promise} 168 | // */ 169 | // function fetchAllContent(posts) { 170 | // return Promise.reduce(posts, function(total, post) { 171 | // return getCleanBody(post.link).then(function(article) { 172 | // post.content = article.content ? article.content : post.description || 173 | // post.summary; 174 | // return post; 175 | // }); 176 | // }, []).then(function(total) { 177 | // return posts; 178 | // }); 179 | // } 180 | 181 | /** 182 | * get all content and rss post by rssUrl 183 | * @param url 184 | * @returns {*} 185 | */ 186 | // function getAllByUrl(url) { 187 | // return fetchRss(url).then(function(posts) { 188 | // return fetchAllContent(posts); 189 | // }); 190 | // } 191 | // /** 192 | // * get body content by link 193 | // * @param link 194 | // * @returns {Promise} 195 | // */ 196 | // function getCleanBody(link) { 197 | // return new Promise(function (resolve, reject) { 198 | // read(link, function (err, article, meta) { 199 | // if (err) { 200 | // reject(err); 201 | // } 202 | // resolve(article); 203 | // }); 204 | // }); 205 | // } 206 | 207 | module.exports = { 208 | fetchRss: fetchRss, 209 | siteInfo: siteInfo 210 | //fetchAllContent: fetchAllContent, 211 | //getCleanBody: getCleanBody, 212 | //getAllByUrl: getAllByUrl 213 | }; 214 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rssspider", 3 | "version": "1.4.3", 4 | "author": "Liu Xing (http://kissliux.github.io)", 5 | "description": "The simplest way to use rssspide to fetch rss list and site info. Fetch post'content ,give clean view to you. ", 6 | "email": "shanelau1021@gmail.com", 7 | "homepage": "https://github.com/shanelau/rssSpider", 8 | "main": "index.js", 9 | "repository": {}, 10 | "engines": { 11 | "node": ">=0.12.x" 12 | }, 13 | "sciprts": { 14 | "test": "nodeunit test/index.js" 15 | }, 16 | "dependencies": { 17 | "bluebird": "^3.1.1", 18 | "event-stream": "^3.3.2", 19 | "feedparser": "^1.1.4", 20 | "iconv-lite": "^0.4.13", 21 | "lodash": "^3.10.1", 22 | "request": "^2.67.0" 23 | }, 24 | "devDependencies": { 25 | "nodeunit": "^0.9.1" 26 | }, 27 | "license": "BSD" 28 | } 29 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by liuxing on 14-10-9. 3 | */ 4 | var spide = require('../index'), 5 | url = 6 | 'http://news.baidu.com/ns?word=%F7%C8%CD%E6%B0%EF&tn=newsrss&from=news&cl=2&rn=20&ct=1'; 7 | exports.fetchRSS = function(test) { 8 | spide.fetchRss(url).then(function(data) { 9 | test.ok(data.length > 0, "this assertion should pass"); 10 | test.done(); 11 | }).catch(test.done); 12 | } 13 | 14 | exports.fetchSiteInfo = function(test) { 15 | spide.siteInfo(url).then(function(data) { 16 | test.ok(data.title !== '', "this assertion should pass"); 17 | test.done(); 18 | }).catch(function(err) { 19 | console.error(err); 20 | }); 21 | 22 | }; 23 | --------------------------------------------------------------------------------