├── .gitignore ├── .npmignore ├── LICENSE ├── README.md ├── doc ├── API.md ├── CONFIG.md └── EXAMPLE.md ├── example ├── parser │ ├── parser-baoliao5.js │ ├── parser-healthno1.js │ └── parser-qiushibaike.js └── test │ ├── baoliao5.js │ ├── healthno1.js │ └── qiushibaike.js ├── index.js ├── lib ├── crawler.js ├── helper.js └── parser.js ├── package-lock.json └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # Typescript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | doc 2 | example 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 coolfish 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 页面爬虫解析器核心 2 | 此工具适用于 3 | 1. 对单独页面链接进行解析 4 | 2. 配合队列进行多页面解析 5 | 6 | 7 | ## 解释说明 8 | 支持详情页下一页抓取,支持繁体转换,支持对字数统计,支持对图片数量统计。 9 | 目前主要针对静态页的解析,对json请求和jsonp请求的解析做了预留(暂不支持)。 10 | 11 | [![NPM](https://nodei.co/npm/almighty-parser-core.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/almighty-parser-core/) 12 | 13 | [![npm](https://img.shields.io/npm/v/almighty-parser-core.svg)]() 14 | [![npm](https://img.shields.io/npm/dm/almighty-parser-core.svg)]() 15 | [![license](https://img.shields.io/github/license/coolfishstudio/almighty-parser-core.svg)]() 16 | 17 | ## 安装 18 | ``` 19 | npm i --save almighty-parser-core 20 | ``` 21 | 22 | ## api接口 23 | - [x] `getLinks` 获取待抓页链接 24 | - [x] `getContent` 获取详情页内容 25 | - [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合] 26 | - [x] `isArticleUrl` 检测链接是否是详情页 27 | - [x] `isListUrl` 检测链接是否是列表页 28 | - [x] `getIdFromArticleUrl` 获取页面链接的唯一标示 29 | 30 | ## 配置参数 31 | [文档说明](https://github.com/coolfishstudio/almighty-parser-core/blob/master/doc/CONFIG.md) 32 | 33 | ## 实例 34 | ### 解析器案例 35 | [糗事百科 - 基础](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-qiushibaike.js) 36 | [今日健康 - 繁体](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-healthno1.js) 37 | [爆料网 - 详情下一页](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-baoliao5.js) 38 | 39 | ### 定义网站规则 40 | ``` 41 | module.exports = { 42 | // 域名 网站域名,设置域名后只处理这些域名下的网页 43 | domains: 'https://www.qiushibaike.com/', 44 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理 45 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/], 46 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理 47 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/], 48 | // 从内容页中抽取需要的数据 49 | fields: [{ 50 | // 作者 51 | name: 'author', 52 | meta: { 53 | selector: ['.author h2'], 54 | format: 'text' 55 | } 56 | }, { 57 | // 标签 58 | name: 'tags', 59 | meta: { 60 | format: 'text', 61 | selector: ['.source a'], 62 | index: 0 63 | } 64 | }, { 65 | // 网页关键字 66 | name: 'keywords', 67 | meta: { 68 | format: 'meta', 69 | selector: ['meta[name="keywords"]'] 70 | } 71 | }, { 72 | // 网页描述 73 | name: 'description', 74 | meta: { 75 | format: 'meta', 76 | selector: ['meta[name="description"]'] 77 | } 78 | }, { 79 | // 详情 80 | name: 'content', 81 | meta: { 82 | selector: ['.content', '.thumb'], 83 | format: 'html' 84 | }, 85 | required: true 86 | }, { 87 | name: 'imagesCount', 88 | meta: { 89 | selector: ['.thumb'], 90 | format: 'count', 91 | countType: 'image' 92 | }, 93 | defaultValue: 0 94 | }, { 95 | name: 'wordsCount', 96 | meta: { 97 | selector: ['.content'], 98 | format: 'count', 99 | countType: 'text' 100 | }, 101 | defaultValue: 0 102 | }, { 103 | name: 'comments', 104 | meta: { 105 | selector: ['.stats-comments .number'], 106 | format: 'text' 107 | }, 108 | defaultValue: 0 109 | }, { 110 | name: 'likes', 111 | meta: { 112 | selector: ['.stats-vote .number'], 113 | format: 'text' 114 | }, 115 | defaultValue: 0 116 | }], 117 | // 是否模拟用户请求 118 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 119 | // 编码 默认utf-8 120 | charset: null, 121 | // 回调函数 对所有数据做处理 122 | afterExtractAll: function (data) { 123 | data.fields['hits'] = 0; 124 | return data; 125 | }, 126 | afterExtractField: function (fieldsName, data) { 127 | if (fieldsName === 'tags') { 128 | data = data ? data.split(',') : []; 129 | } 130 | if (fieldsName === 'comments') { 131 | data = +data; 132 | } 133 | if (fieldsName === 'likes') { 134 | data = +data; 135 | } 136 | return data; 137 | } 138 | }; 139 | ``` 140 | 141 | ### 引入 142 | ``` 143 | const Crawler = require('almighty-parser-core') 144 | const options = require('../parser/parser-qiushibaike.js') 145 | const parser = new Crawler(options) 146 | ``` 147 | 148 | ### API测试 149 | [测试案例](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/test/qiushibaike.js) 150 | #### parse 151 | ``` 152 | { fields: 153 | { author: '草莓、牛奶巧克力', 154 | tags: [ '搞笑图片' ], 155 | keywords: '', 156 | description: '笑死我了', 157 | content: '
\n\n笑死我了\n\n
\n\n糗事#119095438\n\n
', 158 | imagesCount: 1, 159 | wordsCount: 4, 160 | comments: 0, 161 | likes: 457, 162 | from: 'https://www.qiushibaike.com/article/119095438', 163 | sourceId: 'com.qiushibaike.www-article-119095438', 164 | site: 'www.qiushibaike.com', 165 | hits: 0 }, 166 | urls: 167 | [ 'https://www.qiushibaike.com/', 168 | 'https://www.qiushibaike.com/hot/', 169 | 'https://www.qiushibaike.com/imgrank/', 170 | 'https://www.qiushibaike.com/text/', 171 | 'https://www.qiushibaike.com/history/', 172 | 'https://www.qiushibaike.com/pic/', 173 | 'https://www.qiushibaike.com/textnew/', 174 | 'https://www.qiushibaike.com/my', 175 | 'https://www.qiushibaike.com/article/116423562', 176 | 'https://www.qiushibaike.com/article/116424718', 177 | 'https://www.qiushibaike.com/article/116421669', 178 | 'https://www.qiushibaike.com/article/116423344', 179 | 'https://www.qiushibaike.com/article/116426229', 180 | 'https://www.qiushibaike.com/article/116423107', 181 | 'https://www.qiushibaike.com/article/104614784', 182 | 'https://www.qiushibaike.com/article/104590828', 183 | 'https://www.qiushibaike.com/article/104629666', 184 | 'https://www.qiushibaike.com/article/104599846', 185 | 'https://www.qiushibaike.com/article/104598154', 186 | 'https://www.qiushibaike.com/article/104619022', 187 | 'https://www.qiushibaike.com/article/118954381', 188 | 'https://www.qiushibaike.com/article/118491926', 189 | 'https://www.qiushibaike.com/article/118563113', 190 | 'https://www.qiushibaike.com/article/118806836', 191 | 'https://www.qiushibaike.com/article/118525804', 192 | 'https://www.qiushibaike.com/article/118770803', 193 | 'https://www.qiushibaike.com/article/119008939', 194 | 'https://www.qiushibaike.com/article/119033005', 195 | 'https://www.qiushibaike.com/article/119036209', 196 | 'https://www.qiushibaike.com/article/118922421', 197 | 'https://www.qiushibaike.com/article/119014594', 198 | 'https://www.qiushibaike.com/article/119009873', 199 | 'https://www.qiushibaike.com/article/118934286', 200 | 'https://www.qiushibaike.com/joke/', 201 | 'https://www.qiushibaike.com/article/' ] } 202 | ``` 203 | 204 | 其余接口测试请下载后运行 205 | ``` 206 | npm run test:qiushibaike 207 | ``` 208 | 209 | ## License 210 | 211 | [MIT License](https://opensource.org/licenses/MIT) 212 | -------------------------------------------------------------------------------- /doc/API.md: -------------------------------------------------------------------------------- 1 | ## API 文档 2 | 3 | - [x] `getLinks` 获取待抓页链接 4 | - [x] `getContent` 获取详情页内容 5 | - [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合] 6 | - [x] `isArticleUrl` 检测链接是否是详情页 7 | - [x] `isListUrl` 检测链接是否是列表页 8 | - [x] `getIdFromArticleUrl` 获取页面链接的唯一标示 9 | -------------------------------------------------------------------------------- /doc/CONFIG.md: -------------------------------------------------------------------------------- 1 | ## 配置参数 2 | 3 | 针对不同 要有自己定义的配置 4 | 5 | 注意 目前只支持html静态页的内容抓取 6 | 7 | 配置 | 描述 | 是否必填 | 类型 8 | ------------- | ------------- | ------------- | ------------- 9 | domains | 网站域名 | 必填 | 字符串 10 | listUrlRegexes | 列表页url的正则,符合这些正则的页面会被当作列表页处理 | 必填 | 数组 11 | contentUrlRegexes | 内容页url的正则,符合这些正则的页面会被当作内容页处理 | 必填 | 数组 12 | fields | 从内容页中抽取需要的数据 | 必填 | fields示例 13 | userAgent | 是否模拟用户请求 | 选填 | 字符串 14 | charset | 编码 默认utf-8 | 选填 | 字符串 15 | afterExtractField | 回调函数 对每一个抽取出来的数据进行处理 | 选填 | 方法 16 | afterExtractAll | 回调函数 对所有抽取出来的数据进行处理 | 选填 | 方法 17 | contentPage | 对详情页下一页内容处理 | 选填 | contentPage示例 18 | 19 | ## fields示例 20 | 字段 | 描述 | 类型 21 | ------------- | ------------- | ------------- 22 | name | 定义字段名字 | 字符串 必填 23 | meta | 选择器 | meta示例 必填 24 | defaultValue | 默认值 | 任意 选填 25 | 26 | ### meta示例 27 | 字段 | 描述 | 类型 28 | selector | 选择器(支持多个拼接) | 数组 必填 29 | format | 返回是否含有标签[text/html/meta 默认text] | 字符串 选填 30 | index | 下标 | 数字 选填 31 | 32 | ## contentPage示例 33 | 字段 | 描述 | 类型 34 | ------------- | ------------- | ------------- 35 | urls | 下一页的正则 | 数组 必填 36 | selector | 选择器 | 数组 必填 37 | appendNode | 插入的位置 | 任意 必填 -------------------------------------------------------------------------------- /doc/EXAMPLE.md: -------------------------------------------------------------------------------- 1 | ## 实例 2 | ### 定义网站规则 3 | ``` 4 | module.exports = { 5 | // 域名 网站域名,设置域名后只处理这些域名下的网页 6 | domains: 'https://www.qiushibaike.com/', 7 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理 8 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/], 9 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理 10 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/], 11 | // 从内容页中抽取需要的数据 12 | fields: [{ 13 | // 作者 14 | name: 'author', 15 | meta: { 16 | selector: ['.author h2'], 17 | format: 'text' 18 | } 19 | }, { 20 | // 标签 21 | name: 'tags', 22 | meta: { 23 | format: 'text', 24 | selector: ['.source a'], 25 | index: 0 26 | } 27 | }, { 28 | // 网页关键字 29 | name: 'keywords', 30 | meta: { 31 | format: 'meta', 32 | selector: ['meta[name="keywords"]'] 33 | } 34 | }, { 35 | // 网页描述 36 | name: 'description', 37 | meta: { 38 | format: 'meta', 39 | selector: ['meta[name="description"]'] 40 | } 41 | }, { 42 | // 详情 43 | name: 'content', 44 | meta: { 45 | selector: ['.content', '.thumb'], 46 | format: 'html' 47 | }, 48 | required: true 49 | }, { 50 | name: 'imagesCount', 51 | meta: { 52 | selector: ['.thumb'], 53 | format: 'count', 54 | countType: 'image' 55 | }, 56 | defaultValue: 0 57 | }, { 58 | name: 'wordsCount', 59 | meta: { 60 | selector: ['.content'], 61 | format: 'count', 62 | countType: 'text' 63 | }, 64 | defaultValue: 0 65 | }, { 66 | name: 'comments', 67 | meta: { 68 | selector: ['.stats-comments .number'], 69 | format: 'text' 70 | }, 71 | defaultValue: 0 72 | }, { 73 | name: 'likes', 74 | meta: { 75 | selector: ['.stats-vote .number'], 76 | format: 'text' 77 | }, 78 | defaultValue: 0 79 | }], 80 | // 是否模拟用户请求 81 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 82 | // 编码 默认utf-8 83 | charset: null, 84 | // 回调函数 对所有数据做处理 85 | afterExtractAll: function (data) { 86 | data.fields['hits'] = 0; 87 | return data; 88 | }, 89 | afterExtractField: function (fieldsName, data) { 90 | if (fieldsName === 'tags') { 91 | data = data ? data.split(',') : []; 92 | } 93 | if (fieldsName === 'comments') { 94 | data = +data; 95 | } 96 | if (fieldsName === 'likes') { 97 | data = +data; 98 | } 99 | return data; 100 | } 101 | }; 102 | ``` 103 | 104 | ### 引入 105 | ``` 106 | const Crawler = require('almighty-parser-core') 107 | const options = require('../parser/parser-qiushibaike.js') 108 | const parser = new Crawler(options) 109 | ``` 110 | 111 | ### API测试 112 | #### parse 113 | ``` 114 | { fields: 115 | { author: '草莓、牛奶巧克力', 116 | tags: [ '搞笑图片' ], 117 | keywords: '', 118 | description: '笑死我了', 119 | content: '
\n\n笑死我了\n\n
\n\n糗事#119095438\n\n
', 120 | imagesCount: 1, 121 | wordsCount: 4, 122 | comments: 0, 123 | likes: 457, 124 | from: 'https://www.qiushibaike.com/article/119095438', 125 | sourceId: 'com.qiushibaike.www-article-119095438', 126 | site: 'www.qiushibaike.com', 127 | hits: 0 }, 128 | urls: 129 | [ 'https://www.qiushibaike.com/', 130 | 'https://www.qiushibaike.com/hot/', 131 | 'https://www.qiushibaike.com/imgrank/', 132 | 'https://www.qiushibaike.com/text/', 133 | 'https://www.qiushibaike.com/history/', 134 | 'https://www.qiushibaike.com/pic/', 135 | 'https://www.qiushibaike.com/textnew/', 136 | 'https://www.qiushibaike.com/my', 137 | 'https://www.qiushibaike.com/article/116423562', 138 | 'https://www.qiushibaike.com/article/116424718', 139 | 'https://www.qiushibaike.com/article/116421669', 140 | 'https://www.qiushibaike.com/article/116423344', 141 | 'https://www.qiushibaike.com/article/116426229', 142 | 'https://www.qiushibaike.com/article/116423107', 143 | 'https://www.qiushibaike.com/article/104614784', 144 | 'https://www.qiushibaike.com/article/104590828', 145 | 'https://www.qiushibaike.com/article/104629666', 146 | 'https://www.qiushibaike.com/article/104599846', 147 | 'https://www.qiushibaike.com/article/104598154', 148 | 'https://www.qiushibaike.com/article/104619022', 149 | 'https://www.qiushibaike.com/article/118954381', 150 | 'https://www.qiushibaike.com/article/118491926', 151 | 'https://www.qiushibaike.com/article/118563113', 152 | 'https://www.qiushibaike.com/article/118806836', 153 | 'https://www.qiushibaike.com/article/118525804', 154 | 'https://www.qiushibaike.com/article/118770803', 155 | 'https://www.qiushibaike.com/article/119008939', 156 | 'https://www.qiushibaike.com/article/119033005', 157 | 'https://www.qiushibaike.com/article/119036209', 158 | 'https://www.qiushibaike.com/article/118922421', 159 | 'https://www.qiushibaike.com/article/119014594', 160 | 'https://www.qiushibaike.com/article/119009873', 161 | 'https://www.qiushibaike.com/article/118934286', 162 | 'https://www.qiushibaike.com/joke/', 163 | 'https://www.qiushibaike.com/article/' ] } 164 | ``` 165 | 166 | 其余接口测试请下载后运行 167 | ``` 168 | npm run test:qiushibaike 169 | ``` 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /example/parser/parser-baoliao5.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | /** 3 | * 爆料网 4 | * http://www.baoliao5.com/ 5 | */ 6 | module.exports = { 7 | // 域名 网站域名,设置域名后只处理这些域名下的网页 8 | domains: 'http://www.baoliao5.com/', 9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理 10 | listUrlRegexes: [/http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)*$/, /http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)+\/list[0-9_]+\.html*$/], 11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理 12 | contentUrlRegexes: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+\.html/], 13 | // 从内容页中抽取需要的数据 14 | fields: [{ 15 | // 标题 16 | name: 'title', 17 | meta: { 18 | // 默认 type 为 jquery/text/xpath 19 | selector: ['.t4Btit'], 20 | format: 'text' 21 | }, 22 | required: true 23 | }, { 24 | // 详情 25 | name: 'content', 26 | meta: { 27 | selector: ['#icontent'], 28 | format: 'html' 29 | }, 30 | required: true 31 | }, { 32 | // 作者 33 | name: 'author', 34 | meta: { 35 | selector: ['.t4Bexp'], 36 | format: 'text' 37 | } 38 | }, { 39 | // 标签 40 | name: 'tags', 41 | meta: { 42 | format: 'text', 43 | selector: ['.itj_lt .lc a'], 44 | index: 1 45 | } 46 | }, { 47 | // 网页关键字 48 | name: 'keywords', 49 | meta: { 50 | format: 'meta', 51 | selector: ['meta[name="keywords"]'] 52 | } 53 | }, { 54 | // 网页描述 55 | name: 'description', 56 | meta: { 57 | format: 'meta', 58 | selector: ['meta[name="description"]'] 59 | } 60 | }, { 61 | name: 'imagesCount', 62 | meta: { 63 | selector: ['#icontent'], 64 | format: 'count', 65 | countType: 'image' 66 | }, 67 | defaultValue: 0 68 | }, { 69 | name: 'wordsCount', 70 | meta: { 71 | selector: ['#icontent'], 72 | format: 'count', 73 | countType: 'text' 74 | }, 75 | defaultValue: 0 76 | }, { 77 | name: 'publishedAt', 78 | meta: { 79 | format: 'text', 80 | selector: ['.t4Bexp'] 81 | } 82 | }], 83 | // 内容下一页 84 | contentPage: { 85 | urls: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+_[0-9]+\.html/], 86 | selector: ['#icontent'], 87 | appendNode: '#icontent' 88 | }, 89 | // 是否模拟用户请求 90 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 91 | // 编码 默认utf-8 92 | charset: 'gb2312', 93 | // 回调函数 对所有数据做处理 94 | afterExtractAll: function (data) { 95 | data.fields['comments'] = 0; 96 | data.fields['hits'] = 0; 97 | data.fields['likes'] = 0; 98 | return data; 99 | }, 100 | afterExtractField: function (fieldsName, data) { 101 | if (fieldsName === 'author') { 102 | data = data.trim() 103 | if (data.indexOf('编辑:') >= 0) { 104 | var arr = data.split('编辑:'); 105 | data = arr[arr.length - 1]; 106 | } else { 107 | data = ''; 108 | } 109 | } 110 | if (fieldsName === 'publishedAt') { 111 | data = new Date(data.replace(/[^0-9\-\: ]+/img, '')).getTime() || new Date().getTime(); 112 | } 113 | if (fieldsName === 'tags') { 114 | data = (data !== '') ? [data] : []; 115 | } 116 | return data; 117 | } 118 | }; 119 | -------------------------------------------------------------------------------- /example/parser/parser-healthno1.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | /** 3 | * healthNo1 4 | * http://www.healthno1.com/ 5 | */ 6 | module.exports = { 7 | // 域名 网站域名,设置域名后只处理这些域名下的网页 8 | domains: 'http://www.healthno1.com/', 9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理 10 | listUrlRegexes: [/^http:\/\/www\.healthno1\.com(\/[a-z_]+(\.html)?)*(\/)?(\?start=[0-9]+)?$/], 11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理 12 | contentUrlRegexes: [/^http:\/\/www\.healthno1\.com\/([a-z_]+\/)*[0-9-]+\.html$/], 13 | // 从内容页中抽取需要的数据 14 | fields: [{ 15 | // 标题 16 | name: 'title', 17 | meta: { 18 | // 默认 type 为 jquery/text/xpath 19 | selector: ['#gkContentWrap .item-page header h1'], 20 | format: 'text' 21 | }, 22 | required: true 23 | }, { 24 | // 详情 25 | name: 'content', 26 | meta: { 27 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'], 28 | format: 'html' 29 | }, 30 | required: true 31 | }, { 32 | // 作者 33 | name: 'author', 34 | meta: { 35 | format: 'meta', 36 | selector: ['meta[name="author"]'] 37 | } 38 | }, { 39 | // 标签 40 | name: 'tags', 41 | meta: { 42 | format: 'text', 43 | selector: ['.category-name a'], 44 | index: 0 45 | } 46 | }, { 47 | // 网页关键字 48 | name: 'keywords', 49 | meta: { 50 | format: 'meta', 51 | selector: ['meta[name="keywords"]'] 52 | } 53 | }, { 54 | // 网页描述 55 | name: 'description', 56 | meta: { 57 | format: 'meta', 58 | selector: ['meta[name="description"]'] 59 | } 60 | }, { 61 | name: 'imagesCount', 62 | meta: { 63 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'], 64 | format: 'count', 65 | countType: 'image' 66 | }, 67 | defaultValue: 0 68 | }, { 69 | name: 'wordsCount', 70 | meta: { 71 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'], 72 | format: 'count', 73 | countType: 'text' 74 | }, 75 | defaultValue: 0 76 | }, { 77 | name: 'publishedAt', 78 | meta: { 79 | format: 'text', 80 | selector: ['.created time'] 81 | } 82 | }, { 83 | name: 'hits', 84 | meta: { 85 | format: 'text', 86 | selector: ['.hits'], 87 | index: 0 88 | }, 89 | defaultValue: 0 90 | }], 91 | // 是否模拟用户请求 92 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 93 | // 编码 默认utf-8 94 | charset: null, 95 | // 语言格式 96 | i18n: 'tw2s', 97 | // 回调函数 对所有数据做处理 98 | afterExtractAll: function (data) { 99 | data.fields['comments'] = 0; 100 | data.fields['likes'] = 0; 101 | return data; 102 | }, 103 | afterExtractField: function (fieldsName, data) { 104 | if (fieldsName === 'publishedAt') { 105 | data = new Date(data.replace(/[^0-9\- \:]+/img, '')).getTime() || new Date().getTime(); 106 | } 107 | if (fieldsName === 'tags') { 108 | data = (data !== '') ? [data] : []; 109 | } 110 | if (fieldsName === 'title') { 111 | data = data.trim(); 112 | } 113 | if (fieldsName === 'hits') { 114 | data = data.replace(/[^0-9]+/img, '') || 0; 115 | } 116 | return data; 117 | } 118 | }; 119 | -------------------------------------------------------------------------------- /example/parser/parser-qiushibaike.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | /** 3 | * 糗事百科 4 | * https://www.qiushibaike.com/ 5 | */ 6 | module.exports = { 7 | // 域名 网站域名,设置域名后只处理这些域名下的网页 8 | domains: 'https://www.qiushibaike.com/', 9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理 10 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/], 11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理 12 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/], 13 | // 从内容页中抽取需要的数据 14 | fields: [{ 15 | // 作者 16 | name: 'author', 17 | meta: { 18 | selector: ['.author h2'], 19 | format: 'text' 20 | } 21 | }, { 22 | // 标签 23 | name: 'tags', 24 | meta: { 25 | format: 'text', 26 | selector: ['.source a'], 27 | index: 0 28 | } 29 | }, { 30 | // 网页关键字 31 | name: 'keywords', 32 | meta: { 33 | format: 'meta', 34 | selector: ['meta[name="keywords"]'] 35 | } 36 | }, { 37 | // 网页描述 38 | name: 'description', 39 | meta: { 40 | format: 'meta', 41 | selector: ['meta[name="description"]'] 42 | } 43 | }, { 44 | // 详情 45 | name: 'content', 46 | meta: { 47 | selector: ['.content', '.thumb'], 48 | format: 'html' 49 | }, 50 | required: true 51 | }, { 52 | name: 'imagesCount', 53 | meta: { 54 | selector: ['.thumb'], 55 | format: 'count', 56 | countType: 'image' 57 | }, 58 | defaultValue: 0 59 | }, { 60 | name: 'wordsCount', 61 | meta: { 62 | selector: ['.content'], 63 | format: 'count', 64 | countType: 'text' 65 | }, 66 | defaultValue: 0 67 | }, { 68 | name: 'comments', 69 | meta: { 70 | selector: ['.stats-comments .number'], 71 | format: 'text' 72 | }, 73 | defaultValue: 0 74 | }, { 75 | name: 'likes', 76 | meta: { 77 | selector: ['.stats-vote .number'], 78 | format: 'text' 79 | }, 80 | defaultValue: 0 81 | }], 82 | // 是否模拟用户请求 83 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 84 | // 编码 默认utf-8 85 | charset: null, 86 | // 回调函数 对所有数据做处理 87 | afterExtractAll: function (data) { 88 | data.fields['hits'] = 0; 89 | return data; 90 | }, 91 | afterExtractField: function (fieldsName, data) { 92 | if (fieldsName === 'tags') { 93 | data = data ? data.split(',') : []; 94 | } 95 | if (fieldsName === 'comments') { 96 | data = +data; 97 | } 98 | if (fieldsName === 'likes') { 99 | data = +data; 100 | } 101 | return data; 102 | } 103 | }; 104 | -------------------------------------------------------------------------------- /example/test/baoliao5.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const Crawler = require('../../index.js') 3 | const options = require('../parser/parser-baoliao5.js') 4 | const parser = new Crawler(options) 5 | 6 | // const url = 'http://www.baoliao5.com/' 7 | const url = 'http://www.baoliao5.com/yule/201701/1867.html' 8 | // const url = 'http://www.baoliao5.com/yingshi/201701/1835.html' 9 | // const url = 'http://www.baoliao5.com/yingshi/' 10 | // const url = 'http://www.baoliao5.com/yingshi/list_7_11.html' 11 | // const url = 'http://www.baoliao5.com/meitu/201701/1848.html' 12 | // const url = 'http://www.baoliao5.com/yule/neidi/' 13 | 14 | let errorItems = [] 15 | 16 | // 测试获取内容 17 | async function testParseDate () { 18 | try { 19 | const result = await parser.parse(url) 20 | console.log('获取数据内容为', result) 21 | } catch (e) { 22 | console.error('[抓取数据出错]', e.message) 23 | errorItems.push('testParseDate') 24 | } 25 | } 26 | // 检测链接是否是详情页 27 | function testIsArticleUrl () { 28 | try { 29 | const result = parser.isArticleUrl(url) 30 | console.log('获取数据内容为', result) 31 | } catch (e) { 32 | console.error('[抓取数据出错]', e.message) 33 | errorItems.push('testIsArticleUrl') 34 | } 35 | } 36 | // 测试页面链接的唯一标示 37 | function testGetIdFromArticleUrl () { 38 | try { 39 | const result = parser.getIdFromArticleUrl(url) 40 | console.log('获取数据内容为', result) 41 | } catch (e) { 42 | console.error('[抓取数据出错]', e.message) 43 | errorItems.push('testGetIdFromArticleUrl') 44 | } 45 | } 46 | 47 | // 获取详情页内容 48 | async function testGetContent () { 49 | try { 50 | const result = await parser.getContent(url) 51 | console.log('获取数据内容为', result) 52 | } catch (e) { 53 | console.error('[抓取数据出错]', e.message) 54 | errorItems.push('testGetContent') 55 | } 56 | } 57 | 58 | // 获取详情页内容 59 | async function testGetLinks () { 60 | try { 61 | const result = await parser.getLinks(url) 62 | console.log('获取数据内容为', result) 63 | } catch (e) { 64 | console.error('[抓取数据出错]', e.message) 65 | errorItems.push('testGetLinks') 66 | } 67 | } 68 | 69 | // 测试入口 70 | async function start () { 71 | console.log('测试开始') 72 | console.log('------') 73 | console.log('测试步骤1 获取内容') 74 | await testParseDate() 75 | console.log('测试步骤1 获取内容 结束') 76 | console.log('------') 77 | console.log('测试步骤2 校验链接是否为详情页') 78 | testIsArticleUrl() 79 | console.log('测试步骤2 校验链接是否为详情页 结束') 80 | console.log('------') 81 | console.log('测试步骤3 获取页面链接的唯一标示') 82 | testGetIdFromArticleUrl() 83 | console.log('测试步骤3 获取页面链接的唯一标示 结束') 84 | console.log('------') 85 | console.log('测试步骤4 获取详情页内容') 86 | // await testGetContent() 87 | console.log('测试步骤4 获取详情页内容 结束') 88 | console.log('------') 89 | console.log('测试步骤5 获取列表页内容') 90 | await testGetLinks() 91 | console.log('测试步骤5 获取列表页内容 结束') 92 | console.log('------') 93 | console.log('所有接口均已测试结束') 94 | if (errorItems.length) { 95 | console.log('测试结果: ', errorItems.join(','), '异常。') 96 | } else { 97 | console.log('测试结果: 所有接口都正常。') 98 | } 99 | } 100 | start() 101 | -------------------------------------------------------------------------------- /example/test/healthno1.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const Crawler = require('../../index.js') 3 | const options = require('../parser/parser-healthno1.js') 4 | const parser = new Crawler(options) 5 | 6 | // const url = 'http://www.healthno1.com/' 7 | // const url = 'http://www.healthno1.com/feature_articles.html?start=12' 8 | // const url = 'http://www.healthno1.com/feature_articles.html' 9 | // const url = 'http://www.healthno1.com/health_info/16841-2017-05-12-03-10-00.html' 10 | const url = 'http://www.healthno1.com/16939-2017-05-19-10-16-00.html' 11 | 12 | let errorItems = [] 13 | 14 | // 测试获取内容 15 | async function testParseDate () { 16 | try { 17 | const result = await parser.parse(url) 18 | console.log('获取数据内容为', result) 19 | } catch (e) { 20 | console.error('[抓取数据出错]', e.message) 21 | errorItems.push('testParseDate') 22 | } 23 | } 24 | // 检测链接是否是详情页 25 | function testIsArticleUrl () { 26 | try { 27 | const result = parser.isArticleUrl(url) 28 | console.log('获取数据内容为', result) 29 | } catch (e) { 30 | console.error('[抓取数据出错]', e.message) 31 | errorItems.push('testIsArticleUrl') 32 | } 33 | } 34 | // 测试页面链接的唯一标示 35 | function testGetIdFromArticleUrl () { 36 | try { 37 | const result = parser.getIdFromArticleUrl(url) 38 | console.log('获取数据内容为', result) 39 | } catch (e) { 40 | console.error('[抓取数据出错]', e.message) 41 | errorItems.push('testGetIdFromArticleUrl') 42 | } 43 | } 44 | 45 | // 获取详情页内容 46 | async function testGetContent () { 47 | try { 48 | const result = await parser.getContent(url) 49 | console.log('获取数据内容为', result) 50 | } catch (e) { 51 | console.error('[抓取数据出错]', e.message) 52 | errorItems.push('testGetContent') 53 | } 54 | } 55 | 56 | // 获取详情页内容 57 | async function testGetLinks () { 58 | try { 59 | const result = await parser.getLinks(url) 60 | console.log('获取数据内容为', result) 61 | } catch (e) { 62 | console.error('[抓取数据出错]', e.message) 63 | errorItems.push('testGetLinks') 64 | } 65 | } 66 | 67 | // 测试入口 68 | async function start () { 69 | console.log('测试开始') 70 | console.log('------') 71 | console.log('测试步骤1 获取内容') 72 | await testParseDate() 73 | console.log('测试步骤1 获取内容 结束') 74 | console.log('------') 75 | console.log('测试步骤2 校验链接是否为详情页') 76 | testIsArticleUrl() 77 | console.log('测试步骤2 校验链接是否为详情页 结束') 78 | console.log('------') 79 | console.log('测试步骤3 获取页面链接的唯一标示') 80 | testGetIdFromArticleUrl() 81 | console.log('测试步骤3 获取页面链接的唯一标示 结束') 82 | console.log('------') 83 | console.log('测试步骤4 获取详情页内容') 84 | // await testGetContent() 85 | console.log('测试步骤4 获取详情页内容 结束') 86 | console.log('------') 87 | console.log('测试步骤5 获取列表页内容') 88 | // await testGetLinks() 89 | console.log('测试步骤5 获取列表页内容 结束') 90 | console.log('------') 91 | console.log('所有接口均已测试结束') 92 | if (errorItems.length) { 93 | console.log('测试结果: ', errorItems.join(','), '异常。') 94 | } else { 95 | console.log('测试结果: 所有接口都正常。') 96 | } 97 | } 98 | start() 99 | -------------------------------------------------------------------------------- /example/test/qiushibaike.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const Crawler = require('../../index.js') 3 | const options = require('../parser/parser-qiushibaike.js') 4 | const parser = new Crawler(options) 5 | 6 | // const url = 'https://www.qiushibaike.com/hot/' 7 | // const url = 'https://www.qiushibaike.com/hot/page/4/?s=4987995' 8 | // const url = 'https://www.qiushibaike.com/article/119101871' 9 | // const url = 'https://www.qiushibaike.com/article/119102864' 10 | const url = 'https://www.qiushibaike.com/article/119095438' 11 | 12 | let errorItems = [] 13 | 14 | // 测试获取内容 15 | async function testParseDate () { 16 | try { 17 | const result = await parser.parse(url) 18 | console.log('获取数据内容为', result) 19 | } catch (e) { 20 | console.error('[抓取数据出错]', e.message) 21 | errorItems.push('testParseDate') 22 | } 23 | } 24 | // 检测链接是否是详情页 25 | function testIsArticleUrl () { 26 | try { 27 | const result = parser.isArticleUrl(url) 28 | console.log('获取数据内容为', result) 29 | } catch (e) { 30 | console.error('[抓取数据出错]', e.message) 31 | errorItems.push('testIsArticleUrl') 32 | } 33 | } 34 | // 检测链接是否是列表页 35 | function testIsListUrl () { 36 | try { 37 | const result = parser.isListUrl(url) 38 | console.log('获取数据内容为', result) 39 | } catch (e) { 40 | console.error('[抓取数据出错]', e.message) 41 | errorItems.push('testIsListUrl') 42 | } 43 | } 44 | // 测试页面链接的唯一标示 45 | function testGetIdFromArticleUrl () { 46 | try { 47 | const result = parser.getIdFromArticleUrl(url) 48 | console.log('获取数据内容为', result) 49 | } catch (e) { 50 | console.error('[抓取数据出错]', e.message) 51 | errorItems.push('testGetIdFromArticleUrl') 52 | } 53 | } 54 | 55 | // 获取详情页内容 56 | async function testGetContent () { 57 | try { 58 | const result = await parser.getContent(url) 59 | console.log('获取数据内容为', result) 60 | } catch (e) { 61 | console.error('[抓取数据出错]', e.message) 62 | errorItems.push('testGetContent') 63 | } 64 | } 65 | 66 | // 获取详情页内容 67 | async function testGetLinks () { 68 | try { 69 | const result = await parser.getLinks(url) 70 | console.log('获取数据内容为', result) 71 | } catch (e) { 72 | console.error('[抓取数据出错]', e.message) 73 | errorItems.push('testGetLinks') 74 | } 75 | } 76 | 77 | // 测试入口 78 | async function start () { 79 | console.log('测试开始') 80 | console.log('------') 81 | console.log('测试步骤1 获取内容') 82 | await testParseDate() 83 | console.log('测试步骤1 获取内容 结束') 84 | console.log('------') 85 | console.log('测试步骤2 校验链接是否为详情页') 86 | testIsArticleUrl() 87 | console.log('测试步骤2 校验链接是否为详情页 结束') 88 | console.log('------') 89 | console.log('测试步骤3 校验链接是否为列表页') 90 | testIsListUrl() 91 | console.log('测试步骤3 校验链接是否为列表页 结束') 92 | console.log('------') 93 | console.log('测试步骤4 获取页面链接的唯一标示') 94 | testGetIdFromArticleUrl() 95 | console.log('测试步骤4 获取页面链接的唯一标示 结束') 96 | console.log('------') 97 | console.log('测试步骤5 获取详情页内容') 98 | // await testGetContent() 99 | console.log('测试步骤5 获取详情页内容 结束') 100 | console.log('------') 101 | console.log('测试步骤6 获取列表页内容') 102 | // await testGetLinks() 103 | console.log('测试步骤6 获取列表页内容 结束') 104 | console.log('------') 105 | console.log('所有接口均已测试结束') 106 | if (errorItems.length) { 107 | console.log('测试结果: ', errorItems.join(','), '异常。') 108 | } else { 109 | console.log('测试结果: 所有接口都正常。') 110 | } 111 | } 112 | start() 113 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const Crawler = require('./lib/crawler.js') 2 | 3 | module.exports = Crawler; -------------------------------------------------------------------------------- /lib/crawler.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | var async = require('async'); 3 | 4 | var helper = require('./helper'), 5 | parser = require('./parser'); 6 | 7 | function Crawler (options) { 8 | options = options || {}; 9 | if (!['domains', 'listUrlRegexes', 'contentUrlRegexes', 'fields'].some(key => key in options)) { 10 | throw new Error('options is invalid data format.'); 11 | } 12 | // 初始化 13 | this._init(options); 14 | }; 15 | /** 16 | * 初始化 17 | */ 18 | Crawler.prototype._init = function (options) { 19 | options.domains = helper.formatUrl(options.domains); 20 | // 核心 21 | this.domains = options.domains || '';// 域名 首页 22 | this.listUrlRegexes = options.listUrlRegexes || [];// 列表页url的正则 23 | this.contentUrlRegexes = options.contentUrlRegexes || [];// 内容页url的正则 24 | this.fields = options.fields || [];// 从内容页中抽取需要的数据 25 | this.contentPage = options.contentPage || null;// 下一页 26 | this.sourceId = options.sourceId || [2, 5, 4];// 唯一标示组成 27 | // 配置 28 | this.userAgent = options.userAgent || null;// 模拟用户请求 29 | this.charset = options.charset || null;// 编码 30 | this.format = options.format || 'html';// 请求格式 http|json|jsonp 31 | this.i18n = options.i18n || null;// 转译 繁体转简体 s2t | t2s | s2tw | tw2s | s2hk | hk2s | t2tw | t2hk 32 | // 函数 33 | this.afterExtractField = options.afterExtractField || null;// 对每一个抓取的数据进行处理 34 | this.afterExtractAll = options.afterExtractAll || null;// 对完整的数据进行一个处理 35 | this.afterExtractUrls = options.afterExtractUrls || null;// 对抓取的url进行一个处理 36 | this.attachFields = options.attachFields || null;// 附加数据 37 | }; 38 | /** 39 | * 检测链接类型 40 | * 可选参数 41 | * type: list|post 42 | */ 43 | Crawler.prototype._judge = function (url, type) { 44 | var result = ''; 45 | if (!type || type === 'list') { 46 | this.listUrlRegexes.forEach(function (urlRegex) { 47 | if (urlRegex.test(url)) { 48 | result = 'list'; 49 | } 50 | }); 51 | } 52 | if (!type || type === 'post') { 53 | this.contentUrlRegexes.forEach(function (urlRegex) { 54 | if (urlRegex.test(url)) { 55 | result = 'post'; 56 | } 57 | }); 58 | } 59 | return type ? result === type : result; 60 | }; 61 | /** 62 | * 根据url生成唯一标示 63 | */ 64 | Crawler.prototype._getSourceId = function (url) { 65 | var type = this._judge(url); 66 | if (!type) { 67 | console.error('The url type is not list or post.'); 68 | return null; 69 | } 70 | var regex = /(\w+):\/\/([^\:|\/]+)(\:\d*)?(.*\/)([^#|\?|\n]+)?(#.*)?(\?.*)?/i; 71 | var arr = url.match(regex); 72 | this._site = arr[2]; 73 | var sources = ''; 74 | this.sourceId.forEach(function (item) { 75 | if (!!arr[item]) { 76 | if (item === 2) { 77 | sources += arr[item].split('.').reverse().join('.'); 78 | } else { 79 | sources += arr[item].replace(/\//img, '-').replace('.', '-'); 80 | } 81 | } 82 | }); 83 | sources = ((sources.substring(sources.length - 1) === '-') ? sources.substring(0, sources.length - 1) : sources).trim(); 84 | return sources; 85 | }; 86 | /** 87 | * 解析详情页 88 | */ 89 | Crawler.prototype._getContent = function (url, callback) { 90 | var self = this; 91 | self.url = url; 92 | var result = {}; 93 | var resultAttachFields = {}; 94 | result.bodyData = null; 95 | result.fields = null; 96 | // 处理附加数据 97 | var getAttachBodyFields = function (done) { 98 | if (!self._judge(url, 'post')) { 99 | return done(null); 100 | } 101 | if (!self.attachFields) { 102 | return done(null); 103 | } 104 | if (!self.attachFields.url) { 105 | return done(null); 106 | } 107 | parser.getAttachUrl({ 108 | url: self.attachFields.url, 109 | meta: self.attachFields.meta, 110 | body: result.bodyData 111 | }, function (error, _url) { 112 | helper.request(_url, { 113 | format: self.format, 114 | charset: self.charset, 115 | userAgent: self.userAgent 116 | }, function (error, body) { 117 | if (error) { 118 | return done(error); 119 | } 120 | resultAttachFields = parser.getFieldsBySelector(body, self.attachFields.fields); 121 | done(error); 122 | }); 123 | }); 124 | }; 125 | var getBodyPage = function (done) { 126 | if (!(self._judge(url, 'post') && !!result.bodyData && !!self.contentPage)) { 127 | return done(null); 128 | } 129 | // 处理下一页 130 | parser.getContentPage(self, { body: result.bodyData, url }, function (error, body) { 131 | if (error) { 132 | return done(error); 133 | } 134 | if (body) { 135 | result.bodyData = body; 136 | } 137 | done(error); 138 | }); 139 | }; 140 | var getBodyFields = function (done) { 141 | helper.request(url, { 142 | format: self.format, 143 | charset: self.charset, 144 | userAgent: self.userAgent 145 | }, function (error, body) { 146 | if (error) { 147 | return done(error); 148 | } 149 | result.bodyData = body; 150 | done(error); 151 | }); 152 | }; 153 | async.waterfall([getBodyFields, getBodyPage, getAttachBodyFields], function (error) { 154 | if (error) { 155 | return callback(error); 156 | } 157 | 158 | if (self._judge(url, 'post') && !!result.bodyData) { 159 | // 获取数据 160 | result.fields = parser.getFields(result.bodyData, self); 161 | result.fields.from = url; 162 | result.fields.sourceId = self._getSourceId(url); 163 | result.fields.site = self._site; 164 | // 附加数据 165 | for (var name in resultAttachFields) { 166 | result.fields[name] = resultAttachFields[name]; 167 | } 168 | // 处理完整数据 169 | if (self.afterExtractAll) { 170 | result = self.afterExtractAll(result); 171 | } 172 | } 173 | callback(error, result); 174 | }); 175 | }; 176 | /** 177 | * 解析列表页 178 | */ 179 | Crawler.prototype._getLinks = function (url, callback) { 180 | var self = this; 181 | helper.request(url, { 182 | format: self.format, 183 | charset: self.charset, 184 | userAgent: self.userAgent 185 | }, function (error, body) { 186 | var result = {}; 187 | result.urls = null; 188 | if (body) { 189 | result.urls = self._parseUrls(body, url); 190 | } 191 | callback(error, result); 192 | }); 193 | }; 194 | /** 195 | * 解析url 196 | */ 197 | Crawler.prototype._parseUrls = function (bodyData, url) { 198 | var self = this; 199 | self.url = url; 200 | return parser.getUrls(bodyData, self); 201 | }; 202 | /** 203 | * 解析获取内容[为`getLinks`与`getContent`的集合] 204 | */ 205 | Crawler.prototype.parse = function (url, callback) { 206 | url = helper.formatUrl(url); 207 | var self = this; 208 | var type = null; 209 | var result = {}; 210 | var bodyData = null; 211 | 212 | // 获取页面的数据 213 | var parserUrls = function (data, done) { 214 | result.urls = null; 215 | if (!!bodyData) { 216 | result.urls = self._parseUrls(bodyData, url); 217 | } 218 | done(null, result); 219 | }; 220 | // 获取页面的链接 221 | var parserFields = function (type, done) { 222 | self._getContent(url, function (error, data) { 223 | if (!data || error) { 224 | return done(error); 225 | } 226 | bodyData = data.bodyData; 227 | result.fields = data.fields; 228 | done(null, result); 229 | }); 230 | }; 231 | // 判断是否为url 232 | var judge = function (done) { 233 | type = self._judge(url); 234 | if (type) { 235 | done(null, type); 236 | } else { 237 | done('url mismatch'); 238 | } 239 | }; 240 | return new Promise(function (resolve, reject) { 241 | async.waterfall([judge, parserFields, parserUrls], function (error, result) { 242 | if (error) { 243 | console.error(error); 244 | if (callback) return callback(error); 245 | return reject(error); 246 | } 247 | resolve(result); 248 | if (callback) { 249 | callback(null, result); 250 | } 251 | }); 252 | }); 253 | }; 254 | /** 255 | * 获取待抓页链接 256 | */ 257 | Crawler.prototype.getLinks = function (url, callback) { 258 | url = helper.formatUrl(url); 259 | var self = this; 260 | var type = this._judge(url); 261 | if (!type) return null; 262 | return new Promise(function (resolve, reject) { 263 | self._getLinks(url, function (error, result) { 264 | if (error) { 265 | console.error(error); 266 | if (callback) return callback(error); 267 | return reject(error); 268 | } 269 | resolve(result.urls); 270 | if (callback) { 271 | callback(null, result.urls); 272 | } 273 | }); 274 | }); 275 | }; 276 | /** 277 | * 获取详情页内容 278 | */ 279 | Crawler.prototype.getContent = function (url, callback) { 280 | url = helper.formatUrl(url); 281 | var self = this; 282 | var type = this._judge(url); 283 | if (!type) return null; 284 | if (!this.isArticleUrl(url)) return null; 285 | return new Promise(function (resolve, reject) { 286 | self._getContent(url, function (error, result) { 287 | if (error) { 288 | console.error(error); 289 | if (callback) return callback(error); 290 | return reject(error); 291 | } 292 | resolve(result.fields); 293 | if (callback) { 294 | callback(null, result.fields); 295 | } 296 | }); 297 | }); 298 | }; 299 | /** 300 | * 检测链接是否是详情页 301 | */ 302 | Crawler.prototype.isArticleUrl = function (url) { 303 | url = helper.formatUrl(url); 304 | return this._judge(url, 'post'); 305 | }; 306 | /** 307 | * 检测链接是否是列表页 308 | */ 309 | Crawler.prototype.isListUrl = function (url) { 310 | url = helper.formatUrl(url); 311 | return this._judge(url, 'list'); 312 | }; 313 | /** 314 | * 获取页面链接的唯一标示 315 | */ 316 | Crawler.prototype.getIdFromArticleUrl = function (url) { 317 | url = helper.formatUrl(url); 318 | var type = this._judge(url); 319 | return type ? this._getSourceId(url) : null; 320 | }; 321 | 322 | module.exports = Crawler; 323 | -------------------------------------------------------------------------------- /lib/helper.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | var request = require('request'), 3 | iconv = require('iconv-lite'), 4 | OpenCC = require('opencc'); 5 | 6 | /** 7 | * 代理 8 | */ 9 | var _proxy = function () { 10 | var proxy = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || process.env.ALL_PROXY; 11 | if (proxy) { 12 | request = request.defaults({'proxy': proxy}); 13 | } 14 | }; 15 | /** 16 | * 请求核心 17 | */ 18 | var _requestCore = function (url, options, callback) { 19 | _proxy(); 20 | var query = {}; 21 | query.url = url; 22 | query.headers = {}; 23 | if (options.charset && options.charset !== 'utf-8') { 24 | query.encoding = null; 25 | } 26 | if (options.userAgent) { 27 | query.headers = { 28 | 'User-Agent': options.userAgent 29 | }; 30 | } 31 | request.get(query, function (err, res, body) { 32 | if (!err && res.statusCode === 200) { 33 | if (options.charset && options.charset !== 'utf-8') { 34 | body = iconv.decode(body, options.charset);// 处理转码问题 35 | } 36 | callback(err, body); 37 | } else { 38 | console.error(err); 39 | return callback(err); 40 | } 41 | }); 42 | }; 43 | /** 44 | * 多种类型请求 45 | */ 46 | var _request = { 47 | html: function (url, options, callback) { 48 | _requestCore(url, options, function (error, body) { 49 | callback(error, body); 50 | }); 51 | }, 52 | json: function (url, options, callback) { 53 | _requestCore(url, options, function (error, body) { 54 | body = JSON.parse(body); 55 | callback(error, body); 56 | }); 57 | }, 58 | jsonp: function (url, options, callback) { 59 | _requestCore(url, options, function (error, body) { 60 | body = body.substring(9, body.length - 1); 61 | body = JSON.parse(body); 62 | callback(error, body); 63 | }); 64 | } 65 | }; 66 | 67 | /** 68 | * 请求接口 69 | * 支持http/json/jsonp 70 | */ 71 | var requestUrl = function (url, options, callback) { 72 | options.format = options.format || 'html'; 73 | if (options.format === 'html') { 74 | _request.html(url, options, callback); 75 | } else if (options.format === 'json') { 76 | _request.json(url, options, callback); 77 | } else if (options.format === 'jsonp') { 78 | _request.jsonp(url, options, callback); 79 | } else { 80 | console.error('The request format is error.'); 81 | } 82 | }; 83 | /** 84 | * 转义 i18n 85 | * 86 | * 支持的类型: 87 | * 简体到繁体 s2t 88 | * 繁体到简体 t2s 89 | * 简体到台湾正体 s2tw 90 | * 台湾正体到简体 tw2s 91 | * 简体到香港繁体 s2hk 92 | * 香港繁体到简体 hk2s 93 | * 繁体到台湾正体 t2tw 94 | * 繁体到香港繁体 t2hk 95 | */ 96 | var translate = function (str, type) { 97 | type = type || 'tw2s' 98 | if (['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 't2tw', 't2hk'].indexOf(type) < 0) { 99 | console.error(type, 'in i18n is null'); 100 | return str; 101 | } 102 | var opencc = new OpenCC(type + '.json'); 103 | var converted = opencc.convertSync(str); 104 | return converted; 105 | }; 106 | 107 | /** 108 | * 追加首页链接结尾的/ 109 | */ 110 | var formatUrl = function (url) { 111 | if (url.split('/').length - 1 === 2) { 112 | url += '/'; 113 | } 114 | return url; 115 | }; 116 | /** 117 | * 转码 118 | */ 119 | var encode = function (str) { 120 | return str.replace(/[^\u0000-\u00FF]/g, function ($0) { 121 | return escape($0).replace(/(%u)(\w{4})/gi, "&#x$2") 122 | }); 123 | }; 124 | var rencode = function (str) { 125 | return unescape(str.replace(/(&#x)(\w{4});/gi, "%u$2")).replace(/%uA0/img, ' ').replace(/ /img, ' '); 126 | }; 127 | /** 128 | * 数组去重 129 | */ 130 | var deDuplication = function (arr) { 131 | var filterObj = {}; 132 | arr = arr.filter(function (_item) { 133 | if (!filterObj[_item]) { 134 | filterObj[_item] = true; 135 | return true; 136 | } else { 137 | return false; 138 | } 139 | }); 140 | return arr; 141 | }; 142 | 143 | module.exports = { 144 | request: requestUrl, 145 | translate: translate, 146 | formatUrl: formatUrl, 147 | encode: encode, 148 | rencode: rencode, 149 | deDuplication: deDuplication 150 | }; 151 | -------------------------------------------------------------------------------- /lib/parser.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | var async = require('async'), 3 | cheerio = require('cheerio'), 4 | xpath = require('xpath'), 5 | dom = require('xmldom').DOMParser; 6 | var helper = require('./helper'); 7 | 8 | /** 9 | * 解析器 10 | */ 11 | // jq解析器 12 | var _jquerySelectorParser = function ($, item) { 13 | var data = item.countType === 'image' ? [] : ''; 14 | item.selector.forEach(function (selector) { 15 | if (!item.format || item.format === 'text') { 16 | if (item.index !== undefined) { 17 | data = $(selector).eq(item.index).text().trim() || data; 18 | } else { 19 | data += $(selector).text() || ''; 20 | } 21 | } 22 | if (item.format === 'meta') { 23 | if (item.index !== undefined) { 24 | data = $(selector).eq(item.index).attr('content') || data; 25 | } else { 26 | data += $(selector).attr('content') || ''; 27 | } 28 | } 29 | if (item.format === 'count') { 30 | if (item.countType === 'image') { 31 | if (selector.indexOf(' img') > -1 || selector === 'img') { 32 | $(selector).each(function (i) { 33 | data.push($(this).attr('src')); 34 | }) 35 | } else { 36 | $(selector).find('img').each(function (i) { 37 | data.push($(this).attr('src')); 38 | }) 39 | } 40 | } 41 | if (item.countType === 'text') { 42 | var text = $(selector).text(); 43 | text = text.replace(/\n[\s| | ]*\r/g, '\n').replace(/\n/img, ''); 44 | data = +(data || 0) + text.length; 45 | } 46 | } 47 | if (item.format === 'html') { 48 | if (item.index !== undefined) { 49 | data = `${helper.rencode($(selector).eq(item.index).html() || data)}`; 50 | } else { 51 | // 处理图片 52 | if (selector.indexOf(' img') > -1 || selector === 'img') { 53 | $(selector).each(function (_index) { 54 | var value = $(this).attr('src'); 55 | if (value) { 56 | data += ``; 57 | } 58 | }); 59 | } else { 60 | $(selector).each(function (_index) { 61 | var value = ($.html(this) || '').trim(); 62 | var regExpExplain = new RegExp('', 'img'); 63 | value = value.replace(regExpExplain, ''); 64 | data += `${helper.rencode(value)}` 65 | 66 | }); 67 | } 68 | } 69 | } 70 | }); 71 | return data; 72 | }; 73 | // 上下文解析器 74 | var _contextSelectorParser = function (body, item) { 75 | var data = ''; 76 | if (item.selector.length !== 2) { 77 | return null; 78 | } 79 | var upIndex = body.indexOf(item.selector[0]); 80 | var downIndex = body.indexOf(item.selector[1]); 81 | 82 | if (upIndex < 0 || downIndex < 0) { 83 | return null; 84 | } 85 | data = body.substring(upIndex + item.selector[0].length, downIndex); 86 | return data; 87 | }; 88 | // xpath解析器 89 | var _xpathSelectorParser = function ($, item) { 90 | var data = ''; 91 | item.selector.forEach(function (selector) { 92 | var nodes = xpath.select(selector, $.doc); 93 | if (nodes.length > 0) { 94 | data += nodes[0].textContent || ''; 95 | } 96 | }); 97 | return data; 98 | }; 99 | /** 100 | * 补全链接 101 | */ 102 | var _completionUrl = function (url, options) { 103 | if (!url) { return null; } 104 | if (url.indexOf('javascript') !== -1) { return null; } 105 | if (url.substr(0, 4) !== 'http') { 106 | if (url.substr(0, 2) === '//') { 107 | url = options.domains.split('//')[0] + url; 108 | } else if (url.substr(0, 1) === '/') { 109 | url = url.substr(1); 110 | url = options.domains + url.replace(/^http(s)?:\/\//, '').replace(options.domains.replace(/^http(s)?:\/\//, ''), ''); 111 | } else { 112 | if (url.substr(0, 2) === './') { 113 | url = url.substr(2); 114 | } 115 | var arrUrl = options.url.split('/'); 116 | arrUrl[arrUrl.length - 1] = url; 117 | url = arrUrl.join('/'); 118 | } 119 | } 120 | if (url.indexOf('#') !== -1) { 121 | url = url.split('#')[0]; 122 | } 123 | url = url.trim(); 124 | return url; 125 | }; 126 | /** 127 | * 提取url 128 | */ 129 | var getUrls = function (bodyHtml, self) { 130 | var $ = cheerio.load(bodyHtml); 131 | var result = []; 132 | $('a').each(function (index) { 133 | var url = _completionUrl($(this).attr('href'), { 134 | domains: self.domains, 135 | url: self.url 136 | }); 137 | if (!url) { return true; } 138 | self.listUrlRegexes.forEach(function (urlRegex) { 139 | if (urlRegex.test(url)) { 140 | result.push(url); 141 | } 142 | }); 143 | self.contentUrlRegexes.forEach(function (urlRegex) { 144 | if (urlRegex.test(url)) { 145 | result.push(url); 146 | } 147 | }); 148 | }); 149 | // 回调函数 150 | if (self.afterExtractUrls) { 151 | result = self.afterExtractUrls(result); 152 | } 153 | // 去重 154 | return helper.deDuplication(result); 155 | }; 156 | /** 157 | * 提取数据 158 | */ 159 | var getFields = function (bodyHtml, self) { 160 | var $ = cheerio.load(bodyHtml); 161 | $.doc = new dom({ 162 | errorHandler: { 163 | warning : function (err) {}, 164 | error : function (err) {}, 165 | fatalError: function (err) {} 166 | } 167 | }).parseFromString($.html()); 168 | // 对图片链接进行处理 169 | $('img').each(function (item) { 170 | var url = _completionUrl($(this).attr('src'), { 171 | domains: self.domains, 172 | url: self.url 173 | }); 174 | if (!url) { return true; } 175 | if ($(this).attr('src').substr(0, 4) !== 'http') { 176 | $(this).attr('src', url); 177 | } 178 | }); 179 | 180 | var result = {}; 181 | self.fields.forEach(function (item) { 182 | result[item.name] = ''; 183 | // 解析 184 | if (!item.meta.type || item.meta.type === 'jq' || item.meta.type === '$' || item.meta.type === 'jquery') { 185 | result[item.name] = _jquerySelectorParser($, item.meta); 186 | if (item.meta.countType === 'image') { 187 | result['imagesList'] = result[item.name]; 188 | result[item.name] = result[item.name].length; 189 | } 190 | } else if (item.meta.type === 'context') { 191 | result[item.name] = _contextSelectorParser(body, item.meta); 192 | } 193 | // 处理默认 194 | if (!result[item.name] && item.defaultValue !== undefined) { 195 | result[item.name] = item.defaultValue; 196 | } 197 | // 处理翻译 198 | if (typeof result[item.name] === 'string' && self.i18n) { 199 | result[item.name] = helper.translate(result[item.name], self.i18n); 200 | } 201 | // 去掉首尾空格 202 | if (typeof result[item.name] === 'string') { 203 | result[item.name] = result[item.name].trim(); 204 | } 205 | // 处理回调 206 | if (self.afterExtractField) { 207 | result[item.name] = self.afterExtractField(item.name, result[item.name]); 208 | } 209 | // 处理必填 210 | if (item.required && (result[item.name] === undefined || result[item.name] === '')) { 211 | console.error('fields[', item.name, '] value is emtly'); 212 | throw new Error('fields value is emtly'); 213 | } 214 | }); 215 | return result; 216 | }; 217 | /** 218 | * 根据选择器获取数据 219 | */ 220 | var getFieldsBySelector = function (bodyHtml, fields) { 221 | var $ = cheerio.load(bodyHtml); 222 | var result = {}; 223 | fields.forEach(function (item) { 224 | result[item.name] = _jquerySelectorParser($, item.meta); 225 | if (!result[item.name] && item.defaultValue !== undefined) { 226 | result[item.name] = item.defaultValue; 227 | } 228 | }); 229 | return result; 230 | }; 231 | /** 232 | * 附加数据 233 | */ 234 | var getAttachUrl = function (options, callback) { 235 | if (!options.meta || !options.meta.length) { 236 | return callback(null, options.url); 237 | } 238 | var $ = cheerio.load(options.body); 239 | options.meta.forEach(function (item) { 240 | var value = null 241 | if (item.format === 'value') { 242 | value = $(item.selector).val(); 243 | } else { 244 | value = $(item.selector).text(); 245 | } 246 | options.url = options.url.replace(`{{${item.name}}}`, value); 247 | }); 248 | callback(null, options.url); 249 | }; 250 | /** 251 | * 下一页 252 | */ 253 | var getContentPage = function (self, options, callback) { 254 | if (!options.url || !options.body) { 255 | return callback(null, null); 256 | } 257 | if (!self.contentPage || !self.contentPage.urls || !self.contentPage.selector) { 258 | return callback(null, null); 259 | } 260 | var $ = cheerio.load(options.body); 261 | var urls = []; 262 | $('a').each(function (item) { 263 | var url = _completionUrl($(this).attr('href'), { 264 | domains: self.domains, 265 | url: self.url 266 | }); 267 | if (!url) { return true; } 268 | self.contentPage.urls.forEach(function (urlRegex) { 269 | if (urlRegex.test(url)) { 270 | urls.push(url); 271 | } 272 | }); 273 | }); 274 | // 数组去重 275 | urls = helper.deDuplication(urls); 276 | if (!urls.length) { 277 | return callback(null, null); 278 | } 279 | var regExpExplain = new RegExp('', 'img'); 280 | var data = ''; 281 | // 如果数组有数据 则拼接数据到指定位置 282 | async.mapSeries(urls, function (url, done) { 283 | helper.request(url, { 284 | format: self.format, 285 | charset: self.charset, 286 | userAgent: self.userAgent 287 | }, function (error, body) { 288 | var _$ = cheerio.load(body); 289 | self.contentPage.selector.forEach(function (selector) { 290 | _$(selector).each(function (_index) { 291 | var value = ($.html(this) || '').trim(); 292 | value = value.replace(regExpExplain, ''); 293 | data += `${helper.rencode(value)}`; 294 | }); 295 | }); 296 | done(error, null); 297 | }); 298 | }, function (err, result) { 299 | if (self.contentPage.prependNode) { 300 | $(self.contentPage.prependNode).prepend(data); 301 | } 302 | if (self.contentPage.appendNode) { 303 | $(self.contentPage.appendNode).append(data); 304 | } 305 | callback(err, $.html()); 306 | }); 307 | }; 308 | module.exports = { 309 | // 提取url 310 | getUrls: getUrls, 311 | // 提取数据 312 | getFields: getFields, 313 | // 下一页 314 | getContentPage: getContentPage, 315 | // 附加数据 316 | getAttachUrl: getAttachUrl, 317 | // 根据选择器获取数据 318 | getFieldsBySelector: getFieldsBySelector 319 | }; 320 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "almighty-parser-core", 3 | "version": "0.0.1", 4 | "lockfileVersion": 1, 5 | "dependencies": { 6 | "@types/node": { 7 | "version": "6.0.74", 8 | "resolved": "https://registry.npmjs.org/@types/node/-/node-6.0.74.tgz", 9 | "integrity": "sha512-fjUDu2//vsHodfhWeo6bkJcY+YjHAnQSaOahcY6M3hvl7KIHf/5EosXLTJB8gTN2Yfsfaov+FpUtkR/gfgrQXA==" 10 | }, 11 | "ajv": { 12 | "version": "4.11.8", 13 | "resolved": "https://registry.npmjs.org/ajv/-/ajv-4.11.8.tgz", 14 | "integrity": "sha1-gv+wKynmYq5TvcIK8VlHcGc5xTY=" 15 | }, 16 | "asn1": { 17 | "version": "0.2.3", 18 | "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.3.tgz", 19 | "integrity": "sha1-2sh4dxPJlmhJ/IGAd36+nB3fO4Y=" 20 | }, 21 | "assert-plus": { 22 | "version": "0.2.0", 23 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.2.0.tgz", 24 | "integrity": "sha1-104bh+ev/A24qttwIfP+SBAasjQ=" 25 | }, 26 | "async": { 27 | "version": "2.4.1", 28 | "resolved": "https://registry.npmjs.org/async/-/async-2.4.1.tgz", 29 | "integrity": "sha1-YqVrJ5yYoR0JhwlqAcw+6463u9c=" 30 | }, 31 | "asynckit": { 32 | "version": "0.4.0", 33 | "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", 34 | "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=" 35 | }, 36 | "aws-sign2": { 37 | "version": "0.6.0", 38 | "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.6.0.tgz", 39 | "integrity": "sha1-FDQt0428yU0OW4fXY81jYSwOeU8=" 40 | }, 41 | "aws4": { 42 | "version": "1.6.0", 43 | "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.6.0.tgz", 44 | "integrity": "sha1-g+9cqGCysy5KDe7e6MdxudtXRx4=" 45 | }, 46 | "bcrypt-pbkdf": { 47 | "version": "1.0.1", 48 | "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.1.tgz", 49 | "integrity": "sha1-Y7xdy2EzG5K8Bf1SiVPDNGKgb40=", 50 | "optional": true 51 | }, 52 | "boolbase": { 53 | "version": "1.0.0", 54 | "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", 55 | "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" 56 | }, 57 | "boom": { 58 | "version": "2.10.1", 59 | "resolved": "https://registry.npmjs.org/boom/-/boom-2.10.1.tgz", 60 | "integrity": "sha1-OciRjO/1eZ+D+UkqhI9iWt0Mdm8=" 61 | }, 62 | "buffer-shims": { 63 | "version": "1.0.0", 64 | "resolved": "https://registry.npmjs.org/buffer-shims/-/buffer-shims-1.0.0.tgz", 65 | "integrity": "sha1-mXjOMXOIxkmth5MCjDR37wRKi1E=" 66 | }, 67 | "caseless": { 68 | "version": "0.12.0", 69 | "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", 70 | "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=" 71 | }, 72 | "cheerio": { 73 | "version": "1.0.0-rc.1", 74 | "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.1.tgz", 75 | "integrity": "sha1-KvNzOeq3E+9rcs3pjO+mcrh2Qf4=" 76 | }, 77 | "co": { 78 | "version": "4.6.0", 79 | "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", 80 | "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ=" 81 | }, 82 | "combined-stream": { 83 | "version": "1.0.5", 84 | "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.5.tgz", 85 | "integrity": "sha1-k4NwpXtKUd6ix3wV1cX9+JUWQAk=" 86 | }, 87 | "core-util-is": { 88 | "version": "1.0.2", 89 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 90 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 91 | }, 92 | "cryptiles": { 93 | "version": "2.0.5", 94 | "resolved": "https://registry.npmjs.org/cryptiles/-/cryptiles-2.0.5.tgz", 95 | "integrity": "sha1-O9/s3GCBR8HGcgL6KR59ylnqo7g=" 96 | }, 97 | "css-select": { 98 | "version": "1.2.0", 99 | "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", 100 | "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=" 101 | }, 102 | "css-what": { 103 | "version": "2.1.0", 104 | "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", 105 | "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" 106 | }, 107 | "dashdash": { 108 | "version": "1.14.1", 109 | "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", 110 | "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", 111 | "dependencies": { 112 | "assert-plus": { 113 | "version": "1.0.0", 114 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", 115 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" 116 | } 117 | } 118 | }, 119 | "delayed-stream": { 120 | "version": "1.0.0", 121 | "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", 122 | "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=" 123 | }, 124 | "dom-serializer": { 125 | "version": "0.1.0", 126 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", 127 | "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=", 128 | "dependencies": { 129 | "domelementtype": { 130 | "version": "1.1.3", 131 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz", 132 | "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs=" 133 | } 134 | } 135 | }, 136 | "domelementtype": { 137 | "version": "1.3.0", 138 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.0.tgz", 139 | "integrity": "sha1-sXrtguirWeUt2cGbF1bg/BhyBMI=" 140 | }, 141 | "domhandler": { 142 | "version": "2.4.1", 143 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.1.tgz", 144 | "integrity": "sha1-iS5HAAqZvlW783dP/qBWHYh5wlk=" 145 | }, 146 | "domutils": { 147 | "version": "1.5.1", 148 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", 149 | "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=" 150 | }, 151 | "ecc-jsbn": { 152 | "version": "0.1.1", 153 | "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.1.tgz", 154 | "integrity": "sha1-D8c6ntXw1Tw4GTOYUj735UN3dQU=", 155 | "optional": true 156 | }, 157 | "entities": { 158 | "version": "1.1.1", 159 | "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz", 160 | "integrity": "sha1-blwtClYhtdra7O+AuQ7ftc13cvA=" 161 | }, 162 | "extend": { 163 | "version": "3.0.1", 164 | "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.1.tgz", 165 | "integrity": "sha1-p1Xqe8Gt/MWjHOfnYtuq3F5jZEQ=" 166 | }, 167 | "extsprintf": { 168 | "version": "1.0.2", 169 | "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.0.2.tgz", 170 | "integrity": "sha1-4QgOBljjALBilJkMxw4VAiNf1VA=" 171 | }, 172 | "forever-agent": { 173 | "version": "0.6.1", 174 | "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", 175 | "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=" 176 | }, 177 | "form-data": { 178 | "version": "2.1.4", 179 | "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.1.4.tgz", 180 | "integrity": "sha1-M8GDrPGTJ27KqYFDpp6Uv+4XUNE=" 181 | }, 182 | "getpass": { 183 | "version": "0.1.7", 184 | "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", 185 | "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", 186 | "dependencies": { 187 | "assert-plus": { 188 | "version": "1.0.0", 189 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", 190 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" 191 | } 192 | } 193 | }, 194 | "har-schema": { 195 | "version": "1.0.5", 196 | "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-1.0.5.tgz", 197 | "integrity": "sha1-0mMTX0MwfALGAq/I/pWXDAFRNp4=" 198 | }, 199 | "har-validator": { 200 | "version": "4.2.1", 201 | "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-4.2.1.tgz", 202 | "integrity": "sha1-M0gdDxu/9gDdID11gSpqX7oALio=" 203 | }, 204 | "hawk": { 205 | "version": "3.1.3", 206 | "resolved": "https://registry.npmjs.org/hawk/-/hawk-3.1.3.tgz", 207 | "integrity": "sha1-B4REvXwWQLD+VA0sm3PVlnjo4cQ=" 208 | }, 209 | "hoek": { 210 | "version": "2.16.3", 211 | "resolved": "https://registry.npmjs.org/hoek/-/hoek-2.16.3.tgz", 212 | "integrity": "sha1-ILt0A9POo5jpHcRxCo/xuCdKJe0=" 213 | }, 214 | "htmlparser2": { 215 | "version": "3.9.2", 216 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz", 217 | "integrity": "sha1-G9+HrMoPP55T+k/M6w9LTLsAszg=" 218 | }, 219 | "http-signature": { 220 | "version": "1.1.1", 221 | "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.1.1.tgz", 222 | "integrity": "sha1-33LiZwZs0Kxn+3at+OE0qPvPkb8=" 223 | }, 224 | "iconv-lite": { 225 | "version": "0.4.17", 226 | "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.17.tgz", 227 | "integrity": "sha1-T9qjs4rLwsAxsEXQ7c3+HsqxjI0=" 228 | }, 229 | "inherits": { 230 | "version": "2.0.3", 231 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 232 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 233 | }, 234 | "is-typedarray": { 235 | "version": "1.0.0", 236 | "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", 237 | "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=" 238 | }, 239 | "isarray": { 240 | "version": "1.0.0", 241 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 242 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 243 | }, 244 | "isstream": { 245 | "version": "0.1.2", 246 | "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", 247 | "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=" 248 | }, 249 | "jodid25519": { 250 | "version": "1.0.2", 251 | "resolved": "https://registry.npmjs.org/jodid25519/-/jodid25519-1.0.2.tgz", 252 | "integrity": "sha1-BtSRIlUJNBlHfUJWM2BuDpB4KWc=", 253 | "optional": true 254 | }, 255 | "jsbn": { 256 | "version": "0.1.1", 257 | "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", 258 | "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=", 259 | "optional": true 260 | }, 261 | "json-schema": { 262 | "version": "0.2.3", 263 | "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", 264 | "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=" 265 | }, 266 | "json-stable-stringify": { 267 | "version": "1.0.1", 268 | "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz", 269 | "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=" 270 | }, 271 | "json-stringify-safe": { 272 | "version": "5.0.1", 273 | "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", 274 | "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=" 275 | }, 276 | "jsonify": { 277 | "version": "0.0.0", 278 | "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz", 279 | "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM=" 280 | }, 281 | "jsprim": { 282 | "version": "1.4.0", 283 | "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.0.tgz", 284 | "integrity": "sha1-o7h+QCmNjDgFUtjMdiigu5WiKRg=", 285 | "dependencies": { 286 | "assert-plus": { 287 | "version": "1.0.0", 288 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", 289 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" 290 | } 291 | } 292 | }, 293 | "lodash": { 294 | "version": "4.17.4", 295 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.4.tgz", 296 | "integrity": "sha1-eCA6TRwyiuHYbcpkYONptX9AVa4=" 297 | }, 298 | "mime-db": { 299 | "version": "1.27.0", 300 | "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.27.0.tgz", 301 | "integrity": "sha1-gg9XIpa70g7CXtVeW13oaeVDbrE=" 302 | }, 303 | "mime-types": { 304 | "version": "2.1.15", 305 | "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.15.tgz", 306 | "integrity": "sha1-pOv1BkCUVpI3uM9wBGd20J/JKu0=" 307 | }, 308 | "nan": { 309 | "version": "2.6.2", 310 | "resolved": "https://registry.npmjs.org/nan/-/nan-2.6.2.tgz", 311 | "integrity": "sha1-5P805slf37WuzAjeZZb0NgWn20U=" 312 | }, 313 | "nth-check": { 314 | "version": "1.0.1", 315 | "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.1.tgz", 316 | "integrity": "sha1-mSms32KPwsQQmN6rgqxYDPFJquQ=" 317 | }, 318 | "oauth-sign": { 319 | "version": "0.8.2", 320 | "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.8.2.tgz", 321 | "integrity": "sha1-Rqarfwrq2N6unsBWV4C31O/rnUM=" 322 | }, 323 | "opencc": { 324 | "version": "1.0.5", 325 | "resolved": "https://registry.npmjs.org/opencc/-/opencc-1.0.5.tgz", 326 | "integrity": "sha1-U6korCncNVehseBY02m0I2c6t+U=" 327 | }, 328 | "parse5": { 329 | "version": "3.0.2", 330 | "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.2.tgz", 331 | "integrity": "sha1-Be/1fw70V3+xRKefi5qWemzERRA=" 332 | }, 333 | "performance-now": { 334 | "version": "0.2.0", 335 | "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-0.2.0.tgz", 336 | "integrity": "sha1-M+8wxcd9TqIcWlOGnZG1bY8lVeU=" 337 | }, 338 | "process-nextick-args": { 339 | "version": "1.0.7", 340 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", 341 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=" 342 | }, 343 | "punycode": { 344 | "version": "1.4.1", 345 | "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", 346 | "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=" 347 | }, 348 | "qs": { 349 | "version": "6.4.0", 350 | "resolved": "https://registry.npmjs.org/qs/-/qs-6.4.0.tgz", 351 | "integrity": "sha1-E+JtKK1rD/qpExLNO/cI7TUecjM=" 352 | }, 353 | "readable-stream": { 354 | "version": "2.2.9", 355 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.2.9.tgz", 356 | "integrity": "sha1-z3jsb0ptHrQ9JkiMrJfwQudLf8g=" 357 | }, 358 | "request": { 359 | "version": "2.81.0", 360 | "resolved": "https://registry.npmjs.org/request/-/request-2.81.0.tgz", 361 | "integrity": "sha1-xpKJRqDgbF+Nb4qTM0af/aRimKA=" 362 | }, 363 | "safe-buffer": { 364 | "version": "5.0.1", 365 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.0.1.tgz", 366 | "integrity": "sha1-0mPKVGls2KMGtcplUekt5XkY++c=" 367 | }, 368 | "sntp": { 369 | "version": "1.0.9", 370 | "resolved": "https://registry.npmjs.org/sntp/-/sntp-1.0.9.tgz", 371 | "integrity": "sha1-ZUEYTMkK7qbG57NeJlkIJEPGYZg=" 372 | }, 373 | "sshpk": { 374 | "version": "1.13.0", 375 | "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.13.0.tgz", 376 | "integrity": "sha1-/yo+T9BEl1Vf7Zezmg/YL6+zozw=", 377 | "dependencies": { 378 | "assert-plus": { 379 | "version": "1.0.0", 380 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", 381 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" 382 | } 383 | } 384 | }, 385 | "string_decoder": { 386 | "version": "1.0.1", 387 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.0.1.tgz", 388 | "integrity": "sha1-YuIA8DmVWmgQ2N8KM//A8BNmLZg=" 389 | }, 390 | "stringstream": { 391 | "version": "0.0.5", 392 | "resolved": "https://registry.npmjs.org/stringstream/-/stringstream-0.0.5.tgz", 393 | "integrity": "sha1-TkhM1N5aC7vuGORjB3EKioFiGHg=" 394 | }, 395 | "tough-cookie": { 396 | "version": "2.3.2", 397 | "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.3.2.tgz", 398 | "integrity": "sha1-8IH3bkyFcg5sN6X6ztc3FQ2EByo=" 399 | }, 400 | "tunnel-agent": { 401 | "version": "0.6.0", 402 | "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", 403 | "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=" 404 | }, 405 | "tweetnacl": { 406 | "version": "0.14.5", 407 | "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", 408 | "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=", 409 | "optional": true 410 | }, 411 | "util-deprecate": { 412 | "version": "1.0.2", 413 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 414 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 415 | }, 416 | "uuid": { 417 | "version": "3.0.1", 418 | "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.0.1.tgz", 419 | "integrity": "sha1-ZUS7ot/ajBzxfmKaOjBeK7H+5sE=" 420 | }, 421 | "verror": { 422 | "version": "1.3.6", 423 | "resolved": "https://registry.npmjs.org/verror/-/verror-1.3.6.tgz", 424 | "integrity": "sha1-z/XfEpRtKX0rqu+qJoniW+AcAFw=" 425 | }, 426 | "xmldom": { 427 | "version": "0.1.27", 428 | "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.1.27.tgz", 429 | "integrity": "sha1-1QH5ezvbQDr4757MIFcxh6rawOk=" 430 | }, 431 | "xpath": { 432 | "version": "0.0.24", 433 | "resolved": "https://registry.npmjs.org/xpath/-/xpath-0.0.24.tgz", 434 | "integrity": "sha1-Gt4WLhzFI8jTn8fQavwW6iFvKfs=" 435 | } 436 | } 437 | } 438 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "almighty-parser-core", 3 | "version": "1.0.7", 4 | "description": "crawler prser core", 5 | "main": "index.js", 6 | "scripts": { 7 | "test:qiushibaike": "node --harmony-async-await ./example/test/qiushibaike.js", 8 | "test:healthno1": "node --harmony-async-await ./example/test/healthno1.js", 9 | "test:baoliao5": "node --harmony-async-await ./example/test/baoliao5.js" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git@github.com:coolfishstudio/almighty-parser-core.git" 14 | }, 15 | "keywords": "crawler, parser", 16 | "author": "Yves", 17 | "license": "MIT", 18 | "dependencies": { 19 | "async": "^2.4.1", 20 | "cheerio": "^1.0.0-rc.1", 21 | "iconv-lite": "^0.4.17", 22 | "opencc": "^1.0.5", 23 | "request": "^2.81.0", 24 | "xmldom": "^0.1.27", 25 | "xpath": "0.0.24" 26 | } 27 | } 28 | --------------------------------------------------------------------------------