├── .gitignore
├── .npmignore
├── LICENSE
├── README.md
├── doc
├── API.md
├── CONFIG.md
└── EXAMPLE.md
├── example
├── parser
│ ├── parser-baoliao5.js
│ ├── parser-healthno1.js
│ └── parser-qiushibaike.js
└── test
│ ├── baoliao5.js
│ ├── healthno1.js
│ └── qiushibaike.js
├── index.js
├── lib
├── crawler.js
├── helper.js
└── parser.js
├── package-lock.json
└── package.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 |
8 | # Runtime data
9 | pids
10 | *.pid
11 | *.seed
12 | *.pid.lock
13 |
14 | # Directory for instrumented libs generated by jscoverage/JSCover
15 | lib-cov
16 |
17 | # Coverage directory used by tools like istanbul
18 | coverage
19 |
20 | # nyc test coverage
21 | .nyc_output
22 |
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 |
26 | # Bower dependency directory (https://bower.io/)
27 | bower_components
28 |
29 | # node-waf configuration
30 | .lock-wscript
31 |
32 | # Compiled binary addons (http://nodejs.org/api/addons.html)
33 | build/Release
34 |
35 | # Dependency directories
36 | node_modules/
37 | jspm_packages/
38 |
39 | # Typescript v1 declaration files
40 | typings/
41 |
42 | # Optional npm cache directory
43 | .npm
44 |
45 | # Optional eslint cache
46 | .eslintcache
47 |
48 | # Optional REPL history
49 | .node_repl_history
50 |
51 | # Output of 'npm pack'
52 | *.tgz
53 |
54 | # Yarn Integrity file
55 | .yarn-integrity
56 |
57 | # dotenv environment variables file
58 | .env
59 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | doc
2 | example
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 coolfish
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 页面爬虫解析器核心
2 | 此工具适用于
3 | 1. 对单独页面链接进行解析
4 | 2. 配合队列进行多页面解析
5 |
6 |
7 | ## 解释说明
8 | 支持详情页下一页抓取,支持繁体转换,支持对字数统计,支持对图片数量统计。
9 | 目前主要针对静态页的解析,对json请求和jsonp请求的解析做了预留(暂不支持)。
10 |
11 | [](https://nodei.co/npm/almighty-parser-core/)
12 |
13 | []()
14 | []()
15 | []()
16 |
17 | ## 安装
18 | ```
19 | npm i --save almighty-parser-core
20 | ```
21 |
22 | ## api接口
23 | - [x] `getLinks` 获取待抓页链接
24 | - [x] `getContent` 获取详情页内容
25 | - [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
26 | - [x] `isArticleUrl` 检测链接是否是详情页
27 | - [x] `isListUrl` 检测链接是否是列表页
28 | - [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
29 |
30 | ## 配置参数
31 | [文档说明](https://github.com/coolfishstudio/almighty-parser-core/blob/master/doc/CONFIG.md)
32 |
33 | ## 实例
34 | ### 解析器案例
35 | [糗事百科 - 基础](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-qiushibaike.js)
36 | [今日健康 - 繁体](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-healthno1.js)
37 | [爆料网 - 详情下一页](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-baoliao5.js)
38 |
39 | ### 定义网站规则
40 | ```
41 | module.exports = {
42 | // 域名 网站域名,设置域名后只处理这些域名下的网页
43 | domains: 'https://www.qiushibaike.com/',
44 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理
45 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
46 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理
47 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
48 | // 从内容页中抽取需要的数据
49 | fields: [{
50 | // 作者
51 | name: 'author',
52 | meta: {
53 | selector: ['.author h2'],
54 | format: 'text'
55 | }
56 | }, {
57 | // 标签
58 | name: 'tags',
59 | meta: {
60 | format: 'text',
61 | selector: ['.source a'],
62 | index: 0
63 | }
64 | }, {
65 | // 网页关键字
66 | name: 'keywords',
67 | meta: {
68 | format: 'meta',
69 | selector: ['meta[name="keywords"]']
70 | }
71 | }, {
72 | // 网页描述
73 | name: 'description',
74 | meta: {
75 | format: 'meta',
76 | selector: ['meta[name="description"]']
77 | }
78 | }, {
79 | // 详情
80 | name: 'content',
81 | meta: {
82 | selector: ['.content', '.thumb'],
83 | format: 'html'
84 | },
85 | required: true
86 | }, {
87 | name: 'imagesCount',
88 | meta: {
89 | selector: ['.thumb'],
90 | format: 'count',
91 | countType: 'image'
92 | },
93 | defaultValue: 0
94 | }, {
95 | name: 'wordsCount',
96 | meta: {
97 | selector: ['.content'],
98 | format: 'count',
99 | countType: 'text'
100 | },
101 | defaultValue: 0
102 | }, {
103 | name: 'comments',
104 | meta: {
105 | selector: ['.stats-comments .number'],
106 | format: 'text'
107 | },
108 | defaultValue: 0
109 | }, {
110 | name: 'likes',
111 | meta: {
112 | selector: ['.stats-vote .number'],
113 | format: 'text'
114 | },
115 | defaultValue: 0
116 | }],
117 | // 是否模拟用户请求
118 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
119 | // 编码 默认utf-8
120 | charset: null,
121 | // 回调函数 对所有数据做处理
122 | afterExtractAll: function (data) {
123 | data.fields['hits'] = 0;
124 | return data;
125 | },
126 | afterExtractField: function (fieldsName, data) {
127 | if (fieldsName === 'tags') {
128 | data = data ? data.split(',') : [];
129 | }
130 | if (fieldsName === 'comments') {
131 | data = +data;
132 | }
133 | if (fieldsName === 'likes') {
134 | data = +data;
135 | }
136 | return data;
137 | }
138 | };
139 | ```
140 |
141 | ### 引入
142 | ```
143 | const Crawler = require('almighty-parser-core')
144 | const options = require('../parser/parser-qiushibaike.js')
145 | const parser = new Crawler(options)
146 | ```
147 |
148 | ### API测试
149 | [测试案例](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/test/qiushibaike.js)
150 | #### parse
151 | ```
152 | { fields:
153 | { author: '草莓、牛奶巧克力',
154 | tags: [ '搞笑图片' ],
155 | keywords: '',
156 | description: '笑死我了',
157 | content: '
\n\n笑死我了\n\n
\n\n

\n\n
',
158 | imagesCount: 1,
159 | wordsCount: 4,
160 | comments: 0,
161 | likes: 457,
162 | from: 'https://www.qiushibaike.com/article/119095438',
163 | sourceId: 'com.qiushibaike.www-article-119095438',
164 | site: 'www.qiushibaike.com',
165 | hits: 0 },
166 | urls:
167 | [ 'https://www.qiushibaike.com/',
168 | 'https://www.qiushibaike.com/hot/',
169 | 'https://www.qiushibaike.com/imgrank/',
170 | 'https://www.qiushibaike.com/text/',
171 | 'https://www.qiushibaike.com/history/',
172 | 'https://www.qiushibaike.com/pic/',
173 | 'https://www.qiushibaike.com/textnew/',
174 | 'https://www.qiushibaike.com/my',
175 | 'https://www.qiushibaike.com/article/116423562',
176 | 'https://www.qiushibaike.com/article/116424718',
177 | 'https://www.qiushibaike.com/article/116421669',
178 | 'https://www.qiushibaike.com/article/116423344',
179 | 'https://www.qiushibaike.com/article/116426229',
180 | 'https://www.qiushibaike.com/article/116423107',
181 | 'https://www.qiushibaike.com/article/104614784',
182 | 'https://www.qiushibaike.com/article/104590828',
183 | 'https://www.qiushibaike.com/article/104629666',
184 | 'https://www.qiushibaike.com/article/104599846',
185 | 'https://www.qiushibaike.com/article/104598154',
186 | 'https://www.qiushibaike.com/article/104619022',
187 | 'https://www.qiushibaike.com/article/118954381',
188 | 'https://www.qiushibaike.com/article/118491926',
189 | 'https://www.qiushibaike.com/article/118563113',
190 | 'https://www.qiushibaike.com/article/118806836',
191 | 'https://www.qiushibaike.com/article/118525804',
192 | 'https://www.qiushibaike.com/article/118770803',
193 | 'https://www.qiushibaike.com/article/119008939',
194 | 'https://www.qiushibaike.com/article/119033005',
195 | 'https://www.qiushibaike.com/article/119036209',
196 | 'https://www.qiushibaike.com/article/118922421',
197 | 'https://www.qiushibaike.com/article/119014594',
198 | 'https://www.qiushibaike.com/article/119009873',
199 | 'https://www.qiushibaike.com/article/118934286',
200 | 'https://www.qiushibaike.com/joke/',
201 | 'https://www.qiushibaike.com/article/' ] }
202 | ```
203 |
204 | 其余接口测试请下载后运行
205 | ```
206 | npm run test:qiushibaike
207 | ```
208 |
209 | ## License
210 |
211 | [MIT License](https://opensource.org/licenses/MIT)
212 |
--------------------------------------------------------------------------------
/doc/API.md:
--------------------------------------------------------------------------------
1 | ## API 文档
2 |
3 | - [x] `getLinks` 获取待抓页链接
4 | - [x] `getContent` 获取详情页内容
5 | - [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
6 | - [x] `isArticleUrl` 检测链接是否是详情页
7 | - [x] `isListUrl` 检测链接是否是列表页
8 | - [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
9 |
--------------------------------------------------------------------------------
/doc/CONFIG.md:
--------------------------------------------------------------------------------
1 | ## 配置参数
2 |
3 | 针对不同 要有自己定义的配置
4 |
5 | 注意 目前只支持html静态页的内容抓取
6 |
7 | 配置 | 描述 | 是否必填 | 类型
8 | ------------- | ------------- | ------------- | -------------
9 | domains | 网站域名 | 必填 | 字符串
10 | listUrlRegexes | 列表页url的正则,符合这些正则的页面会被当作列表页处理 | 必填 | 数组
11 | contentUrlRegexes | 内容页url的正则,符合这些正则的页面会被当作内容页处理 | 必填 | 数组
12 | fields | 从内容页中抽取需要的数据 | 必填 | fields示例
13 | userAgent | 是否模拟用户请求 | 选填 | 字符串
14 | charset | 编码 默认utf-8 | 选填 | 字符串
15 | afterExtractField | 回调函数 对每一个抽取出来的数据进行处理 | 选填 | 方法
16 | afterExtractAll | 回调函数 对所有抽取出来的数据进行处理 | 选填 | 方法
17 | contentPage | 对详情页下一页内容处理 | 选填 | contentPage示例
18 |
19 | ## fields示例
20 | 字段 | 描述 | 类型
21 | ------------- | ------------- | -------------
22 | name | 定义字段名字 | 字符串 必填
23 | meta | 选择器 | meta示例 必填
24 | defaultValue | 默认值 | 任意 选填
25 |
26 | ### meta示例
27 | 字段 | 描述 | 类型
28 | selector | 选择器(支持多个拼接) | 数组 必填
29 | format | 返回是否含有标签[text/html/meta 默认text] | 字符串 选填
30 | index | 下标 | 数字 选填
31 |
32 | ## contentPage示例
33 | 字段 | 描述 | 类型
34 | ------------- | ------------- | -------------
35 | urls | 下一页的正则 | 数组 必填
36 | selector | 选择器 | 数组 必填
37 | appendNode | 插入的位置 | 任意 必填
--------------------------------------------------------------------------------
/doc/EXAMPLE.md:
--------------------------------------------------------------------------------
1 | ## 实例
2 | ### 定义网站规则
3 | ```
4 | module.exports = {
5 | // 域名 网站域名,设置域名后只处理这些域名下的网页
6 | domains: 'https://www.qiushibaike.com/',
7 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理
8 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
9 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理
10 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
11 | // 从内容页中抽取需要的数据
12 | fields: [{
13 | // 作者
14 | name: 'author',
15 | meta: {
16 | selector: ['.author h2'],
17 | format: 'text'
18 | }
19 | }, {
20 | // 标签
21 | name: 'tags',
22 | meta: {
23 | format: 'text',
24 | selector: ['.source a'],
25 | index: 0
26 | }
27 | }, {
28 | // 网页关键字
29 | name: 'keywords',
30 | meta: {
31 | format: 'meta',
32 | selector: ['meta[name="keywords"]']
33 | }
34 | }, {
35 | // 网页描述
36 | name: 'description',
37 | meta: {
38 | format: 'meta',
39 | selector: ['meta[name="description"]']
40 | }
41 | }, {
42 | // 详情
43 | name: 'content',
44 | meta: {
45 | selector: ['.content', '.thumb'],
46 | format: 'html'
47 | },
48 | required: true
49 | }, {
50 | name: 'imagesCount',
51 | meta: {
52 | selector: ['.thumb'],
53 | format: 'count',
54 | countType: 'image'
55 | },
56 | defaultValue: 0
57 | }, {
58 | name: 'wordsCount',
59 | meta: {
60 | selector: ['.content'],
61 | format: 'count',
62 | countType: 'text'
63 | },
64 | defaultValue: 0
65 | }, {
66 | name: 'comments',
67 | meta: {
68 | selector: ['.stats-comments .number'],
69 | format: 'text'
70 | },
71 | defaultValue: 0
72 | }, {
73 | name: 'likes',
74 | meta: {
75 | selector: ['.stats-vote .number'],
76 | format: 'text'
77 | },
78 | defaultValue: 0
79 | }],
80 | // 是否模拟用户请求
81 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
82 | // 编码 默认utf-8
83 | charset: null,
84 | // 回调函数 对所有数据做处理
85 | afterExtractAll: function (data) {
86 | data.fields['hits'] = 0;
87 | return data;
88 | },
89 | afterExtractField: function (fieldsName, data) {
90 | if (fieldsName === 'tags') {
91 | data = data ? data.split(',') : [];
92 | }
93 | if (fieldsName === 'comments') {
94 | data = +data;
95 | }
96 | if (fieldsName === 'likes') {
97 | data = +data;
98 | }
99 | return data;
100 | }
101 | };
102 | ```
103 |
104 | ### 引入
105 | ```
106 | const Crawler = require('almighty-parser-core')
107 | const options = require('../parser/parser-qiushibaike.js')
108 | const parser = new Crawler(options)
109 | ```
110 |
111 | ### API测试
112 | #### parse
113 | ```
114 | { fields:
115 | { author: '草莓、牛奶巧克力',
116 | tags: [ '搞笑图片' ],
117 | keywords: '',
118 | description: '笑死我了',
119 | content: '\n\n笑死我了\n\n
\n\n

\n\n
',
120 | imagesCount: 1,
121 | wordsCount: 4,
122 | comments: 0,
123 | likes: 457,
124 | from: 'https://www.qiushibaike.com/article/119095438',
125 | sourceId: 'com.qiushibaike.www-article-119095438',
126 | site: 'www.qiushibaike.com',
127 | hits: 0 },
128 | urls:
129 | [ 'https://www.qiushibaike.com/',
130 | 'https://www.qiushibaike.com/hot/',
131 | 'https://www.qiushibaike.com/imgrank/',
132 | 'https://www.qiushibaike.com/text/',
133 | 'https://www.qiushibaike.com/history/',
134 | 'https://www.qiushibaike.com/pic/',
135 | 'https://www.qiushibaike.com/textnew/',
136 | 'https://www.qiushibaike.com/my',
137 | 'https://www.qiushibaike.com/article/116423562',
138 | 'https://www.qiushibaike.com/article/116424718',
139 | 'https://www.qiushibaike.com/article/116421669',
140 | 'https://www.qiushibaike.com/article/116423344',
141 | 'https://www.qiushibaike.com/article/116426229',
142 | 'https://www.qiushibaike.com/article/116423107',
143 | 'https://www.qiushibaike.com/article/104614784',
144 | 'https://www.qiushibaike.com/article/104590828',
145 | 'https://www.qiushibaike.com/article/104629666',
146 | 'https://www.qiushibaike.com/article/104599846',
147 | 'https://www.qiushibaike.com/article/104598154',
148 | 'https://www.qiushibaike.com/article/104619022',
149 | 'https://www.qiushibaike.com/article/118954381',
150 | 'https://www.qiushibaike.com/article/118491926',
151 | 'https://www.qiushibaike.com/article/118563113',
152 | 'https://www.qiushibaike.com/article/118806836',
153 | 'https://www.qiushibaike.com/article/118525804',
154 | 'https://www.qiushibaike.com/article/118770803',
155 | 'https://www.qiushibaike.com/article/119008939',
156 | 'https://www.qiushibaike.com/article/119033005',
157 | 'https://www.qiushibaike.com/article/119036209',
158 | 'https://www.qiushibaike.com/article/118922421',
159 | 'https://www.qiushibaike.com/article/119014594',
160 | 'https://www.qiushibaike.com/article/119009873',
161 | 'https://www.qiushibaike.com/article/118934286',
162 | 'https://www.qiushibaike.com/joke/',
163 | 'https://www.qiushibaike.com/article/' ] }
164 | ```
165 |
166 | 其余接口测试请下载后运行
167 | ```
168 | npm run test:qiushibaike
169 | ```
170 |
171 |
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/example/parser/parser-baoliao5.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | /**
3 | * 爆料网
4 | * http://www.baoliao5.com/
5 | */
6 | module.exports = {
7 | // 域名 网站域名,设置域名后只处理这些域名下的网页
8 | domains: 'http://www.baoliao5.com/',
9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理
10 | listUrlRegexes: [/http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)*$/, /http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)+\/list[0-9_]+\.html*$/],
11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理
12 | contentUrlRegexes: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+\.html/],
13 | // 从内容页中抽取需要的数据
14 | fields: [{
15 | // 标题
16 | name: 'title',
17 | meta: {
18 | // 默认 type 为 jquery/text/xpath
19 | selector: ['.t4Btit'],
20 | format: 'text'
21 | },
22 | required: true
23 | }, {
24 | // 详情
25 | name: 'content',
26 | meta: {
27 | selector: ['#icontent'],
28 | format: 'html'
29 | },
30 | required: true
31 | }, {
32 | // 作者
33 | name: 'author',
34 | meta: {
35 | selector: ['.t4Bexp'],
36 | format: 'text'
37 | }
38 | }, {
39 | // 标签
40 | name: 'tags',
41 | meta: {
42 | format: 'text',
43 | selector: ['.itj_lt .lc a'],
44 | index: 1
45 | }
46 | }, {
47 | // 网页关键字
48 | name: 'keywords',
49 | meta: {
50 | format: 'meta',
51 | selector: ['meta[name="keywords"]']
52 | }
53 | }, {
54 | // 网页描述
55 | name: 'description',
56 | meta: {
57 | format: 'meta',
58 | selector: ['meta[name="description"]']
59 | }
60 | }, {
61 | name: 'imagesCount',
62 | meta: {
63 | selector: ['#icontent'],
64 | format: 'count',
65 | countType: 'image'
66 | },
67 | defaultValue: 0
68 | }, {
69 | name: 'wordsCount',
70 | meta: {
71 | selector: ['#icontent'],
72 | format: 'count',
73 | countType: 'text'
74 | },
75 | defaultValue: 0
76 | }, {
77 | name: 'publishedAt',
78 | meta: {
79 | format: 'text',
80 | selector: ['.t4Bexp']
81 | }
82 | }],
83 | // 内容下一页
84 | contentPage: {
85 | urls: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+_[0-9]+\.html/],
86 | selector: ['#icontent'],
87 | appendNode: '#icontent'
88 | },
89 | // 是否模拟用户请求
90 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
91 | // 编码 默认utf-8
92 | charset: 'gb2312',
93 | // 回调函数 对所有数据做处理
94 | afterExtractAll: function (data) {
95 | data.fields['comments'] = 0;
96 | data.fields['hits'] = 0;
97 | data.fields['likes'] = 0;
98 | return data;
99 | },
100 | afterExtractField: function (fieldsName, data) {
101 | if (fieldsName === 'author') {
102 | data = data.trim()
103 | if (data.indexOf('编辑:') >= 0) {
104 | var arr = data.split('编辑:');
105 | data = arr[arr.length - 1];
106 | } else {
107 | data = '';
108 | }
109 | }
110 | if (fieldsName === 'publishedAt') {
111 | data = new Date(data.replace(/[^0-9\-\: ]+/img, '')).getTime() || new Date().getTime();
112 | }
113 | if (fieldsName === 'tags') {
114 | data = (data !== '') ? [data] : [];
115 | }
116 | return data;
117 | }
118 | };
119 |
--------------------------------------------------------------------------------
/example/parser/parser-healthno1.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | /**
3 | * healthNo1
4 | * http://www.healthno1.com/
5 | */
6 | module.exports = {
7 | // 域名 网站域名,设置域名后只处理这些域名下的网页
8 | domains: 'http://www.healthno1.com/',
9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理
10 | listUrlRegexes: [/^http:\/\/www\.healthno1\.com(\/[a-z_]+(\.html)?)*(\/)?(\?start=[0-9]+)?$/],
11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理
12 | contentUrlRegexes: [/^http:\/\/www\.healthno1\.com\/([a-z_]+\/)*[0-9-]+\.html$/],
13 | // 从内容页中抽取需要的数据
14 | fields: [{
15 | // 标题
16 | name: 'title',
17 | meta: {
18 | // 默认 type 为 jquery/text/xpath
19 | selector: ['#gkContentWrap .item-page header h1'],
20 | format: 'text'
21 | },
22 | required: true
23 | }, {
24 | // 详情
25 | name: 'content',
26 | meta: {
27 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
28 | format: 'html'
29 | },
30 | required: true
31 | }, {
32 | // 作者
33 | name: 'author',
34 | meta: {
35 | format: 'meta',
36 | selector: ['meta[name="author"]']
37 | }
38 | }, {
39 | // 标签
40 | name: 'tags',
41 | meta: {
42 | format: 'text',
43 | selector: ['.category-name a'],
44 | index: 0
45 | }
46 | }, {
47 | // 网页关键字
48 | name: 'keywords',
49 | meta: {
50 | format: 'meta',
51 | selector: ['meta[name="keywords"]']
52 | }
53 | }, {
54 | // 网页描述
55 | name: 'description',
56 | meta: {
57 | format: 'meta',
58 | selector: ['meta[name="description"]']
59 | }
60 | }, {
61 | name: 'imagesCount',
62 | meta: {
63 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
64 | format: 'count',
65 | countType: 'image'
66 | },
67 | defaultValue: 0
68 | }, {
69 | name: 'wordsCount',
70 | meta: {
71 | selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
72 | format: 'count',
73 | countType: 'text'
74 | },
75 | defaultValue: 0
76 | }, {
77 | name: 'publishedAt',
78 | meta: {
79 | format: 'text',
80 | selector: ['.created time']
81 | }
82 | }, {
83 | name: 'hits',
84 | meta: {
85 | format: 'text',
86 | selector: ['.hits'],
87 | index: 0
88 | },
89 | defaultValue: 0
90 | }],
91 | // 是否模拟用户请求
92 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
93 | // 编码 默认utf-8
94 | charset: null,
95 | // 语言格式
96 | i18n: 'tw2s',
97 | // 回调函数 对所有数据做处理
98 | afterExtractAll: function (data) {
99 | data.fields['comments'] = 0;
100 | data.fields['likes'] = 0;
101 | return data;
102 | },
103 | afterExtractField: function (fieldsName, data) {
104 | if (fieldsName === 'publishedAt') {
105 | data = new Date(data.replace(/[^0-9\- \:]+/img, '')).getTime() || new Date().getTime();
106 | }
107 | if (fieldsName === 'tags') {
108 | data = (data !== '') ? [data] : [];
109 | }
110 | if (fieldsName === 'title') {
111 | data = data.trim();
112 | }
113 | if (fieldsName === 'hits') {
114 | data = data.replace(/[^0-9]+/img, '') || 0;
115 | }
116 | return data;
117 | }
118 | };
119 |
--------------------------------------------------------------------------------
/example/parser/parser-qiushibaike.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | /**
3 | * 糗事百科
4 | * https://www.qiushibaike.com/
5 | */
6 | module.exports = {
7 | // 域名 网站域名,设置域名后只处理这些域名下的网页
8 | domains: 'https://www.qiushibaike.com/',
9 | // 列表页url的正则,符合这些正则的页面会被当作列表页处理
10 | listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
11 | // 内容页url的正则,符合这些正则的页面会被当作内容页处理
12 | contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
13 | // 从内容页中抽取需要的数据
14 | fields: [{
15 | // 作者
16 | name: 'author',
17 | meta: {
18 | selector: ['.author h2'],
19 | format: 'text'
20 | }
21 | }, {
22 | // 标签
23 | name: 'tags',
24 | meta: {
25 | format: 'text',
26 | selector: ['.source a'],
27 | index: 0
28 | }
29 | }, {
30 | // 网页关键字
31 | name: 'keywords',
32 | meta: {
33 | format: 'meta',
34 | selector: ['meta[name="keywords"]']
35 | }
36 | }, {
37 | // 网页描述
38 | name: 'description',
39 | meta: {
40 | format: 'meta',
41 | selector: ['meta[name="description"]']
42 | }
43 | }, {
44 | // 详情
45 | name: 'content',
46 | meta: {
47 | selector: ['.content', '.thumb'],
48 | format: 'html'
49 | },
50 | required: true
51 | }, {
52 | name: 'imagesCount',
53 | meta: {
54 | selector: ['.thumb'],
55 | format: 'count',
56 | countType: 'image'
57 | },
58 | defaultValue: 0
59 | }, {
60 | name: 'wordsCount',
61 | meta: {
62 | selector: ['.content'],
63 | format: 'count',
64 | countType: 'text'
65 | },
66 | defaultValue: 0
67 | }, {
68 | name: 'comments',
69 | meta: {
70 | selector: ['.stats-comments .number'],
71 | format: 'text'
72 | },
73 | defaultValue: 0
74 | }, {
75 | name: 'likes',
76 | meta: {
77 | selector: ['.stats-vote .number'],
78 | format: 'text'
79 | },
80 | defaultValue: 0
81 | }],
82 | // 是否模拟用户请求
83 | userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
84 | // 编码 默认utf-8
85 | charset: null,
86 | // 回调函数 对所有数据做处理
87 | afterExtractAll: function (data) {
88 | data.fields['hits'] = 0;
89 | return data;
90 | },
91 | afterExtractField: function (fieldsName, data) {
92 | if (fieldsName === 'tags') {
93 | data = data ? data.split(',') : [];
94 | }
95 | if (fieldsName === 'comments') {
96 | data = +data;
97 | }
98 | if (fieldsName === 'likes') {
99 | data = +data;
100 | }
101 | return data;
102 | }
103 | };
104 |
--------------------------------------------------------------------------------
/example/test/baoliao5.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const Crawler = require('../../index.js')
3 | const options = require('../parser/parser-baoliao5.js')
4 | const parser = new Crawler(options)
5 |
6 | // const url = 'http://www.baoliao5.com/'
7 | const url = 'http://www.baoliao5.com/yule/201701/1867.html'
8 | // const url = 'http://www.baoliao5.com/yingshi/201701/1835.html'
9 | // const url = 'http://www.baoliao5.com/yingshi/'
10 | // const url = 'http://www.baoliao5.com/yingshi/list_7_11.html'
11 | // const url = 'http://www.baoliao5.com/meitu/201701/1848.html'
12 | // const url = 'http://www.baoliao5.com/yule/neidi/'
13 |
14 | let errorItems = []
15 |
16 | // 测试获取内容
17 | async function testParseDate () {
18 | try {
19 | const result = await parser.parse(url)
20 | console.log('获取数据内容为', result)
21 | } catch (e) {
22 | console.error('[抓取数据出错]', e.message)
23 | errorItems.push('testParseDate')
24 | }
25 | }
26 | // 检测链接是否是详情页
27 | function testIsArticleUrl () {
28 | try {
29 | const result = parser.isArticleUrl(url)
30 | console.log('获取数据内容为', result)
31 | } catch (e) {
32 | console.error('[抓取数据出错]', e.message)
33 | errorItems.push('testIsArticleUrl')
34 | }
35 | }
36 | // 测试页面链接的唯一标示
37 | function testGetIdFromArticleUrl () {
38 | try {
39 | const result = parser.getIdFromArticleUrl(url)
40 | console.log('获取数据内容为', result)
41 | } catch (e) {
42 | console.error('[抓取数据出错]', e.message)
43 | errorItems.push('testGetIdFromArticleUrl')
44 | }
45 | }
46 |
47 | // 获取详情页内容
48 | async function testGetContent () {
49 | try {
50 | const result = await parser.getContent(url)
51 | console.log('获取数据内容为', result)
52 | } catch (e) {
53 | console.error('[抓取数据出错]', e.message)
54 | errorItems.push('testGetContent')
55 | }
56 | }
57 |
58 | // 获取详情页内容
59 | async function testGetLinks () {
60 | try {
61 | const result = await parser.getLinks(url)
62 | console.log('获取数据内容为', result)
63 | } catch (e) {
64 | console.error('[抓取数据出错]', e.message)
65 | errorItems.push('testGetLinks')
66 | }
67 | }
68 |
69 | // 测试入口
70 | async function start () {
71 | console.log('测试开始')
72 | console.log('------')
73 | console.log('测试步骤1 获取内容')
74 | await testParseDate()
75 | console.log('测试步骤1 获取内容 结束')
76 | console.log('------')
77 | console.log('测试步骤2 校验链接是否为详情页')
78 | testIsArticleUrl()
79 | console.log('测试步骤2 校验链接是否为详情页 结束')
80 | console.log('------')
81 | console.log('测试步骤3 获取页面链接的唯一标示')
82 | testGetIdFromArticleUrl()
83 | console.log('测试步骤3 获取页面链接的唯一标示 结束')
84 | console.log('------')
85 | console.log('测试步骤4 获取详情页内容')
86 | // await testGetContent()
87 | console.log('测试步骤4 获取详情页内容 结束')
88 | console.log('------')
89 | console.log('测试步骤5 获取列表页内容')
90 | await testGetLinks()
91 | console.log('测试步骤5 获取列表页内容 结束')
92 | console.log('------')
93 | console.log('所有接口均已测试结束')
94 | if (errorItems.length) {
95 | console.log('测试结果: ', errorItems.join(','), '异常。')
96 | } else {
97 | console.log('测试结果: 所有接口都正常。')
98 | }
99 | }
100 | start()
101 |
--------------------------------------------------------------------------------
/example/test/healthno1.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const Crawler = require('../../index.js')
3 | const options = require('../parser/parser-healthno1.js')
4 | const parser = new Crawler(options)
5 |
6 | // const url = 'http://www.healthno1.com/'
7 | // const url = 'http://www.healthno1.com/feature_articles.html?start=12'
8 | // const url = 'http://www.healthno1.com/feature_articles.html'
9 | // const url = 'http://www.healthno1.com/health_info/16841-2017-05-12-03-10-00.html'
10 | const url = 'http://www.healthno1.com/16939-2017-05-19-10-16-00.html'
11 |
12 | let errorItems = []
13 |
14 | // 测试获取内容
15 | async function testParseDate () {
16 | try {
17 | const result = await parser.parse(url)
18 | console.log('获取数据内容为', result)
19 | } catch (e) {
20 | console.error('[抓取数据出错]', e.message)
21 | errorItems.push('testParseDate')
22 | }
23 | }
24 | // 检测链接是否是详情页
25 | function testIsArticleUrl () {
26 | try {
27 | const result = parser.isArticleUrl(url)
28 | console.log('获取数据内容为', result)
29 | } catch (e) {
30 | console.error('[抓取数据出错]', e.message)
31 | errorItems.push('testIsArticleUrl')
32 | }
33 | }
34 | // 测试页面链接的唯一标示
35 | function testGetIdFromArticleUrl () {
36 | try {
37 | const result = parser.getIdFromArticleUrl(url)
38 | console.log('获取数据内容为', result)
39 | } catch (e) {
40 | console.error('[抓取数据出错]', e.message)
41 | errorItems.push('testGetIdFromArticleUrl')
42 | }
43 | }
44 |
45 | // 获取详情页内容
46 | async function testGetContent () {
47 | try {
48 | const result = await parser.getContent(url)
49 | console.log('获取数据内容为', result)
50 | } catch (e) {
51 | console.error('[抓取数据出错]', e.message)
52 | errorItems.push('testGetContent')
53 | }
54 | }
55 |
56 | // 获取详情页内容
57 | async function testGetLinks () {
58 | try {
59 | const result = await parser.getLinks(url)
60 | console.log('获取数据内容为', result)
61 | } catch (e) {
62 | console.error('[抓取数据出错]', e.message)
63 | errorItems.push('testGetLinks')
64 | }
65 | }
66 |
67 | // 测试入口
68 | async function start () {
69 | console.log('测试开始')
70 | console.log('------')
71 | console.log('测试步骤1 获取内容')
72 | await testParseDate()
73 | console.log('测试步骤1 获取内容 结束')
74 | console.log('------')
75 | console.log('测试步骤2 校验链接是否为详情页')
76 | testIsArticleUrl()
77 | console.log('测试步骤2 校验链接是否为详情页 结束')
78 | console.log('------')
79 | console.log('测试步骤3 获取页面链接的唯一标示')
80 | testGetIdFromArticleUrl()
81 | console.log('测试步骤3 获取页面链接的唯一标示 结束')
82 | console.log('------')
83 | console.log('测试步骤4 获取详情页内容')
84 | // await testGetContent()
85 | console.log('测试步骤4 获取详情页内容 结束')
86 | console.log('------')
87 | console.log('测试步骤5 获取列表页内容')
88 | // await testGetLinks()
89 | console.log('测试步骤5 获取列表页内容 结束')
90 | console.log('------')
91 | console.log('所有接口均已测试结束')
92 | if (errorItems.length) {
93 | console.log('测试结果: ', errorItems.join(','), '异常。')
94 | } else {
95 | console.log('测试结果: 所有接口都正常。')
96 | }
97 | }
98 | start()
99 |
--------------------------------------------------------------------------------
/example/test/qiushibaike.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const Crawler = require('../../index.js')
3 | const options = require('../parser/parser-qiushibaike.js')
4 | const parser = new Crawler(options)
5 |
6 | // const url = 'https://www.qiushibaike.com/hot/'
7 | // const url = 'https://www.qiushibaike.com/hot/page/4/?s=4987995'
8 | // const url = 'https://www.qiushibaike.com/article/119101871'
9 | // const url = 'https://www.qiushibaike.com/article/119102864'
10 | const url = 'https://www.qiushibaike.com/article/119095438'
11 |
12 | let errorItems = []
13 |
14 | // 测试获取内容
15 | async function testParseDate () {
16 | try {
17 | const result = await parser.parse(url)
18 | console.log('获取数据内容为', result)
19 | } catch (e) {
20 | console.error('[抓取数据出错]', e.message)
21 | errorItems.push('testParseDate')
22 | }
23 | }
24 | // 检测链接是否是详情页
25 | function testIsArticleUrl () {
26 | try {
27 | const result = parser.isArticleUrl(url)
28 | console.log('获取数据内容为', result)
29 | } catch (e) {
30 | console.error('[抓取数据出错]', e.message)
31 | errorItems.push('testIsArticleUrl')
32 | }
33 | }
34 | // 检测链接是否是列表页
35 | function testIsListUrl () {
36 | try {
37 | const result = parser.isListUrl(url)
38 | console.log('获取数据内容为', result)
39 | } catch (e) {
40 | console.error('[抓取数据出错]', e.message)
41 | errorItems.push('testIsListUrl')
42 | }
43 | }
44 | // 测试页面链接的唯一标示
45 | function testGetIdFromArticleUrl () {
46 | try {
47 | const result = parser.getIdFromArticleUrl(url)
48 | console.log('获取数据内容为', result)
49 | } catch (e) {
50 | console.error('[抓取数据出错]', e.message)
51 | errorItems.push('testGetIdFromArticleUrl')
52 | }
53 | }
54 |
55 | // 获取详情页内容
56 | async function testGetContent () {
57 | try {
58 | const result = await parser.getContent(url)
59 | console.log('获取数据内容为', result)
60 | } catch (e) {
61 | console.error('[抓取数据出错]', e.message)
62 | errorItems.push('testGetContent')
63 | }
64 | }
65 |
66 | // 获取详情页内容
67 | async function testGetLinks () {
68 | try {
69 | const result = await parser.getLinks(url)
70 | console.log('获取数据内容为', result)
71 | } catch (e) {
72 | console.error('[抓取数据出错]', e.message)
73 | errorItems.push('testGetLinks')
74 | }
75 | }
76 |
77 | // 测试入口
78 | async function start () {
79 | console.log('测试开始')
80 | console.log('------')
81 | console.log('测试步骤1 获取内容')
82 | await testParseDate()
83 | console.log('测试步骤1 获取内容 结束')
84 | console.log('------')
85 | console.log('测试步骤2 校验链接是否为详情页')
86 | testIsArticleUrl()
87 | console.log('测试步骤2 校验链接是否为详情页 结束')
88 | console.log('------')
89 | console.log('测试步骤3 校验链接是否为列表页')
90 | testIsListUrl()
91 | console.log('测试步骤3 校验链接是否为列表页 结束')
92 | console.log('------')
93 | console.log('测试步骤4 获取页面链接的唯一标示')
94 | testGetIdFromArticleUrl()
95 | console.log('测试步骤4 获取页面链接的唯一标示 结束')
96 | console.log('------')
97 | console.log('测试步骤5 获取详情页内容')
98 | // await testGetContent()
99 | console.log('测试步骤5 获取详情页内容 结束')
100 | console.log('------')
101 | console.log('测试步骤6 获取列表页内容')
102 | // await testGetLinks()
103 | console.log('测试步骤6 获取列表页内容 结束')
104 | console.log('------')
105 | console.log('所有接口均已测试结束')
106 | if (errorItems.length) {
107 | console.log('测试结果: ', errorItems.join(','), '异常。')
108 | } else {
109 | console.log('测试结果: 所有接口都正常。')
110 | }
111 | }
112 | start()
113 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | const Crawler = require('./lib/crawler.js')
2 |
3 | module.exports = Crawler;
--------------------------------------------------------------------------------
/lib/crawler.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | var async = require('async');
3 |
4 | var helper = require('./helper'),
5 | parser = require('./parser');
6 |
7 | function Crawler (options) {
8 | options = options || {};
9 | if (!['domains', 'listUrlRegexes', 'contentUrlRegexes', 'fields'].some(key => key in options)) {
10 | throw new Error('options is invalid data format.');
11 | }
12 | // 初始化
13 | this._init(options);
14 | };
15 | /**
16 | * 初始化
17 | */
18 | Crawler.prototype._init = function (options) {
19 | options.domains = helper.formatUrl(options.domains);
20 | // 核心
21 | this.domains = options.domains || '';// 域名 首页
22 | this.listUrlRegexes = options.listUrlRegexes || [];// 列表页url的正则
23 | this.contentUrlRegexes = options.contentUrlRegexes || [];// 内容页url的正则
24 | this.fields = options.fields || [];// 从内容页中抽取需要的数据
25 | this.contentPage = options.contentPage || null;// 下一页
26 | this.sourceId = options.sourceId || [2, 5, 4];// 唯一标示组成
27 | // 配置
28 | this.userAgent = options.userAgent || null;// 模拟用户请求
29 | this.charset = options.charset || null;// 编码
30 | this.format = options.format || 'html';// 请求格式 http|json|jsonp
31 | this.i18n = options.i18n || null;// 转译 繁体转简体 s2t | t2s | s2tw | tw2s | s2hk | hk2s | t2tw | t2hk
32 | // 函数
33 | this.afterExtractField = options.afterExtractField || null;// 对每一个抓取的数据进行处理
34 | this.afterExtractAll = options.afterExtractAll || null;// 对完整的数据进行一个处理
35 | this.afterExtractUrls = options.afterExtractUrls || null;// 对抓取的url进行一个处理
36 | this.attachFields = options.attachFields || null;// 附加数据
37 | };
38 | /**
39 | * 检测链接类型
40 | * 可选参数
41 | * type: list|post
42 | */
43 | Crawler.prototype._judge = function (url, type) {
44 | var result = '';
45 | if (!type || type === 'list') {
46 | this.listUrlRegexes.forEach(function (urlRegex) {
47 | if (urlRegex.test(url)) {
48 | result = 'list';
49 | }
50 | });
51 | }
52 | if (!type || type === 'post') {
53 | this.contentUrlRegexes.forEach(function (urlRegex) {
54 | if (urlRegex.test(url)) {
55 | result = 'post';
56 | }
57 | });
58 | }
59 | return type ? result === type : result;
60 | };
61 | /**
62 | * 根据url生成唯一标示
63 | */
64 | Crawler.prototype._getSourceId = function (url) {
65 | var type = this._judge(url);
66 | if (!type) {
67 | console.error('The url type is not list or post.');
68 | return null;
69 | }
70 | var regex = /(\w+):\/\/([^\:|\/]+)(\:\d*)?(.*\/)([^#|\?|\n]+)?(#.*)?(\?.*)?/i;
71 | var arr = url.match(regex);
72 | this._site = arr[2];
73 | var sources = '';
74 | this.sourceId.forEach(function (item) {
75 | if (!!arr[item]) {
76 | if (item === 2) {
77 | sources += arr[item].split('.').reverse().join('.');
78 | } else {
79 | sources += arr[item].replace(/\//img, '-').replace('.', '-');
80 | }
81 | }
82 | });
83 | sources = ((sources.substring(sources.length - 1) === '-') ? sources.substring(0, sources.length - 1) : sources).trim();
84 | return sources;
85 | };
86 | /**
87 | * 解析详情页
88 | */
89 | Crawler.prototype._getContent = function (url, callback) {
90 | var self = this;
91 | self.url = url;
92 | var result = {};
93 | var resultAttachFields = {};
94 | result.bodyData = null;
95 | result.fields = null;
96 | // 处理附加数据
97 | var getAttachBodyFields = function (done) {
98 | if (!self._judge(url, 'post')) {
99 | return done(null);
100 | }
101 | if (!self.attachFields) {
102 | return done(null);
103 | }
104 | if (!self.attachFields.url) {
105 | return done(null);
106 | }
107 | parser.getAttachUrl({
108 | url: self.attachFields.url,
109 | meta: self.attachFields.meta,
110 | body: result.bodyData
111 | }, function (error, _url) {
112 | helper.request(_url, {
113 | format: self.format,
114 | charset: self.charset,
115 | userAgent: self.userAgent
116 | }, function (error, body) {
117 | if (error) {
118 | return done(error);
119 | }
120 | resultAttachFields = parser.getFieldsBySelector(body, self.attachFields.fields);
121 | done(error);
122 | });
123 | });
124 | };
125 | var getBodyPage = function (done) {
126 | if (!(self._judge(url, 'post') && !!result.bodyData && !!self.contentPage)) {
127 | return done(null);
128 | }
129 | // 处理下一页
130 | parser.getContentPage(self, { body: result.bodyData, url }, function (error, body) {
131 | if (error) {
132 | return done(error);
133 | }
134 | if (body) {
135 | result.bodyData = body;
136 | }
137 | done(error);
138 | });
139 | };
140 | var getBodyFields = function (done) {
141 | helper.request(url, {
142 | format: self.format,
143 | charset: self.charset,
144 | userAgent: self.userAgent
145 | }, function (error, body) {
146 | if (error) {
147 | return done(error);
148 | }
149 | result.bodyData = body;
150 | done(error);
151 | });
152 | };
153 | async.waterfall([getBodyFields, getBodyPage, getAttachBodyFields], function (error) {
154 | if (error) {
155 | return callback(error);
156 | }
157 |
158 | if (self._judge(url, 'post') && !!result.bodyData) {
159 | // 获取数据
160 | result.fields = parser.getFields(result.bodyData, self);
161 | result.fields.from = url;
162 | result.fields.sourceId = self._getSourceId(url);
163 | result.fields.site = self._site;
164 | // 附加数据
165 | for (var name in resultAttachFields) {
166 | result.fields[name] = resultAttachFields[name];
167 | }
168 | // 处理完整数据
169 | if (self.afterExtractAll) {
170 | result = self.afterExtractAll(result);
171 | }
172 | }
173 | callback(error, result);
174 | });
175 | };
176 | /**
177 | * 解析列表页
178 | */
179 | Crawler.prototype._getLinks = function (url, callback) {
180 | var self = this;
181 | helper.request(url, {
182 | format: self.format,
183 | charset: self.charset,
184 | userAgent: self.userAgent
185 | }, function (error, body) {
186 | var result = {};
187 | result.urls = null;
188 | if (body) {
189 | result.urls = self._parseUrls(body, url);
190 | }
191 | callback(error, result);
192 | });
193 | };
194 | /**
195 | * 解析url
196 | */
197 | Crawler.prototype._parseUrls = function (bodyData, url) {
198 | var self = this;
199 | self.url = url;
200 | return parser.getUrls(bodyData, self);
201 | };
202 | /**
203 | * 解析获取内容[为`getLinks`与`getContent`的集合]
204 | */
205 | Crawler.prototype.parse = function (url, callback) {
206 | url = helper.formatUrl(url);
207 | var self = this;
208 | var type = null;
209 | var result = {};
210 | var bodyData = null;
211 |
212 | // 获取页面的数据
213 | var parserUrls = function (data, done) {
214 | result.urls = null;
215 | if (!!bodyData) {
216 | result.urls = self._parseUrls(bodyData, url);
217 | }
218 | done(null, result);
219 | };
220 | // 获取页面的链接
221 | var parserFields = function (type, done) {
222 | self._getContent(url, function (error, data) {
223 | if (!data || error) {
224 | return done(error);
225 | }
226 | bodyData = data.bodyData;
227 | result.fields = data.fields;
228 | done(null, result);
229 | });
230 | };
231 | // 判断是否为url
232 | var judge = function (done) {
233 | type = self._judge(url);
234 | if (type) {
235 | done(null, type);
236 | } else {
237 | done('url mismatch');
238 | }
239 | };
240 | return new Promise(function (resolve, reject) {
241 | async.waterfall([judge, parserFields, parserUrls], function (error, result) {
242 | if (error) {
243 | console.error(error);
244 | if (callback) return callback(error);
245 | return reject(error);
246 | }
247 | resolve(result);
248 | if (callback) {
249 | callback(null, result);
250 | }
251 | });
252 | });
253 | };
254 | /**
255 | * 获取待抓页链接
256 | */
257 | Crawler.prototype.getLinks = function (url, callback) {
258 | url = helper.formatUrl(url);
259 | var self = this;
260 | var type = this._judge(url);
261 | if (!type) return null;
262 | return new Promise(function (resolve, reject) {
263 | self._getLinks(url, function (error, result) {
264 | if (error) {
265 | console.error(error);
266 | if (callback) return callback(error);
267 | return reject(error);
268 | }
269 | resolve(result.urls);
270 | if (callback) {
271 | callback(null, result.urls);
272 | }
273 | });
274 | });
275 | };
276 | /**
277 | * 获取详情页内容
278 | */
279 | Crawler.prototype.getContent = function (url, callback) {
280 | url = helper.formatUrl(url);
281 | var self = this;
282 | var type = this._judge(url);
283 | if (!type) return null;
284 | if (!this.isArticleUrl(url)) return null;
285 | return new Promise(function (resolve, reject) {
286 | self._getContent(url, function (error, result) {
287 | if (error) {
288 | console.error(error);
289 | if (callback) return callback(error);
290 | return reject(error);
291 | }
292 | resolve(result.fields);
293 | if (callback) {
294 | callback(null, result.fields);
295 | }
296 | });
297 | });
298 | };
299 | /**
300 | * 检测链接是否是详情页
301 | */
302 | Crawler.prototype.isArticleUrl = function (url) {
303 | url = helper.formatUrl(url);
304 | return this._judge(url, 'post');
305 | };
306 | /**
307 | * 检测链接是否是列表页
308 | */
309 | Crawler.prototype.isListUrl = function (url) {
310 | url = helper.formatUrl(url);
311 | return this._judge(url, 'list');
312 | };
313 | /**
314 | * 获取页面链接的唯一标示
315 | */
316 | Crawler.prototype.getIdFromArticleUrl = function (url) {
317 | url = helper.formatUrl(url);
318 | var type = this._judge(url);
319 | return type ? this._getSourceId(url) : null;
320 | };
321 |
322 | module.exports = Crawler;
323 |
--------------------------------------------------------------------------------
/lib/helper.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | var request = require('request'),
3 | iconv = require('iconv-lite'),
4 | OpenCC = require('opencc');
5 |
6 | /**
7 | * 代理
8 | */
9 | var _proxy = function () {
10 | var proxy = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || process.env.ALL_PROXY;
11 | if (proxy) {
12 | request = request.defaults({'proxy': proxy});
13 | }
14 | };
15 | /**
16 | * 请求核心
17 | */
18 | var _requestCore = function (url, options, callback) {
19 | _proxy();
20 | var query = {};
21 | query.url = url;
22 | query.headers = {};
23 | if (options.charset && options.charset !== 'utf-8') {
24 | query.encoding = null;
25 | }
26 | if (options.userAgent) {
27 | query.headers = {
28 | 'User-Agent': options.userAgent
29 | };
30 | }
31 | request.get(query, function (err, res, body) {
32 | if (!err && res.statusCode === 200) {
33 | if (options.charset && options.charset !== 'utf-8') {
34 | body = iconv.decode(body, options.charset);// 处理转码问题
35 | }
36 | callback(err, body);
37 | } else {
38 | console.error(err);
39 | return callback(err);
40 | }
41 | });
42 | };
43 | /**
44 | * 多种类型请求
45 | */
46 | var _request = {
47 | html: function (url, options, callback) {
48 | _requestCore(url, options, function (error, body) {
49 | callback(error, body);
50 | });
51 | },
52 | json: function (url, options, callback) {
53 | _requestCore(url, options, function (error, body) {
54 | body = JSON.parse(body);
55 | callback(error, body);
56 | });
57 | },
58 | jsonp: function (url, options, callback) {
59 | _requestCore(url, options, function (error, body) {
60 | body = body.substring(9, body.length - 1);
61 | body = JSON.parse(body);
62 | callback(error, body);
63 | });
64 | }
65 | };
66 |
67 | /**
68 | * 请求接口
69 | * 支持http/json/jsonp
70 | */
71 | var requestUrl = function (url, options, callback) {
72 | options.format = options.format || 'html';
73 | if (options.format === 'html') {
74 | _request.html(url, options, callback);
75 | } else if (options.format === 'json') {
76 | _request.json(url, options, callback);
77 | } else if (options.format === 'jsonp') {
78 | _request.jsonp(url, options, callback);
79 | } else {
80 | console.error('The request format is error.');
81 | }
82 | };
83 | /**
84 | * 转义 i18n
85 | *
86 | * 支持的类型:
87 | * 简体到繁体 s2t
88 | * 繁体到简体 t2s
89 | * 简体到台湾正体 s2tw
90 | * 台湾正体到简体 tw2s
91 | * 简体到香港繁体 s2hk
92 | * 香港繁体到简体 hk2s
93 | * 繁体到台湾正体 t2tw
94 | * 繁体到香港繁体 t2hk
95 | */
96 | var translate = function (str, type) {
97 | type = type || 'tw2s'
98 | if (['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 't2tw', 't2hk'].indexOf(type) < 0) {
99 | console.error(type, 'in i18n is null');
100 | return str;
101 | }
102 | var opencc = new OpenCC(type + '.json');
103 | var converted = opencc.convertSync(str);
104 | return converted;
105 | };
106 |
107 | /**
108 | * 追加首页链接结尾的/
109 | */
110 | var formatUrl = function (url) {
111 | if (url.split('/').length - 1 === 2) {
112 | url += '/';
113 | }
114 | return url;
115 | };
116 | /**
117 | * 转码
118 | */
119 | var encode = function (str) {
120 | return str.replace(/[^\u0000-\u00FF]/g, function ($0) {
121 | return escape($0).replace(/(%u)(\w{4})/gi, "$2")
122 | });
123 | };
124 | var rencode = function (str) {
125 | return unescape(str.replace(/()(\w{4});/gi, "%u$2")).replace(/%uA0/img, ' ').replace(/ /img, ' ');
126 | };
127 | /**
128 | * 数组去重
129 | */
130 | var deDuplication = function (arr) {
131 | var filterObj = {};
132 | arr = arr.filter(function (_item) {
133 | if (!filterObj[_item]) {
134 | filterObj[_item] = true;
135 | return true;
136 | } else {
137 | return false;
138 | }
139 | });
140 | return arr;
141 | };
142 |
143 | module.exports = {
144 | request: requestUrl,
145 | translate: translate,
146 | formatUrl: formatUrl,
147 | encode: encode,
148 | rencode: rencode,
149 | deDuplication: deDuplication
150 | };
151 |
--------------------------------------------------------------------------------
/lib/parser.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | var async = require('async'),
3 | cheerio = require('cheerio'),
4 | xpath = require('xpath'),
5 | dom = require('xmldom').DOMParser;
6 | var helper = require('./helper');
7 |
8 | /**
9 | * 解析器
10 | */
11 | // jq解析器
12 | var _jquerySelectorParser = function ($, item) {
13 | var data = item.countType === 'image' ? [] : '';
14 | item.selector.forEach(function (selector) {
15 | if (!item.format || item.format === 'text') {
16 | if (item.index !== undefined) {
17 | data = $(selector).eq(item.index).text().trim() || data;
18 | } else {
19 | data += $(selector).text() || '';
20 | }
21 | }
22 | if (item.format === 'meta') {
23 | if (item.index !== undefined) {
24 | data = $(selector).eq(item.index).attr('content') || data;
25 | } else {
26 | data += $(selector).attr('content') || '';
27 | }
28 | }
29 | if (item.format === 'count') {
30 | if (item.countType === 'image') {
31 | if (selector.indexOf(' img') > -1 || selector === 'img') {
32 | $(selector).each(function (i) {
33 | data.push($(this).attr('src'));
34 | })
35 | } else {
36 | $(selector).find('img').each(function (i) {
37 | data.push($(this).attr('src'));
38 | })
39 | }
40 | }
41 | if (item.countType === 'text') {
42 | var text = $(selector).text();
43 | text = text.replace(/\n[\s| | ]*\r/g, '\n').replace(/\n/img, '');
44 | data = +(data || 0) + text.length;
45 | }
46 | }
47 | if (item.format === 'html') {
48 | if (item.index !== undefined) {
49 | data = `${helper.rencode($(selector).eq(item.index).html() || data)}`;
50 | } else {
51 | // 处理图片
52 | if (selector.indexOf(' img') > -1 || selector === 'img') {
53 | $(selector).each(function (_index) {
54 | var value = $(this).attr('src');
55 | if (value) {
56 | data += `
`;
57 | }
58 | });
59 | } else {
60 | $(selector).each(function (_index) {
61 | var value = ($.html(this) || '').trim();
62 | var regExpExplain = new RegExp('', 'img');
63 | value = value.replace(regExpExplain, '');
64 | data += `${helper.rencode(value)}`
65 |
66 | });
67 | }
68 | }
69 | }
70 | });
71 | return data;
72 | };
73 | // 上下文解析器
74 | var _contextSelectorParser = function (body, item) {
75 | var data = '';
76 | if (item.selector.length !== 2) {
77 | return null;
78 | }
79 | var upIndex = body.indexOf(item.selector[0]);
80 | var downIndex = body.indexOf(item.selector[1]);
81 |
82 | if (upIndex < 0 || downIndex < 0) {
83 | return null;
84 | }
85 | data = body.substring(upIndex + item.selector[0].length, downIndex);
86 | return data;
87 | };
88 | // xpath解析器
89 | var _xpathSelectorParser = function ($, item) {
90 | var data = '';
91 | item.selector.forEach(function (selector) {
92 | var nodes = xpath.select(selector, $.doc);
93 | if (nodes.length > 0) {
94 | data += nodes[0].textContent || '';
95 | }
96 | });
97 | return data;
98 | };
99 | /**
100 | * 补全链接
101 | */
102 | var _completionUrl = function (url, options) {
103 | if (!url) { return null; }
104 | if (url.indexOf('javascript') !== -1) { return null; }
105 | if (url.substr(0, 4) !== 'http') {
106 | if (url.substr(0, 2) === '//') {
107 | url = options.domains.split('//')[0] + url;
108 | } else if (url.substr(0, 1) === '/') {
109 | url = url.substr(1);
110 | url = options.domains + url.replace(/^http(s)?:\/\//, '').replace(options.domains.replace(/^http(s)?:\/\//, ''), '');
111 | } else {
112 | if (url.substr(0, 2) === './') {
113 | url = url.substr(2);
114 | }
115 | var arrUrl = options.url.split('/');
116 | arrUrl[arrUrl.length - 1] = url;
117 | url = arrUrl.join('/');
118 | }
119 | }
120 | if (url.indexOf('#') !== -1) {
121 | url = url.split('#')[0];
122 | }
123 | url = url.trim();
124 | return url;
125 | };
126 | /**
127 | * 提取url
128 | */
129 | var getUrls = function (bodyHtml, self) {
130 | var $ = cheerio.load(bodyHtml);
131 | var result = [];
132 | $('a').each(function (index) {
133 | var url = _completionUrl($(this).attr('href'), {
134 | domains: self.domains,
135 | url: self.url
136 | });
137 | if (!url) { return true; }
138 | self.listUrlRegexes.forEach(function (urlRegex) {
139 | if (urlRegex.test(url)) {
140 | result.push(url);
141 | }
142 | });
143 | self.contentUrlRegexes.forEach(function (urlRegex) {
144 | if (urlRegex.test(url)) {
145 | result.push(url);
146 | }
147 | });
148 | });
149 | // 回调函数
150 | if (self.afterExtractUrls) {
151 | result = self.afterExtractUrls(result);
152 | }
153 | // 去重
154 | return helper.deDuplication(result);
155 | };
156 | /**
157 | * 提取数据
158 | */
159 | var getFields = function (bodyHtml, self) {
160 | var $ = cheerio.load(bodyHtml);
161 | $.doc = new dom({
162 | errorHandler: {
163 | warning : function (err) {},
164 | error : function (err) {},
165 | fatalError: function (err) {}
166 | }
167 | }).parseFromString($.html());
168 | // 对图片链接进行处理
169 | $('img').each(function (item) {
170 | var url = _completionUrl($(this).attr('src'), {
171 | domains: self.domains,
172 | url: self.url
173 | });
174 | if (!url) { return true; }
175 | if ($(this).attr('src').substr(0, 4) !== 'http') {
176 | $(this).attr('src', url);
177 | }
178 | });
179 |
180 | var result = {};
181 | self.fields.forEach(function (item) {
182 | result[item.name] = '';
183 | // 解析
184 | if (!item.meta.type || item.meta.type === 'jq' || item.meta.type === '$' || item.meta.type === 'jquery') {
185 | result[item.name] = _jquerySelectorParser($, item.meta);
186 | if (item.meta.countType === 'image') {
187 | result['imagesList'] = result[item.name];
188 | result[item.name] = result[item.name].length;
189 | }
190 | } else if (item.meta.type === 'context') {
191 | result[item.name] = _contextSelectorParser(body, item.meta);
192 | }
193 | // 处理默认
194 | if (!result[item.name] && item.defaultValue !== undefined) {
195 | result[item.name] = item.defaultValue;
196 | }
197 | // 处理翻译
198 | if (typeof result[item.name] === 'string' && self.i18n) {
199 | result[item.name] = helper.translate(result[item.name], self.i18n);
200 | }
201 | // 去掉首尾空格
202 | if (typeof result[item.name] === 'string') {
203 | result[item.name] = result[item.name].trim();
204 | }
205 | // 处理回调
206 | if (self.afterExtractField) {
207 | result[item.name] = self.afterExtractField(item.name, result[item.name]);
208 | }
209 | // 处理必填
210 | if (item.required && (result[item.name] === undefined || result[item.name] === '')) {
211 | console.error('fields[', item.name, '] value is emtly');
212 | throw new Error('fields value is emtly');
213 | }
214 | });
215 | return result;
216 | };
217 | /**
218 | * 根据选择器获取数据
219 | */
220 | var getFieldsBySelector = function (bodyHtml, fields) {
221 | var $ = cheerio.load(bodyHtml);
222 | var result = {};
223 | fields.forEach(function (item) {
224 | result[item.name] = _jquerySelectorParser($, item.meta);
225 | if (!result[item.name] && item.defaultValue !== undefined) {
226 | result[item.name] = item.defaultValue;
227 | }
228 | });
229 | return result;
230 | };
231 | /**
232 | * 附加数据
233 | */
234 | var getAttachUrl = function (options, callback) {
235 | if (!options.meta || !options.meta.length) {
236 | return callback(null, options.url);
237 | }
238 | var $ = cheerio.load(options.body);
239 | options.meta.forEach(function (item) {
240 | var value = null
241 | if (item.format === 'value') {
242 | value = $(item.selector).val();
243 | } else {
244 | value = $(item.selector).text();
245 | }
246 | options.url = options.url.replace(`{{${item.name}}}`, value);
247 | });
248 | callback(null, options.url);
249 | };
250 | /**
251 | * 下一页
252 | */
253 | var getContentPage = function (self, options, callback) {
254 | if (!options.url || !options.body) {
255 | return callback(null, null);
256 | }
257 | if (!self.contentPage || !self.contentPage.urls || !self.contentPage.selector) {
258 | return callback(null, null);
259 | }
260 | var $ = cheerio.load(options.body);
261 | var urls = [];
262 | $('a').each(function (item) {
263 | var url = _completionUrl($(this).attr('href'), {
264 | domains: self.domains,
265 | url: self.url
266 | });
267 | if (!url) { return true; }
268 | self.contentPage.urls.forEach(function (urlRegex) {
269 | if (urlRegex.test(url)) {
270 | urls.push(url);
271 | }
272 | });
273 | });
274 | // 数组去重
275 | urls = helper.deDuplication(urls);
276 | if (!urls.length) {
277 | return callback(null, null);
278 | }
279 | var regExpExplain = new RegExp('', 'img');
280 | var data = '';
281 | // 如果数组有数据 则拼接数据到指定位置
282 | async.mapSeries(urls, function (url, done) {
283 | helper.request(url, {
284 | format: self.format,
285 | charset: self.charset,
286 | userAgent: self.userAgent
287 | }, function (error, body) {
288 | var _$ = cheerio.load(body);
289 | self.contentPage.selector.forEach(function (selector) {
290 | _$(selector).each(function (_index) {
291 | var value = ($.html(this) || '').trim();
292 | value = value.replace(regExpExplain, '');
293 | data += `${helper.rencode(value)}`;
294 | });
295 | });
296 | done(error, null);
297 | });
298 | }, function (err, result) {
299 | if (self.contentPage.prependNode) {
300 | $(self.contentPage.prependNode).prepend(data);
301 | }
302 | if (self.contentPage.appendNode) {
303 | $(self.contentPage.appendNode).append(data);
304 | }
305 | callback(err, $.html());
306 | });
307 | };
308 | module.exports = {
309 | // 提取url
310 | getUrls: getUrls,
311 | // 提取数据
312 | getFields: getFields,
313 | // 下一页
314 | getContentPage: getContentPage,
315 | // 附加数据
316 | getAttachUrl: getAttachUrl,
317 | // 根据选择器获取数据
318 | getFieldsBySelector: getFieldsBySelector
319 | };
320 |
--------------------------------------------------------------------------------
/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "almighty-parser-core",
3 | "version": "0.0.1",
4 | "lockfileVersion": 1,
5 | "dependencies": {
6 | "@types/node": {
7 | "version": "6.0.74",
8 | "resolved": "https://registry.npmjs.org/@types/node/-/node-6.0.74.tgz",
9 | "integrity": "sha512-fjUDu2//vsHodfhWeo6bkJcY+YjHAnQSaOahcY6M3hvl7KIHf/5EosXLTJB8gTN2Yfsfaov+FpUtkR/gfgrQXA=="
10 | },
11 | "ajv": {
12 | "version": "4.11.8",
13 | "resolved": "https://registry.npmjs.org/ajv/-/ajv-4.11.8.tgz",
14 | "integrity": "sha1-gv+wKynmYq5TvcIK8VlHcGc5xTY="
15 | },
16 | "asn1": {
17 | "version": "0.2.3",
18 | "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.3.tgz",
19 | "integrity": "sha1-2sh4dxPJlmhJ/IGAd36+nB3fO4Y="
20 | },
21 | "assert-plus": {
22 | "version": "0.2.0",
23 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.2.0.tgz",
24 | "integrity": "sha1-104bh+ev/A24qttwIfP+SBAasjQ="
25 | },
26 | "async": {
27 | "version": "2.4.1",
28 | "resolved": "https://registry.npmjs.org/async/-/async-2.4.1.tgz",
29 | "integrity": "sha1-YqVrJ5yYoR0JhwlqAcw+6463u9c="
30 | },
31 | "asynckit": {
32 | "version": "0.4.0",
33 | "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
34 | "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k="
35 | },
36 | "aws-sign2": {
37 | "version": "0.6.0",
38 | "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.6.0.tgz",
39 | "integrity": "sha1-FDQt0428yU0OW4fXY81jYSwOeU8="
40 | },
41 | "aws4": {
42 | "version": "1.6.0",
43 | "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.6.0.tgz",
44 | "integrity": "sha1-g+9cqGCysy5KDe7e6MdxudtXRx4="
45 | },
46 | "bcrypt-pbkdf": {
47 | "version": "1.0.1",
48 | "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.1.tgz",
49 | "integrity": "sha1-Y7xdy2EzG5K8Bf1SiVPDNGKgb40=",
50 | "optional": true
51 | },
52 | "boolbase": {
53 | "version": "1.0.0",
54 | "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
55 | "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
56 | },
57 | "boom": {
58 | "version": "2.10.1",
59 | "resolved": "https://registry.npmjs.org/boom/-/boom-2.10.1.tgz",
60 | "integrity": "sha1-OciRjO/1eZ+D+UkqhI9iWt0Mdm8="
61 | },
62 | "buffer-shims": {
63 | "version": "1.0.0",
64 | "resolved": "https://registry.npmjs.org/buffer-shims/-/buffer-shims-1.0.0.tgz",
65 | "integrity": "sha1-mXjOMXOIxkmth5MCjDR37wRKi1E="
66 | },
67 | "caseless": {
68 | "version": "0.12.0",
69 | "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz",
70 | "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw="
71 | },
72 | "cheerio": {
73 | "version": "1.0.0-rc.1",
74 | "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.1.tgz",
75 | "integrity": "sha1-KvNzOeq3E+9rcs3pjO+mcrh2Qf4="
76 | },
77 | "co": {
78 | "version": "4.6.0",
79 | "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
80 | "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ="
81 | },
82 | "combined-stream": {
83 | "version": "1.0.5",
84 | "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.5.tgz",
85 | "integrity": "sha1-k4NwpXtKUd6ix3wV1cX9+JUWQAk="
86 | },
87 | "core-util-is": {
88 | "version": "1.0.2",
89 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
90 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
91 | },
92 | "cryptiles": {
93 | "version": "2.0.5",
94 | "resolved": "https://registry.npmjs.org/cryptiles/-/cryptiles-2.0.5.tgz",
95 | "integrity": "sha1-O9/s3GCBR8HGcgL6KR59ylnqo7g="
96 | },
97 | "css-select": {
98 | "version": "1.2.0",
99 | "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz",
100 | "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg="
101 | },
102 | "css-what": {
103 | "version": "2.1.0",
104 | "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz",
105 | "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0="
106 | },
107 | "dashdash": {
108 | "version": "1.14.1",
109 | "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
110 | "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=",
111 | "dependencies": {
112 | "assert-plus": {
113 | "version": "1.0.0",
114 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
115 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
116 | }
117 | }
118 | },
119 | "delayed-stream": {
120 | "version": "1.0.0",
121 | "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
122 | "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
123 | },
124 | "dom-serializer": {
125 | "version": "0.1.0",
126 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
127 | "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=",
128 | "dependencies": {
129 | "domelementtype": {
130 | "version": "1.1.3",
131 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz",
132 | "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs="
133 | }
134 | }
135 | },
136 | "domelementtype": {
137 | "version": "1.3.0",
138 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.0.tgz",
139 | "integrity": "sha1-sXrtguirWeUt2cGbF1bg/BhyBMI="
140 | },
141 | "domhandler": {
142 | "version": "2.4.1",
143 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.1.tgz",
144 | "integrity": "sha1-iS5HAAqZvlW783dP/qBWHYh5wlk="
145 | },
146 | "domutils": {
147 | "version": "1.5.1",
148 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz",
149 | "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8="
150 | },
151 | "ecc-jsbn": {
152 | "version": "0.1.1",
153 | "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.1.tgz",
154 | "integrity": "sha1-D8c6ntXw1Tw4GTOYUj735UN3dQU=",
155 | "optional": true
156 | },
157 | "entities": {
158 | "version": "1.1.1",
159 | "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz",
160 | "integrity": "sha1-blwtClYhtdra7O+AuQ7ftc13cvA="
161 | },
162 | "extend": {
163 | "version": "3.0.1",
164 | "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.1.tgz",
165 | "integrity": "sha1-p1Xqe8Gt/MWjHOfnYtuq3F5jZEQ="
166 | },
167 | "extsprintf": {
168 | "version": "1.0.2",
169 | "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.0.2.tgz",
170 | "integrity": "sha1-4QgOBljjALBilJkMxw4VAiNf1VA="
171 | },
172 | "forever-agent": {
173 | "version": "0.6.1",
174 | "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz",
175 | "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
176 | },
177 | "form-data": {
178 | "version": "2.1.4",
179 | "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.1.4.tgz",
180 | "integrity": "sha1-M8GDrPGTJ27KqYFDpp6Uv+4XUNE="
181 | },
182 | "getpass": {
183 | "version": "0.1.7",
184 | "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz",
185 | "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=",
186 | "dependencies": {
187 | "assert-plus": {
188 | "version": "1.0.0",
189 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
190 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
191 | }
192 | }
193 | },
194 | "har-schema": {
195 | "version": "1.0.5",
196 | "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-1.0.5.tgz",
197 | "integrity": "sha1-0mMTX0MwfALGAq/I/pWXDAFRNp4="
198 | },
199 | "har-validator": {
200 | "version": "4.2.1",
201 | "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-4.2.1.tgz",
202 | "integrity": "sha1-M0gdDxu/9gDdID11gSpqX7oALio="
203 | },
204 | "hawk": {
205 | "version": "3.1.3",
206 | "resolved": "https://registry.npmjs.org/hawk/-/hawk-3.1.3.tgz",
207 | "integrity": "sha1-B4REvXwWQLD+VA0sm3PVlnjo4cQ="
208 | },
209 | "hoek": {
210 | "version": "2.16.3",
211 | "resolved": "https://registry.npmjs.org/hoek/-/hoek-2.16.3.tgz",
212 | "integrity": "sha1-ILt0A9POo5jpHcRxCo/xuCdKJe0="
213 | },
214 | "htmlparser2": {
215 | "version": "3.9.2",
216 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz",
217 | "integrity": "sha1-G9+HrMoPP55T+k/M6w9LTLsAszg="
218 | },
219 | "http-signature": {
220 | "version": "1.1.1",
221 | "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.1.1.tgz",
222 | "integrity": "sha1-33LiZwZs0Kxn+3at+OE0qPvPkb8="
223 | },
224 | "iconv-lite": {
225 | "version": "0.4.17",
226 | "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.17.tgz",
227 | "integrity": "sha1-T9qjs4rLwsAxsEXQ7c3+HsqxjI0="
228 | },
229 | "inherits": {
230 | "version": "2.0.3",
231 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
232 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
233 | },
234 | "is-typedarray": {
235 | "version": "1.0.0",
236 | "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
237 | "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo="
238 | },
239 | "isarray": {
240 | "version": "1.0.0",
241 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
242 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
243 | },
244 | "isstream": {
245 | "version": "0.1.2",
246 | "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
247 | "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo="
248 | },
249 | "jodid25519": {
250 | "version": "1.0.2",
251 | "resolved": "https://registry.npmjs.org/jodid25519/-/jodid25519-1.0.2.tgz",
252 | "integrity": "sha1-BtSRIlUJNBlHfUJWM2BuDpB4KWc=",
253 | "optional": true
254 | },
255 | "jsbn": {
256 | "version": "0.1.1",
257 | "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
258 | "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=",
259 | "optional": true
260 | },
261 | "json-schema": {
262 | "version": "0.2.3",
263 | "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz",
264 | "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM="
265 | },
266 | "json-stable-stringify": {
267 | "version": "1.0.1",
268 | "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz",
269 | "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8="
270 | },
271 | "json-stringify-safe": {
272 | "version": "5.0.1",
273 | "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
274 | "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
275 | },
276 | "jsonify": {
277 | "version": "0.0.0",
278 | "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz",
279 | "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM="
280 | },
281 | "jsprim": {
282 | "version": "1.4.0",
283 | "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.0.tgz",
284 | "integrity": "sha1-o7h+QCmNjDgFUtjMdiigu5WiKRg=",
285 | "dependencies": {
286 | "assert-plus": {
287 | "version": "1.0.0",
288 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
289 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
290 | }
291 | }
292 | },
293 | "lodash": {
294 | "version": "4.17.4",
295 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.4.tgz",
296 | "integrity": "sha1-eCA6TRwyiuHYbcpkYONptX9AVa4="
297 | },
298 | "mime-db": {
299 | "version": "1.27.0",
300 | "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.27.0.tgz",
301 | "integrity": "sha1-gg9XIpa70g7CXtVeW13oaeVDbrE="
302 | },
303 | "mime-types": {
304 | "version": "2.1.15",
305 | "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.15.tgz",
306 | "integrity": "sha1-pOv1BkCUVpI3uM9wBGd20J/JKu0="
307 | },
308 | "nan": {
309 | "version": "2.6.2",
310 | "resolved": "https://registry.npmjs.org/nan/-/nan-2.6.2.tgz",
311 | "integrity": "sha1-5P805slf37WuzAjeZZb0NgWn20U="
312 | },
313 | "nth-check": {
314 | "version": "1.0.1",
315 | "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.1.tgz",
316 | "integrity": "sha1-mSms32KPwsQQmN6rgqxYDPFJquQ="
317 | },
318 | "oauth-sign": {
319 | "version": "0.8.2",
320 | "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.8.2.tgz",
321 | "integrity": "sha1-Rqarfwrq2N6unsBWV4C31O/rnUM="
322 | },
323 | "opencc": {
324 | "version": "1.0.5",
325 | "resolved": "https://registry.npmjs.org/opencc/-/opencc-1.0.5.tgz",
326 | "integrity": "sha1-U6korCncNVehseBY02m0I2c6t+U="
327 | },
328 | "parse5": {
329 | "version": "3.0.2",
330 | "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.2.tgz",
331 | "integrity": "sha1-Be/1fw70V3+xRKefi5qWemzERRA="
332 | },
333 | "performance-now": {
334 | "version": "0.2.0",
335 | "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-0.2.0.tgz",
336 | "integrity": "sha1-M+8wxcd9TqIcWlOGnZG1bY8lVeU="
337 | },
338 | "process-nextick-args": {
339 | "version": "1.0.7",
340 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz",
341 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M="
342 | },
343 | "punycode": {
344 | "version": "1.4.1",
345 | "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz",
346 | "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4="
347 | },
348 | "qs": {
349 | "version": "6.4.0",
350 | "resolved": "https://registry.npmjs.org/qs/-/qs-6.4.0.tgz",
351 | "integrity": "sha1-E+JtKK1rD/qpExLNO/cI7TUecjM="
352 | },
353 | "readable-stream": {
354 | "version": "2.2.9",
355 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.2.9.tgz",
356 | "integrity": "sha1-z3jsb0ptHrQ9JkiMrJfwQudLf8g="
357 | },
358 | "request": {
359 | "version": "2.81.0",
360 | "resolved": "https://registry.npmjs.org/request/-/request-2.81.0.tgz",
361 | "integrity": "sha1-xpKJRqDgbF+Nb4qTM0af/aRimKA="
362 | },
363 | "safe-buffer": {
364 | "version": "5.0.1",
365 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.0.1.tgz",
366 | "integrity": "sha1-0mPKVGls2KMGtcplUekt5XkY++c="
367 | },
368 | "sntp": {
369 | "version": "1.0.9",
370 | "resolved": "https://registry.npmjs.org/sntp/-/sntp-1.0.9.tgz",
371 | "integrity": "sha1-ZUEYTMkK7qbG57NeJlkIJEPGYZg="
372 | },
373 | "sshpk": {
374 | "version": "1.13.0",
375 | "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.13.0.tgz",
376 | "integrity": "sha1-/yo+T9BEl1Vf7Zezmg/YL6+zozw=",
377 | "dependencies": {
378 | "assert-plus": {
379 | "version": "1.0.0",
380 | "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
381 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
382 | }
383 | }
384 | },
385 | "string_decoder": {
386 | "version": "1.0.1",
387 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.0.1.tgz",
388 | "integrity": "sha1-YuIA8DmVWmgQ2N8KM//A8BNmLZg="
389 | },
390 | "stringstream": {
391 | "version": "0.0.5",
392 | "resolved": "https://registry.npmjs.org/stringstream/-/stringstream-0.0.5.tgz",
393 | "integrity": "sha1-TkhM1N5aC7vuGORjB3EKioFiGHg="
394 | },
395 | "tough-cookie": {
396 | "version": "2.3.2",
397 | "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.3.2.tgz",
398 | "integrity": "sha1-8IH3bkyFcg5sN6X6ztc3FQ2EByo="
399 | },
400 | "tunnel-agent": {
401 | "version": "0.6.0",
402 | "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
403 | "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0="
404 | },
405 | "tweetnacl": {
406 | "version": "0.14.5",
407 | "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",
408 | "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=",
409 | "optional": true
410 | },
411 | "util-deprecate": {
412 | "version": "1.0.2",
413 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
414 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
415 | },
416 | "uuid": {
417 | "version": "3.0.1",
418 | "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.0.1.tgz",
419 | "integrity": "sha1-ZUS7ot/ajBzxfmKaOjBeK7H+5sE="
420 | },
421 | "verror": {
422 | "version": "1.3.6",
423 | "resolved": "https://registry.npmjs.org/verror/-/verror-1.3.6.tgz",
424 | "integrity": "sha1-z/XfEpRtKX0rqu+qJoniW+AcAFw="
425 | },
426 | "xmldom": {
427 | "version": "0.1.27",
428 | "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.1.27.tgz",
429 | "integrity": "sha1-1QH5ezvbQDr4757MIFcxh6rawOk="
430 | },
431 | "xpath": {
432 | "version": "0.0.24",
433 | "resolved": "https://registry.npmjs.org/xpath/-/xpath-0.0.24.tgz",
434 | "integrity": "sha1-Gt4WLhzFI8jTn8fQavwW6iFvKfs="
435 | }
436 | }
437 | }
438 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "almighty-parser-core",
3 | "version": "1.0.7",
4 | "description": "crawler prser core",
5 | "main": "index.js",
6 | "scripts": {
7 | "test:qiushibaike": "node --harmony-async-await ./example/test/qiushibaike.js",
8 | "test:healthno1": "node --harmony-async-await ./example/test/healthno1.js",
9 | "test:baoliao5": "node --harmony-async-await ./example/test/baoliao5.js"
10 | },
11 | "repository": {
12 | "type": "git",
13 | "url": "git@github.com:coolfishstudio/almighty-parser-core.git"
14 | },
15 | "keywords": "crawler, parser",
16 | "author": "Yves",
17 | "license": "MIT",
18 | "dependencies": {
19 | "async": "^2.4.1",
20 | "cheerio": "^1.0.0-rc.1",
21 | "iconv-lite": "^0.4.17",
22 | "opencc": "^1.0.5",
23 | "request": "^2.81.0",
24 | "xmldom": "^0.1.27",
25 | "xpath": "0.0.24"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------