├── .eslintrc.yml
├── .gitignore
├── README.md
├── package.json
├── test
    └── index.js
├── .circleci
    └── config.yml
├── LICENSE
└── lib
    ├── get-html.js
    └── index.js


/.eslintrc.yml:
--------------------------------------------------------------------------------
 1 | root: true
 2 | plugins:
 3 |   - prettier
 4 | extends:
 5 |   - plugin:prettier/recommended
 6 | env:
 7 |   browser: true
 8 |   es6: true
 9 |   mocha: true
10 |   node: true
11 | globals:
12 |   "$": true
13 |   jQuery: true
14 | rules:
15 |   no-useless-escape: 0
16 |   prettier/prettier:
17 |     - 2
18 |     -
19 |       trailingComma: none
20 |       singleQuote: true
21 |       semi: false
22 |   prefer-const: 2
23 |   no-unused-vars:
24 |     - 2
25 |     -
26 |       argsIgnorePattern: ^_
27 |       varsIgnorePattern: ^_
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | 
 5 | # Runtime data
 6 | pids
 7 | *.pid
 8 | *.seed
 9 | 
10 | # Directory for instrumented libs generated by jscoverage/JSCover
11 | lib-cov
12 | 
13 | # Coverage directory used by tools like istanbul
14 | coverage
15 | 
16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
17 | .grunt
18 | 
19 | # node-waf configuration
20 | .lock-wscript
21 | 
22 | # Compiled binary addons (http://nodejs.org/api/addons.html)
23 | build/Release
24 | 
25 | # Dependency directory
26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git
27 | node_modules
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![CircleCI](https://circleci.com/gh/craftzdog/extract-main-text-node.svg?style=svg)](https://circleci.com/gh/craftzdog/extract-main-text-node)
 2 | 
 3 | extract-main-text-node
 4 | ======================
 5 | 
 6 | Ported from [mono0x/extractcontent](https://github.com/mono0x/extractcontent).
 7 | 
 8 | ## Installing
 9 | 
10 | ```
11 | npm install extract-main-text
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ```JavaScript
17 | var BodyExtractor = require('extract-main-text');
18 | var extractor = new BodyExtractor({
19 |     url: 'http://***.com/'
20 |   });
21 | extractor.analyze()
22 |   .then(function(text) {
23 |     console.log(extractor.title);
24 |     console.log(extractor.mainText);
25 |   });
26 | ```
27 | 
28 | ## License
29 | 
30 | The BSD license
31 | 
32 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "extract-main-text",
 3 |   "version": "1.0.3",
 4 |   "description": "Automatically grab the main text out of a webpage",
 5 |   "main": "lib/index.js",
 6 |   "scripts": {
 7 |     "test": "mocha --harmony"
 8 |   },
 9 |   "author": "",
10 |   "license": "BSD",
11 |   "devDependencies": {
12 |     "eslint": "^5.12.0",
13 |     "eslint-config-prettier": "^3.3.0",
14 |     "eslint-plugin-prettier": "^3.0.1",
15 |     "mocha": "^5.2.0",
16 |     "prettier": "^1.15.3",
17 |     "should": "^13.2.3"
18 |   },
19 |   "dependencies": {
20 |     "charset": "^1.0.1",
21 |     "html-entities": "^1.2.1",
22 |     "iconv-lite": "^0.4.24",
23 |     "jschardet": "^1.6.0",
24 |     "lodash.defaults": "^4.2.0",
25 |     "lodash.merge": "^4.6.1",
26 |     "request": "^2.88.0"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
 1 | var BodyExtractor = require('../lib')
 2 | var should = require('should')
 3 | 
 4 | describe('The main text extractor', function() {
 5 |   var extractor
 6 | 
 7 |   it('can initialize', function() {
 8 |     extractor = new BodyExtractor({
 9 |       //url: 'http://toyokeizai.net/articles/-/75910'
10 |       //url: 'http://d.hatena.ne.jp/shi3z/20150720/1437347243'
11 |       url: 'https://anond.hatelabo.jp/20150719014315'
12 |     })
13 |   })
14 | 
15 |   it('can analyze', function() {
16 |     return extractor.analyze().then(function(text) {
17 |       should(text).be.ok()
18 |       extractor.should.have.property('mainText')
19 |       console.log(extractor.mainText)
20 |     })
21 |   })
22 | 
23 |   it('can extract title', function() {
24 |     should(extractor.title).be.ok()
25 |   })
26 | })
27 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Javascript Node CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-javascript/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       - image: circleci/node:10.15.0
10 | 
11 |     working_directory: ~/repo
12 | 
13 |     steps:
14 |       - checkout
15 | 
16 |       # Download and cache dependencies
17 |       - restore_cache:
18 |           keys:
19 |           - v1-dependencies-{{ checksum "package.json" }}
20 |           # fallback to using the latest cache if no exact match is found
21 |           - v1-dependencies-
22 | 
23 |       - run: npm install
24 | 
25 |       - save_cache:
26 |           paths:
27 |             - node_modules
28 |           key: v1-dependencies-{{ checksum "package.json" }}
29 | 
30 |       # run tests!
31 |       - run: npm test
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Takuya Matsuyama
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 


--------------------------------------------------------------------------------
/lib/get-html.js:
--------------------------------------------------------------------------------
 1 | var request = require('request');
 2 | var charset = require('charset');
 3 | var iconv = require('iconv-lite');
 4 | var jschardet = require('jschardet');
 5 | 
 6 | module.exports = getHTML;
 7 | 
 8 | /**
 9 |  * Fetch HTML page
10 |  * retrieveHTML('http://hoge', function(err, html, url){ ... })
11 |  * The url argument of callback function is actual URL.
12 |  * It's different from specified one if the page is redirected like shorten URL.
13 |  *
14 |  * @param {string}   url  The URL to fetch
15 |  * @param {function} cb   The callback function
16 |  * @return {Promise}  The promise resolving the HTML content:
17 |  *  {
18 |  *    html: {string} The html content
19 |  *    url: {string} The actual URL retrieved from
20 |  *  }
21 |  */
22 | function getHTML (url){
23 |   return new Promise(function(fulfill, reject) {
24 |     var purl = require('url').parse(url);
25 |     if (!purl.protocol) {
26 |       purl = require('url').parse("http://"+url);
27 |     }
28 |     url = require('url').format(purl);
29 | 
30 |     var options = {
31 |       url: url,
32 |       encoding: null,
33 |       followRedirect: true,
34 |       headers: {
35 |         'User-Agent': 'Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36'
36 |       }
37 |     };
38 | 
39 |     request(options, function(err, res, body) {
40 |       if (err) {
41 |         reject(err);
42 |       }
43 |       else {
44 |         var enc = charset(res.headers, body) || jschardet.detect(body).encoding.toLowerCase();
45 |         body = iconv.decode(body, enc);
46 |         if (res.statusCode >= 300 && res.statusCode < 400) {
47 |           retrieveHTML(res.headers.location).then(fulfill, reject);
48 |         }
49 |         else {
50 |           fulfill({ html: body, url: res.request.uri.href });
51 |         }
52 |       }
53 |     });
54 |   });
55 | }
56 | 


--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
  1 | var assert = require('assert')
  2 | var getHTML = require('./get-html')
  3 | var merge = require('lodash.merge')
  4 | var defaults = require('lodash.defaults')
  5 | var Entities = require('html-entities').AllHtmlEntities
  6 | var entities = new Entities()
  7 | 
  8 | /**
  9 |  * Initialize new extractor.
 10 |  * Either parans.html or params.url must be specified.
 11 |  *
 12 |  * @param {object} params   The parameters
 13 |  * @param {string} params.html  Optional, the HTML content
 14 |  * @param {string} params.url   Optional, the URL
 15 |  */
 16 | function BodyExtractor(params, opts) {
 17 |   assert.equal(
 18 |     typeof params,
 19 |     'object',
 20 |     'The params must be an object: ' + params
 21 |   )
 22 | 
 23 |   this.html = params.html
 24 |   this.url = params.url
 25 |   merge(
 26 |     this,
 27 |     defaults(opts || {}, {
 28 |       threshold: 100,
 29 |       min_length: 80,
 30 |       decay_factor: 0.73,
 31 |       continuous_factor: 1.62,
 32 |       punctuation_weight: 10,
 33 |       punctuations: /([、。，．！？]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,
 34 |       waste_expressions: /Copyright|All Rights Reserved/i,
 35 |       debug: true
 36 |     })
 37 |   )
 38 | }
 39 | 
 40 | BodyExtractor.prototype.loadHTML = function() {
 41 |   assert.equal(
 42 |     typeof this.url,
 43 |     'string',
 44 |     'The this.url must be a stirng: ' + this.url
 45 |   )
 46 |   var self = this
 47 |   return getHTML(this.url).then(function(res) {
 48 |     self.html = res.html
 49 |     self.url = res.url
 50 |     return res
 51 |   })
 52 | }
 53 | 
 54 | /**
 55 |  * Parse HTML content
 56 |  * @return {Promise}  The promise
 57 |  */
 58 | BodyExtractor.prototype.analyze = function() {
 59 |   var self = this
 60 |   var promise = Promise.resolve()
 61 |   if (!this.html && this.url) {
 62 |     promise = promise.then(function() {
 63 |       return self.loadHTML()
 64 |     })
 65 |   }
 66 |   promise = promise.then(function() {
 67 |     var html = self.html
 68 | 
 69 |     if (
 70 |       html.match(
 71 |         /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i
 72 |       )
 73 |     ) {
 74 |       return
 75 |     }
 76 |     html = html.replace(
 77 |       /<!--\s*google_ad_section_start\(weight=ignore\)\s*-->[\s\S]*?<!--\s*google_ad_section_end.*?-->/gm,
 78 |       ''
 79 |     )
 80 |     if (html.match(/<!--\s*google_ad_section_start[^>]*-->/)) {
 81 |       var m = html.match(
 82 |         /<!--\s*google_ad_section_start[^>]*-->([\s\S]*?)<!--\s*google_ad_section_end.*?-->/m
 83 |       )
 84 |       html = m[1]
 85 |     }
 86 | 
 87 |     html = eliminate_useless_tags(html)
 88 | 
 89 |     var title = self.title
 90 |     // h? block including title
 91 |     html = html.replace(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/gi, function(
 92 |       $0,
 93 |       $1,
 94 |       $2,
 95 |       _$3
 96 |     ) {
 97 |       if ($2.length >= 3 && title.indexOf($2) >= 0) {
 98 |         return '<div>' + $2 + '</div>'
 99 |       } else {
100 |         return $1
101 |       }
102 |     })
103 | 
104 |     var factor = (continuous = 1.0)
105 |     var body = ''
106 |     var score = 0
107 |     var bodylist = []
108 |     var list = html.split(
109 |       /<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/
110 |     )
111 |     list.forEach(function(block) {
112 |       if (!block) {
113 |         return
114 |       }
115 |       block = block.trim()
116 |       if (has_only_tags(block)) {
117 |         return
118 |       }
119 |       if (body.length > 0) {
120 |         continuous /= self.continuous_factor
121 |       }
122 | 
123 |       // リンク除外＆リンクリスト判定
124 |       var notlinked = eliminate_link(block)
125 |       if (notlinked.length < self.min_length) {
126 |         return
127 |       }
128 | 
129 |       // スコア算出
130 |       var c =
131 |         (notlinked.length +
132 |           str_scan(notlinked, self.punctuations).length *
133 |             self.punctuation_weight) *
134 |         factor
135 |       factor *= self.decay_factor
136 |       var not_body_rate =
137 |         str_scan(block, self.waste_expressions).length +
138 |         str_scan(block, /amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0
139 |       if (not_body_rate > 0) {
140 |         c *= Math.pow(0.72, not_body_rate)
141 |       }
142 |       var c1 = c * continuous
143 | 
144 |       if (self.debug) {
145 |         console.log(c, '*', continuous, '=', c1, notlinked.length)
146 |       }
147 | 
148 |       // ブロック抽出＆スコア加算
149 |       if (c1 > self.threshold) {
150 |         body += block.trim() + '\n'
151 |         score += c1
152 |         continuous = self.continuous_factor
153 |       } else if (c > self.threshold) {
154 |         // continuous block end
155 |         bodylist.push([body, score])
156 |         body = block.trim() + '\n'
157 |         score = c
158 |         continuous = self.continuous_factor
159 |       }
160 |     })
161 |     bodylist.push([body, score])
162 |     body = bodylist.reduce(
163 |       function(a, b) {
164 |         if (a[1] >= b[1]) {
165 |           return a
166 |         } else {
167 |           return b
168 |         }
169 |       },
170 |       ['', 0]
171 |     )
172 |     self.mainText = strip_tags(body[0], self.dom_separator)
173 |     return self.mainText
174 |   })
175 |   return promise
176 | }
177 | 
178 | BodyExtractor.prototype.__defineGetter__('title', function() {
179 |   var m = this.html.match(/<title[^>]*>\s*(.*?)\s*<\/title\s*>/i)
180 |   if (m) {
181 |     return strip_tags(m[1])
182 |   } else {
183 |     return ''
184 |   }
185 | })
186 | 
187 | module.exports = BodyExtractor
188 | 
189 | function eliminate_useless_tags(html) {
190 |   // eliminate useless symbols
191 |   html = html.replace(
192 |     /[\342\200\230-\342\200\235]|[\342\206\220-\342\206\223]|[\342\226\240-\342\226\275]|[\342\227\206-\342\227\257]|\342\230\205|\342\230\206/g,
193 |     ''
194 |   )
195 | 
196 |   // eliminate useless html tags
197 |   html = html.replace(
198 |     /<(script|style|select|noscript)[^>]*>[\s\S]*?<\/\1\s*>/gim,
199 |     ''
200 |   )
201 |   html = html.replace(/<meta.*\/>/gi, '')
202 |   html = html.replace(/<!--[\s\S]*?-->/gm, '')
203 |   html = html.replace(/<![A-Za-z].*?>/g, '')
204 |   html = html.replace(
205 |     /<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>[\s\S]*?<\/div\s*>/gm,
206 |     ''
207 |   )
208 |   html = html.replace(
209 |     /<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/gi,
210 |     ''
211 |   )
212 | 
213 |   return html
214 | }
215 | 
216 | // Checks if the given block has only tags without text.
217 | function has_only_tags(st) {
218 |   return (
219 |     st
220 |       .replace(/<[^>]*>/gim, '')
221 |       .replace(/&nbsp;/g, '')
222 |       .trim().length == 0
223 |   )
224 | }
225 | 
226 | // リンク除外＆リンクリスト判定
227 | function eliminate_link(html) {
228 |   var count = 0
229 |   var notlinked = html
230 |     .replace(/<a\s[^>]*>[\s\S]*?<\/a\s*>/gim, function() {
231 |       count += 1
232 |       return ''
233 |     })
234 |     .replace(/<form\s[^>]*>[\s\S]*?<\/form\s*>/gim, '')
235 |   notlinked = strip_tags(notlinked)
236 |   if (notlinked.length < 20 * count || islinklist(html)) {
237 |     return ''
238 |   }
239 |   return notlinked
240 | }
241 | 
242 | /*
243 |  * Strips tags from html.
244 |  */
245 | function strip_tags(html, separator) {
246 |   if (separator === undefined) {
247 |     separator = ''
248 |   }
249 |   var st = html.replace(/<.+?>/gm, separator)
250 |   // Convert from wide character to ascii
251 |   // symbols, 0-9, A-Z
252 |   st = st.replace(
253 |     /[Ａ-Ｚａ-ｚ０-９－！”＃＄％＆’（）＝＜＞，．？＿［］｛｝＠＾～￥]/g,
254 |     function(s) {
255 |       return String.fromCharCode(s.charCodeAt(0) - 0xfee0)
256 |     }
257 |   )
258 |   // keisen
259 |   st = st.replace(
260 |     /[\342\224\200-\342\224\277]|[\342\225\200-\342\225\277]/g,
261 |     ''
262 |   )
263 |   st = st.replace(/\343\200\200/g, ' ')
264 |   st = entities.decode(st)
265 |   st.replace(/[ \t]+/g, ' ')
266 |   st.replace(/\n\s*/g, '\n')
267 |   return st
268 | }
269 | 
270 | // リンクリスト判定
271 | // リストであれば非本文として除外する
272 | function islinklist(st) {
273 |   var m = st.match(/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im)
274 |   if (m) {
275 |     var listpart = m[1]
276 |     var outside = st
277 |       .replace(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/gim, '')
278 |       .replace(/<.+?>/gm, '')
279 |       .replace(/\s+/g, ' ')
280 |     var list = listpart.split(/<li[^>]*>/)
281 |     list.shift()
282 |     var rate = evaluate_list(list)
283 |     return outside.length <= st.length / (45 / rate)
284 |   } else {
285 |     return false
286 |   }
287 | }
288 | 
289 | // リンクリストらしさを評価
290 | function evaluate_list(list) {
291 |   if (list.length == 0) {
292 |     return 1
293 |   }
294 |   var hit = 0
295 |   list.forEach(function(line) {
296 |     if (line.match(/<a\s+href=(['"]?)([^"'\s]+)\1/im)) {
297 |       hit++
298 |     }
299 |   })
300 |   return 9 * Math.pow((1.0 * hit) / list.length, 2) + 1
301 | }
302 | 
303 | function str_scan(str, regexp) {
304 |   var r = []
305 |   str.replace(regexp, function() {
306 |     r.push(Array.prototype.slice.call(arguments, 1, -2))
307 |   })
308 |   return r
309 | }
310 | 


--------------------------------------------------------------------------------