├── .eslintrc.yml ├── .gitignore ├── README.md ├── package.json ├── test └── index.js ├── .circleci └── config.yml ├── LICENSE └── lib ├── get-html.js └── index.js /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | root: true 2 | plugins: 3 | - prettier 4 | extends: 5 | - plugin:prettier/recommended 6 | env: 7 | browser: true 8 | es6: true 9 | mocha: true 10 | node: true 11 | globals: 12 | "$": true 13 | jQuery: true 14 | rules: 15 | no-useless-escape: 0 16 | prettier/prettier: 17 | - 2 18 | - 19 | trailingComma: none 20 | singleQuote: true 21 | semi: false 22 | prefer-const: 2 23 | no-unused-vars: 24 | - 2 25 | - 26 | argsIgnorePattern: ^_ 27 | varsIgnorePattern: ^_ 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # node-waf configuration 20 | .lock-wscript 21 | 22 | # Compiled binary addons (http://nodejs.org/api/addons.html) 23 | build/Release 24 | 25 | # Dependency directory 26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git 27 | node_modules 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CircleCI](https://circleci.com/gh/craftzdog/extract-main-text-node.svg?style=svg)](https://circleci.com/gh/craftzdog/extract-main-text-node) 2 | 3 | extract-main-text-node 4 | ====================== 5 | 6 | Ported from [mono0x/extractcontent](https://github.com/mono0x/extractcontent). 7 | 8 | ## Installing 9 | 10 | ``` 11 | npm install extract-main-text 12 | ``` 13 | 14 | ## Usage 15 | 16 | ```JavaScript 17 | var BodyExtractor = require('extract-main-text'); 18 | var extractor = new BodyExtractor({ 19 | url: 'http://***.com/' 20 | }); 21 | extractor.analyze() 22 | .then(function(text) { 23 | console.log(extractor.title); 24 | console.log(extractor.mainText); 25 | }); 26 | ``` 27 | 28 | ## License 29 | 30 | The BSD license 31 | 32 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract-main-text", 3 | "version": "1.0.3", 4 | "description": "Automatically grab the main text out of a webpage", 5 | "main": "lib/index.js", 6 | "scripts": { 7 | "test": "mocha --harmony" 8 | }, 9 | "author": "", 10 | "license": "BSD", 11 | "devDependencies": { 12 | "eslint": "^5.12.0", 13 | "eslint-config-prettier": "^3.3.0", 14 | "eslint-plugin-prettier": "^3.0.1", 15 | "mocha": "^5.2.0", 16 | "prettier": "^1.15.3", 17 | "should": "^13.2.3" 18 | }, 19 | "dependencies": { 20 | "charset": "^1.0.1", 21 | "html-entities": "^1.2.1", 22 | "iconv-lite": "^0.4.24", 23 | "jschardet": "^1.6.0", 24 | "lodash.defaults": "^4.2.0", 25 | "lodash.merge": "^4.6.1", 26 | "request": "^2.88.0" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | var BodyExtractor = require('../lib') 2 | var should = require('should') 3 | 4 | describe('The main text extractor', function() { 5 | var extractor 6 | 7 | it('can initialize', function() { 8 | extractor = new BodyExtractor({ 9 | //url: 'http://toyokeizai.net/articles/-/75910' 10 | //url: 'http://d.hatena.ne.jp/shi3z/20150720/1437347243' 11 | url: 'https://anond.hatelabo.jp/20150719014315' 12 | }) 13 | }) 14 | 15 | it('can analyze', function() { 16 | return extractor.analyze().then(function(text) { 17 | should(text).be.ok() 18 | extractor.should.have.property('mainText') 19 | console.log(extractor.mainText) 20 | }) 21 | }) 22 | 23 | it('can extract title', function() { 24 | should(extractor.title).be.ok() 25 | }) 26 | }) 27 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Javascript Node CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-javascript/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | - image: circleci/node:10.15.0 10 | 11 | working_directory: ~/repo 12 | 13 | steps: 14 | - checkout 15 | 16 | # Download and cache dependencies 17 | - restore_cache: 18 | keys: 19 | - v1-dependencies-{{ checksum "package.json" }} 20 | # fallback to using the latest cache if no exact match is found 21 | - v1-dependencies- 22 | 23 | - run: npm install 24 | 25 | - save_cache: 26 | paths: 27 | - node_modules 28 | key: v1-dependencies-{{ checksum "package.json" }} 29 | 30 | # run tests! 31 | - run: npm test 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Takuya Matsuyama 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /lib/get-html.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var charset = require('charset'); 3 | var iconv = require('iconv-lite'); 4 | var jschardet = require('jschardet'); 5 | 6 | module.exports = getHTML; 7 | 8 | /** 9 | * Fetch HTML page 10 | * retrieveHTML('http://hoge', function(err, html, url){ ... }) 11 | * The url argument of callback function is actual URL. 12 | * It's different from specified one if the page is redirected like shorten URL. 13 | * 14 | * @param {string} url The URL to fetch 15 | * @param {function} cb The callback function 16 | * @return {Promise} The promise resolving the HTML content: 17 | * { 18 | * html: {string} The html content 19 | * url: {string} The actual URL retrieved from 20 | * } 21 | */ 22 | function getHTML (url){ 23 | return new Promise(function(fulfill, reject) { 24 | var purl = require('url').parse(url); 25 | if (!purl.protocol) { 26 | purl = require('url').parse("http://"+url); 27 | } 28 | url = require('url').format(purl); 29 | 30 | var options = { 31 | url: url, 32 | encoding: null, 33 | followRedirect: true, 34 | headers: { 35 | 'User-Agent': 'Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36' 36 | } 37 | }; 38 | 39 | request(options, function(err, res, body) { 40 | if (err) { 41 | reject(err); 42 | } 43 | else { 44 | var enc = charset(res.headers, body) || jschardet.detect(body).encoding.toLowerCase(); 45 | body = iconv.decode(body, enc); 46 | if (res.statusCode >= 300 && res.statusCode < 400) { 47 | retrieveHTML(res.headers.location).then(fulfill, reject); 48 | } 49 | else { 50 | fulfill({ html: body, url: res.request.uri.href }); 51 | } 52 | } 53 | }); 54 | }); 55 | } 56 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | var getHTML = require('./get-html') 3 | var merge = require('lodash.merge') 4 | var defaults = require('lodash.defaults') 5 | var Entities = require('html-entities').AllHtmlEntities 6 | var entities = new Entities() 7 | 8 | /** 9 | * Initialize new extractor. 10 | * Either parans.html or params.url must be specified. 11 | * 12 | * @param {object} params The parameters 13 | * @param {string} params.html Optional, the HTML content 14 | * @param {string} params.url Optional, the URL 15 | */ 16 | function BodyExtractor(params, opts) { 17 | assert.equal( 18 | typeof params, 19 | 'object', 20 | 'The params must be an object: ' + params 21 | ) 22 | 23 | this.html = params.html 24 | this.url = params.url 25 | merge( 26 | this, 27 | defaults(opts || {}, { 28 | threshold: 100, 29 | min_length: 80, 30 | decay_factor: 0.73, 31 | continuous_factor: 1.62, 32 | punctuation_weight: 10, 33 | punctuations: /([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, 34 | waste_expressions: /Copyright|All Rights Reserved/i, 35 | debug: true 36 | }) 37 | ) 38 | } 39 | 40 | BodyExtractor.prototype.loadHTML = function() { 41 | assert.equal( 42 | typeof this.url, 43 | 'string', 44 | 'The this.url must be a stirng: ' + this.url 45 | ) 46 | var self = this 47 | return getHTML(this.url).then(function(res) { 48 | self.html = res.html 49 | self.url = res.url 50 | return res 51 | }) 52 | } 53 | 54 | /** 55 | * Parse HTML content 56 | * @return {Promise} The promise 57 | */ 58 | BodyExtractor.prototype.analyze = function() { 59 | var self = this 60 | var promise = Promise.resolve() 61 | if (!this.html && this.url) { 62 | promise = promise.then(function() { 63 | return self.loadHTML() 64 | }) 65 | } 66 | promise = promise.then(function() { 67 | var html = self.html 68 | 69 | if ( 70 | html.match( 71 | /<\/frameset>|]*url/i 72 | ) 73 | ) { 74 | return 75 | } 76 | html = html.replace( 77 | /[\s\S]*?/gm, 78 | '' 79 | ) 80 | if (html.match(//)) { 81 | var m = html.match( 82 | /([\s\S]*?)/m 83 | ) 84 | html = m[1] 85 | } 86 | 87 | html = eliminate_useless_tags(html) 88 | 89 | var title = self.title 90 | // h? block including title 91 | html = html.replace(/(\s*(.*?)\s*<\/h\d\s*>)/gi, function( 92 | $0, 93 | $1, 94 | $2, 95 | _$3 96 | ) { 97 | if ($2.length >= 3 && title.indexOf($2) >= 0) { 98 | return '
' + $2 + '
' 99 | } else { 100 | return $1 101 | } 102 | }) 103 | 104 | var factor = (continuous = 1.0) 105 | var body = '' 106 | var score = 0 107 | var bodylist = [] 108 | var list = html.split( 109 | /<\/?(?:div|center|td)[^>]*>|]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/ 110 | ) 111 | list.forEach(function(block) { 112 | if (!block) { 113 | return 114 | } 115 | block = block.trim() 116 | if (has_only_tags(block)) { 117 | return 118 | } 119 | if (body.length > 0) { 120 | continuous /= self.continuous_factor 121 | } 122 | 123 | // リンク除外&リンクリスト判定 124 | var notlinked = eliminate_link(block) 125 | if (notlinked.length < self.min_length) { 126 | return 127 | } 128 | 129 | // スコア算出 130 | var c = 131 | (notlinked.length + 132 | str_scan(notlinked, self.punctuations).length * 133 | self.punctuation_weight) * 134 | factor 135 | factor *= self.decay_factor 136 | var not_body_rate = 137 | str_scan(block, self.waste_expressions).length + 138 | str_scan(block, /amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0 139 | if (not_body_rate > 0) { 140 | c *= Math.pow(0.72, not_body_rate) 141 | } 142 | var c1 = c * continuous 143 | 144 | if (self.debug) { 145 | console.log(c, '*', continuous, '=', c1, notlinked.length) 146 | } 147 | 148 | // ブロック抽出&スコア加算 149 | if (c1 > self.threshold) { 150 | body += block.trim() + '\n' 151 | score += c1 152 | continuous = self.continuous_factor 153 | } else if (c > self.threshold) { 154 | // continuous block end 155 | bodylist.push([body, score]) 156 | body = block.trim() + '\n' 157 | score = c 158 | continuous = self.continuous_factor 159 | } 160 | }) 161 | bodylist.push([body, score]) 162 | body = bodylist.reduce( 163 | function(a, b) { 164 | if (a[1] >= b[1]) { 165 | return a 166 | } else { 167 | return b 168 | } 169 | }, 170 | ['', 0] 171 | ) 172 | self.mainText = strip_tags(body[0], self.dom_separator) 173 | return self.mainText 174 | }) 175 | return promise 176 | } 177 | 178 | BodyExtractor.prototype.__defineGetter__('title', function() { 179 | var m = this.html.match(/]*>\s*(.*?)\s*<\/title\s*>/i) 180 | if (m) { 181 | return strip_tags(m[1]) 182 | } else { 183 | return '' 184 | } 185 | }) 186 | 187 | module.exports = BodyExtractor 188 | 189 | function eliminate_useless_tags(html) { 190 | // eliminate useless symbols 191 | html = html.replace( 192 | /[\342\200\230-\342\200\235]|[\342\206\220-\342\206\223]|[\342\226\240-\342\226\275]|[\342\227\206-\342\227\257]|\342\230\205|\342\230\206/g, 193 | '' 194 | ) 195 | 196 | // eliminate useless html tags 197 | html = html.replace( 198 | /<(script|style|select|noscript)[^>]*>[\s\S]*?<\/\1\s*>/gim, 199 | '' 200 | ) 201 | html = html.replace(//gi, '') 202 | html = html.replace(//gm, '') 203 | html = html.replace(//g, '') 204 | html = html.replace( 205 | /]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>[\s\S]*?<\/div\s*>/gm, 206 | '' 207 | ) 208 | html = html.replace( 209 | /]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/gi, 210 | '' 211 | ) 212 | 213 | return html 214 | } 215 | 216 | // Checks if the given block has only tags without text. 217 | function has_only_tags(st) { 218 | return ( 219 | st 220 | .replace(/<[^>]*>/gim, '') 221 | .replace(/ /g, '') 222 | .trim().length == 0 223 | ) 224 | } 225 | 226 | // リンク除外&リンクリスト判定 227 | function eliminate_link(html) { 228 | var count = 0 229 | var notlinked = html 230 | .replace(/]*>[\s\S]*?<\/a\s*>/gim, function() { 231 | count += 1 232 | return '' 233 | }) 234 | .replace(/]*>[\s\S]*?<\/form\s*>/gim, '') 235 | notlinked = strip_tags(notlinked) 236 | if (notlinked.length < 20 * count || islinklist(html)) { 237 | return '' 238 | } 239 | return notlinked 240 | } 241 | 242 | /* 243 | * Strips tags from html. 244 | */ 245 | function strip_tags(html, separator) { 246 | if (separator === undefined) { 247 | separator = '' 248 | } 249 | var st = html.replace(/<.+?>/gm, separator) 250 | // Convert from wide character to ascii 251 | // symbols, 0-9, A-Z 252 | st = st.replace( 253 | /[A-Za-z0-9-!”#$%&’()=<>,.?_[]{}@^~¥]/g, 254 | function(s) { 255 | return String.fromCharCode(s.charCodeAt(0) - 0xfee0) 256 | } 257 | ) 258 | // keisen 259 | st = st.replace( 260 | /[\342\224\200-\342\224\277]|[\342\225\200-\342\225\277]/g, 261 | '' 262 | ) 263 | st = st.replace(/\343\200\200/g, ' ') 264 | st = entities.decode(st) 265 | st.replace(/[ \t]+/g, ' ') 266 | st.replace(/\n\s*/g, '\n') 267 | return st 268 | } 269 | 270 | // リンクリスト判定 271 | // リストであれば非本文として除外する 272 | function islinklist(st) { 273 | var m = st.match(/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im) 274 | if (m) { 275 | var listpart = m[1] 276 | var outside = st 277 | .replace(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/gim, '') 278 | .replace(/<.+?>/gm, '') 279 | .replace(/\s+/g, ' ') 280 | var list = listpart.split(/]*>/) 281 | list.shift() 282 | var rate = evaluate_list(list) 283 | return outside.length <= st.length / (45 / rate) 284 | } else { 285 | return false 286 | } 287 | } 288 | 289 | // リンクリストらしさを評価 290 | function evaluate_list(list) { 291 | if (list.length == 0) { 292 | return 1 293 | } 294 | var hit = 0 295 | list.forEach(function(line) { 296 | if (line.match(/