├── .editorconfig ├── .gitignore ├── .npmignore ├── .travis.yml ├── Gruntfile.coffee ├── LICENSE ├── README.md ├── _src ├── lib │ └── html_extractor.coffee └── test │ ├── readme_example_advanced.coffee │ ├── readme_example_simple.coffee │ ├── test.coffee │ └── test_data.coffee ├── appveyor.yml ├── coffeelint.json └── package.json /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors and IDEs 3 | # editorconfig.org 4 | 5 | root = true 6 | 7 | 8 | [*] 9 | 10 | # Change these settings to your own preference 11 | indent_style = tab 12 | end_of_line = lf 13 | charset = utf-8 14 | trim_trailing_whitespace = true 15 | insert_final_newline = true 16 | 17 | [*.coffee] 18 | trim_trailing_whitespace = false 19 | 20 | [*.md] 21 | trim_trailing_whitespace = false 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.profile 2 | *.lock 3 | *.conflict 4 | *.DS_Store 5 | *.zip 6 | *.rdb 7 | *.log 8 | 9 | .project 10 | .settings 11 | .idea 12 | 13 | *.mo 14 | *.sublime* 15 | config.json 16 | config*.json 17 | deploy.json 18 | /node_modules 19 | /_release 20 | /lib 21 | /test 22 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | /_src 2 | /_docs 3 | /node_modules 4 | /_release 5 | Gruntfile.* 6 | *.sublime* 7 | config.json 8 | config*.json 9 | deploy.json 10 | .editorconfig 11 | 12 | *.yml 13 | *.profile 14 | *.lock 15 | *.conflict 16 | *.DS_Store 17 | *.zip 18 | *.rdb 19 | *.log 20 | 21 | .project 22 | .settings 23 | .idea 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.10 4 | - 0.12 5 | - 4.0 6 | - 4.4 7 | - 5.0 8 | - 5.5 9 | - 5.11 10 | - 6.0 11 | - 6.1 12 | - iojs 13 | - node 14 | before_script: 15 | - "npm install -g mocha grunt-cli" 16 | - "grunt build" 17 | -------------------------------------------------------------------------------- /Gruntfile.coffee: -------------------------------------------------------------------------------- 1 | module.exports = (grunt) -> 2 | 3 | # Project configuration. 4 | grunt.initConfig 5 | pkg: grunt.file.readJSON("package.json") 6 | watch: 7 | lib: 8 | files: ["_src/**/*.coffee"] 9 | tasks: [ "coffee:base" ] 10 | module_test: 11 | files: [ "_src/**/*.coffee" ] 12 | tasks: [ "coffee:base", "test" ] 13 | 14 | coffee: 15 | base: 16 | expand: true 17 | cwd: '_src', 18 | src: ["**/*.coffee"] 19 | dest: "" 20 | ext: ".js" 21 | 22 | options: 23 | flatten: false 24 | bare: false 25 | 26 | mochacli: 27 | options: 28 | require: [ "should" ] 29 | reporter: "spec" 30 | bail: if process.env.BAIL? then true else false 31 | timeout: 10000 32 | env: 33 | COUNT: process.env.COUNT 34 | 35 | all: [ "test/test.js" ] 36 | 37 | 38 | # Load npm modules 39 | grunt.loadNpmTasks "grunt-contrib-watch" 40 | grunt.loadNpmTasks "grunt-contrib-coffee" 41 | grunt.loadNpmTasks "grunt-mocha-cli" 42 | 43 | # ALIAS TASKS 44 | grunt.registerTask "default", "build" 45 | grunt.registerTask "test", [ "build", "mochacli" ] 46 | grunt.registerTask( "watch-test", [ "watch:module_test" ] ) 47 | 48 | # ALIAS SHORTS 49 | grunt.registerTask( "b", "build" ) 50 | grunt.registerTask( "w", "watch:lib" ) 51 | grunt.registerTask( "wt", "watch-test" ) 52 | grunt.registerTask( "t", "test" ) 53 | 54 | grunt.registerTask "build", [ "coffee:base" ] 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 mpneuried 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html-extractor 2 | ============== 3 | 4 | [![Build Status](https://secure.travis-ci.org/mpneuried/html-extractor.png?branch=master)](http://travis-ci.org/mpneuried/html-extractor) 5 | [![Windows Tests](https://img.shields.io/appveyor/ci/mpneuried/html-extractor.svg?label=Windows%20Test)](https://ci.appveyor.com/project/mpneuried/html-extractor) 6 | [![Dependency Status](https://david-dm.org/mpneuried/html-extractor.png)](https://david-dm.org/mpneuried/html-extractor) 7 | [![NPM version](https://badge.fury.io/js/html-extractor.png)](http://badge.fury.io/js/html-extractor) 8 | 9 | Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search 10 | 11 | [![NPM](https://nodei.co/npm/html-extractor.png?downloads=true&stars=true)](https://nodei.co/npm/html-extractor/) 12 | 13 | ## Install 14 | 15 | ``` 16 | npm install html-extractor 17 | ``` 18 | 19 | ## Initialize 20 | 21 | 22 | ```js 23 | var Extrator = require("html-extractor"); 24 | var myExtrator = new Extrator(); 25 | ``` 26 | 27 | ### `new Extrator( debug )` 28 | 29 | **arguments** 30 | - **debug** : *( `Boolean` optional: default = `false` )* 31 | Output the parsing time 32 | 33 | ## Methods 34 | 35 | ### Extrator.extract( html[, reduced], cb ) 36 | 37 | Call `.extract()` to get the data of an html string. 38 | HTML entities will be decoded. 39 | 40 | **arguments:** 41 | 42 | - **html** : *( `String` required )* 43 | The html string to process 44 | - **reduced** : *( `Object` optional )* 45 | A object to reduce the content of body to a specific site content. It is not possible to reduce to a tag without a attribute filter. 46 | - **reduced.tag** : *( `String` required if `reduced` is set )* 47 | The tag name of the html element to reduce to 48 | - **reduced.attr** : *( `String` required if `reduced` is set )* 49 | The attribute of the html element to reduce to 50 | - **reduced.val** : *( `String` required if `reduced` is set )* 51 | The attribute value of the html element to reduce to 52 | - **reduced.list** : *( `Boobean` default = `false` )* 53 | Return every found reduced block as an array within body. 54 | - **cb** : *( `Function` required )* 55 | The callback function 56 | 57 | **callback arguments:** 58 | 59 | - **error** : *( `Error` )* 60 | Error information. If no error occoured this will be `null` 61 | - **data** : *( `Object` )* 62 | The extraction result 63 | - **data.body** : *( `String|Array` )* 64 | The whole body content or the content within the configured reduced element. There will be just the text content without html tags/attributes and without the content in script tags. 65 | If the reduced feature is used and `reduced.list = true` the body will be an array of all found reduced blocks. 66 | - **data.h1** : *( `Array` )* 67 | An array containing all `h1` text contents. Including the `h1`elements outside the configured reduced element 68 | - **data.meta** : *( `Object` )* 69 | A Object of all found meta tags with the syntax ``. Other meta tags will be ignored. 70 | - **data.meta.charset** : *( `String` optional )* 71 | If a metatag with the charset setting like `` is defined it will be returned under `data.meta.charset` 72 | - **data.meta.title** : *( `String` default = `""` )* 73 | If tilte tag is defined it will be returned under `data.meta.title`. Otherwise the key will contain an empty string 74 | - **data.meta.description** : *( `String` default = `""` )* 75 | If a metatag with the name `description` is defined it will be returned under `data.meta.description`. Otherwise the key will contain an empty string 76 | - **data.meta.keywords** : *( `Array` default = `[]` )* 77 | If a metatag with the name `keywords` is defined it will be returned as trimmed array of strings under `data.meta.keywords`. Otherwise the key will contain an empty string 78 | 79 | ## Examples 80 | 81 | ### simple 82 | 83 | This is a simple example to extarct the content of a html document 84 | 85 | ```js 86 | var Extrator = require("html-extractor"); 87 | var myExtrator = new Extrator(); 88 | 89 | var html = ` 90 | 91 | 92 | Testpage 93 | 94 | 95 |

Header 1

96 |

Content

97 | 98 | 99 | ` 100 | 101 | myExtrator.extract( html, function( err, data ){ 102 | if( err ){ 103 | throw( err ) 104 | } else { 105 | console.log( data ); 106 | // { 107 | // meta: { 108 | // title: 'Testpage', 109 | // description: '', 110 | // keywords: [] 111 | // }, 112 | // body: ' Header 1 Content ', 113 | // h1: [ 'Header 1' ] 114 | // } 115 | } 116 | }); 117 | ``` 118 | 119 | > see `test/readme_example_simple` or [run in Tonic](https://tonicdev.com/mpneuried/5767a1b1444f3a1400e793c2) 120 | 121 | ### advanced 122 | 123 | This is a advanced example to show the usage of the reducing. 124 | With the reduce feature it is possible to reduce the body content to the content of a specific html element. 125 | 126 | ```js 127 | var Extrator = require("html-extractor"); 128 | var myExtrator = new Extrator(); 129 | 130 | var html = ` 131 | 132 | 133 | Super page 134 | 135 | 136 | 137 | 138 | 139 | 142 | 147 |
148 |

First article €

149 |

Lorem ipsum dolor sit amet ...

150 |

Second article  ...  

151 |

Aenean commodo ligula eget dolor.

152 | 155 |
156 |
157 |

ABC 1

158 |

Lorem ipsum dolor sit amet ...

159 |
160 |
161 |

XYZ 1

162 |

Lorem ipsum dolor sit amet ...

163 |
164 |
165 |

ABC 2

166 |

Lorem ipsum dolor sit amet ...

167 |
168 | 171 | 172 | 173 | ` 174 | 175 | var reduceTo = { 176 | tag: "div", 177 | attr: "id", 178 | val: "content" 179 | } 180 | 181 | myExtrator.extract( html, reduceTo, function( err, data ){ 182 | if( err ){ 183 | throw( err ) 184 | } else { 185 | console.log( "String", data ); 186 | //{ 187 | // meta: { 188 | // title: 'Super page', 189 | // description: 'Look at this super page', 190 | // keywords: ['X', 'Y', 'Z'], 191 | // generator: 'Super pageCMS' 192 | // }, 193 | // body: 'First article € Lorem ipsum dolor sit amet ... Second article ... Aenean commodo ligula eget dolor. ', 194 | // h1: ['My super page2', 'First article €', 'Second article ...'] 195 | //} 196 | } 197 | }); 198 | 199 | var reduceToList = { 200 | tag: "div", 201 | attr: "id", 202 | val: "content", 203 | list: true 204 | }; 205 | 206 | myExtrator.extract( html, reduceToList, function( err, data ){ 207 | if( err ){ 208 | throw( err ) 209 | } else { 210 | console.log( "List", data ); 211 | //{ 212 | // meta: { 213 | // title: 'Super page', 214 | // description: 'Look at this super page', 215 | // keywords: ['X', 'Y', 'Z'], 216 | // generator: 'Super pageCMS' 217 | // }, 218 | // body: [ 219 | // 'ABC 1 Lorem ipsum dolor sit amet ... ', 220 | // 'ABC 2 Lorem ipsum dolor sit amet ... ' 221 | // ], 222 | // h1: ['My super page2', 'First article', 'Second article'] 223 | //} 224 | } 225 | }); 226 | ``` 227 | 228 | > see `test/readme_example_advanced` or [run in Tonic](https://tonicdev.com/mpneuried/5767a178b29b431300aeb02f) 229 | 230 | ## Work in progress 231 | 232 | `html-extractor` is work in progress. Your ideas, suggestions etc. are very welcome. 233 | 234 | ## Release History 235 | |Version|Date|Description| 236 | |:--:|:--:|:--| 237 | |0.2.2|2016-07-1|Fixed trimming when `reduced.list` is active #3. Thanks to [Javier Castro](https://github.com/jacargentina)| 238 | |0.2.1|2016-06-30|Fixed handling of html entities #1. Thanks to [Javier Castro](https://github.com/jacargentina)| 239 | |0.2.0|2016-06-20|Added option to return reduced elements as list; Fixed reduced value check for classes; Optimized dev env.| 240 | |0.1.4|-|Updated and pinned dependencies and optimized tests| 241 | |0.1.3|-|Fixed extraction to remove style-tag content| 242 | |0.1.2|-|Updated documentation| 243 | |0.1.1|-|Added raw documentation; Fixed `travis.yml` | 244 | |0.1.0|-|Initial version| 245 | 246 | [![NPM](https://nodei.co/npm-dl/html-extractor.png?months=6)](https://nodei.co/npm/html-extractor/) 247 | 248 | ## License 249 | 250 | (The MIT License) 251 | 252 | Copyright (c) 2016 M. Peter, http://www.tcs.de 253 | 254 | Permission is hereby granted, free of charge, to any person obtaining 255 | a copy of this software and associated documentation files (the 256 | 'Software'), to deal in the Software without restriction, including 257 | without limitation the rights to use, copy, modify, merge, publish, 258 | distribute, sublicense, and/or sell copies of the Software, and to 259 | permit persons to whom the Software is furnished to do so, subject to 260 | the following conditions: 261 | 262 | The above copyright notice and this permission notice shall be 263 | included in all copies or substantial portions of the Software. 264 | 265 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 266 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 267 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 268 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 269 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 270 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 271 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 272 | -------------------------------------------------------------------------------- /_src/lib/html_extractor.coffee: -------------------------------------------------------------------------------- 1 | # # HTMLExtractor 2 | # 3 | # Extract meta-data from a html string. 4 | # It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search 5 | 6 | # import external modules 7 | htmlparser = require("htmlparser2") 8 | _isEmpty = require('lodash/isEmpty') 9 | _isString = require('lodash/isString') 10 | _isArray = require('lodash/isArray') 11 | 12 | # export extractor class 13 | module.exports = class HTMLExtractor 14 | 15 | ### 16 | ## constructor 17 | 18 | `new HTMLExtractor( debug )` 19 | 20 | initializes a extractor instance 21 | 22 | @param { Boolean } [debug=false] Output the parsing time 23 | 24 | ### 25 | constructor: ( @debug = false )-> 26 | return 27 | 28 | # **_trimRegex** *RegEx* Regular expression for trimming. 29 | _trimRegex: /^\s+/ 30 | 31 | ### 32 | ## _trim 33 | 34 | `html_extractor._trim( str )` 35 | 36 | Trim method to remove whitespace 37 | 38 | @param { String } [str=""] String to trim 39 | 40 | @return { String } Trimmed string 41 | 42 | @api private 43 | ### 44 | _trim: ( str = "")=> 45 | str = str.replace( @_trimRegex, "") 46 | i = str.length - 1 47 | 48 | while i >= 0 49 | if /\S/.test(str.charAt(i)) 50 | str = str.substring(0, i + 1) 51 | break 52 | i-- 53 | str 54 | 55 | ### 56 | ## extract 57 | 58 | `html_extractor.extract( html[, reduce], cb )` 59 | 60 | Main method to extract the contens out of a html string 61 | 62 | @param { String } html Raw html string to extract the meta, title and body 63 | @param { Object } [reduce] Reduce config object to reduce the body results to a specific element. Example: `{ tag: "div", attr: "id", val: "myContent" }` 64 | @param { Function } reduce Callback function 65 | 66 | @api public 67 | ### 68 | extract: ( [ html, reduce ]..., cb )=> 69 | # default return Object 70 | _ret = 71 | meta: 72 | title: "" 73 | description: "" 74 | keywords: "" 75 | body: null 76 | h1: [] 77 | 78 | # init benchmarking on `debug = true` 79 | console.time( "\t\tparse Time" ) if @debug 80 | 81 | # run extractor 82 | @_extract html, _ret, reduce, ( err, data )=> 83 | if err 84 | cb( err ) 85 | return 86 | # return time on `debug = true` 87 | console.timeEnd( "\t\tparse Time" ) if @debug 88 | 89 | # trim results 90 | _ret.meta.title = @_trim( _ret.meta.title ) if _ret.meta?.title?.length 91 | _ret.meta.description = @_trim( _ret.meta.description ) if _ret.meta?.description?.length 92 | if _isString( _ret.body ) and _ret.body.length 93 | _ret.body = @_trim( _ret.body ) 94 | else if _isArray( _ret.body ) and _ret.body.length 95 | for listEl, idx in _ret.body when listEl?.length 96 | _ret.body[ idx ] = @_trim( listEl ) 97 | 98 | for _h, idx in _ret.h1 when _h?.length 99 | _ret.h1[ idx ] = @_trim( _h ) 100 | 101 | cb( null, data ) 102 | return 103 | return 104 | 105 | _extract: ( html, _ret, reduce, cb )=> 106 | 107 | # check the reduce config and disable it if one key is missing 108 | if not reduce?.tag? or not reduce.attr? or not reduce.val? 109 | reduce = null 110 | 111 | if reduce?.list? 112 | reduce.list = true 113 | 114 | # set some flags 115 | _bodyMode = false 116 | _scriptMode = false 117 | _reducedBody = [] 118 | _reducedBodyIdx = 0 119 | _reduce_stack = null 120 | _body = [] 121 | _currTag = null 122 | _startBody = null 123 | _h1Open = false 124 | _h1LastOpen = false 125 | 126 | # allwasy create a instance of htmlparser2 to prevent race conditions through a possible instance parser value 127 | parser = new htmlparser.Parser( 128 | # event on tag open 129 | onopentag: ( name, attr )-> 130 | _currTag = name 131 | # check and start the reduced section by saving the current start stack. The collectin will be done within the `ontext` event. 132 | if reduce? and reduce.tag is name and attr[ reduce.attr ]?.indexOf( reduce.val ) >= 0 133 | _reducedBody[ _reducedBodyIdx ] = "" 134 | _reduce_stack = parser._stack.slice( 0,-1 ).join( "§§" ) 135 | 136 | switch name 137 | 138 | # get the meta tag attributes and set the meta return object 139 | when "meta" 140 | if attr? and attr.name? and attr.content? 141 | _ret.meta[ attr.name ] = attr.content 142 | #else if attr? and attr.property? and attr.content? 143 | # _ret.meta[ attr.property ] = attr.content 144 | #else if attr? and attr[ 'http-equiv' ]? and attr.content? 145 | # _ret.meta[ attr[ 'http-equiv' ] ] = attr.content 146 | else if attr? and attr.charset? 147 | _ret.meta.charset = attr.charset 148 | 149 | # start the body section to activate the text body collector 150 | when "body" 151 | _bodyMode = true 152 | _startBody = parser._tokenizer._index 153 | 154 | # start a script section to prevent text get within scripts 155 | when "script", "style" 156 | _scriptMode = true 157 | 158 | # start a h1 section to pull the text in h1 tags out of the html 159 | when "h1" 160 | _h1Open = true 161 | return 162 | 163 | # event on a text fragment 164 | ontext: ( text )=> 165 | 166 | # check if the parser is in body and not in a script tag 167 | if _bodyMode and not _scriptMode 168 | 169 | # if reduce is active only push to the body if a stack is defined 170 | if reduce? and _reduce_stack? 171 | _body.push( text ) 172 | _reducedBody[ _reducedBodyIdx ] += text 173 | else if not reduce? 174 | _body.push( text ) 175 | 176 | # if the h1 state is active push the text to the h1 array 177 | if _h1Open 178 | # on subtag in the h1 tag the `_h1LastOpen` will be true so the sub tag content will be added to the latest h1 element 179 | if _h1LastOpen 180 | _ret.h1[ _ret.h1.length - 1 ] += text 181 | else 182 | _ret.h1.push text 183 | _h1LastOpen = true 184 | else 185 | _h1LastOpen = false 186 | 187 | 188 | switch _currTag 189 | # save the content of the title tag to the meta object 190 | when "title" 191 | _ret.meta.title += text 192 | 193 | return 194 | 195 | # event on tag close 196 | onclosetag: ( name )-> 197 | _currTag = null 198 | 199 | # check if the stack matches the stack on reduce start and stop an active reduce section 200 | if _reduce_stack? and _reduce_stack is parser._stack.join( "§§" ) 201 | _reducedBodyIdx++ 202 | _reduce_stack = null 203 | 204 | switch name 205 | # stop the body section 206 | when "body" 207 | if _startBody < parser._tokenizer._index 208 | _bodyMode = false 209 | # stop a h1 section 210 | when "h1" 211 | _h1Open = false 212 | _h1LastOpen = false 213 | # stop a script section 214 | when "script", "style" 215 | _scriptMode = false 216 | return 217 | 218 | return 219 | onend: => 220 | # if keywords are defined convert it to an array 221 | if _ret.meta.keywords? 222 | _ret.meta.keywords = for _word in _ret.meta.keywords.split( "," ) when not _isEmpty( _word ) 223 | @_trim( _word ) 224 | 225 | if reduce?.list? 226 | _ret.body = [] 227 | for _redTxt in _reducedBody 228 | _redTxt = @_trim( _redTxt ).replace( /\s\s+/g, " " ) 229 | if _redTxt.length 230 | _ret.body.push _redTxt 231 | else 232 | _ret.body = _body.join( " " ).replace( /\s\s+/g, " " ) 233 | 234 | cb( null, _ret ) 235 | return 236 | 237 | # allways us lowertags because tags could be written upper or lowercase 238 | , {lowerCaseTags: true, decodeEntities: true } ) 239 | 240 | # push the html to the parser 241 | parser.write( html ) 242 | 243 | # finish the parsing and let the parser call end 244 | parser.end() 245 | 246 | return 247 | -------------------------------------------------------------------------------- /_src/test/readme_example_advanced.coffee: -------------------------------------------------------------------------------- 1 | Extrator = require("../lib/html_extractor") 2 | myExtrator = new Extrator() 3 | 4 | html = """ 5 | 6 | 7 | Super page 8 | 9 | 10 | 11 | 12 | 13 | 16 | 21 |
22 |

First article

23 |

Lorem ipsum dolor sit amet ...

24 |

Second article

25 |

Aenean commodo ligula eget dolor.

26 |
27 |

ABC 1

28 |

Lorem ipsum dolor sit amet ...

29 |
30 |
31 |

XYZ 1

32 |

Lorem ipsum dolor sit amet ...

33 |
34 |
35 |

ABC 2

36 |

Lorem ipsum dolor sit amet ...

37 |
38 |
39 |

XYZ 2

40 |

Lorem ipsum dolor sit amet ...

41 |
42 |
43 |

ABC 3

44 |

Lorem ipsum dolor sit amet ...

45 |
46 | 49 |
50 | 53 | 54 | 55 | """ 56 | 57 | reduceTo = 58 | tag: "div" 59 | attr: "id" 60 | val: "content" 61 | 62 | myExtrator.extract html, reduceTo, ( err, data )-> 63 | if err 64 | throw err 65 | else 66 | console.log data 67 | # { 68 | # meta: { 69 | # title: 'Super page', 70 | # description: 'Look at this super page', 71 | # keywords: ['X', 'Y', 'Z'], 72 | # generator: 'Super pageCMS' 73 | # }, 74 | # body: ' First article Lorem ipsum dolor sit amet ... Second article Aenean commodo ligula eget dolor. ', 75 | # h1: ['My super page2', 'First article', 'Second article'] 76 | # } 77 | 78 | 79 | reduceTo2 = 80 | tag: "section" 81 | attr: "class" 82 | val: "abc" 83 | list: true 84 | 85 | myExtrator.extract html, reduceTo2, ( err, data )-> 86 | if err 87 | throw err 88 | else 89 | console.log data 90 | # { 91 | # meta: { 92 | # title: 'Super page', 93 | # description: 'Look at this super page', 94 | # keywords: ['X', 'Y', 'Z'], 95 | # generator: 'Super pageCMS' 96 | # }, 97 | # body: ' First article Lorem ipsum dolor sit amet ... Second article Aenean commodo ligula eget dolor. ', 98 | # h1: ['My super page2', 'First article', 'Second article'] 99 | # } 100 | return 101 | 102 | return 103 | -------------------------------------------------------------------------------- /_src/test/readme_example_simple.coffee: -------------------------------------------------------------------------------- 1 | Extrator = require("../lib/html_extractor") 2 | myExtrator = new Extrator() 3 | 4 | html = """ 5 | 6 | 7 | Testpage 8 | 9 | 10 |

Header 1

11 |

Content

12 | 13 | 14 | """ 15 | 16 | myExtrator.extract html, ( err, data )-> 17 | if err 18 | throw err 19 | else 20 | console.log data 21 | # { 22 | # meta: { 23 | # title: 'Testpage', 24 | # description: '', 25 | # keywords: [] 26 | # }, 27 | # body: ' Header 1 Content ', 28 | # h1: [ 'Header 1' ] 29 | # } 30 | return 31 | -------------------------------------------------------------------------------- /_src/test/test.coffee: -------------------------------------------------------------------------------- 1 | HTMLExtractor = require( "../lib/html_extractor" ) 2 | testData = require( "./test_data" ) 3 | 4 | request = require( "request" ) 5 | 6 | should = require( "should" ) 7 | 8 | _extractor = new HTMLExtractor( true ) 9 | 10 | getHTML = ( link, cb )-> 11 | request.get link, ( err, data )-> 12 | if err 13 | throw err 14 | cb( data.body ) 15 | return 16 | return 17 | 18 | describe 'HTML-dispatch-TEST', -> 19 | 20 | before ( done )-> 21 | done() 22 | return 23 | 24 | after ( done )-> 25 | done() 26 | return 27 | 28 | 29 | 30 | describe 'TEST Parser', -> 31 | it "Test tcs.de HTML", ( done )-> 32 | 33 | _extractor.extract testData.html[ 0 ], ( err, data )-> 34 | if err 35 | throw err 36 | 37 | should.exist( data.meta ) 38 | should.exist( data.meta.title ) 39 | data.meta.title.should.equal("TCS: Team Centric Software GmbH & Co. KG") 40 | should.exist( data.body ) 41 | data.body.should.not.be.empty 42 | 43 | data.body.should.not.containEql( "$('#contactform')" ) 44 | data.body.should.not.containEql( ".testcssselector" ) 45 | data.body.should.not.containEql( " 52 | 53 | _extractor.extract testData.html[ 1 ], ( err, data )-> 54 | if err 55 | throw err 56 | 57 | should.exist( data.meta ) 58 | should.exist( data.meta.title ) 59 | data.meta.title.should.equal("SPIEGEL ONLINE - Nachrichten") 60 | should.exist( data.body ) 61 | data.body.should.not.containEql( " 71 | 72 | it "test get HTML", ( done )-> 73 | 74 | getHTML testData.links[ 0 ], ( html )-> 75 | html.should.be.a.String() 76 | html.length.should.be.above( 0 ) 77 | html.should.containEql( "Team Centric Software GmbH" ) 78 | done() 79 | return 80 | return 81 | 82 | describe 'Test Parser with multiple pages', -> 83 | _count = process.env.COUNT or 5 84 | for _link, idx in testData.links[ 0.._count ] 85 | do( _link )-> 86 | it "#{ idx }: Parse '#{ _link }'", ( done )-> 87 | 88 | getHTML _link, ( html )-> 89 | 90 | _extractor.extract html, ( err, data )-> 91 | if err 92 | throw err 93 | should.exist( data.meta ) 94 | should.exist( data.meta.title ) 95 | should.exist( data.body ) 96 | data.body.should.not.containEql( " 109 | for _reduce, idx in testData.reduce 110 | do( _reduce, idx )-> 111 | it "#{ idx }: Reduced parse '#{ _reduce.url }'", ( done )-> 112 | getHTML _reduce.url, ( html )-> 113 | 114 | _extractor.extract html, _reduce.reduced, ( err, data )-> 115 | if err 116 | throw err 117 | should.exist( data.meta ) 118 | should.exist( data.meta.title ) 119 | should.exist( data.body ) 120 | data.body.should.not.be.empty 121 | switch idx 122 | when 0 123 | data.body.should.be.instanceof( String ) 124 | data.body.should.not.containEql( " 153 | it "#1 Returned body contains html entities", ( done )-> 154 | _html = '

 HELLO! 

 Headline > < €   ...   

' 155 | _exp = 156 | meta: 157 | title: "" 158 | description: "" 159 | keywords: [] 160 | body: "HELLO! Headline > < € ..." 161 | h1: [ "Headline > < €   ..." ] 162 | 163 | _extractor.extract _html, ( err, data )-> 164 | if err 165 | throw err 166 | should.exist( data ) 167 | data.should.eql( _exp ) 168 | done() 169 | return 170 | return 171 | 172 | return 173 | 174 | it "#3 str.replace is not a function when using reduce with list: true", ( done )-> 175 | _html = '

term one

non indexable content

term   two   

' 176 | _exp = 177 | meta: 178 | title: "" 179 | description: "" 180 | keywords: [] 181 | body: ["term one", "term   two"] 182 | h1: [] 183 | _reduce = 184 | tag: "p" 185 | attr: "id" 186 | val: "indexable" 187 | list: true 188 | 189 | _extractor.extract _html, _reduce, ( err, data )-> 190 | if err 191 | throw err 192 | should.exist( data ) 193 | data.should.eql( _exp ) 194 | done() 195 | return 196 | return 197 | 198 | 199 | return 200 | 201 | 202 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # http://www.appveyor.com/docs/appveyor-yml 2 | 3 | # Test against these versions of Node.js. 4 | environment: 5 | matrix: 6 | - nodejs_version: "0.10" 7 | - nodejs_version: "0.12" 8 | - nodejs_version: "4" 9 | - nodejs_version: "5" 10 | - nodejs_version: "6" 11 | 12 | pull_requests: 13 | do_not_increment_build_number: true 14 | 15 | platform: Any CPU 16 | shallow_clone: true 17 | 18 | # Install scripts. (runs after repo cloning) 19 | install: 20 | # Get the latest stable version of Node 0.STABLE.latest 21 | - ps: Install-Product node $env:nodejs_version 22 | # Typical npm stuff. Use msvs 2013 for the hiredis parser 23 | - npm install 24 | - npm install -g grunt-cli 25 | - grunt build 26 | 27 | # Post-install test scripts. 28 | test_script: 29 | # Output useful info for debugging. 30 | - node --version 31 | - npm --version 32 | - cmd: npm t 33 | 34 | os: 35 | - Default Azure 36 | - Windows Server 2012 R2 37 | 38 | # Don't actually build using MSBuild 39 | build: off 40 | 41 | # Set build version format here instead of in the admin panel. 42 | version: "{build}" 43 | -------------------------------------------------------------------------------- /coffeelint.json: -------------------------------------------------------------------------------- 1 | { 2 | "arrow_spacing": { 3 | "level": "ignore" 4 | }, 5 | "braces_spacing": { 6 | "level": "ignore", 7 | "spaces": 0, 8 | "empty_object_spaces": 0 9 | }, 10 | "camel_case_classes": { 11 | "level": "error" 12 | }, 13 | "coffeescript_error": { 14 | "level": "error" 15 | }, 16 | "colon_assignment_spacing": { 17 | "level": "ignore", 18 | "spacing": { 19 | "left": 0, 20 | "right": 0 21 | } 22 | }, 23 | "cyclomatic_complexity": { 24 | "value": 10, 25 | "level": "ignore" 26 | }, 27 | "duplicate_key": { 28 | "level": "error" 29 | }, 30 | "empty_constructor_needs_parens": { 31 | "level": "ignore" 32 | }, 33 | "ensure_comprehensions": { 34 | "level": "warn" 35 | }, 36 | "indentation": { 37 | "value": 1, 38 | "level": "error" 39 | }, 40 | "line_endings": { 41 | "level": "ignore", 42 | "value": "unix" 43 | }, 44 | "max_line_length": { 45 | "value": 120, 46 | "level": "ignore", 47 | "limitComments": false 48 | }, 49 | "missing_fat_arrows": { 50 | "level": "ignore", 51 | "is_strict": false 52 | }, 53 | "newlines_after_classes": { 54 | "value": 3, 55 | "level": "ignore" 56 | }, 57 | "no_backticks": { 58 | "level": "error" 59 | }, 60 | "no_debugger": { 61 | "level": "warn" 62 | }, 63 | "no_empty_functions": { 64 | "level": "ignore" 65 | }, 66 | "no_empty_param_list": { 67 | "level": "ignore" 68 | }, 69 | "no_implicit_braces": { 70 | "level": "ignore", 71 | "strict": true 72 | }, 73 | "no_implicit_parens": { 74 | "strict": true, 75 | "level": "ignore" 76 | }, 77 | "no_interpolation_in_single_quotes": { 78 | "level": "ignore" 79 | }, 80 | "no_plusplus": { 81 | "level": "ignore" 82 | }, 83 | "no_stand_alone_at": { 84 | "level": "ignore" 85 | }, 86 | "no_tabs": { 87 | "level": "ignore" 88 | }, 89 | "no_throwing_strings": { 90 | "level": "error" 91 | }, 92 | "no_trailing_semicolons": { 93 | "level": "error" 94 | }, 95 | "no_trailing_whitespace": { 96 | "level": "error", 97 | "allowed_in_comments": false, 98 | "allowed_in_empty_lines": true 99 | }, 100 | "no_unnecessary_double_quotes": { 101 | "level": "ignore" 102 | }, 103 | "no_unnecessary_fat_arrows": { 104 | "level": "warn" 105 | }, 106 | "non_empty_constructor_needs_parens": { 107 | "level": "ignore" 108 | }, 109 | "prefer_english_operator": { 110 | "level": "ignore", 111 | "doubleNotLevel": "ignore" 112 | }, 113 | "space_operators": { 114 | "level": "ignore" 115 | }, 116 | "spacing_after_comma": { 117 | "level": "ignore" 118 | }, 119 | "transform_messes_up_line_numbers": { 120 | "level": "warn" 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-extractor", 3 | "description": "Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search", 4 | "version": "0.2.2", 5 | "homepage": "https://github.com/mpneuried/html-extractor", 6 | "keywords": [ 7 | "html", "parse", "extract", "body", "search", "tool", "elastic", "headlines", "meta", "data" 8 | ], 9 | "author": { 10 | "name": "Mathias Peter" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git://github.com/mpneuried/html-extractor.git" 15 | }, 16 | "bugs": { 17 | "url": "https://github.com/mpneuried/html-extractor/issues" 18 | }, 19 | "licenses": [ 20 | { 21 | "type": "MIT", 22 | "url": "https://github.com/mpneuried/html-extractor/blob/master/LICENSE-MIT" 23 | } 24 | ], 25 | "main": "./lib/html_extractor.js", 26 | "engines": { 27 | "node": ">= 0.8.10" 28 | }, 29 | "scripts": { 30 | "test": "grunt test" 31 | }, 32 | "dependencies": { 33 | "htmlparser2": "3.9.x", 34 | "lodash": "4.x" 35 | }, 36 | "devDependencies": { 37 | "should": "9.x", 38 | "request": "2.x", 39 | "grunt-contrib-watch": "*", 40 | "grunt-contrib-coffee": "1.x", 41 | "grunt-mocha-cli": "2.x", 42 | "grunt": "1.x" 43 | } 44 | } 45 | --------------------------------------------------------------------------------