First article €
149 |Lorem ipsum dolor sit amet ...
150 |Second article ...
151 |Aenean commodo ligula eget dolor.
152 | 155 |├── .editorconfig ├── .gitignore ├── .npmignore ├── .travis.yml ├── Gruntfile.coffee ├── LICENSE ├── README.md ├── _src ├── lib │ └── html_extractor.coffee └── test │ ├── readme_example_advanced.coffee │ ├── readme_example_simple.coffee │ ├── test.coffee │ └── test_data.coffee ├── appveyor.yml ├── coffeelint.json └── package.json /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors and IDEs 3 | # editorconfig.org 4 | 5 | root = true 6 | 7 | 8 | [*] 9 | 10 | # Change these settings to your own preference 11 | indent_style = tab 12 | end_of_line = lf 13 | charset = utf-8 14 | trim_trailing_whitespace = true 15 | insert_final_newline = true 16 | 17 | [*.coffee] 18 | trim_trailing_whitespace = false 19 | 20 | [*.md] 21 | trim_trailing_whitespace = false 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.profile 2 | *.lock 3 | *.conflict 4 | *.DS_Store 5 | *.zip 6 | *.rdb 7 | *.log 8 | 9 | .project 10 | .settings 11 | .idea 12 | 13 | *.mo 14 | *.sublime* 15 | config.json 16 | config*.json 17 | deploy.json 18 | /node_modules 19 | /_release 20 | /lib 21 | /test 22 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | /_src 2 | /_docs 3 | /node_modules 4 | /_release 5 | Gruntfile.* 6 | *.sublime* 7 | config.json 8 | config*.json 9 | deploy.json 10 | .editorconfig 11 | 12 | *.yml 13 | *.profile 14 | *.lock 15 | *.conflict 16 | *.DS_Store 17 | *.zip 18 | *.rdb 19 | *.log 20 | 21 | .project 22 | .settings 23 | .idea 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.10 4 | - 0.12 5 | - 4.0 6 | - 4.4 7 | - 5.0 8 | - 5.5 9 | - 5.11 10 | - 6.0 11 | - 6.1 12 | - iojs 13 | - node 14 | before_script: 15 | - "npm install -g mocha grunt-cli" 16 | - "grunt build" 17 | -------------------------------------------------------------------------------- /Gruntfile.coffee: -------------------------------------------------------------------------------- 1 | module.exports = (grunt) -> 2 | 3 | # Project configuration. 4 | grunt.initConfig 5 | pkg: grunt.file.readJSON("package.json") 6 | watch: 7 | lib: 8 | files: ["_src/**/*.coffee"] 9 | tasks: [ "coffee:base" ] 10 | module_test: 11 | files: [ "_src/**/*.coffee" ] 12 | tasks: [ "coffee:base", "test" ] 13 | 14 | coffee: 15 | base: 16 | expand: true 17 | cwd: '_src', 18 | src: ["**/*.coffee"] 19 | dest: "" 20 | ext: ".js" 21 | 22 | options: 23 | flatten: false 24 | bare: false 25 | 26 | mochacli: 27 | options: 28 | require: [ "should" ] 29 | reporter: "spec" 30 | bail: if process.env.BAIL? then true else false 31 | timeout: 10000 32 | env: 33 | COUNT: process.env.COUNT 34 | 35 | all: [ "test/test.js" ] 36 | 37 | 38 | # Load npm modules 39 | grunt.loadNpmTasks "grunt-contrib-watch" 40 | grunt.loadNpmTasks "grunt-contrib-coffee" 41 | grunt.loadNpmTasks "grunt-mocha-cli" 42 | 43 | # ALIAS TASKS 44 | grunt.registerTask "default", "build" 45 | grunt.registerTask "test", [ "build", "mochacli" ] 46 | grunt.registerTask( "watch-test", [ "watch:module_test" ] ) 47 | 48 | # ALIAS SHORTS 49 | grunt.registerTask( "b", "build" ) 50 | grunt.registerTask( "w", "watch:lib" ) 51 | grunt.registerTask( "wt", "watch-test" ) 52 | grunt.registerTask( "t", "test" ) 53 | 54 | grunt.registerTask "build", [ "coffee:base" ] 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 mpneuried 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html-extractor 2 | ============== 3 | 4 | [](http://travis-ci.org/mpneuried/html-extractor) 5 | [](https://ci.appveyor.com/project/mpneuried/html-extractor) 6 | [](https://david-dm.org/mpneuried/html-extractor) 7 | [](http://badge.fury.io/js/html-extractor) 8 | 9 | Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search 10 | 11 | [](https://nodei.co/npm/html-extractor/) 12 | 13 | ## Install 14 | 15 | ``` 16 | npm install html-extractor 17 | ``` 18 | 19 | ## Initialize 20 | 21 | 22 | ```js 23 | var Extrator = require("html-extractor"); 24 | var myExtrator = new Extrator(); 25 | ``` 26 | 27 | ### `new Extrator( debug )` 28 | 29 | **arguments** 30 | - **debug** : *( `Boolean` optional: default = `false` )* 31 | Output the parsing time 32 | 33 | ## Methods 34 | 35 | ### Extrator.extract( html[, reduced], cb ) 36 | 37 | Call `.extract()` to get the data of an html string. 38 | HTML entities will be decoded. 39 | 40 | **arguments:** 41 | 42 | - **html** : *( `String` required )* 43 | The html string to process 44 | - **reduced** : *( `Object` optional )* 45 | A object to reduce the content of body to a specific site content. It is not possible to reduce to a tag without a attribute filter. 46 | - **reduced.tag** : *( `String` required if `reduced` is set )* 47 | The tag name of the html element to reduce to 48 | - **reduced.attr** : *( `String` required if `reduced` is set )* 49 | The attribute of the html element to reduce to 50 | - **reduced.val** : *( `String` required if `reduced` is set )* 51 | The attribute value of the html element to reduce to 52 | - **reduced.list** : *( `Boobean` default = `false` )* 53 | Return every found reduced block as an array within body. 54 | - **cb** : *( `Function` required )* 55 | The callback function 56 | 57 | **callback arguments:** 58 | 59 | - **error** : *( `Error` )* 60 | Error information. If no error occoured this will be `null` 61 | - **data** : *( `Object` )* 62 | The extraction result 63 | - **data.body** : *( `String|Array` )* 64 | The whole body content or the content within the configured reduced element. There will be just the text content without html tags/attributes and without the content in script tags. 65 | If the reduced feature is used and `reduced.list = true` the body will be an array of all found reduced blocks. 66 | - **data.h1** : *( `Array` )* 67 | An array containing all `h1` text contents. Including the `h1`elements outside the configured reduced element 68 | - **data.meta** : *( `Object` )* 69 | A Object of all found meta tags with the syntax ``. Other meta tags will be ignored. 70 | - **data.meta.charset** : *( `String` optional )* 71 | If a metatag with the charset setting like `` is defined it will be returned under `data.meta.charset` 72 | - **data.meta.title** : *( `String` default = `""` )* 73 | If tilte tag is defined it will be returned under `data.meta.title`. Otherwise the key will contain an empty string 74 | - **data.meta.description** : *( `String` default = `""` )* 75 | If a metatag with the name `description` is defined it will be returned under `data.meta.description`. Otherwise the key will contain an empty string 76 | - **data.meta.keywords** : *( `Array` default = `[]` )* 77 | If a metatag with the name `keywords` is defined it will be returned as trimmed array of strings under `data.meta.keywords`. Otherwise the key will contain an empty string 78 | 79 | ## Examples 80 | 81 | ### simple 82 | 83 | This is a simple example to extarct the content of a html document 84 | 85 | ```js 86 | var Extrator = require("html-extractor"); 87 | var myExtrator = new Extrator(); 88 | 89 | var html = ` 90 | 91 |
92 |Content
97 | 98 | 99 | ` 100 | 101 | myExtrator.extract( html, function( err, data ){ 102 | if( err ){ 103 | throw( err ) 104 | } else { 105 | console.log( data ); 106 | // { 107 | // meta: { 108 | // title: 'Testpage', 109 | // description: '', 110 | // keywords: [] 111 | // }, 112 | // body: ' Header 1 Content ', 113 | // h1: [ 'Header 1' ] 114 | // } 115 | } 116 | }); 117 | ``` 118 | 119 | > see `test/readme_example_simple` or [run in Tonic](https://tonicdev.com/mpneuried/5767a1b1444f3a1400e793c2) 120 | 121 | ### advanced 122 | 123 | This is a advanced example to show the usage of the reducing. 124 | With the reduce feature it is possible to reduce the body content to the content of a specific html element. 125 | 126 | ```js 127 | var Extrator = require("html-extractor"); 128 | var myExtrator = new Extrator(); 129 | 130 | var html = ` 131 | 132 | 133 |Lorem ipsum dolor sit amet ...
150 |Aenean commodo ligula eget dolor.
152 | 155 |Lorem ipsum dolor sit amet ...
159 |Lorem ipsum dolor sit amet ...
163 |Lorem ipsum dolor sit amet ...
167 |Lorem ipsum dolor sit amet ...
24 |Aenean commodo ligula eget dolor.
26 |Lorem ipsum dolor sit amet ...
29 |Lorem ipsum dolor sit amet ...
33 |Lorem ipsum dolor sit amet ...
37 |Lorem ipsum dolor sit amet ...
41 |Lorem ipsum dolor sit amet ...
45 |Content
12 | 13 | 14 | """ 15 | 16 | myExtrator.extract html, ( err, data )-> 17 | if err 18 | throw err 19 | else 20 | console.log data 21 | # { 22 | # meta: { 23 | # title: 'Testpage', 24 | # description: '', 25 | # keywords: [] 26 | # }, 27 | # body: ' Header 1 Content ', 28 | # h1: [ 'Header 1' ] 29 | # } 30 | return 31 | -------------------------------------------------------------------------------- /_src/test/test.coffee: -------------------------------------------------------------------------------- 1 | HTMLExtractor = require( "../lib/html_extractor" ) 2 | testData = require( "./test_data" ) 3 | 4 | request = require( "request" ) 5 | 6 | should = require( "should" ) 7 | 8 | _extractor = new HTMLExtractor( true ) 9 | 10 | getHTML = ( link, cb )-> 11 | request.get link, ( err, data )-> 12 | if err 13 | throw err 14 | cb( data.body ) 15 | return 16 | return 17 | 18 | describe 'HTML-dispatch-TEST', -> 19 | 20 | before ( done )-> 21 | done() 22 | return 23 | 24 | after ( done )-> 25 | done() 26 | return 27 | 28 | 29 | 30 | describe 'TEST Parser', -> 31 | it "Test tcs.de HTML", ( done )-> 32 | 33 | _extractor.extract testData.html[ 0 ], ( err, data )-> 34 | if err 35 | throw err 36 | 37 | should.exist( data.meta ) 38 | should.exist( data.meta.title ) 39 | data.meta.title.should.equal("TCS: Team Centric Software GmbH & Co. KG") 40 | should.exist( data.body ) 41 | data.body.should.not.be.empty 42 | 43 | data.body.should.not.containEql( "$('#contactform')" ) 44 | data.body.should.not.containEql( ".testcssselector" ) 45 | data.body.should.not.containEql( "" ) 46 | #console.log data.meta, data.body.length, data.h1 47 | done() 48 | return 49 | return 50 | 51 | it "Test spiegel.de HTML", ( done )-> 52 | 53 | _extractor.extract testData.html[ 1 ], ( err, data )-> 54 | if err 55 | throw err 56 | 57 | should.exist( data.meta ) 58 | should.exist( data.meta.title ) 59 | data.meta.title.should.equal("SPIEGEL ONLINE - Nachrichten") 60 | should.exist( data.body ) 61 | data.body.should.not.containEql( "" ) 62 | data.body.should.not.be.empty 63 | 64 | #console.log data.meta, data.body.length, data.h1 65 | done() 66 | return 67 | return 68 | return 69 | 70 | describe 'Test Request', -> 71 | 72 | it "test get HTML", ( done )-> 73 | 74 | getHTML testData.links[ 0 ], ( html )-> 75 | html.should.be.a.String() 76 | html.length.should.be.above( 0 ) 77 | html.should.containEql( "Team Centric Software GmbH" ) 78 | done() 79 | return 80 | return 81 | 82 | describe 'Test Parser with multiple pages', -> 83 | _count = process.env.COUNT or 5 84 | for _link, idx in testData.links[ 0.._count ] 85 | do( _link )-> 86 | it "#{ idx }: Parse '#{ _link }'", ( done )-> 87 | 88 | getHTML _link, ( html )-> 89 | 90 | _extractor.extract html, ( err, data )-> 91 | if err 92 | throw err 93 | should.exist( data.meta ) 94 | should.exist( data.meta.title ) 95 | should.exist( data.body ) 96 | data.body.should.not.containEql( "" ) 97 | data.body.should.not.be.empty 98 | 99 | #console.log "\nHEADER of #{ _link }\n", data.meta.title, "\n", JSON.stringify( data.meta, true, 2 ), "\n", JSON.stringify( data.h1, true, 2 ) 100 | 101 | done() 102 | return 103 | return 104 | return 105 | 106 | return 107 | 108 | describe 'Test reducing', -> 109 | for _reduce, idx in testData.reduce 110 | do( _reduce, idx )-> 111 | it "#{ idx }: Reduced parse '#{ _reduce.url }'", ( done )-> 112 | getHTML _reduce.url, ( html )-> 113 | 114 | _extractor.extract html, _reduce.reduced, ( err, data )-> 115 | if err 116 | throw err 117 | should.exist( data.meta ) 118 | should.exist( data.meta.title ) 119 | should.exist( data.body ) 120 | data.body.should.not.be.empty 121 | switch idx 122 | when 0 123 | data.body.should.be.instanceof( String ) 124 | data.body.should.not.containEql( "" ) 125 | data.body.should.not.containEql "EDV-Downloadbereich" 126 | data.body.should.not.containEql "Spitalgasse 31" 127 | 128 | data.body.should.containEql "Herzlich willkommen im APO-Shop" 129 | when 1 130 | data.body.should.be.instanceof( String ) 131 | data.body.should.not.containEql( "" ) 132 | data.body.should.not.containEql "Impressum" 133 | data.body.should.not.containEql "Haftungsausschluss" 134 | 135 | data.body.should.containEql "Geschäftsführung" 136 | 137 | when 2 138 | data.body.should.be.instanceof( Array ) 139 | data.body.should.have.length( 11 ) 140 | data.body[ 0 ].should.startWith "Dynamo DB" 141 | 142 | #console.log "\nBody of #{ _reduce.url }\n", data.body 143 | 144 | done() 145 | return 146 | return 147 | return 148 | return 149 | return 150 | return 151 | 152 | describe 'Issues', -> 153 | it "#1 Returned body contains html entities", ( done )-> 154 | _html = 'HELLO!
term one
non indexable content
term two
' 176 | _exp = 177 | meta: 178 | title: "" 179 | description: "" 180 | keywords: [] 181 | body: ["term one", "term two"] 182 | h1: [] 183 | _reduce = 184 | tag: "p" 185 | attr: "id" 186 | val: "indexable" 187 | list: true 188 | 189 | _extractor.extract _html, _reduce, ( err, data )-> 190 | if err 191 | throw err 192 | should.exist( data ) 193 | data.should.eql( _exp ) 194 | done() 195 | return 196 | return 197 | 198 | 199 | return 200 | 201 | 202 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # http://www.appveyor.com/docs/appveyor-yml 2 | 3 | # Test against these versions of Node.js. 4 | environment: 5 | matrix: 6 | - nodejs_version: "0.10" 7 | - nodejs_version: "0.12" 8 | - nodejs_version: "4" 9 | - nodejs_version: "5" 10 | - nodejs_version: "6" 11 | 12 | pull_requests: 13 | do_not_increment_build_number: true 14 | 15 | platform: Any CPU 16 | shallow_clone: true 17 | 18 | # Install scripts. (runs after repo cloning) 19 | install: 20 | # Get the latest stable version of Node 0.STABLE.latest 21 | - ps: Install-Product node $env:nodejs_version 22 | # Typical npm stuff. Use msvs 2013 for the hiredis parser 23 | - npm install 24 | - npm install -g grunt-cli 25 | - grunt build 26 | 27 | # Post-install test scripts. 28 | test_script: 29 | # Output useful info for debugging. 30 | - node --version 31 | - npm --version 32 | - cmd: npm t 33 | 34 | os: 35 | - Default Azure 36 | - Windows Server 2012 R2 37 | 38 | # Don't actually build using MSBuild 39 | build: off 40 | 41 | # Set build version format here instead of in the admin panel. 42 | version: "{build}" 43 | -------------------------------------------------------------------------------- /coffeelint.json: -------------------------------------------------------------------------------- 1 | { 2 | "arrow_spacing": { 3 | "level": "ignore" 4 | }, 5 | "braces_spacing": { 6 | "level": "ignore", 7 | "spaces": 0, 8 | "empty_object_spaces": 0 9 | }, 10 | "camel_case_classes": { 11 | "level": "error" 12 | }, 13 | "coffeescript_error": { 14 | "level": "error" 15 | }, 16 | "colon_assignment_spacing": { 17 | "level": "ignore", 18 | "spacing": { 19 | "left": 0, 20 | "right": 0 21 | } 22 | }, 23 | "cyclomatic_complexity": { 24 | "value": 10, 25 | "level": "ignore" 26 | }, 27 | "duplicate_key": { 28 | "level": "error" 29 | }, 30 | "empty_constructor_needs_parens": { 31 | "level": "ignore" 32 | }, 33 | "ensure_comprehensions": { 34 | "level": "warn" 35 | }, 36 | "indentation": { 37 | "value": 1, 38 | "level": "error" 39 | }, 40 | "line_endings": { 41 | "level": "ignore", 42 | "value": "unix" 43 | }, 44 | "max_line_length": { 45 | "value": 120, 46 | "level": "ignore", 47 | "limitComments": false 48 | }, 49 | "missing_fat_arrows": { 50 | "level": "ignore", 51 | "is_strict": false 52 | }, 53 | "newlines_after_classes": { 54 | "value": 3, 55 | "level": "ignore" 56 | }, 57 | "no_backticks": { 58 | "level": "error" 59 | }, 60 | "no_debugger": { 61 | "level": "warn" 62 | }, 63 | "no_empty_functions": { 64 | "level": "ignore" 65 | }, 66 | "no_empty_param_list": { 67 | "level": "ignore" 68 | }, 69 | "no_implicit_braces": { 70 | "level": "ignore", 71 | "strict": true 72 | }, 73 | "no_implicit_parens": { 74 | "strict": true, 75 | "level": "ignore" 76 | }, 77 | "no_interpolation_in_single_quotes": { 78 | "level": "ignore" 79 | }, 80 | "no_plusplus": { 81 | "level": "ignore" 82 | }, 83 | "no_stand_alone_at": { 84 | "level": "ignore" 85 | }, 86 | "no_tabs": { 87 | "level": "ignore" 88 | }, 89 | "no_throwing_strings": { 90 | "level": "error" 91 | }, 92 | "no_trailing_semicolons": { 93 | "level": "error" 94 | }, 95 | "no_trailing_whitespace": { 96 | "level": "error", 97 | "allowed_in_comments": false, 98 | "allowed_in_empty_lines": true 99 | }, 100 | "no_unnecessary_double_quotes": { 101 | "level": "ignore" 102 | }, 103 | "no_unnecessary_fat_arrows": { 104 | "level": "warn" 105 | }, 106 | "non_empty_constructor_needs_parens": { 107 | "level": "ignore" 108 | }, 109 | "prefer_english_operator": { 110 | "level": "ignore", 111 | "doubleNotLevel": "ignore" 112 | }, 113 | "space_operators": { 114 | "level": "ignore" 115 | }, 116 | "spacing_after_comma": { 117 | "level": "ignore" 118 | }, 119 | "transform_messes_up_line_numbers": { 120 | "level": "warn" 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-extractor", 3 | "description": "Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search", 4 | "version": "0.2.2", 5 | "homepage": "https://github.com/mpneuried/html-extractor", 6 | "keywords": [ 7 | "html", "parse", "extract", "body", "search", "tool", "elastic", "headlines", "meta", "data" 8 | ], 9 | "author": { 10 | "name": "Mathias Peter" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git://github.com/mpneuried/html-extractor.git" 15 | }, 16 | "bugs": { 17 | "url": "https://github.com/mpneuried/html-extractor/issues" 18 | }, 19 | "licenses": [ 20 | { 21 | "type": "MIT", 22 | "url": "https://github.com/mpneuried/html-extractor/blob/master/LICENSE-MIT" 23 | } 24 | ], 25 | "main": "./lib/html_extractor.js", 26 | "engines": { 27 | "node": ">= 0.8.10" 28 | }, 29 | "scripts": { 30 | "test": "grunt test" 31 | }, 32 | "dependencies": { 33 | "htmlparser2": "3.9.x", 34 | "lodash": "4.x" 35 | }, 36 | "devDependencies": { 37 | "should": "9.x", 38 | "request": "2.x", 39 | "grunt-contrib-watch": "*", 40 | "grunt-contrib-coffee": "1.x", 41 | "grunt-mocha-cli": "2.x", 42 | "grunt": "1.x" 43 | } 44 | } 45 | --------------------------------------------------------------------------------