├── .editorconfig
├── .gitignore
├── .npmignore
├── .travis.yml
├── Gruntfile.coffee
├── LICENSE
├── README.md
├── _src
    ├── lib
    │   └── html_extractor.coffee
    └── test
    │   ├── readme_example_advanced.coffee
    │   ├── readme_example_simple.coffee
    │   ├── test.coffee
    │   └── test_data.coffee
├── appveyor.yml
├── coffeelint.json
└── package.json


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain consistent
 2 | # coding styles between different editors and IDEs
 3 | # editorconfig.org
 4 | 
 5 | root = true
 6 | 
 7 | 
 8 | [*]
 9 | 
10 | # Change these settings to your own preference
11 | indent_style = tab
12 | end_of_line = lf
13 | charset = utf-8
14 | trim_trailing_whitespace = true
15 | insert_final_newline = true
16 | 
17 | [*.coffee]
18 | trim_trailing_whitespace = false
19 | 
20 | [*.md]
21 | trim_trailing_whitespace = false
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.profile
 2 | *.lock
 3 | *.conflict
 4 | *.DS_Store
 5 | *.zip
 6 | *.rdb
 7 | *.log
 8 | 
 9 | .project
10 | .settings
11 | .idea
12 | 
13 | *.mo
14 | *.sublime*
15 | config.json
16 | config*.json
17 | deploy.json
18 | /node_modules
19 | /_release
20 | /lib
21 | /test
22 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | /_src
 2 | /_docs
 3 | /node_modules
 4 | /_release
 5 | Gruntfile.*
 6 | *.sublime*
 7 | config.json
 8 | config*.json
 9 | deploy.json
10 | .editorconfig
11 | 
12 | *.yml
13 | *.profile
14 | *.lock
15 | *.conflict
16 | *.DS_Store
17 | *.zip
18 | *.rdb
19 | *.log
20 | 
21 | .project
22 | .settings
23 | .idea
24 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: node_js
 2 | node_js:
 3 |   - 0.10
 4 |   - 0.12
 5 |   - 4.0
 6 |   - 4.4
 7 |   - 5.0
 8 |   - 5.5
 9 |   - 5.11
10 |   - 6.0
11 |   - 6.1
12 |   - iojs
13 |   - node
14 | before_script:
15 |   - "npm install -g mocha grunt-cli"
16 |   - "grunt build"
17 | 


--------------------------------------------------------------------------------
/Gruntfile.coffee:
--------------------------------------------------------------------------------
 1 | module.exports = (grunt) ->
 2 | 
 3 | 	# Project configuration.
 4 | 	grunt.initConfig
 5 | 		pkg: grunt.file.readJSON("package.json")
 6 | 		watch:
 7 | 			lib:
 8 | 				files: ["_src/**/*.coffee"]
 9 | 				tasks: [ "coffee:base" ]
10 | 			module_test:
11 | 				files: [ "_src/**/*.coffee" ]
12 | 				tasks: [ "coffee:base", "test" ]
13 | 			
14 | 		coffee:
15 | 			base:
16 | 				expand: true
17 | 				cwd: '_src',
18 | 				src: ["**/*.coffee"]
19 | 				dest: ""
20 | 				ext: ".js"
21 | 
22 | 			options:
23 | 				flatten: false
24 | 				bare: false
25 | 
26 | 		mochacli:
27 | 			options:
28 | 				require: [ "should" ]
29 | 				reporter: "spec"
30 | 				bail: if process.env.BAIL? then true else false
31 | 				timeout: 10000
32 | 				env:
33 | 					COUNT: process.env.COUNT
34 | 
35 | 			all: [ "test/test.js" ]
36 | 			
37 | 
38 | 	# Load npm modules
39 | 	grunt.loadNpmTasks "grunt-contrib-watch"
40 | 	grunt.loadNpmTasks "grunt-contrib-coffee"
41 | 	grunt.loadNpmTasks "grunt-mocha-cli"
42 | 
43 | 	# ALIAS TASKS
44 | 	grunt.registerTask "default", "build"
45 | 	grunt.registerTask "test", [ "build", "mochacli" ]
46 | 	grunt.registerTask( "watch-test", [ "watch:module_test" ] )
47 | 
48 | 	# ALIAS SHORTS
49 | 	grunt.registerTask( "b", "build" )
50 | 	grunt.registerTask( "w", "watch:lib" )
51 | 	grunt.registerTask( "wt", "watch-test" )
52 | 	grunt.registerTask( "t", "test" )
53 | 
54 | 	grunt.registerTask "build", [ "coffee:base" ]
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 mpneuried
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | html-extractor
  2 | ==============
  3 | 
  4 | [![Build Status](https://secure.travis-ci.org/mpneuried/html-extractor.png?branch=master)](http://travis-ci.org/mpneuried/html-extractor)
  5 | [![Windows Tests](https://img.shields.io/appveyor/ci/mpneuried/html-extractor.svg?label=Windows%20Test)](https://ci.appveyor.com/project/mpneuried/html-extractor)
  6 | [![Dependency Status](https://david-dm.org/mpneuried/html-extractor.png)](https://david-dm.org/mpneuried/html-extractor)
  7 | [![NPM version](https://badge.fury.io/js/html-extractor.png)](http://badge.fury.io/js/html-extractor)
  8 | 
  9 | Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search
 10 | 
 11 | [![NPM](https://nodei.co/npm/html-extractor.png?downloads=true&stars=true)](https://nodei.co/npm/html-extractor/)
 12 | 
 13 | ## Install
 14 | 
 15 | ```
 16 | 	npm install html-extractor
 17 | ```
 18 | 
 19 | ## Initialize
 20 | 
 21 | 
 22 | ```js
 23 | var Extrator = require("html-extractor");
 24 | var myExtrator = new Extrator();
 25 | ```
 26 | 
 27 | ### `new Extrator( debug )`
 28 | 
 29 | **arguments**
 30 | - **debug** : *( `Boolean` optional: default = `false` )*  
 31 | Output the parsing time
 32 | 
 33 | ## Methods
 34 | 
 35 | ### Extrator.extract( html[, reduced], cb )
 36 | 
 37 | Call `.extract()` to get the data of an html string.  
 38 | HTML entities will be decoded.
 39 | 
 40 | **arguments:**
 41 | 
 42 | - **html** : *( `String` required )*  
 43 | The html string to process
 44 | - **reduced** : *( `Object` optional )*  
 45 | A object to reduce the content of body to a specific site content. It is not possible to reduce to a tag without a attribute filter.
 46 | 	- **reduced.tag** : *( `String` required if `reduced` is set )*  
 47 | 	The tag name of the html element to reduce to
 48 | 	- **reduced.attr** : *( `String` required if `reduced` is set )*  
 49 | 	The attribute of the html element to reduce to
 50 | 	- **reduced.val** : *( `String` required if `reduced` is set )*  
 51 | 	The attribute value of the html element to reduce to
 52 | 	- **reduced.list** : *( `Boobean` default = `false` )*  
 53 | 	Return every found reduced block as an array within body.
 54 | - **cb** : *( `Function` required )*  
 55 | The callback function
 56 | 
 57 | **callback arguments:**
 58 | 
 59 | - **error** : *( `Error` )*  
 60 | Error information. If no error occoured this will be `null`
 61 | - **data** : *( `Object` )*  
 62 | The extraction result
 63 | 	- **data.body** : *( `String|Array` )*  
 64 | 	The whole body content or the content within the configured reduced element. There will be just the text content without html tags/attributes and without the content in script tags.
 65 | 	If the reduced feature is used and `reduced.list = true` the body will be an array of all found reduced blocks.  
 66 | 	- **data.h1** : *( `Array` )*  
 67 | 	An array containing all `h1` text contents. Including the `h1`elements outside the configured reduced element 
 68 | 	- **data.meta** : *( `Object` )*  
 69 | 	A Object of all found meta tags with the syntax `<meta content="" name="">`. Other meta tags will be ignored.
 70 | 		- **data.meta.charset** : *( `String` optional )*  
 71 | 		If a metatag with the charset setting like `<meta charset="utf-8" >` is defined it will be returned under `data.meta.charset`
 72 | 		- **data.meta.title** : *( `String` default = `""` )*  
 73 | 		If tilte tag is defined it will be returned under `data.meta.title`. Otherwise the key will contain an empty string
 74 | 		- **data.meta.description** : *( `String` default = `""` )*  
 75 | 		If a metatag with the name `description` is defined it will be returned under `data.meta.description`. Otherwise the key will contain an empty string
 76 | 		- **data.meta.keywords** : *( `Array` default = `[]` )*  
 77 | 		If a metatag with the name `keywords` is defined it will be returned as trimmed array of strings under `data.meta.keywords`. Otherwise the key will contain an empty string
 78 | 
 79 | ## Examples
 80 | 
 81 | ### simple
 82 | 
 83 | This is a simple example to extarct the content of a html document
 84 | 
 85 | ```js
 86 | var Extrator = require("html-extractor");
 87 | var myExtrator = new Extrator();
 88 | 
 89 | var html = `
 90 | <html> 
 91 | 	<head>
 92 | 		<title>Testpage</title>
 93 | 	</head>
 94 | 	<body>
 95 | 		<h1>Header 1</h1>
 96 | 		<p>Content</p>
 97 | 	</body>
 98 | </html>
 99 | `
100 | 
101 | myExtrator.extract( html, function( err, data ){
102 | 	if( err ){
103 | 		throw( err )
104 |     } else {
105 | 		console.log( data );
106 | 		// {
107 | 		// 	meta: {
108 | 		// 		title: 'Testpage',
109 | 		//		description: '',
110 | 		//		keywords: []
111 | 		//	},
112 | 		//	body: ' Header 1 Content ',
113 | 		//	h1: [ 'Header 1' ]
114 | 		// }
115 |     }
116 | });
117 | ```
118 | 
119 | > see `test/readme_example_simple` or [run in Tonic](https://tonicdev.com/mpneuried/5767a1b1444f3a1400e793c2)
120 | 
121 | ### advanced
122 | 
123 | This is a advanced example to show the usage of the reducing.
124 | With the reduce feature it is possible to reduce the body content to the content of a specific html element.
125 | 
126 | ```js
127 | var Extrator = require("html-extractor");
128 | var myExtrator = new Extrator();
129 | 
130 | var html = `
131 | <html>
132 | 	<head>
133 | 		<title>Super page</title>
134 | 		<meta content="X, Y, Z" name="keywords">
135 | 		<meta content="Look at this super page" name="description">
136 | 		<meta content="Super pageCMS" name="generator">
137 | 	</head>
138 | 	<body>
139 | 		<div id="head">
140 | 			<h1>My super page<sup>2</sup></h1>
141 | 		</div> 
142 | 		<ul id="menu">
143 | 			<li>Home</li>
144 | 			<li>First</li>
145 | 			<li>Second</li>
146 | 		</ul>
147 | 		<div id="content">
148 | 			<h1>First article &euro;</h1>
149 | 			<p>Lorem ipsum dolor sit amet ... </p>
150 | 			<h1>Second article&nbsp;&nbsp;...&nbsp;&nbsp;</h1>
151 | 			<p>Aenean commodo ligula eget dolor.</p>
152 | 			<script>
153 | 				var superVar = [ 3,2,1 ]
154 | 			</script>
155 | 		</div>
156 | 		<section class="abc">
157 | 			<h3>ABC 1</h3>
158 | 			<p>Lorem ipsum dolor sit amet ... </p>
159 | 		</section>
160 | 		<section class="xyz">
161 | 			<h3>XYZ 1</h3>
162 | 			<p>Lorem ipsum dolor sit amet ... </p>
163 | 		</section>
164 | 		<section class="abc">
165 | 			<h3>ABC 2</h3>
166 | 			<p>Lorem ipsum dolor sit amet ... </p>
167 | 		</section>
168 | 		<div id="footer">
169 | 			Copyright 2013
170 | 		</div>
171 | 	</body>
172 | </html>
173 | `
174 | 
175 | var reduceTo = {
176 | 	tag: "div",
177 | 	attr: "id",
178 | 	val: "content"
179 | }
180 | 
181 | myExtrator.extract( html, reduceTo, function( err, data ){
182 | 	if( err ){
183 | 		throw( err )
184 | 	} else {
185 | 		console.log( "String", data );
186 | 		//{
187 | 		//	meta: {
188 | 		//		title: 'Super page',
189 | 		//		description: 'Look at this super page',
190 | 		//		keywords: ['X', 'Y', 'Z'],
191 | 		//		generator: 'Super pageCMS'
192 | 		//	},
193 | 		//	body: 'First article € Lorem ipsum dolor sit amet ... Second article  ... Aenean commodo ligula eget dolor. ',
194 | 		//	h1: ['My super page2', 'First article €', 'Second article  ...']
195 | 		//}
196 | 	}
197 | });
198 | 
199 | var reduceToList = {
200 | 	tag: "div",
201 | 	attr: "id",
202 | 	val: "content",
203 | 	list: true
204 | }; 
205 | 
206 | myExtrator.extract( html, reduceToList, function( err, data ){
207 | 	if( err ){
208 | 		throw( err )
209 | 	} else {
210 | 		console.log( "List", data );
211 | 		//{
212 | 		//	meta: {
213 | 		//		title: 'Super page',
214 | 		//		description: 'Look at this super page',
215 | 		//		keywords: ['X', 'Y', 'Z'],
216 | 		//		generator: 'Super pageCMS'
217 | 		//	},
218 | 		//	body: [
219 | 		//		'ABC 1 Lorem ipsum dolor sit amet ... ',
220 | 		//		'ABC 2 Lorem ipsum dolor sit amet ... '
221 | 		//	],
222 | 		//	h1: ['My super page2', 'First article', 'Second article']
223 | 		//}
224 | 	}
225 | });
226 | ```
227 | 
228 | > see `test/readme_example_advanced` or [run in Tonic](https://tonicdev.com/mpneuried/5767a178b29b431300aeb02f)
229 | 
230 | ## Work in progress
231 | 
232 | `html-extractor` is work in progress. Your ideas, suggestions etc. are very welcome.
233 | 
234 | ## Release History
235 | |Version|Date|Description|
236 | |:--:|:--:|:--|
237 | |0.2.2|2016-07-1|Fixed trimming when `reduced.list` is active #3. Thanks to [Javier Castro](https://github.com/jacargentina)|
238 | |0.2.1|2016-06-30|Fixed handling of html entities #1. Thanks to [Javier Castro](https://github.com/jacargentina)|
239 | |0.2.0|2016-06-20|Added option to return reduced elements as list; Fixed reduced value check for classes; Optimized dev env.|
240 | |0.1.4|-|Updated and pinned dependencies and optimized tests|
241 | |0.1.3|-|Fixed extraction to remove style-tag content|
242 | |0.1.2|-|Updated documentation|
243 | |0.1.1|-|Added raw documentation; Fixed `travis.yml` |
244 | |0.1.0|-|Initial version|
245 | 
246 | [![NPM](https://nodei.co/npm-dl/html-extractor.png?months=6)](https://nodei.co/npm/html-extractor/)
247 | 
248 | ## License 
249 | 
250 | (The MIT License)
251 | 
252 | Copyright (c) 2016 M. Peter, http://www.tcs.de
253 | 
254 | Permission is hereby granted, free of charge, to any person obtaining
255 | a copy of this software and associated documentation files (the
256 | 'Software'), to deal in the Software without restriction, including
257 | without limitation the rights to use, copy, modify, merge, publish,
258 | distribute, sublicense, and/or sell copies of the Software, and to
259 | permit persons to whom the Software is furnished to do so, subject to
260 | the following conditions:
261 | 
262 | The above copyright notice and this permission notice shall be
263 | included in all copies or substantial portions of the Software.
264 | 
265 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
266 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
267 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
268 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
269 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
270 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
271 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
272 | 


--------------------------------------------------------------------------------
/_src/lib/html_extractor.coffee:
--------------------------------------------------------------------------------
  1 | # # HTMLExtractor
  2 | #
  3 | # Extract meta-data from a html string.
  4 | # It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search
  5 | 
  6 | # import external modules
  7 | htmlparser = require("htmlparser2")
  8 | _isEmpty = require('lodash/isEmpty')
  9 | _isString = require('lodash/isString')
 10 | _isArray = require('lodash/isArray')
 11 | 
 12 | # export extractor class
 13 | module.exports = class HTMLExtractor
 14 | 
 15 | 	###
 16 | 	## constructor
 17 | 	
 18 | 	`new HTMLExtractor( debug )`
 19 | 	
 20 | 	initializes a extractor instance
 21 | 	
 22 | 	@param { Boolean } [debug=false] Output the parsing time
 23 | 	
 24 | 	###
 25 | 	constructor: ( @debug = false )->
 26 | 		return
 27 | 
 28 | 	# **_trimRegex** *RegEx* Regular expression for trimming.
 29 | 	_trimRegex: /^\s+/
 30 | 
 31 | 	###
 32 | 	## _trim
 33 | 	
 34 | 	`html_extractor._trim( str )`
 35 | 	
 36 | 	Trim method to remove whitespace
 37 | 	
 38 | 	@param { String } [str=""] String to trim
 39 | 	
 40 | 	@return { String } Trimmed string
 41 | 	
 42 | 	@api private
 43 | 	###
 44 | 	_trim: ( str = "")=>
 45 | 		str = str.replace( @_trimRegex, "")
 46 | 		i = str.length - 1
 47 | 
 48 | 		while i >= 0
 49 | 			if /\S/.test(str.charAt(i))
 50 | 				str = str.substring(0, i + 1)
 51 | 				break
 52 | 			i--
 53 | 		str
 54 | 
 55 | 	###
 56 | 	## extract
 57 | 	
 58 | 	`html_extractor.extract( html[, reduce], cb )`
 59 | 	
 60 | 	Main method to extract the contens out of a html string
 61 | 	
 62 | 	@param { String } html Raw html string to extract the meta, title and body
 63 | 	@param { Object } [reduce] Reduce config object to reduce the body results to a specific element. Example: `{ tag: "div", attr: "id", val: "myContent" }`
 64 | 	@param { Function } reduce Callback function
 65 | 	
 66 | 	@api public
 67 | 	###
 68 | 	extract: ( [ html, reduce ]..., cb  )=>
 69 | 		# default return Object
 70 | 		_ret =
 71 | 			meta:
 72 | 				title: ""
 73 | 				description: ""
 74 | 				keywords: ""
 75 | 			body: null
 76 | 			h1: []
 77 | 		
 78 | 		# init benchmarking on `debug = true`
 79 | 		console.time( "\t\tparse Time" ) if @debug
 80 | 
 81 | 		# run extractor
 82 | 		@_extract html, _ret, reduce, ( err, data )=>
 83 | 			if err
 84 | 				cb( err )
 85 | 				return
 86 | 			# return time on `debug = true`
 87 | 			console.timeEnd( "\t\tparse Time" ) if @debug
 88 | 
 89 | 			# trim results
 90 | 			_ret.meta.title = @_trim( _ret.meta.title ) if _ret.meta?.title?.length
 91 | 			_ret.meta.description = @_trim( _ret.meta.description ) if _ret.meta?.description?.length
 92 | 			if _isString( _ret.body ) and _ret.body.length
 93 | 				_ret.body = @_trim( _ret.body )
 94 | 			else if _isArray( _ret.body ) and _ret.body.length
 95 | 				for listEl, idx in _ret.body when listEl?.length
 96 | 					_ret.body[ idx ] = @_trim( listEl )
 97 | 				
 98 | 			for _h, idx in _ret.h1 when _h?.length
 99 | 				_ret.h1[ idx ] = @_trim( _h )
100 | 
101 | 			cb( null, data )
102 | 			return
103 | 		return
104 | 
105 | 	_extract: ( html, _ret, reduce, cb )=>
106 | 
107 | 		# check the reduce config and disable it if one key is missing
108 | 		if not reduce?.tag? or not reduce.attr?  or not reduce.val?
109 | 			reduce = null
110 | 		
111 | 		if reduce?.list?
112 | 			reduce.list = true
113 | 
114 | 		# set some flags
115 | 		_bodyMode = false
116 | 		_scriptMode = false
117 | 		_reducedBody = []
118 | 		_reducedBodyIdx = 0
119 | 		_reduce_stack = null
120 | 		_body = []
121 | 		_currTag = null
122 | 		_startBody = null
123 | 		_h1Open = false
124 | 		_h1LastOpen = false
125 | 
126 | 		# allwasy create a instance of htmlparser2 to prevent race conditions through a possible instance parser value
127 | 		parser = new htmlparser.Parser(
128 | 			# event on tag open
129 | 			onopentag: ( name, attr )->
130 | 				_currTag = name
131 | 				# check and start the reduced section by saving the current start stack. The collectin will be done within the `ontext` event.
132 | 				if reduce? and reduce.tag is name and attr[ reduce.attr ]?.indexOf( reduce.val ) >= 0
133 | 					_reducedBody[ _reducedBodyIdx ] = ""
134 | 					_reduce_stack = parser._stack.slice( 0,-1 ).join( "§§" )
135 | 
136 | 				switch name
137 | 					
138 | 					# get the meta tag attributes and set the meta return object
139 | 					when "meta"
140 | 						if attr? and attr.name? and attr.content?
141 | 							_ret.meta[ attr.name ] = attr.content
142 | 						#else if attr? and attr.property? and attr.content?
143 | 						#	_ret.meta[ attr.property ] = attr.content
144 | 						#else if attr? and attr[ 'http-equiv' ]? and attr.content?
145 | 						#	_ret.meta[ attr[ 'http-equiv' ] ] = attr.content
146 | 						else if attr? and attr.charset?
147 | 							_ret.meta.charset = attr.charset
148 | 
149 | 					# start the body section to activate the text body collector
150 | 					when "body"
151 | 						_bodyMode = true
152 | 						_startBody = parser._tokenizer._index
153 | 
154 | 					# start a script section to prevent text get within scripts
155 | 					when "script", "style"
156 | 						_scriptMode = true
157 | 
158 | 					# start a h1 section to pull the text in h1 tags out of the html
159 | 					when "h1"
160 | 						_h1Open = true
161 | 				return
162 | 
163 | 			# event on a text fragment
164 | 			ontext: ( text )=>
165 | 
166 | 				# check if the parser is in body and not in a script tag
167 | 				if _bodyMode and not _scriptMode
168 | 
169 | 					# if reduce is active only push to the body if a stack is defined
170 | 					if reduce? and _reduce_stack?
171 | 						_body.push( text )
172 | 						_reducedBody[ _reducedBodyIdx ] += text
173 | 					else if not reduce?
174 | 						_body.push( text )
175 | 
176 | 				# if the h1 state is active push the text to the h1 array
177 | 				if _h1Open
178 | 					# on subtag in the h1 tag the `_h1LastOpen` will be true so the sub tag content will be added to the latest h1 element
179 | 					if _h1LastOpen
180 | 						_ret.h1[ _ret.h1.length - 1 ] += text
181 | 					else
182 | 						_ret.h1.push text
183 | 					_h1LastOpen = true
184 | 				else
185 | 					_h1LastOpen = false
186 | 
187 | 
188 | 				switch _currTag
189 | 					# save the content of the title tag to the meta object
190 | 					when "title"
191 | 						_ret.meta.title += text
192 | 
193 | 				return
194 | 				
195 | 			# event on tag close
196 | 			onclosetag: ( name )->
197 | 				_currTag = null
198 | 
199 | 				# check if the stack matches the stack on reduce start and stop an active reduce section
200 | 				if _reduce_stack? and _reduce_stack is parser._stack.join( "§§" )
201 | 					_reducedBodyIdx++
202 | 					_reduce_stack = null
203 | 
204 | 				switch name
205 | 					# stop the body section
206 | 					when "body"
207 | 						if _startBody < parser._tokenizer._index
208 | 							_bodyMode = false
209 | 					# stop a h1 section
210 | 					when "h1"
211 | 						_h1Open = false
212 | 						_h1LastOpen = false
213 | 					# stop a script section
214 | 					when "script", "style"
215 | 						_scriptMode = false
216 | 				return
217 | 
218 | 				return
219 | 			onend: =>
220 | 				# if keywords are defined convert it to an array
221 | 				if _ret.meta.keywords?
222 | 					_ret.meta.keywords = for _word in _ret.meta.keywords.split( "," ) when not _isEmpty( _word )
223 | 						@_trim( _word )
224 | 				
225 | 				if reduce?.list?
226 | 					_ret.body = []
227 | 					for _redTxt in _reducedBody
228 | 						_redTxt = @_trim( _redTxt ).replace( /\s\s+/g, " " )
229 | 						if _redTxt.length
230 | 							_ret.body.push _redTxt
231 | 				else
232 | 					_ret.body = _body.join( " " ).replace( /\s\s+/g, " " )
233 | 				
234 | 				cb( null, _ret )
235 | 				return
236 | 
237 | 		# allways us lowertags because tags could be written upper or lowercase
238 | 		, {lowerCaseTags: true, decodeEntities: true } )
239 | 		
240 | 		# push the html to the parser
241 | 		parser.write( html )
242 | 
243 | 		# finish the parsing and let the parser call end
244 | 		parser.end()
245 | 
246 | 		return
247 | 


--------------------------------------------------------------------------------
/_src/test/readme_example_advanced.coffee:
--------------------------------------------------------------------------------
  1 | Extrator = require("../lib/html_extractor")
  2 | myExtrator = new Extrator()
  3 | 
  4 | html = """
  5 | <html>
  6 | 	<head>
  7 | 		<title>Super page</title>
  8 | 		<meta content="X, Y, Z" name="keywords">
  9 | 		<meta content="Look at this super page" name="description">
 10 | 		<meta content="Super pageCMS" name="generator">
 11 | 	</head>
 12 | 	<body>
 13 | 		<div id="head">
 14 | 			<h1>My super page<sup>2</sup></h1>
 15 | 		</div> 
 16 | 		<ul id="menu">
 17 | 			<li>Home</li>
 18 | 			<li>First</li>
 19 | 			<li>Second</li>
 20 | 		</ul>
 21 | 		<div id="content">
 22 | 			<h1>First article</h1>
 23 | 			<p>Lorem ipsum dolor sit amet ... </p>
 24 | 			<h1>Second article</h1>
 25 | 			<p>Aenean commodo ligula eget dolor.</p>
 26 | 			<section class="abc">
 27 | 				<h3>ABC 1</h3>
 28 | 				<p>Lorem ipsum dolor sit amet ... </p>
 29 | 			</section>
 30 | 			<section class="xyz">
 31 | 				<h3>XYZ 1</h3>
 32 | 				<p>Lorem ipsum dolor sit amet ... </p>
 33 | 			</section>
 34 | 			<section class="abc">
 35 | 				<h3>ABC 2</h3>
 36 | 				<p>Lorem ipsum dolor sit amet ... </p>
 37 | 			</section>
 38 | 			<section class="xyz">
 39 | 				<h3>XYZ 2</h3>
 40 | 				<p>Lorem ipsum dolor sit amet ... </p>
 41 | 			</section>
 42 | 			<section class="abc">
 43 | 				<h3>ABC 3</h3>
 44 | 				<p>Lorem ipsum dolor sit amet ... </p>
 45 | 			</section>
 46 | 			<script>
 47 | 				var superVar = [ 3,2,1 ]
 48 | 			</script>
 49 | 		</div>
 50 | 		<div id="footer">
 51 | 			Copyright 2013
 52 | 		</div>
 53 | 	</body>
 54 | </html>
 55 | """
 56 | 
 57 | reduceTo =
 58 | 	tag: "div"
 59 | 	attr: "id"
 60 | 	val: "content"
 61 | 
 62 | myExtrator.extract html, reduceTo, ( err, data )->
 63 | 	if err
 64 | 		throw err
 65 | 	else
 66 | 		console.log data
 67 | 		# {
 68 | 		# 	meta: {
 69 | 		# 		title: 'Super page',
 70 | 		# 		description: 'Look at this super page',
 71 | 		# 		keywords: ['X', 'Y', 'Z'],
 72 | 		# 		generator: 'Super pageCMS'
 73 | 		# 	},
 74 | 		# 	body: ' First article Lorem ipsum dolor sit amet ... Second article Aenean commodo ligula eget dolor. ',
 75 | 		# 	h1: ['My super page2', 'First article', 'Second article']
 76 | 		# }
 77 | 		
 78 | 	
 79 | 	reduceTo2 =
 80 | 		tag: "section"
 81 | 		attr: "class"
 82 | 		val: "abc"
 83 | 		list: true
 84 | 
 85 | 	myExtrator.extract html, reduceTo2, ( err, data )->
 86 | 		if err
 87 | 			throw err
 88 | 		else
 89 | 			console.log data
 90 | 			# {
 91 | 			# 	meta: {
 92 | 			# 		title: 'Super page',
 93 | 			# 		description: 'Look at this super page',
 94 | 			# 		keywords: ['X', 'Y', 'Z'],
 95 | 			# 		generator: 'Super pageCMS'
 96 | 			# 	},
 97 | 			# 	body: ' First article Lorem ipsum dolor sit amet ... Second article Aenean commodo ligula eget dolor. ',
 98 | 			# 	h1: ['My super page2', 'First article', 'Second article']
 99 | 			# }
100 | 		return
101 | 	
102 | 	return
103 | 


--------------------------------------------------------------------------------
/_src/test/readme_example_simple.coffee:
--------------------------------------------------------------------------------
 1 | Extrator = require("../lib/html_extractor")
 2 | myExtrator = new Extrator()
 3 | 
 4 | html = """
 5 | <html>
 6 | 	<head>
 7 | 		<title>Testpage</title>
 8 | 	</head>
 9 | 	<body>
10 | 		<h1>Header 1</h1>
11 | 		<p>Content</p>
12 | 	</body>
13 | </html>
14 | """
15 | 
16 | myExtrator.extract html, ( err, data )->
17 | 	if err
18 | 		throw err
19 | 	else
20 | 		console.log data
21 | 		# {
22 | 		# 	meta: {
23 | 		# 		title: 'Testpage',
24 | 		#		description: '',
25 | 		#		keywords: []
26 | 		#	},
27 | 		#	body: ' Header 1 Content ',
28 | 		#	h1: [ 'Header 1' ]
29 | 		# }
30 | 	return
31 | 


--------------------------------------------------------------------------------
/_src/test/test.coffee:
--------------------------------------------------------------------------------
  1 | HTMLExtractor = require( "../lib/html_extractor" )
  2 | testData = require( "./test_data" )
  3 | 
  4 | request = require( "request" )
  5 | 
  6 | should = require( "should" )
  7 | 
  8 | _extractor = new HTMLExtractor( true )
  9 | 
 10 | getHTML = ( link, cb )->
 11 | 	request.get link, ( err, data )->
 12 | 		if err
 13 | 			throw err
 14 | 		cb( data.body )
 15 | 		return
 16 | 	return
 17 | 
 18 | describe 'HTML-dispatch-TEST', ->
 19 | 
 20 | 	before ( done )->
 21 | 		done()
 22 | 		return
 23 | 
 24 | 	after ( done )->
 25 | 		done()
 26 | 		return
 27 | 
 28 | 
 29 | 
 30 | 	describe 'TEST Parser', ->
 31 | 		it "Test tcs.de HTML", ( done )->
 32 | 
 33 | 			_extractor.extract testData.html[ 0 ], ( err, data )->
 34 | 				if err
 35 | 					throw err
 36 | 
 37 | 				should.exist( data.meta )
 38 | 				should.exist( data.meta.title )
 39 | 				data.meta.title.should.equal("TCS: Team Centric Software GmbH & Co. KG")
 40 | 				should.exist( data.body )
 41 | 				data.body.should.not.be.empty
 42 | 
 43 | 				data.body.should.not.containEql( "$('#contactform')" )
 44 | 				data.body.should.not.containEql( ".testcssselector" )
 45 | 				data.body.should.not.containEql( "</" )
 46 | 				#console.log data.meta, data.body.length, data.h1
 47 | 				done()
 48 | 				return
 49 | 			return
 50 | 		
 51 | 		it "Test spiegel.de HTML", ( done )->
 52 | 
 53 | 			_extractor.extract testData.html[ 1 ], ( err, data )->
 54 | 				if err
 55 | 					throw err
 56 | 
 57 | 				should.exist( data.meta )
 58 | 				should.exist( data.meta.title )
 59 | 				data.meta.title.should.equal("SPIEGEL ONLINE - Nachrichten")
 60 | 				should.exist( data.body )
 61 | 				data.body.should.not.containEql( "</" )
 62 | 				data.body.should.not.be.empty
 63 | 				
 64 | 				#console.log data.meta, data.body.length, data.h1
 65 | 				done()
 66 | 				return
 67 | 			return
 68 | 		return
 69 | 	
 70 | 	describe 'Test Request', ->
 71 | 		
 72 | 		it "test get HTML", ( done )->
 73 | 
 74 | 			getHTML testData.links[ 0 ], ( html )->
 75 | 				html.should.be.a.String()
 76 | 				html.length.should.be.above( 0 )
 77 | 				html.should.containEql( "Team Centric Software GmbH" )
 78 | 				done()
 79 | 				return
 80 | 			return
 81 | 
 82 | 	describe 'Test Parser with multiple pages', ->
 83 | 		_count = process.env.COUNT or 5
 84 | 		for _link, idx in testData.links[ 0.._count ]
 85 | 			do( _link )->
 86 | 				it "#{ idx }: Parse '#{ _link }'", ( done )->
 87 | 
 88 | 					getHTML _link, ( html )->
 89 | 
 90 | 						_extractor.extract html, ( err, data )->
 91 | 							if err
 92 | 								throw err
 93 | 							should.exist( data.meta )
 94 | 							should.exist( data.meta.title )
 95 | 							should.exist( data.body )
 96 | 							data.body.should.not.containEql( "</" )
 97 | 							data.body.should.not.be.empty
 98 | 							
 99 | 							#console.log "\nHEADER of #{ _link }\n", data.meta.title, "\n", JSON.stringify( data.meta, true, 2 ), "\n", JSON.stringify( data.h1, true, 2 )
100 | 
101 | 							done()
102 | 							return
103 | 						return
104 | 					return
105 | 
106 | 		return
107 | 
108 | 	describe 'Test reducing', ->
109 | 		for _reduce, idx in testData.reduce
110 | 			do( _reduce, idx )->
111 | 				it "#{ idx }: Reduced parse '#{ _reduce.url }'", ( done )->
112 | 					getHTML _reduce.url, ( html )->
113 | 
114 | 						_extractor.extract html, _reduce.reduced, ( err, data )->
115 | 							if err
116 | 								throw err
117 | 							should.exist( data.meta )
118 | 							should.exist( data.meta.title )
119 | 							should.exist( data.body )
120 | 							data.body.should.not.be.empty
121 | 							switch idx
122 | 								when 0
123 | 									data.body.should.be.instanceof( String )
124 | 									data.body.should.not.containEql( "</" )
125 | 									data.body.should.not.containEql "EDV-Downloadbereich"
126 | 									data.body.should.not.containEql "Spitalgasse 31"
127 | 
128 | 									data.body.should.containEql "Herzlich willkommen im APO-Shop"
129 | 								when 1
130 | 									data.body.should.be.instanceof( String )
131 | 									data.body.should.not.containEql( "</" )
132 | 									data.body.should.not.containEql "Impressum"
133 | 									data.body.should.not.containEql "Haftungsausschluss"
134 | 
135 | 									data.body.should.containEql "Geschäftsführung"
136 | 								
137 | 								when 2
138 | 									data.body.should.be.instanceof( Array )
139 | 									data.body.should.have.length( 11 )
140 | 									data.body[ 0 ].should.startWith "Dynamo DB"
141 | 							
142 | 							#console.log "\nBody of #{  _reduce.url }\n", data.body
143 | 
144 | 							done()
145 | 							return
146 | 						return
147 | 					return
148 | 				return
149 | 			return
150 | 		return
151 | 
152 | 	describe 'Issues', ->
153 | 		it "#1 Returned body contains html entities", ( done )->
154 | 			_html = '<body><p>&nbsp;HELLO!&nbsp;</p><h1>&nbsp;Headline &gt; &lt; &euro;&nbsp;&nbsp;&nbsp;...&nbsp;&nbsp;&nbsp;</h1></body>'
155 | 			_exp =
156 | 				meta: 
157 | 					title: ""
158 | 					description: ""
159 | 					keywords: []
160 | 				body: "HELLO! Headline > < € ..."
161 | 				h1: [  "Headline > < €   ..." ]
162 | 
163 | 			_extractor.extract _html, ( err, data )->
164 | 				if err
165 | 					throw err
166 | 				should.exist( data )
167 | 				data.should.eql( _exp )
168 | 				done()
169 | 				return
170 | 			return
171 | 
172 | 		return
173 | 
174 | 		it "#3 str.replace is not a function when using reduce with list: true", ( done )->
175 | 			_html = '<body><p id="indexable">term one</p><p>non indexable content</p><p id="indexable">term&nbsp;&nbsp;&nbsp;two&nbsp;&nbsp;&nbsp;</p></body>'
176 | 			_exp =
177 | 				meta:
178 | 					title: ""
179 | 					description: ""
180 | 					keywords: []
181 | 				body: ["term one", "term   two"]
182 | 				h1: []
183 | 			_reduce =
184 | 				tag: "p"
185 | 				attr: "id"
186 | 				val: "indexable"
187 | 				list: true
188 | 
189 | 			_extractor.extract _html, _reduce, ( err, data )->
190 | 				if err
191 | 					throw err
192 | 				should.exist( data )
193 | 				data.should.eql( _exp )
194 | 				done()
195 | 				return
196 | 			return
197 | 
198 | 	
199 | 	return
200 | 
201 | 	
202 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # http://www.appveyor.com/docs/appveyor-yml
 2 | 
 3 | # Test against these versions of Node.js.
 4 | environment:
 5 |   matrix:
 6 |     - nodejs_version: "0.10"
 7 |     - nodejs_version: "0.12"
 8 |     - nodejs_version: "4"
 9 |     - nodejs_version: "5"
10 |     - nodejs_version: "6"
11 | 
12 | pull_requests:
13 |   do_not_increment_build_number: true
14 | 
15 | platform: Any CPU
16 | shallow_clone: true
17 | 
18 | # Install scripts. (runs after repo cloning)
19 | install:
20 |   # Get the latest stable version of Node 0.STABLE.latest
21 |   - ps: Install-Product node $env:nodejs_version
22 |   # Typical npm stuff. Use msvs 2013 for the hiredis parser
23 |   - npm install
24 |   - npm install -g grunt-cli
25 |   - grunt build
26 | 
27 | # Post-install test scripts.
28 | test_script:
29 |   # Output useful info for debugging.
30 |   - node --version
31 |   - npm --version
32 |   - cmd: npm t
33 | 
34 | os:
35 |   - Default Azure
36 |   - Windows Server 2012 R2
37 | 
38 | # Don't actually build using MSBuild
39 | build: off
40 | 
41 | # Set build version format here instead of in the admin panel.
42 | version: "{build}"
43 | 


--------------------------------------------------------------------------------
/coffeelint.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "arrow_spacing": {
  3 |         "level": "ignore"
  4 |     },
  5 |     "braces_spacing": {
  6 |         "level": "ignore",
  7 |         "spaces": 0,
  8 |         "empty_object_spaces": 0
  9 |     },
 10 |     "camel_case_classes": {
 11 |         "level": "error"
 12 |     },
 13 |     "coffeescript_error": {
 14 |         "level": "error"
 15 |     },
 16 |     "colon_assignment_spacing": {
 17 |         "level": "ignore",
 18 |         "spacing": {
 19 |             "left": 0,
 20 |             "right": 0
 21 |         }
 22 |     },
 23 |     "cyclomatic_complexity": {
 24 |         "value": 10,
 25 |         "level": "ignore"
 26 |     },
 27 |     "duplicate_key": {
 28 |         "level": "error"
 29 |     },
 30 |     "empty_constructor_needs_parens": {
 31 |         "level": "ignore"
 32 |     },
 33 |     "ensure_comprehensions": {
 34 |         "level": "warn"
 35 |     },
 36 |     "indentation": {
 37 |         "value": 1,
 38 |         "level": "error"
 39 |     },
 40 |     "line_endings": {
 41 |         "level": "ignore",
 42 |         "value": "unix"
 43 |     },
 44 |     "max_line_length": {
 45 |         "value": 120,
 46 |         "level": "ignore",
 47 |         "limitComments": false
 48 |     },
 49 |     "missing_fat_arrows": {
 50 |         "level": "ignore",
 51 |         "is_strict": false
 52 |     },
 53 |     "newlines_after_classes": {
 54 |         "value": 3,
 55 |         "level": "ignore"
 56 |     },
 57 |     "no_backticks": {
 58 |         "level": "error"
 59 |     },
 60 |     "no_debugger": {
 61 |         "level": "warn"
 62 |     },
 63 |     "no_empty_functions": {
 64 |         "level": "ignore"
 65 |     },
 66 |     "no_empty_param_list": {
 67 |         "level": "ignore"
 68 |     },
 69 |     "no_implicit_braces": {
 70 |         "level": "ignore",
 71 |         "strict": true
 72 |     },
 73 |     "no_implicit_parens": {
 74 |         "strict": true,
 75 |         "level": "ignore"
 76 |     },
 77 |     "no_interpolation_in_single_quotes": {
 78 |         "level": "ignore"
 79 |     },
 80 |     "no_plusplus": {
 81 |         "level": "ignore"
 82 |     },
 83 |     "no_stand_alone_at": {
 84 |         "level": "ignore"
 85 |     },
 86 |     "no_tabs": {
 87 |         "level": "ignore"
 88 |     },
 89 |     "no_throwing_strings": {
 90 |         "level": "error"
 91 |     },
 92 |     "no_trailing_semicolons": {
 93 |         "level": "error"
 94 |     },
 95 |     "no_trailing_whitespace": {
 96 |         "level": "error",
 97 |         "allowed_in_comments": false,
 98 |         "allowed_in_empty_lines": true
 99 |     },
100 |     "no_unnecessary_double_quotes": {
101 |         "level": "ignore"
102 |     },
103 |     "no_unnecessary_fat_arrows": {
104 |         "level": "warn"
105 |     },
106 |     "non_empty_constructor_needs_parens": {
107 |         "level": "ignore"
108 |     },
109 |     "prefer_english_operator": {
110 |         "level": "ignore",
111 |         "doubleNotLevel": "ignore"
112 |     },
113 |     "space_operators": {
114 |         "level": "ignore"
115 |     },
116 |     "spacing_after_comma": {
117 |         "level": "ignore"
118 |     },
119 |     "transform_messes_up_line_numbers": {
120 |         "level": "warn"
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "html-extractor",
 3 |   "description": "Extract meta-data from a html string. It extracts the body, title, meta-tags and first headlines to a object to push them to a search indexer like elastic-search",
 4 |   "version": "0.2.2",
 5 |   "homepage": "https://github.com/mpneuried/html-extractor",
 6 |   "keywords": [
 7 |     "html", "parse", "extract", "body", "search", "tool", "elastic", "headlines", "meta", "data"
 8 |   ],
 9 |   "author": {
10 |     "name": "Mathias Peter"
11 |   },
12 |   "repository": {
13 |     "type": "git",
14 |     "url": "git://github.com/mpneuried/html-extractor.git"
15 |   },
16 |   "bugs": {
17 |     "url": "https://github.com/mpneuried/html-extractor/issues"
18 |   },
19 |   "licenses": [
20 |     {
21 |       "type": "MIT",
22 |       "url": "https://github.com/mpneuried/html-extractor/blob/master/LICENSE-MIT"
23 |     }
24 |   ],
25 |   "main": "./lib/html_extractor.js",
26 |   "engines": {
27 |     "node": ">= 0.8.10"
28 |   },
29 |   "scripts": {
30 |     "test": "grunt test"
31 |   },
32 |   "dependencies": {
33 |     "htmlparser2": "3.9.x",
34 |     "lodash": "4.x"
35 |   },
36 |   "devDependencies": {
37 |     "should": "9.x",
38 |     "request": "2.x",
39 |     "grunt-contrib-watch": "*",
40 |     "grunt-contrib-coffee": "1.x",
41 |     "grunt-mocha-cli": "2.x",
42 |     "grunt": "1.x"
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------