├── .npmignore ├── .npmrc ├── .babelrc ├── .gitignore ├── .travis.yml ├── bin └── build.sh ├── src ├── parsers │ ├── utils.js │ ├── metatag-parser.js │ ├── jsonld-parser.js │ └── micro-rdfa-parser.js └── index.js ├── tonicExample.js ├── test ├── test.js └── resources │ ├── expectedResult.json │ └── testPage.html ├── LICENSE ├── package.json └── README.md /.npmignore: -------------------------------------------------------------------------------- 1 | src 2 | test 3 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | registry=https://registry.npmjs.org 2 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "es2015", 4 | "stage-0" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | dist 4 | .tmp 5 | .DS_Store 6 | build/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - '5' 4 | 5 | script: 6 | - npm run lint 7 | - npm run test 8 | -------------------------------------------------------------------------------- /bin/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | source ~/.nvm/nvm.sh 5 | nvm install 5.9 6 | nvm use 5.9 7 | 8 | npm version $GO_PIPELINE_LABEL 9 | npm install 10 | npm install -g npm-cli-login 11 | npm-cli-login 12 | npm publish . 13 | -------------------------------------------------------------------------------- /src/parsers/utils.js: -------------------------------------------------------------------------------- 1 | import $ from 'cheerio' 2 | 3 | export function getCheerioObject (html) { 4 | let $html 5 | if (typeof html === 'string') { 6 | $html = $.load(html, { xmlMode: true }) 7 | } else if ($(html).cheerio) { 8 | $html = html 9 | } else { 10 | throw new Error('Invalid argument: pass valid html string or cheerio object') 11 | } 12 | return $html 13 | } 14 | -------------------------------------------------------------------------------- /tonicExample.js: -------------------------------------------------------------------------------- 1 | var WAE = require('web-auto-extractor').default 2 | //ES6: import WAE from 'web-auto-extractor' 3 | var request = require('request') 4 | 5 | var pageUrl = 'http://southernafricatravel.com/' 6 | //var pageUrl = 'https://raw.githubusercontent.com/ind9/web-auto-extractor/master/test/resources/testPage.html' 7 | 8 | request(pageUrl, function (error, response, body) { 9 | 10 | var wae = WAE() 11 | 12 | var parsed = wae.parse(body) 13 | 14 | console.log(parsed) 15 | 16 | }) 17 | -------------------------------------------------------------------------------- /src/parsers/metatag-parser.js: -------------------------------------------------------------------------------- 1 | 2 | export default ($) => { 3 | let metatagsData = {} 4 | $('meta').each((index, elem) => { 5 | const nameKey = Object.keys(elem.attribs).find((attr) => ['name', 'property', 'itemprop', 'http-equiv'].indexOf(attr) !== -1) 6 | const name = elem.attribs[nameKey] 7 | const value = elem.attribs['content'] 8 | if (!metatagsData[name]) { 9 | metatagsData[name] = [] 10 | } 11 | metatagsData[name].push(value) 12 | }) 13 | return metatagsData 14 | } 15 | -------------------------------------------------------------------------------- /src/parsers/jsonld-parser.js: -------------------------------------------------------------------------------- 1 | import { getCheerioObject } from './utils' 2 | import $ from 'cheerio' 3 | 4 | export default function (html, config = {}) { 5 | const $html = getCheerioObject(html) 6 | let jsonldData = {} 7 | 8 | $html('script[type="application/ld+json"]').each((index, item) => { 9 | try { 10 | let parsedJSON = JSON.parse($(item).text()) 11 | if (!Array.isArray(parsedJSON)) { 12 | parsedJSON = [parsedJSON] 13 | } 14 | parsedJSON.forEach(obj => { 15 | const type = obj['@type'] 16 | jsonldData[type] = jsonldData[type] || [] 17 | jsonldData[type].push(obj) 18 | }) 19 | } catch (e) { 20 | console.log(`Error in jsonld parse - ${e}`) 21 | } 22 | }) 23 | 24 | return jsonldData 25 | } 26 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import $ from 'cheerio' 2 | import MetaTagsParser from './parsers/metatag-parser' 3 | import MicroRdfaParser from './parsers/micro-rdfa-parser' 4 | import JsonldParser from './parsers/jsonld-parser' 5 | if (!global._babelPolyfill) { 6 | require('babel-polyfill') 7 | } 8 | 9 | export default function () { 10 | let $html = null 11 | 12 | const loadCheerioObject = function (_$html) { 13 | $html = _$html 14 | } 15 | 16 | const parse = function (html, options) { 17 | if (!($html && $html.prototype && $html.prototype.cheerio)) { 18 | $html = $.load(html, options) 19 | } 20 | 21 | return { 22 | metatags: MetaTagsParser($html), 23 | microdata: MicroRdfaParser(html, 'micro'), 24 | rdfa: MicroRdfaParser(html, 'rdfa'), 25 | jsonld: JsonldParser($html) 26 | } 27 | } 28 | 29 | return { 30 | parse, 31 | loadCheerioObject 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | /* eslint-env mocha */ 2 | import 'babel-polyfill' 3 | import fs from 'fs' 4 | import { assert } from 'chai' 5 | import WAE from '../src' 6 | 7 | const fileReader = (fileName) => fs.readFileSync(fileName, { encoding: 'utf-8' }) 8 | const expectedResult = JSON.parse(fileReader('test/resources/expectedResult.json')) 9 | const testPage = fileReader('test/resources/testPage.html') 10 | const { microdata, rdfa, metatags, jsonld } = WAE().parse(testPage) 11 | 12 | describe('Web Auto Extractor', function () { 13 | it('should find all elements with microdata', function () { 14 | assert.deepEqual(microdata, expectedResult.microdata) 15 | }) 16 | 17 | it('should find all elements with rdfa', function () { 18 | assert.deepEqual(rdfa, expectedResult.rdfa) 19 | }) 20 | 21 | it('should find embedded json-ld', function () { 22 | assert.deepEqual(jsonld, expectedResult.jsonld) 23 | }) 24 | 25 | it('should find embedded meta tags', function () { 26 | assert.deepEqual(metatags, expectedResult.metatags) 27 | }) 28 | }) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-auto-extractor", 3 | "version": "1.0.1", 4 | "description": "Automatically extracts structured information from webpages", 5 | "main": "dist/index.js", 6 | "scripts": { 7 | "build": "babel src -d dist", 8 | "test": "mocha --recursive --compilers js:babel-register", 9 | "lint": "standard", 10 | "dev": "nodemon --exec npm run lint", 11 | "prepublish": "npm run lint && npm run test && npm run build" 12 | }, 13 | "engines": { 14 | "node": ">=4.4" 15 | }, 16 | "license": "MIT", 17 | "dependencies": { 18 | "babel-polyfill": "^6.8.0", 19 | "cheerio": "^0.22.0", 20 | "htmlparser2": "^3.9.1" 21 | }, 22 | "devDependencies": { 23 | "babel-cli": "^6.8.0", 24 | "babel-eslint": "^6.0.4", 25 | "babel-preset-es2015": "^6.6.0", 26 | "babel-preset-stage-0": "^6.5.0", 27 | "chai": "^3.5.0", 28 | "mocha": "^2.4.5", 29 | "nodemon": "^1.9.2", 30 | "sinon": "^1.17.4", 31 | "standard": "^7.0.1" 32 | }, 33 | "standard": { 34 | "parser": "babel-eslint", 35 | "ignore": [ 36 | "tonicExample.js" 37 | ] 38 | }, 39 | "keywords": [ 40 | "crawler", 41 | "parser", 42 | "html parser", 43 | "schema.org", 44 | "microdata", 45 | "rdfa", 46 | "jsonld", 47 | "seo" 48 | ], 49 | "repository": { 50 | "type": "git", 51 | "url": "https://github.com/indix/web-auto-extractor.git" 52 | }, 53 | "bugs": { 54 | "url": "https://github.com/indix/web-auto-extractor/issues" 55 | }, 56 | "tonicExampleFilename": "tonicExample.js" 57 | } 58 | -------------------------------------------------------------------------------- /src/parsers/micro-rdfa-parser.js: -------------------------------------------------------------------------------- 1 | import htmlparser from 'htmlparser2' 2 | 3 | function getPropValue (tagName, attribs, TYPE, PROP) { 4 | if (attribs[TYPE]) { 5 | return null 6 | } else if ((tagName === 'a' || tagName === 'link') && attribs.href) { 7 | return attribs.href.trim() 8 | } else if (attribs.content) { 9 | return attribs.content.trim() 10 | } else if (attribs[PROP] === 'image' && attribs.src) { 11 | return attribs.src.trim() 12 | } else { 13 | return null 14 | } 15 | } 16 | 17 | const getAttrNames = (specName) => { 18 | let TYPE, PROP 19 | if (specName.toLowerCase().startsWith('micro')) { 20 | TYPE = 'itemtype' 21 | PROP = 'itemprop' 22 | } else if (specName.toLowerCase().startsWith('rdfa')) { 23 | TYPE = 'typeof' 24 | PROP = 'property' 25 | } else { 26 | throw new Error('Unsupported spec: use either micro or rdfa') 27 | } 28 | return { TYPE, PROP } 29 | } 30 | 31 | const getType = (typeString) => { 32 | const match = (/(.*\/)(\w+)/g).exec(typeString) 33 | return { 34 | context: match && match[1] ? match[1] : undefined, 35 | type: match && match[2] ? match[2] : typeString 36 | } 37 | } 38 | 39 | const createHandler = function (specName) { 40 | let scopes = [] 41 | let tags = [] 42 | let topLevelScope = {} 43 | let textForProp = null 44 | const { TYPE, PROP } = getAttrNames(specName) 45 | 46 | const onopentag = function (tagName, attribs) { 47 | let currentScope = scopes[scopes.length - 1] 48 | let tag = false 49 | 50 | if (attribs[TYPE]) { 51 | if (attribs[PROP] && currentScope) { 52 | let newScope = {} 53 | currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || [] 54 | currentScope[attribs[PROP]].push(newScope) 55 | currentScope = newScope 56 | } else { 57 | currentScope = {} 58 | const { type } = getType(attribs[TYPE]) 59 | topLevelScope[type] = topLevelScope[type] || [] 60 | topLevelScope[type].push(currentScope) 61 | } 62 | } 63 | 64 | if (currentScope) { 65 | if (attribs[TYPE]) { 66 | const { context, type } = getType(attribs[TYPE]) 67 | const vocab = attribs.vocab 68 | currentScope['@context'] = context || vocab 69 | currentScope['@type'] = type 70 | tag = TYPE 71 | scopes.push(currentScope) 72 | } else if (attribs[PROP]) { 73 | if (currentScope[attribs[PROP]] && !Array.isArray(currentScope[attribs[PROP]])) { 74 | // PROP occurs for the second time, storing it as an array 75 | currentScope[attribs[PROP]] = [currentScope[attribs[PROP]]] 76 | } 77 | 78 | var value = getPropValue(tagName, attribs, TYPE, PROP) 79 | if (!value) { 80 | tag = PROP 81 | if (Array.isArray(currentScope[attribs[PROP]])) { 82 | currentScope[attribs[PROP]].push('') 83 | } else { 84 | currentScope[attribs[PROP]] = '' 85 | } 86 | textForProp = attribs[PROP] 87 | } else { 88 | if (Array.isArray(currentScope[attribs[PROP]])) { 89 | currentScope[attribs[PROP]].push(value) 90 | } else { 91 | currentScope[attribs[PROP]] = value 92 | } 93 | } 94 | } 95 | } 96 | tags.push(tag) 97 | } 98 | const ontext = function (text) { 99 | if (textForProp) { 100 | if (Array.isArray(scopes[scopes.length - 1][textForProp])) { 101 | scopes[scopes.length - 1][textForProp][scopes[scopes.length - 1][textForProp].length - 1] += text.trim() 102 | } else { 103 | scopes[scopes.length - 1][textForProp] += text.trim() 104 | } 105 | } 106 | } 107 | const onclosetag = function (tagname) { 108 | const tag = tags.pop() 109 | if (tag === TYPE) { 110 | let scope = scopes.pop() 111 | if (!scope['@context']) { 112 | delete scope['@context'] 113 | } 114 | Object.keys(scope).forEach((key) => { 115 | if (Array.isArray(scope[key]) && scope[key].length === 1) { 116 | scope[key] = scope[key][0] 117 | } 118 | }) 119 | } else if (tag === PROP) { 120 | textForProp = false 121 | } 122 | } 123 | 124 | return { 125 | onopentag, 126 | ontext, 127 | onclosetag, 128 | topLevelScope 129 | } 130 | } 131 | 132 | export default (html, specName) => { 133 | const handler = createHandler(specName) 134 | new htmlparser.Parser(handler).end(html) 135 | return handler.topLevelScope 136 | } 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Auto Extractor 2 | [](https://travis-ci.org/indix/web-auto-extractor) 3 | 4 | Parse semantically structured information from any HTML webpage. 5 | 6 | Supported formats:- 7 | - Encodings that support [Schema.org](http://schema.org/) vocabularies:- 8 | - Microdata 9 | - RDFa-lite 10 | - JSON-LD 11 | - Random Meta tags 12 | 13 | Popularly, many websites mark up their webpages with Schema.org vocabularies for better SEO. This library helps you parse that information to JSON. 14 | 15 | **[Demo](https://tonicdev.com/npm/web-auto-extractor)** it on tonicdev 16 | 17 | ## Installation 18 | `npm install web-auto-extractor` 19 | 20 | ## [Usage](#usage) 21 | 22 | ```js 23 | // IF CommonJS 24 | var WAE = require('web-auto-extractor').default 25 | // IF ES6 26 | import WAE from 'web-auto-extractor' 27 | 28 | var parsed = WAE().parse(sampleHTML) 29 | 30 | ``` 31 | 32 | Let's use the following text as the `sampleHTML` in our example. It uses Schema.org vocabularies to structure a Product information and is encoded in `microdata` format. 33 | 34 | #### [Input](#input) 35 | ```html 36 |
40 | Sleeker than ACME's Classic Anvil, the
41 | Executive Anvil is perfect for the business traveler
42 | looking for something to drop from a height.
43 |
44 | Product #: 925872
45 |
46 | 4.4 stars, based on 89
47 | reviews
48 |
49 |
50 |
51 | Regular price: $179.99
52 |
53 | $119.99
54 | (Sale ends )
56 | Available from:
57 | Executive Objects
58 |
59 | Condition: Previously owned,
60 | in excellent condition
61 | In stock! Order now!
62 |
63 | 130 |
60 | Sleeker than ACME's Classic Anvil, the
61 | Executive Anvil is perfect for the business traveler
62 | looking for something to drop from a height.
63 |
64 | Product #: 925872
65 |
66 | 4.4 stars, based on 89
67 | reviews
68 |
69 |
70 |
71 | Regular price: $179.99
72 |
73 | $119.99
74 | (Sale ends )
76 | Available from:
77 | Executive Objects
78 |
79 | Condition: Previously owned,
80 | in excellent condition
81 | In stock! Order now!
82 |
83 |
92 | This classic banana bread recipe comes
93 | from my mom -- the walnuts add a nice texture and flavor to the banana
94 | bread.
95 | Prep Time: 15 minutes
96 | Cook time: 1 hour
97 | Yield: 1 loaf
98 | Tags: Low fat
99 |
127 | Sleeker than ACME's Classic Anvil, the
128 | Executive Anvil is perfect for the business traveler
129 | looking for something to drop from a height.
130 |
131 | Product #: 925872
132 |
133 | 4.4 stars, based on 89
134 | reviews
135 |
136 |
137 |
138 | Regular price: $179.99
139 |
140 | $119.99
141 | (Sale ends )
143 | Available from:
144 | Executive Objects
145 |
146 | Condition: Previously owned,
147 | in excellent condition
148 | In stock! Order now!
149 |
150 |