├── .editorconfig ├── .gitignore ├── .npmignore ├── LICENSE ├── index.js ├── package-lock.json ├── package.json ├── readme.md └── test.js /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | indent_size = 2 7 | indent_style = space 8 | trim_trailing_whitespace = true 9 | 10 | [**.{js,json,md}] 11 | insert_final_newline = true 12 | 13 | [**.html] 14 | insert_final_newline = false 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | coverage 4 | .nyc_output 5 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .git* 3 | coverage 4 | .nyc_output 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Alexey Novikov http://2dubs.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | const url = require("url"); 3 | const got = require("got"); 4 | const htmlparser = require("htmlparser2"); 5 | const FileType = require("file-type"); 6 | const Transform = require("stream").Transform; 7 | const VERSION = require("./package.json").version; 8 | 9 | const USERAGENT = `meta-extractor/${VERSION} (https://github.com/velocityzen/meta-extractor)`; 10 | 11 | function getError(error) { 12 | if (error instanceof got.HTTPError) { 13 | let err = new Error(error.message); 14 | err.statusCode = error.response.statusCode; 15 | return err; 16 | } 17 | 18 | return error; 19 | } 20 | 21 | function fixName(name) { 22 | return name.replace(/(?::|_|-)(\w)/g, (matches, letter) => 23 | letter.toUpperCase() 24 | ); 25 | } 26 | 27 | function parseMeta(attrs, rx) { 28 | const name = attrs.name || attrs.property || Object.keys(attrs)[0]; 29 | 30 | if (rx.test(name)) { 31 | return [fixName(name), attrs.content || attrs[name]]; 32 | } 33 | } 34 | 35 | function parseFeed(attrs) { 36 | const match = /^application\/(atom|rss)\+xml$/i.exec(attrs.type); 37 | 38 | if (!match) { 39 | return; 40 | } 41 | 42 | return { 43 | type: match[1], 44 | href: attrs.href, 45 | title: attrs.title, 46 | }; 47 | } 48 | 49 | function createHtmlParser(res, opts) { 50 | let isHead = false; 51 | let current; 52 | 53 | return new htmlparser.Parser( 54 | { 55 | onopentag: (name, attrs) => { 56 | current = name; 57 | if (name === "head") { 58 | isHead = true; 59 | } else if (name === "meta") { 60 | const meta = parseMeta(attrs, opts.rx); 61 | if (meta && !res[meta[0]]) { 62 | res[meta[0]] = meta[1]; 63 | } 64 | } else if (name === "img") { 65 | const src = attrs.src; 66 | if (src && src.substr(0, 4) !== "data") { 67 | if (!res.images) { 68 | res.images = new Set(); 69 | } 70 | res.images.add(url.resolve(opts.uri, src)); 71 | } 72 | } 73 | 74 | if (isHead && name === "link") { 75 | const feed = parseFeed(attrs); 76 | if (feed) { 77 | if (!res.feeds) { 78 | res.feeds = []; 79 | } 80 | res.feeds.push(feed); 81 | } 82 | } 83 | }, 84 | ontext: (text) => { 85 | if (isHead && current === "title") { 86 | res.title += text; 87 | } 88 | }, 89 | onclosetag: (name) => { 90 | if (name === "head") { 91 | isHead = false; 92 | } 93 | }, 94 | }, 95 | { decodeEntities: true } 96 | ); 97 | } 98 | 99 | function createParser(opts, done) { 100 | const limit = opts.limit; 101 | const url = new URL(opts.uri); 102 | 103 | const res = { 104 | host: url.host, 105 | pathname: url.pathname, 106 | title: "", 107 | }; 108 | 109 | let parser; 110 | let size = 0; 111 | 112 | return new Transform({ 113 | transform: function (chunk, enc, cb) { 114 | size += chunk.length; 115 | 116 | if (size >= limit) { 117 | this.resume(); 118 | return done(new Error("Response body limit exceeded")); 119 | } 120 | 121 | if (!parser) { 122 | FileType.fromBuffer(Buffer.from(chunk)).then((file) => { 123 | if (file) { 124 | res.file = file; 125 | this.resume(); 126 | return done(null, res); 127 | } 128 | 129 | parser = createHtmlParser(res, { 130 | uri: opts.uri, 131 | rx: opts.rx, 132 | }); 133 | 134 | parser.write(chunk); 135 | cb(); 136 | }); 137 | } else { 138 | parser.write(chunk); 139 | cb(); 140 | } 141 | }, 142 | 143 | flush: (cb) => { 144 | res.title = res.title.replace(/\s{2,}|\n/gim, ""); 145 | cb(); 146 | done(null, res); 147 | }, 148 | }); 149 | } 150 | 151 | function _extract(opts, done) { 152 | const uri = opts.uri; 153 | const limit = opts.limit || 2 * 1024 * 1024; 154 | opts.headers = Object.assign( 155 | { 156 | "User-Agent": USERAGENT, 157 | }, 158 | opts.headers 159 | ); 160 | 161 | let isDone = false; 162 | 163 | got 164 | .stream(uri, opts) 165 | .on("error", (err) => { 166 | done(getError(err)); 167 | isDone = true; 168 | }) 169 | .pipe( 170 | createParser( 171 | { 172 | uri, 173 | limit, 174 | rx: 175 | opts.rxMeta || 176 | /charset|description|keywords|twitter:|og:|vk:|al:|theme-color/im, 177 | }, 178 | (err, res) => { 179 | !isDone && done(err, res); 180 | } 181 | ) 182 | ); 183 | } 184 | 185 | function extract(opts, done) { 186 | if (!done) { 187 | return new Promise((resolve, reject) => { 188 | _extract(opts, (err, res) => (err ? reject(err) : resolve(res))); 189 | }); 190 | } 191 | 192 | _extract(opts, done); 193 | } 194 | 195 | module.exports = extract; 196 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "meta-extractor", 3 | "version": "2.1.0", 4 | "description": "Simple, stream based, html meta data extractor", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "nyc ava" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/velocityzen/meta-extractor.git" 12 | }, 13 | "keywords": [ 14 | "meta", 15 | "og", 16 | "opengraph", 17 | "html", 18 | "parser", 19 | "extractor" 20 | ], 21 | "author": "Alexey Novikov (http://2dubs.com)", 22 | "license": "MIT", 23 | "bugs": { 24 | "url": "https://github.com/velocityzen/meta-extractor/issues" 25 | }, 26 | "homepage": "https://github.com/velocityzen/meta-extractor#readme", 27 | "dependencies": { 28 | "file-type": "^16.0.0", 29 | "got": "^11.8.0", 30 | "htmlparser2": "^6.1.0" 31 | }, 32 | "devDependencies": { 33 | "ava": "^3.0.0", 34 | "nyc": "^15.0.0" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # meta-extractor 2 | 3 | [![NPM Version](https://img.shields.io/npm/v/meta-extractor.svg?style=flat-square)](https://www.npmjs.com/package/meta-extractor) 4 | [![NPM Downloads](https://img.shields.io/npm/dt/meta-extractor.svg?style=flat-square)](https://www.npmjs.com/package/meta-extractor) 5 | 6 | Super simple and fast meta data extractor with low memory footprint. 7 | 8 | Extracts: 9 | 10 | - title 11 | - description 12 | - charset 13 | - theme-color 14 | - rss/atom feeds 15 | - all opengraph meta data 16 | - all twitter meta data 17 | - all app links meta data 18 | - all vk meta data 19 | - all unique image urls (absolute) 20 | - **returns mime and extension for binary files without downloading the whole file** 21 | 22 | ## install 23 | 24 | `npm i meta-extractor` 25 | 26 | ## usage 27 | 28 | ```js 29 | const extract = require('meta-extractor'); 30 | 31 | extract({ uri: 'http://www.newyorker.com' }, (err, res) => 32 | console.log(err, res) 33 | ); 34 | 35 | or; 36 | 37 | const res = await extract({ uri: 'http://www.newyorker.com' }); 38 | console.log(res); 39 | ``` 40 | 41 | If no callback provided returns a Promise. 42 | 43 | The first parameter `opts` as in [got](https://github.com/sindresorhus/got) module and: 44 | 45 | - **uri** — uri to get meta from. 46 | - rxMeta — regexp, custom regexp for meta data. 47 | - limit — number, response body size limit in bytes. Default 2Mb. 48 | 49 | License MIT; 50 | 51 | © velocityzen 52 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | const test = require("ava"); 3 | const extract = require("./index"); 4 | 5 | test("404 Not Found resource", (t) => 6 | extract({ uri: "http://www.newyorker.com/doesnotexist" }) 7 | .then(() => t.fail()) 8 | .catch((err) => { 9 | t.is(err.statusCode, 404); 10 | })); 11 | 12 | test.cb("host resource with callback syntax", (t) => { 13 | extract({ uri: "https://www.nytimes.com/" }, (err, res) => { 14 | t.falsy(err); 15 | t.truthy(res); 16 | t.is(res.host, "www.nytimes.com"); 17 | t.truthy(res.title); 18 | t.truthy(res.description); 19 | t.truthy(res.images); 20 | t.truthy(res.ogTitle); 21 | t.truthy(res.ogDescription); 22 | t.end(); 23 | }); 24 | }); 25 | 26 | test("page resource and promise syntax", (t) => 27 | extract({ uri: "http://www.w3.org/TR/html4/index/list.html" }).then((res) => { 28 | t.truthy(res); 29 | t.is(res.host, "www.w3.org"); 30 | t.truthy(res.title); 31 | t.truthy(res.pathname); 32 | })); 33 | 34 | test("page resource", async (t) => { 35 | const res = await extract({ 36 | uri: "http://www.w3.org/TR/html4/index/list.html", 37 | }); 38 | t.truthy(res); 39 | t.is(res.host, "www.w3.org"); 40 | t.truthy(res.title); 41 | t.truthy(res.pathname); 42 | }); 43 | 44 | test("binary file", async (t) => { 45 | const res = await extract({ 46 | uri: 47 | "https://media.newyorker.com/photos/597238624867016af4a67a62/16:9/w_1200,h_630,c_limit/HP-Social-Tout-B-072117.png", 48 | }); 49 | t.truthy(res); 50 | t.is(res.host, "media.newyorker.com"); 51 | t.truthy(res.file); 52 | t.is(res.file.ext, "png"); 53 | t.is(res.file.mime, "image/png"); 54 | }); 55 | 56 | test("the media resource", async (t) => { 57 | const res = await extract({ 58 | uri: 59 | "https://www.youtube.com/watch?v=9M77quPL3vY&list=RDEMhe2AFH_WvB5nuMd9tU5CHg&index=27", 60 | }); 61 | t.truthy(res); 62 | t.is(res.host, "www.youtube.com"); 63 | t.is(res.ogType, "video.other"); 64 | t.is(res.ogVideoWidth, "480"); 65 | t.is(res.ogVideoHeight, "360"); 66 | }); 67 | 68 | test("the url with redirects", async (t) => { 69 | const res = await extract({ 70 | uri: 71 | "https://uxdesign.cc/how-ux-helped-me-learn-english-7f763b81bf0e#.hhgkmdu3r", 72 | }); 73 | t.truthy(res); 74 | t.is(res.host, "uxdesign.cc"); 75 | }); 76 | 77 | test("gets the custom meta", async (t) => { 78 | const res = await extract({ 79 | uri: "https://mail.ru", 80 | rxMeta: /msapplication/im, 81 | }); 82 | 83 | t.truthy(res); 84 | t.truthy(res.msapplicationName); 85 | }); 86 | 87 | test("feeds links", async (t) => { 88 | const res = await extract({ 89 | uri: "https://www.nytimes.com/section/world", 90 | }); 91 | t.truthy(res); 92 | t.truthy(res.feeds); 93 | }); 94 | 95 | test.cb("the response limit", (t) => { 96 | extract( 97 | { 98 | uri: 99 | "https://www.youtube.com/watch?v=9M77quPL3vY&list=RDEMhe2AFH_WvB5nuMd9tU5CHg&index=27", 100 | limit: 10, 101 | }, 102 | (err, res) => { 103 | t.truthy(err); 104 | t.is(err.message, "Response body limit exceeded"); 105 | t.falsy(res); 106 | t.end(); 107 | } 108 | ); 109 | }); 110 | --------------------------------------------------------------------------------