├── .editorconfig
├── .gitignore
├── .npmignore
├── LICENSE
├── index.js
├── package-lock.json
├── package.json
├── readme.md
└── test.js


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | indent_size = 2
 7 | indent_style = space
 8 | trim_trailing_whitespace = true
 9 | 
10 | [**.{js,json,md}]
11 | insert_final_newline = true
12 | 
13 | [**.html]
14 | insert_final_newline = false
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules
3 | coverage
4 | .nyc_output
5 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .git*
3 | coverage
4 | .nyc_output
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Alexey Novikov http://2dubs.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | "use strict";
  2 | const url = require("url");
  3 | const got = require("got");
  4 | const htmlparser = require("htmlparser2");
  5 | const FileType = require("file-type");
  6 | const Transform = require("stream").Transform;
  7 | const VERSION = require("./package.json").version;
  8 | 
  9 | const USERAGENT = `meta-extractor/${VERSION} (https://github.com/velocityzen/meta-extractor)`;
 10 | 
 11 | function getError(error) {
 12 |   if (error instanceof got.HTTPError) {
 13 |     let err = new Error(error.message);
 14 |     err.statusCode = error.response.statusCode;
 15 |     return err;
 16 |   }
 17 | 
 18 |   return error;
 19 | }
 20 | 
 21 | function fixName(name) {
 22 |   return name.replace(/(?::|_|-)(\w)/g, (matches, letter) =>
 23 |     letter.toUpperCase()
 24 |   );
 25 | }
 26 | 
 27 | function parseMeta(attrs, rx) {
 28 |   const name = attrs.name || attrs.property || Object.keys(attrs)[0];
 29 | 
 30 |   if (rx.test(name)) {
 31 |     return [fixName(name), attrs.content || attrs[name]];
 32 |   }
 33 | }
 34 | 
 35 | function parseFeed(attrs) {
 36 |   const match = /^application\/(atom|rss)\+xml$/i.exec(attrs.type);
 37 | 
 38 |   if (!match) {
 39 |     return;
 40 |   }
 41 | 
 42 |   return {
 43 |     type: match[1],
 44 |     href: attrs.href,
 45 |     title: attrs.title,
 46 |   };
 47 | }
 48 | 
 49 | function createHtmlParser(res, opts) {
 50 |   let isHead = false;
 51 |   let current;
 52 | 
 53 |   return new htmlparser.Parser(
 54 |     {
 55 |       onopentag: (name, attrs) => {
 56 |         current = name;
 57 |         if (name === "head") {
 58 |           isHead = true;
 59 |         } else if (name === "meta") {
 60 |           const meta = parseMeta(attrs, opts.rx);
 61 |           if (meta && !res[meta[0]]) {
 62 |             res[meta[0]] = meta[1];
 63 |           }
 64 |         } else if (name === "img") {
 65 |           const src = attrs.src;
 66 |           if (src && src.substr(0, 4) !== "data") {
 67 |             if (!res.images) {
 68 |               res.images = new Set();
 69 |             }
 70 |             res.images.add(url.resolve(opts.uri, src));
 71 |           }
 72 |         }
 73 | 
 74 |         if (isHead && name === "link") {
 75 |           const feed = parseFeed(attrs);
 76 |           if (feed) {
 77 |             if (!res.feeds) {
 78 |               res.feeds = [];
 79 |             }
 80 |             res.feeds.push(feed);
 81 |           }
 82 |         }
 83 |       },
 84 |       ontext: (text) => {
 85 |         if (isHead && current === "title") {
 86 |           res.title += text;
 87 |         }
 88 |       },
 89 |       onclosetag: (name) => {
 90 |         if (name === "head") {
 91 |           isHead = false;
 92 |         }
 93 |       },
 94 |     },
 95 |     { decodeEntities: true }
 96 |   );
 97 | }
 98 | 
 99 | function createParser(opts, done) {
100 |   const limit = opts.limit;
101 |   const url = new URL(opts.uri);
102 | 
103 |   const res = {
104 |     host: url.host,
105 |     pathname: url.pathname,
106 |     title: "",
107 |   };
108 | 
109 |   let parser;
110 |   let size = 0;
111 | 
112 |   return new Transform({
113 |     transform: function (chunk, enc, cb) {
114 |       size += chunk.length;
115 | 
116 |       if (size >= limit) {
117 |         this.resume();
118 |         return done(new Error("Response body limit exceeded"));
119 |       }
120 | 
121 |       if (!parser) {
122 |         FileType.fromBuffer(Buffer.from(chunk)).then((file) => {
123 |           if (file) {
124 |             res.file = file;
125 |             this.resume();
126 |             return done(null, res);
127 |           }
128 | 
129 |           parser = createHtmlParser(res, {
130 |             uri: opts.uri,
131 |             rx: opts.rx,
132 |           });
133 | 
134 |           parser.write(chunk);
135 |           cb();
136 |         });
137 |       } else {
138 |         parser.write(chunk);
139 |         cb();
140 |       }
141 |     },
142 | 
143 |     flush: (cb) => {
144 |       res.title = res.title.replace(/\s{2,}|\n/gim, "");
145 |       cb();
146 |       done(null, res);
147 |     },
148 |   });
149 | }
150 | 
151 | function _extract(opts, done) {
152 |   const uri = opts.uri;
153 |   const limit = opts.limit || 2 * 1024 * 1024;
154 |   opts.headers = Object.assign(
155 |     {
156 |       "User-Agent": USERAGENT,
157 |     },
158 |     opts.headers
159 |   );
160 | 
161 |   let isDone = false;
162 | 
163 |   got
164 |     .stream(uri, opts)
165 |     .on("error", (err) => {
166 |       done(getError(err));
167 |       isDone = true;
168 |     })
169 |     .pipe(
170 |       createParser(
171 |         {
172 |           uri,
173 |           limit,
174 |           rx:
175 |             opts.rxMeta ||
176 |             /charset|description|keywords|twitter:|og:|vk:|al:|theme-color/im,
177 |         },
178 |         (err, res) => {
179 |           !isDone && done(err, res);
180 |         }
181 |       )
182 |     );
183 | }
184 | 
185 | function extract(opts, done) {
186 |   if (!done) {
187 |     return new Promise((resolve, reject) => {
188 |       _extract(opts, (err, res) => (err ? reject(err) : resolve(res)));
189 |     });
190 |   }
191 | 
192 |   _extract(opts, done);
193 | }
194 | 
195 | module.exports = extract;
196 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "meta-extractor",
 3 |   "version": "2.1.0",
 4 |   "description": "Simple, stream based, html meta data extractor",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "nyc ava"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "git+https://github.com/velocityzen/meta-extractor.git"
12 |   },
13 |   "keywords": [
14 |     "meta",
15 |     "og",
16 |     "opengraph",
17 |     "html",
18 |     "parser",
19 |     "extractor"
20 |   ],
21 |   "author": "Alexey Novikov <v@2dubs.com> (http://2dubs.com)",
22 |   "license": "MIT",
23 |   "bugs": {
24 |     "url": "https://github.com/velocityzen/meta-extractor/issues"
25 |   },
26 |   "homepage": "https://github.com/velocityzen/meta-extractor#readme",
27 |   "dependencies": {
28 |     "file-type": "^16.0.0",
29 |     "got": "^11.8.0",
30 |     "htmlparser2": "^6.1.0"
31 |   },
32 |   "devDependencies": {
33 |     "ava": "^3.0.0",
34 |     "nyc": "^15.0.0"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # meta-extractor
 2 | 
 3 | [![NPM Version](https://img.shields.io/npm/v/meta-extractor.svg?style=flat-square)](https://www.npmjs.com/package/meta-extractor)
 4 | [![NPM Downloads](https://img.shields.io/npm/dt/meta-extractor.svg?style=flat-square)](https://www.npmjs.com/package/meta-extractor)
 5 | 
 6 | Super simple and fast meta data extractor with low memory footprint.
 7 | 
 8 | Extracts:
 9 | 
10 | - title
11 | - description
12 | - charset
13 | - theme-color
14 | - rss/atom feeds
15 | - all opengraph meta data
16 | - all twitter meta data
17 | - all app links meta data
18 | - all vk meta data
19 | - all unique image urls (absolute)
20 | - **returns mime and extension for binary files without downloading the whole file**
21 | 
22 | ## install
23 | 
24 | `npm i meta-extractor`
25 | 
26 | ## usage
27 | 
28 | ```js
29 | const extract = require('meta-extractor');
30 | 
31 | extract({ uri: 'http://www.newyorker.com' }, (err, res) =>
32 |   console.log(err, res)
33 | );
34 | 
35 | or;
36 | 
37 | const res = await extract({ uri: 'http://www.newyorker.com' });
38 | console.log(res);
39 | ```
40 | 
41 | If no callback provided returns a Promise.
42 | 
43 | The first parameter `opts` as in [got](https://github.com/sindresorhus/got) module and:
44 | 
45 | - **uri** — uri to get meta from.
46 | - rxMeta — regexp, custom regexp for meta data.
47 | - limit — number, response body size limit in bytes. Default 2Mb.
48 | 
49 | License MIT;
50 | 
51 | © velocityzen
52 | 


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
  1 | "use strict";
  2 | const test = require("ava");
  3 | const extract = require("./index");
  4 | 
  5 | test("404 Not Found resource", (t) =>
  6 |   extract({ uri: "http://www.newyorker.com/doesnotexist" })
  7 |     .then(() => t.fail())
  8 |     .catch((err) => {
  9 |       t.is(err.statusCode, 404);
 10 |     }));
 11 | 
 12 | test.cb("host resource with callback syntax", (t) => {
 13 |   extract({ uri: "https://www.nytimes.com/" }, (err, res) => {
 14 |     t.falsy(err);
 15 |     t.truthy(res);
 16 |     t.is(res.host, "www.nytimes.com");
 17 |     t.truthy(res.title);
 18 |     t.truthy(res.description);
 19 |     t.truthy(res.images);
 20 |     t.truthy(res.ogTitle);
 21 |     t.truthy(res.ogDescription);
 22 |     t.end();
 23 |   });
 24 | });
 25 | 
 26 | test("page resource and promise syntax", (t) =>
 27 |   extract({ uri: "http://www.w3.org/TR/html4/index/list.html" }).then((res) => {
 28 |     t.truthy(res);
 29 |     t.is(res.host, "www.w3.org");
 30 |     t.truthy(res.title);
 31 |     t.truthy(res.pathname);
 32 |   }));
 33 | 
 34 | test("page resource", async (t) => {
 35 |   const res = await extract({
 36 |     uri: "http://www.w3.org/TR/html4/index/list.html",
 37 |   });
 38 |   t.truthy(res);
 39 |   t.is(res.host, "www.w3.org");
 40 |   t.truthy(res.title);
 41 |   t.truthy(res.pathname);
 42 | });
 43 | 
 44 | test("binary file", async (t) => {
 45 |   const res = await extract({
 46 |     uri:
 47 |       "https://media.newyorker.com/photos/597238624867016af4a67a62/16:9/w_1200,h_630,c_limit/HP-Social-Tout-B-072117.png",
 48 |   });
 49 |   t.truthy(res);
 50 |   t.is(res.host, "media.newyorker.com");
 51 |   t.truthy(res.file);
 52 |   t.is(res.file.ext, "png");
 53 |   t.is(res.file.mime, "image/png");
 54 | });
 55 | 
 56 | test("the media resource", async (t) => {
 57 |   const res = await extract({
 58 |     uri:
 59 |       "https://www.youtube.com/watch?v=9M77quPL3vY&list=RDEMhe2AFH_WvB5nuMd9tU5CHg&index=27",
 60 |   });
 61 |   t.truthy(res);
 62 |   t.is(res.host, "www.youtube.com");
 63 |   t.is(res.ogType, "video.other");
 64 |   t.is(res.ogVideoWidth, "480");
 65 |   t.is(res.ogVideoHeight, "360");
 66 | });
 67 | 
 68 | test("the url with redirects", async (t) => {
 69 |   const res = await extract({
 70 |     uri:
 71 |       "https://uxdesign.cc/how-ux-helped-me-learn-english-7f763b81bf0e#.hhgkmdu3r",
 72 |   });
 73 |   t.truthy(res);
 74 |   t.is(res.host, "uxdesign.cc");
 75 | });
 76 | 
 77 | test("gets the custom meta", async (t) => {
 78 |   const res = await extract({
 79 |     uri: "https://mail.ru",
 80 |     rxMeta: /msapplication/im,
 81 |   });
 82 | 
 83 |   t.truthy(res);
 84 |   t.truthy(res.msapplicationName);
 85 | });
 86 | 
 87 | test("feeds links", async (t) => {
 88 |   const res = await extract({
 89 |     uri: "https://www.nytimes.com/section/world",
 90 |   });
 91 |   t.truthy(res);
 92 |   t.truthy(res.feeds);
 93 | });
 94 | 
 95 | test.cb("the response limit", (t) => {
 96 |   extract(
 97 |     {
 98 |       uri:
 99 |         "https://www.youtube.com/watch?v=9M77quPL3vY&list=RDEMhe2AFH_WvB5nuMd9tU5CHg&index=27",
100 |       limit: 10,
101 |     },
102 |     (err, res) => {
103 |       t.truthy(err);
104 |       t.is(err.message, "Response body limit exceeded");
105 |       t.falsy(res);
106 |       t.end();
107 |     }
108 |   );
109 | });
110 | 


--------------------------------------------------------------------------------