├── .babelrc ├── .eslintignore ├── .eslintrc ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── circle.yml ├── karma.conf.js ├── package.json ├── parser.js ├── tests ├── getMetadata.test.js ├── index.js ├── metadataRules.test.js └── test-utils.js ├── url-utils.js └── webpack.config.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["es2015"] 3 | } 4 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | coverage 2 | client/* 3 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | // When adding items to this file please check for effects on sub-directories. 3 | "parserOptions": { 4 | "ecmaFeatures": { 5 | "jsx": true 6 | } 7 | }, 8 | "env": { 9 | "browser": true, 10 | "es6": true, 11 | "mocha": true, 12 | "node": true, 13 | }, 14 | "globals": { 15 | "StopIteration": true, 16 | "__CONFIG__": true, 17 | "platform_exports": true, 18 | "platform_require": true 19 | }, 20 | "plugins": [ 21 | "mozilla", 22 | ], 23 | "extends": [ 24 | "eslint:recommended" 25 | ], 26 | "rules": { 27 | "mozilla/components-imports": 1, 28 | "mozilla/import-globals-from": 1, 29 | "mozilla/this-top-level-scope": 1, 30 | 31 | "array-bracket-spacing": [2, "never"], 32 | "camelcase": 0, 33 | "comma-dangle": 0, 34 | "comma-spacing": 2, 35 | "computed-property-spacing": [2, "never"], 36 | "default-case": 0, 37 | "eqeqeq": 2, 38 | "func-names": 0, 39 | "func-style": 0, 40 | "generator-star-spacing": [2, {"before": false, "after": false}], 41 | "global-require": 0, 42 | "id-blacklist": 0, 43 | "id-length": 0, 44 | "id-match": 0, 45 | "init-declarations": 0, 46 | "max-len": 0, 47 | "max-params": 0, 48 | "newline-after-var": 0, 49 | "no-bitwise": 0, 50 | "no-console": 1, 51 | "no-empty-function": 0, 52 | "no-inline-comments": 0, 53 | "no-invalid-this": 0, 54 | "no-magic-numbers": 0, 55 | "no-negated-condition": 0, 56 | "no-shadow": 1, 57 | "no-trailing-spaces": [2, {"skipBlankLines": false}], 58 | "no-undef": 2, 59 | "no-underscore-dangle": 0, 60 | "no-unused-vars": [2, {"vars": "all", "args": "none"}], 61 | "no-var": 2, 62 | "no-warning-comments": 1, 63 | "object-curly-spacing": [2, "never"], 64 | "prefer-const": 1, 65 | "prefer-reflect": 0, 66 | "quotes": [2, "single", "avoid-escape"], 67 | "semi": [2, "always"], 68 | "space-before-function-paren": [2, {"anonymous": "never", "named": "never"}], 69 | "space-infix-ops": 2, 70 | "space-unary-ops": 2, 71 | "strict": 0 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | 6 | # Runtime data 7 | pids 8 | *.pid 9 | *.seed 10 | 11 | # Directory for instrumented libs generated by jscoverage/JSCover 12 | lib-cov 13 | 14 | # Coverage directory used by tools like istanbul 15 | coverage 16 | 17 | # nyc test coverage 18 | .nyc_output 19 | 20 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 21 | .grunt 22 | 23 | # node-waf configuration 24 | .lock-wscript 25 | 26 | # Compiled binary addons (http://nodejs.org/api/addons.html) 27 | build/Release 28 | 29 | # Dependency directories 30 | node_modules 31 | jspm_packages 32 | package-lock.json 33 | 34 | # Optional npm cache directory 35 | .npm 36 | 37 | # Optional REPL history 38 | .node_repl_history 39 | 40 | .DS_Store 41 | 42 | # build artifact 43 | client 44 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Page Metadata Parser 2 | A Javascript library for parsing metadata in web pages. 3 | 4 | [](https://circleci.com/gh/mozilla/page-metadata-parser) 5 | 6 | [](https://coveralls.io/github/mozilla/page-metadata-parser?branch=master) 7 | 8 | ## Overview 9 | 10 | ### Purpose 11 | 12 | The purpose of this library is to be able to find a consistent set of metadata for any given web page. Each individual kind of metadata has many rules which define how it may be located. For example, a description of a page could be found in any of the following DOM elements: 13 | 14 | 15 | 16 | 17 | 18 | Because different web pages represent their metadata in any number of possible DOM elements, the Page Metadata Parser collects rules for different ways a given kind of metadata may be represented and abstracts them away from the caller. 19 | 20 | The output of the metadata parser for the above example would be 21 | 22 | {description: "A page's description"} 23 | 24 | regardless of which particular kind of description tag was used. 25 | 26 | ### Supported schemas 27 | 28 | This library employs parsers for the following formats: 29 | 30 | [opengraph](http://ogp.me/) 31 | 32 | [twitter](https://dev.twitter.com/cards/markup) 33 | 34 | [meta tags](https://developer.mozilla.org/en/docs/Web/HTML/Element/meta) 35 | 36 | ### Requirements 37 | 38 | This library is meant to be used either in the browser (embedded directly in a website or into a browser addon/extension) or on a server (node.js). 39 | 40 | The parser depends only on the [Node URL library](https://nodejs.org/api/url.html) or the [Browser URL library](https://developer.mozilla.org/en-US/docs/Web/API/Document/URL). 41 | 42 | Each function expects to be passed a [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document) object, which may be created either directly by a browser or on the server using a [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document) compatible object, such as that provided by [domino](https://github.com/fgnass/domino). 43 | 44 | ## Usage 45 | 46 | ### Installation 47 | 48 | npm install --save page-metadata-parser 49 | 50 | ### Usage in the browser 51 | 52 | The library can be built to be deployed directly to a modern browser by using 53 | 54 | npm run bundle 55 | 56 | and embedding the resultant js file directly into a page like so: 57 | 58 | 59 | 60 | 67 | 68 | ### Usage in node 69 | 70 | To use the library in node, you must first construct a DOM API compatible object from an HTML string, for example: 71 | 72 | const {getMetadata} = require('page-metadata-parser'); 73 | const domino = require('domino'); 74 | 75 | const url = 'https://github.com/mozilla/page-metadata-parser'; 76 | const response = await fetch(url); 77 | const html = await response.text(); 78 | const doc = domino.createWindow(html).document; 79 | const metadata = getMetadata(doc, url); 80 | 81 | ## Metadata Rules 82 | 83 | ### Rules 84 | 85 | A single rule instructs the parser on a possible DOM node to locate a specific piece of content. 86 | 87 | For instance, a rule to parse the title of a page found in a DOM tag like this: 88 | 89 | 90 | 91 | Would be represented with the following rule: 92 | 93 | ['meta[property="og:title"]', element => element.getAttribute('content')] 94 | 95 | A rule consists of two parts, a [query selector](https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector) compatible string which is used to look up the target content, and a callable which receives an [element](https://developer.mozilla.org/en-US/docs/Web/API/Element) and returns the desired content from that element. 96 | 97 | Many rules together form a Rule Set. This library will apply each rule to a page and choose the 'best' result. The order in which rules are defined indicate their preference, with the first rule being the most preferred. A Rule Set can be defined like so: 98 | 99 | const titleRules = { 100 | rules: [ 101 | ['meta[property="og:title"]', node => node.element.getAttribute('content')], 102 | ['title', node => node.element.text], 103 | ] 104 | }; 105 | 106 | In this case, the OpenGraph title will be preferred over the title tag. 107 | 108 | This library includes many rules for a single desired piece of metadata which should allow it to consistently find metadata across many types of pages. This library is meant to be a community driven effort, and so if there is no rule to find a piece of information from a particular website, contributors are encouraged to add new rules! 109 | 110 | ### Built-in Rule Sets 111 | 112 | This library provides rule sets to find the following forms of metadata in a page: 113 | 114 | Field | Description 115 | --- | --- 116 | description | A user displayable description for the page. 117 | icon | A URL which contains an icon for the page. 118 | image | A URL which contains a preview image for the page. 119 | keywords | The meta keywords for the page. 120 | provider | A string representation of the sub and primary domains. 121 | title | A user displayable title for the page. 122 | type | The type of content as defined by [opengraph](http://ogp.me/#types). 123 | url | A canonical URL for the page. 124 | 125 | To use a single rule set to find a particular piece of metadata within a page, simply pass that rule set, a URL, and a [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document) object to getMetadata and it will apply each possible rule for that rule set until it finds a matching piece of information and return it. 126 | 127 | Example: 128 | 129 | const {getMetadata, metadataRuleSets} = require('page-metadata-parser'); 130 | 131 | const pageTitle = getMetadata(doc, url, {title: metadataRuleSets.title}); 132 | 133 | 134 | ### Extending a single rule 135 | 136 | To add your own additional custom rule to an existing rule set, you can simply push it into that rule sets's array. 137 | 138 | Example: 139 | 140 | const {getMetadata, metadataRuleSets} = require('page-metadata-parser'); 141 | 142 | const customDescriptionRuleSet = metadataRuleSets.description; 143 | 144 | customDescriptionRuleSet.rules.push([ 145 | ['meta[name="customDescription"]', element => element.getAttribute('content')] 146 | ]); 147 | 148 | const pageDescription = getMetadata(doc, url, {description: customDescriptionRuleSet}); 149 | 150 | 151 | ### Using all rules 152 | 153 | To parse all of the available metadata on a page using all of the rule sets provided in this library, simply call getMetadata on the [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document). 154 | 155 | const {getMetadata, metadataRuleSets} = require('page-metadata-parser'); 156 | 157 | const pageMetadata = getMetadata(doc, url); 158 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | machine: 2 | node: 3 | version: 6.2 4 | 5 | dependencies: 6 | override: 7 | - npm install --only=dev 8 | - npm update 9 | - sudo apt-get update && sudo apt-get install libpango1.0-0 libpangocairo-1.0-0 firefox 10 | - sudo rm /usr/bin/firefox;sudo ln -s $(which firefox.ubuntu) /usr/bin/firefox 11 | 12 | test: 13 | pre: 14 | - npm run bundle 15 | -------------------------------------------------------------------------------- /karma.conf.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | 3 | module.exports = function(config) { 4 | config.set({ 5 | singleRun: true, 6 | browsers: ['Firefox'], 7 | frameworks: ['mocha'], 8 | reporters: ['mocha', 'coverage'], 9 | coverageReporter: { 10 | dir: 'coverage', 11 | reporters: [ 12 | {type: 'lcov', subdir: 'lcov'}, 13 | {type: 'text-summary', subdir: '.', file: 'text-summary.txt'} 14 | ] 15 | }, 16 | files: [ 17 | 'tests/index.js' 18 | ], 19 | preprocessors: { 20 | 'tests/**/*.js': ['webpack', 'sourcemap'] 21 | }, 22 | webpack: { 23 | module: { 24 | loaders: [{test: /\.json$/, loader: 'json'}], 25 | postLoaders: [{ 26 | test: /\.js$/, 27 | loader: 'istanbul-instrumenter', 28 | include: [path.join(__dirname, 'parser.js')] 29 | }] 30 | }, 31 | externals: { 32 | 'jsdom': 'jsdom', 33 | }, 34 | devtool: 'inline-source-map' 35 | }, 36 | webpackMiddleware: { 37 | noInfo: true 38 | } 39 | }); 40 | }; 41 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "page-metadata-parser", 3 | "description": "A JavaScript library for parsing metadata in a Web Page.", 4 | "version": "1.1.4", 5 | "author": "Jared Kerim", 6 | "bugs": { 7 | "url": "https://github.com/mozilla/page-metadata-parser/issues" 8 | }, 9 | "dependencies": {}, 10 | "devDependencies": { 11 | "babel": "^6.5.2", 12 | "babel-core": "^6.17.0", 13 | "babel-loader": "^6.2.5", 14 | "babel-polyfill": "^6.13.0", 15 | "babel-preset-es2015": "^6.14.0", 16 | "chai": "^3.5.0", 17 | "coveralls": "^2.11.9", 18 | "domino": "^2.1.0", 19 | "eslint": "^2.13.1", 20 | "eslint-plugin-mozilla": "^0.0.3", 21 | "istanbul": "^0.4.4", 22 | "istanbul-instrumenter-loader": "^0.2.0", 23 | "karma": "^1.0.0", 24 | "karma-coverage": "^1.0.0", 25 | "karma-firefox-launcher": "^1.0.0", 26 | "karma-mocha": "^1.0.1", 27 | "karma-mocha-reporter": "^2.0.4", 28 | "karma-sourcemap-loader": "^0.3.7", 29 | "karma-webpack": "^1.7.0", 30 | "mocha": "^2.5.3", 31 | "npm-run-all": "^2.2.0", 32 | "webpack": "^1.13.2" 33 | }, 34 | "homepage": "https://github.com/mozilla/page-metadata-parser#readme", 35 | "keywords": [ 36 | "page", 37 | "metadata", 38 | "parser" 39 | ], 40 | "license": "MPL-2.0", 41 | "main": "parser.js", 42 | "repository": { 43 | "type": "git", 44 | "url": "git+https://github.com/mozilla/page-metadata-parser.git" 45 | }, 46 | "scripts": { 47 | "cover": "cat ./coverage/lcov/lcov.info | coveralls", 48 | "tdd": "npm run test:karma -- --no-single-run", 49 | "test": "npm-run-all test:*", 50 | "test:karma": "karma start", 51 | "test:lint": "eslint .", 52 | "test:mocha": "istanbul cover _mocha --report lcovonly -- tests/*.test.js -R spec", 53 | "bundle": "webpack --optimize-minimize --optimize-dedupe", 54 | "clientize": "npm run bundle" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /parser.js: -------------------------------------------------------------------------------- 1 | const {makeUrlAbsolute, parseUrl} = require('./url-utils'); 2 | 3 | function getProvider(host) { 4 | return host 5 | .replace(/www[a-zA-Z0-9]*\./, '') 6 | .replace('.co.', '.') 7 | .split('.') 8 | .slice(0, -1) 9 | .join(' '); 10 | } 11 | 12 | function buildRuleSet(ruleSet) { 13 | return (doc, context) => { 14 | let maxScore = 0; 15 | let maxValue; 16 | 17 | for (let currRule = 0; currRule < ruleSet.rules.length; currRule++) { 18 | const [query, handler] = ruleSet.rules[currRule]; 19 | 20 | const elements = Array.from(doc.querySelectorAll(query)); 21 | 22 | if(elements.length) { 23 | for (const element of elements) { 24 | let score = ruleSet.rules.length - currRule; 25 | 26 | if (ruleSet.scorers) { 27 | for (const scorer of ruleSet.scorers) { 28 | const newScore = scorer(element, score); 29 | 30 | if (newScore) { 31 | score = newScore; 32 | } 33 | } 34 | } 35 | 36 | if (score > maxScore) { 37 | maxScore = score; 38 | maxValue = handler(element); 39 | } 40 | } 41 | } 42 | } 43 | 44 | if (!maxValue && ruleSet.defaultValue) { 45 | maxValue = ruleSet.defaultValue(context); 46 | } 47 | 48 | if (maxValue) { 49 | if (ruleSet.processors) { 50 | for (const processor of ruleSet.processors) { 51 | maxValue = processor(maxValue, context); 52 | } 53 | } 54 | 55 | if (maxValue.trim) { 56 | maxValue = maxValue.trim(); 57 | } 58 | 59 | return maxValue; 60 | } 61 | }; 62 | } 63 | 64 | const metadataRuleSets = { 65 | description: { 66 | rules: [ 67 | ['meta[property="og:description"]', element => element.getAttribute('content')], 68 | ['meta[name="description" i]', element => element.getAttribute('content')], 69 | ], 70 | }, 71 | 72 | icon: { 73 | rules: [ 74 | ['link[rel="apple-touch-icon"]', element => element.getAttribute('href')], 75 | ['link[rel="apple-touch-icon-precomposed"]', element => element.getAttribute('href')], 76 | ['link[rel="icon" i]', element => element.getAttribute('href')], 77 | ['link[rel="fluid-icon"]', element => element.getAttribute('href')], 78 | ['link[rel="shortcut icon"]', element => element.getAttribute('href')], 79 | ['link[rel="Shortcut Icon"]', element => element.getAttribute('href')], 80 | ['link[rel="mask-icon"]', element => element.getAttribute('href')], 81 | ], 82 | scorers: [ 83 | // Handles the case where multiple icons are listed with specific sizes ie 84 | // 85 | // 86 | (element, score) => { 87 | const sizes = element.getAttribute('sizes'); 88 | 89 | if (sizes) { 90 | const sizeMatches = sizes.match(/\d+/g); 91 | if (sizeMatches) { 92 | return sizeMatches[0]; 93 | } 94 | } 95 | } 96 | ], 97 | defaultValue: (context) => 'favicon.ico', 98 | processors: [ 99 | (icon_url, context) => makeUrlAbsolute(context.url, icon_url) 100 | ] 101 | }, 102 | 103 | image: { 104 | rules: [ 105 | ['meta[property="og:image:secure_url"]', element => element.getAttribute('content')], 106 | ['meta[property="og:image:url"]', element => element.getAttribute('content')], 107 | ['meta[property="og:image"]', element => element.getAttribute('content')], 108 | ['meta[name="twitter:image"]', element => element.getAttribute('content')], 109 | ['meta[property="twitter:image"]', element => element.getAttribute('content')], 110 | ['meta[name="thumbnail"]', element => element.getAttribute('content')], 111 | ], 112 | processors: [ 113 | (image_url, context) => makeUrlAbsolute(context.url, image_url) 114 | ], 115 | }, 116 | 117 | keywords: { 118 | rules: [ 119 | ['meta[name="keywords" i]', element => element.getAttribute('content')], 120 | ], 121 | processors: [ 122 | (keywords, context) => keywords.split(',').map((keyword) => keyword.trim()) 123 | ] 124 | }, 125 | 126 | title: { 127 | rules: [ 128 | ['meta[property="og:title"]', element => element.getAttribute('content')], 129 | ['meta[name="twitter:title"]', element => element.getAttribute('content')], 130 | ['meta[property="twitter:title"]', element => element.getAttribute('content')], 131 | ['meta[name="hdl"]', element => element.getAttribute('content')], 132 | ['title', element => element.text], 133 | ], 134 | }, 135 | 136 | language: { 137 | rules: [ 138 | ['html[lang]', element => element.getAttribute('lang')], 139 | ['meta[name="language" i]', element => element.getAttribute('content')], 140 | ], 141 | processors: [ 142 | (language, context) => language.split('-')[0] 143 | ] 144 | }, 145 | 146 | type: { 147 | rules: [ 148 | ['meta[property="og:type"]', element => element.getAttribute('content')], 149 | ], 150 | }, 151 | 152 | url: { 153 | rules: [ 154 | ['a.amp-canurl', element => element.getAttribute('href')], 155 | ['link[rel="canonical"]', element => element.getAttribute('href')], 156 | ['meta[property="og:url"]', element => element.getAttribute('content')], 157 | ], 158 | defaultValue: (context) => context.url, 159 | processors: [ 160 | (url, context) => makeUrlAbsolute(context.url, url) 161 | ] 162 | }, 163 | 164 | provider: { 165 | rules: [ 166 | ['meta[property="og:site_name"]', element => element.getAttribute('content')] 167 | ], 168 | defaultValue: (context) => getProvider(parseUrl(context.url)) 169 | }, 170 | }; 171 | 172 | function getMetadata(doc, url, customRuleSets) { 173 | const metadata = {}; 174 | const context = { 175 | url, 176 | }; 177 | 178 | const ruleSets = customRuleSets || metadataRuleSets; 179 | 180 | Object.keys(ruleSets).map(ruleSetKey => { 181 | const ruleSet = ruleSets[ruleSetKey]; 182 | const builtRuleSet = buildRuleSet(ruleSet); 183 | 184 | metadata[ruleSetKey] = builtRuleSet(doc, context); 185 | }); 186 | 187 | return metadata; 188 | } 189 | 190 | module.exports = { 191 | buildRuleSet, 192 | getMetadata, 193 | getProvider, 194 | metadataRuleSets 195 | }; 196 | -------------------------------------------------------------------------------- /tests/getMetadata.test.js: -------------------------------------------------------------------------------- 1 | // Tests for parse.js 2 | const {assert} = require('chai'); 3 | const {getProvider, getMetadata, metadataRuleSets} = require('../parser'); 4 | const {stringToDom} = require('./test-utils'); 5 | const {parseUrl} = require('../url-utils'); 6 | 7 | describe('Get Provider Tests', function() { 8 | it('gets a provider with no subdomain', function() { 9 | assert.equal(getProvider(parseUrl('https://example.com/this/?id=that')), 'example'); 10 | }); 11 | 12 | it('removes www as a subdomain', function() { 13 | assert.equal(getProvider(parseUrl('https://www.example.com/this/?id=that')), 'example'); 14 | }); 15 | 16 | it('removes www1 as a subdomain', function() { 17 | assert.equal(getProvider(parseUrl('https://www1.example.com/this/?id=that')), 'example'); 18 | }); 19 | 20 | it('preserves non-www subdomains', function() { 21 | assert.equal(getProvider(parseUrl('https://things.example.com/this/?id=that')), 'things example'); 22 | }); 23 | 24 | it('removes secondary TLDs', function() { 25 | assert.equal(getProvider(parseUrl('https://things.example.co.uk/this/?id=that')), 'things example'); 26 | }); 27 | }); 28 | 29 | describe('Get Metadata Tests', function() { 30 | const sampleDescription = 'A test page.'; 31 | const sampleIcon = 'http://www.example.com/favicon.ico'; 32 | const sampleImageHTTP = 'http://www.example.com/image.png'; 33 | const sampleImageHTTPS = 'https://www.example.com/secure_image.png'; 34 | const sampleTitle = 'Page Title'; 35 | const sampleType = 'article'; 36 | const sampleUrl = 'http://www.example.com/'; 37 | const sampleProviderName = 'Example Provider'; 38 | const sampleLanguage = 'en'; 39 | 40 | 41 | const sampleHtml = ` 42 | 43 |
44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | `; 56 | 57 | it('parses metadata', () => { 58 | const doc = stringToDom(sampleHtml); 59 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 60 | 61 | assert.equal(metadata.description, sampleDescription, `Unable to find ${sampleDescription} in ${sampleHtml}`); 62 | assert.equal(metadata.icon, sampleIcon, `Unable to find ${sampleIcon} in ${sampleHtml}`); 63 | assert.equal(metadata.image, sampleImageHTTPS, `Unable to find ${sampleImageHTTPS} in ${sampleHtml}`); 64 | assert.equal(metadata.title, sampleTitle, `Unable to find ${sampleTitle} in ${sampleHtml}`); 65 | assert.equal(metadata.type, sampleType, `Unable to find ${sampleType} in ${sampleHtml}`); 66 | assert.equal(metadata.url, sampleUrl, `Unable to find ${sampleUrl} in ${sampleHtml}`); 67 | assert.equal(metadata.provider, sampleProviderName, `Unable to find ${sampleProviderName} in ${sampleHtml}`); 68 | assert.equal(metadata.language, sampleLanguage, `Unable to find ${sampleLanguage} in ${sampleHtml}`); 69 | 70 | }); 71 | 72 | it('uses absolute URLs when url parameter passed in', () => { 73 | const relativeHtml = ` 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | `; 85 | 86 | const doc = stringToDom(relativeHtml); 87 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 88 | 89 | assert.equal(metadata.icon, sampleIcon, `Unable to find ${sampleIcon} in ${relativeHtml}`); 90 | assert.equal(metadata.image, sampleImageHTTP, `Unable to find ${sampleImageHTTP} in ${relativeHtml}`); 91 | }); 92 | 93 | it('adds a provider when URL passed in', () => { 94 | const emptyHtml = ` 95 | 96 | 97 | 98 | 99 | `; 100 | 101 | const sampleProvider = 'example'; 102 | const doc = stringToDom(emptyHtml); 103 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 104 | 105 | assert.equal(metadata.provider, sampleProvider, `Unable to find ${sampleProvider} in ${sampleUrl}`); 106 | }); 107 | 108 | it('prefers open graph site name over URL based provider', () => { 109 | const sampleProvider = 'OpenGraph Site Name'; 110 | const providerHtml = ` 111 | 112 | 113 | 114 | 115 | 116 | `; 117 | 118 | const doc = stringToDom(providerHtml); 119 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 120 | 121 | assert.equal(metadata.provider, sampleProvider, `Unable to find ${sampleProvider} in ${providerHtml}`); 122 | }); 123 | 124 | it('uses default favicon when no favicon is found', () => { 125 | const noIconHtml = ` 126 | 127 | 128 | 129 | 130 | `; 131 | 132 | const doc = stringToDom(noIconHtml); 133 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 134 | 135 | assert.equal(metadata.icon, sampleIcon, `Unable to find ${sampleIcon} in ${metadata.icon}`); 136 | }); 137 | it('falls back on provided url when no canonical url found', () => { 138 | const html = ` 139 | 140 | 141 | 142 | 143 | `; 144 | 145 | const doc = stringToDom(html); 146 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 147 | 148 | assert.equal(metadata.url, sampleUrl, `Unable to find ${sampleUrl} in ${JSON.stringify(metadata)}`); 149 | }); 150 | 151 | it('it fetches keywords, icon and description from uppercased metadata property titles', () => { 152 | const relativeHtml = ` 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | `; 161 | 162 | const doc = stringToDom(relativeHtml); 163 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 164 | 165 | assert.equal(metadata.icon, sampleIcon, `Unable to find ${sampleIcon} in ${relativeHtml}`); 166 | assert.equal(metadata.description, sampleDescription, `Unable to find ${sampleDescription} in ${relativeHtml}`); 167 | assert.equal(metadata.keywords, sampleTitle, `Unable to find ${sampleTitle} in ${relativeHtml}`); 168 | 169 | }); 170 | 171 | it('finds language in metadata', () => { 172 | const html = ` 173 | 174 | 175 | 176 | 177 | 178 | `; 179 | 180 | const doc = stringToDom(html); 181 | const metadata = getMetadata(doc, sampleUrl, metadataRuleSets); 182 | 183 | assert.equal(metadata.language, sampleLanguage, `Unable to find ${sampleLanguage} in ${html}`); 184 | }); 185 | 186 | it('allows custom rules', () => { 187 | const doc = stringToDom(sampleHtml); 188 | const rules = { 189 | url: metadataRuleSets.url, 190 | title: metadataRuleSets.title, 191 | description: metadataRuleSets.description 192 | }; 193 | 194 | const metadata = getMetadata(doc, sampleUrl, rules); 195 | 196 | assert.equal(metadata.url, sampleUrl, 'Error finding URL'); 197 | assert.equal(metadata.title, sampleTitle, 'Error finding title'); 198 | assert.equal(metadata.description, sampleDescription, 'Error finding description'); 199 | }); 200 | }); 201 | -------------------------------------------------------------------------------- /tests/index.js: -------------------------------------------------------------------------------- 1 | const req = require.context('.', true, /\.test.js$/); 2 | const files = req.keys(); 3 | 4 | files.forEach(file => req(file)); 5 | -------------------------------------------------------------------------------- /tests/metadataRules.test.js: -------------------------------------------------------------------------------- 1 | // Tests for parse.js 2 | const {assert} = require('chai'); 3 | const {buildRuleSet, metadataRuleSets} = require('../parser'); 4 | const {stringToDom} = require('./test-utils'); 5 | 6 | function buildHTML(tag) { 7 | return ` 8 | 9 | 10 | ${tag} 11 | 12 | 13 | `; 14 | } 15 | 16 | function ruleTest(testName, testRule, expected, testTag) { 17 | it(`finds ${testName}`, () => { 18 | const html = buildHTML(testTag); 19 | const doc = stringToDom(html); 20 | const rule = buildRuleSet(testRule); 21 | const found = rule(doc, { 22 | url: 'http://www.example.com/' 23 | }); 24 | assert.deepEqual(found, expected, `Unable to find ${testName} in ${html}`); 25 | }); 26 | } 27 | 28 | 29 | describe('Title Rule Tests', function() { 30 | const pageTitle = 'Page Title'; 31 | 32 | const ruleTests = [ 33 | ['og:title', ``], 34 | ['twitter:title', ``], 35 | ['twitter:title', ``], 36 | ['hdl', ``], 37 | ['title', `