├── test ├── index.js ├── utils │ └── assert.js ├── static │ ├── turtle_movie.json │ ├── turtle_movie.html │ ├── turtle_article_errors.html │ ├── turtle_article.json │ ├── turtle_article_case.html │ └── turtle_article.html ├── static.js ├── errors.js ├── scraping.js └── parsing.js ├── .jshintignore ├── .travis.yml ├── .gitignore ├── .eslintrc.json ├── .jshintrc ├── .github └── workflows │ └── node.js.yml ├── LICENSE.md ├── package.json ├── README.md ├── index.js └── lib └── index.js /test/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | -------------------------------------------------------------------------------- /.jshintignore: -------------------------------------------------------------------------------- 1 | coverage 2 | node_modules 3 | test 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "4" 4 | - "6" 5 | - "8" 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | coverage 2 | node_modules 3 | npm-debug.log 4 | .eslintcache 5 | .nyc_output 6 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "extends": [ 4 | "wikimedia/server" 5 | ], 6 | "rules": { 7 | "camelcase": "off", 8 | "no-console": "off", 9 | "no-process-exit": "off", 10 | "no-shadow": "off", 11 | "no-underscore-dangle": "off", 12 | "no-use-before-define": "off", 13 | "es-x/no-hashbang": "off", 14 | "n/no-process-exit": "off", 15 | "jsdoc/newline-after-description": "off" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "bitwise": true, 3 | "laxbreak": true, 4 | "curly": true, 5 | "eqeqeq": true, 6 | "immed": true, 7 | "latedef": "nofunc", 8 | "newcap": true, 9 | "noarg": true, 10 | "noempty": true, 11 | "nonew": true, 12 | "regexp": false, 13 | "undef": true, 14 | "strict": true, 15 | "trailing": true, 16 | "smarttabs": true, 17 | "multistr": true, 18 | "node": true, 19 | "nomen": false, 20 | "loopfunc": true, 21 | "esnext": true 22 | } 23 | -------------------------------------------------------------------------------- /test/utils/assert.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const { use } = require( 'chai' ); 4 | 5 | module.exports = use( ( _chai ) => { 6 | const { assert } = _chai; 7 | 8 | assert.fails = ( promise ) => { 9 | 10 | let failed = false; 11 | 12 | function trackFailure( e ) { 13 | failed = true; 14 | return e; 15 | } 16 | 17 | function check() { 18 | if ( !failed ) { 19 | throw new Error( 'expected error was not thrown' ); 20 | } 21 | } 22 | return promise.catch( trackFailure ).then( check ); 23 | 24 | }; 25 | 26 | } ).assert; 27 | -------------------------------------------------------------------------------- /.github/workflows/node.js.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [18.x, 20.x, 22.x, 24.x] 20 | # See supported Node.js release schedule at https://nodejs.org/en/about/releases/ 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Use Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v3 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | cache: 'npm' 29 | - run: npm ci 30 | - run: npm run build --if-present 31 | - run: npm test 32 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Marielle Volz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-metadata", 3 | "version": "3.0.1", 4 | "description": "Scrapes metadata of several different standards", 5 | "main": "index.js", 6 | "dependencies": { 7 | "cheerio": "1.0.0-rc.12", 8 | "microdata-node": "^2.0.0" 9 | }, 10 | "devDependencies": { 11 | "chai": "^4.3.0", 12 | "eslint-config-wikimedia": "0.29.0", 13 | "mocha": "^11.1.0", 14 | "mocha-lcov-reporter": "^1.3.0", 15 | "nock": "^13.3.0", 16 | "nyc": "^17.1.0" 17 | }, 18 | "scripts": { 19 | "test": "npm run lint && mocha", 20 | "lint": "eslint --cache --max-warnings 0 --ext .js,.json .", 21 | "lint:fix": "eslint --fix .", 22 | "coverage": "nyc --reporter=lcov _mocha" 23 | }, 24 | "engines": { 25 | "node": ">=18" 26 | }, 27 | "keywords": [ 28 | "bepress", 29 | "coins", 30 | "dublin core", 31 | "eprints", 32 | "highwire press", 33 | "json-ld", 34 | "open graph", 35 | "metadata", 36 | "microdata", 37 | "prism", 38 | "twitter cards", 39 | "web scraper" 40 | ], 41 | "repository": { 42 | "type": "git", 43 | "url": "https://github.com/wikimedia/html-metadata.git" 44 | }, 45 | "author": "Marielle Volz ", 46 | "contributors": [ 47 | "Krzysztof Zbudniewek ", 48 | "Geoffrey Mon ", 49 | "Scimonster " 50 | ], 51 | "license": "MIT", 52 | "bugs": { 53 | "url": "https://github.com/wikimedia/html-metadata/issues" 54 | }, 55 | "homepage": "https://github.com/wikimedia/html-metadata" 56 | } 57 | -------------------------------------------------------------------------------- /test/static/turtle_movie.json: -------------------------------------------------------------------------------- 1 | { 2 | "dublinCore": { 3 | "title": "Turtles of the Jungle", 4 | "creator": "http://www.example.com/turtlelvr", 5 | "description": "A 2008 film about jungle turtles.", 6 | "date": "2012-02-04 12:00:00", 7 | "type": "Image.Moving" 8 | }, 9 | "general": { 10 | "appleTouchIcons": [ 11 | { 12 | "href": "movieturtleapple.png" 13 | }, 14 | { 15 | "href": "movieturtleapple2.png", 16 | "sizes": "72x72" 17 | } 18 | ], 19 | "author": "Turtle Lvr", 20 | "authorlink": "http://examples.com/turtlelvr", 21 | "canonical": "http://example.com/turtles", 22 | "description": "Exposition on the awesomeness of turtles", 23 | "icons": [ 24 | { 25 | "href": "movieturtle.png", 26 | "type": "image/png" 27 | }, 28 | { 29 | "href": "movieturtle2.png", 30 | "sizes": "18x18" 31 | } 32 | ], 33 | "publisher": "https://mediawiki.org", 34 | "robots": "we welcome our robot overlords", 35 | "shortlink": "http://example.com/c", 36 | "title": "Turtles are AWESOME!!1 | Awesome Turtles Website", 37 | "lang": "en" 38 | }, 39 | "openGraph": { 40 | "locale": "en_US", 41 | "type": "video.movie", 42 | "title": "Turtles of the Jungle", 43 | "description": "A 2008 film about jungle turtles.", 44 | "url": "http://example.com", 45 | "site_name": "Awesome Turtle Movies Website", 46 | "image": [ { 47 | "url": "http://example.com/turtle.jpg" 48 | }, { 49 | "url": "http://example.com/shell.jpg" 50 | } ], 51 | "tag": [ "turtle", "movie", "awesome" ], 52 | "director": "http://www.example.com/PhilTheTurtle", 53 | "actor": [ "http://www.example.com/PatTheTurtle", "http://www.example.com/SaminaTheTurtle" ], 54 | "writer": "http://www.example.com/TinaTheTurtle", 55 | "release_date": "2015-01-14T19:14:27+00:00", 56 | "duration": "1000000" 57 | }, 58 | "twitter": { 59 | "card": "summary", 60 | "site": "@Turtlessssssssss", 61 | "creator": "@Turtlessssssssss", 62 | "url": "http://www.example.com/turtles", 63 | "title": "Turtles of the Jungle", 64 | "description": "A 2008 film about jungle turtles.", 65 | "player": { 66 | "url": "http://www.example.com/turtles/player", 67 | "width": "400", 68 | "height": "400", 69 | "stream": { 70 | "url": "http://www.example.com/turtles/turtle.mp4", 71 | "content_type": "video/mp4" 72 | } 73 | } 74 | }, 75 | "prism": { 76 | "publicationName": "Turtles of the Jungle", 77 | "publicationDate": "2012-02-04", 78 | "copyright": "2012 Turtles Society", 79 | "rightsAgent": "permissions@turtles.com", 80 | "url": "https://www.turtles.com" 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /test/static.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * Tests using files contained in ./static 5 | */ 6 | 7 | const assert = require( './utils/assert.js' ); 8 | const cheerio = require( 'cheerio' ); 9 | const meta = require( '../index' ); 10 | 11 | // mocha defines to avoid eslint breakage 12 | /* global describe, it */ 13 | 14 | describe( 'static tests', () => { 15 | let $; 16 | const fs = require( 'fs' ); 17 | let expected; 18 | 19 | describe( 'static files', () => { 20 | it( 'should get correct info from turtle movie file', () => { 21 | expected = JSON.parse( fs.readFileSync( './test/static/turtle_movie.json' ) ); 22 | $ = cheerio.load( fs.readFileSync( './test/static/turtle_movie.html' ) ); 23 | return meta.parseAll( $ ).then( ( results ) => { 24 | assert.deepEqual( results, expected ); 25 | } ); 26 | } ); 27 | 28 | it( 'should get correct info from turtle article file', () => { 29 | expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) ); 30 | $ = cheerio.load( fs.readFileSync( './test/static/turtle_article.html' ) ); 31 | return meta.parseAll( $ ).then( ( results ) => { 32 | assert.deepEqual( results, expected ); 33 | } ); 34 | } ); 35 | 36 | it( 'should be case insensitive on turtle article file', () => { 37 | expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) ); 38 | $ = cheerio.load( fs.readFileSync( './test/static/turtle_article_case.html' ) ); 39 | return meta.parseAll( $ ).then( ( results ) => { 40 | assert.deepEqual( results, expected ); 41 | } ); 42 | } ); 43 | 44 | it( 'should be case insensitive on turtle article file', () => { 45 | expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) ); 46 | $ = cheerio.load( fs.readFileSync( './test/static/turtle_article_case.html' ) ); 47 | return meta.parseAll( $ ).then( ( results ) => { 48 | assert.deepEqual( results, expected ); 49 | } ); 50 | } ); 51 | } ); 52 | 53 | describe( 'loadFromString', () => { 54 | it( 'should get correct info using loadFromString method from turtle movie file ', () => { 55 | expected = JSON.parse( fs.readFileSync( './test/static/turtle_movie.json' ) ); 56 | const html = fs.readFileSync( './test/static/turtle_movie.html' ); 57 | return meta.loadFromString( html ).then( ( results ) => { 58 | assert.deepEqual( results, expected ); 59 | } ); 60 | } ); 61 | 62 | it( 'should get correct info using loadFromString method for self closing tag', () => { 63 | const html = '
'; 64 | const expected = { schemaOrg: { items: [ { properties: { priceCurrency: [ 'PLN' ], price: [ '139.90' ] } } ] } }; 65 | return meta.loadFromString( html ).then( ( results ) => { 66 | assert.deepEqual( results, expected ); 67 | } ); 68 | } ); 69 | } ); 70 | 71 | } ); 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html-metadata 2 | ============= 3 | [![npm](https://img.shields.io/npm/v/html-metadata.svg)](https://www.npmjs.com/package/html-metadata) 4 | > MetaData html scraper and parser for Node.js (supports Promises only. Callbacks were deprecated in 3.0.0) 5 | 6 | The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, JSON-LD, Open Graph, Twitter, EPrints, PRISM, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags). 7 | 8 | Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome! 9 | 10 | ## Install 11 | 12 | npm install html-metadata 13 | 14 | ## Usage 15 | 16 | ```js 17 | var scrape = require('html-metadata'); 18 | 19 | var url = "http://blog.woorank.com/2013/04/dublin-core-metadata-for-seo-and-usability/"; 20 | 21 | scrape(url).then(function(metadata){ 22 | console.log(metadata); 23 | }); 24 | ``` 25 | 26 | The scrape method used here invokes the parseAll() method, which uses all the available methods registered in method metadataFunctions(), and are available for use separately as well, for example: 27 | 28 | ```js 29 | var cheerio = require('cheerio'); 30 | var parseDublinCore = require('html-metadata').parseDublinCore; 31 | 32 | var url = "http://blog.woorank.com/2013/04/dublin-core-metadata-for-seo-and-usability/"; 33 | 34 | fetch(url).then(function(response){ 35 | $ = cheerio.load(response.body); 36 | return parseDublinCore($).then(function(metadata){ 37 | console.log(metadata); 38 | }); 39 | }); 40 | ``` 41 | 42 | Options dictionary: 43 | 44 | You can also pass an [options dictionary](https://developer.mozilla.org/en-US/docs/Web/API/RequestInit) as the first argument containing extra parameters. Some websites require the user-agent or cookies to be set in order to get the response. This is identifical to the RequestInit dictionary except that it should also contain the requested url as part of the dictionary. 45 | 46 | ``` 47 | var scrape = require('html-metadata'); 48 | 49 | var options = { 50 | url: "http://example.com", 51 | headers: { 52 | 'User-Agent': 'webscraper' 53 | } 54 | }; 55 | 56 | scrape(options, function(error, metadata){ 57 | console.log(metadata); 58 | }); 59 | ``` 60 | 61 | The method parseGeneral obtains the following general metadata: 62 | 63 | ```html 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | ``` 77 | 78 | ## Tests 79 | 80 | ```npm test``` runs the mocha tests 81 | 82 | ```npm run-script coverage``` runs the tests and reports code coverage 83 | 84 | ## Contributing 85 | 86 | Contributions welcome! All contibutions should use [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) instead of callbacks. 87 | -------------------------------------------------------------------------------- /test/static/turtle_movie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Turtles are AWESOME!!1 | Awesome Turtles Website 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /test/static/turtle_article_errors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /test/errors.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * Tests expecting promises to reject 5 | */ 6 | 7 | const cheerio = require( 'cheerio' ); 8 | const meta = require( '../index' ); 9 | const assert = require( './utils/assert.js' ); 10 | const fs = require( 'fs' ); 11 | 12 | // mocha defines to avoid eslint breakage 13 | /* global describe, it */ 14 | 15 | describe( 'errors', function () { 16 | 17 | this.timeout( 40000 ); 18 | 19 | function fetchBody( url ) { 20 | // res.body is a ReadableStream of a Uint8Array, but we just want the string 21 | // eslint-disable-next-line n/no-unsupported-features/node-builtins 22 | return fetch( url ).then( ( res ) => res.text() ); 23 | } 24 | 25 | it( 'should not find schema.org metadata, reject promise', () => { 26 | const url = 'http://example.com'; 27 | return fetchBody( url ) 28 | .then( ( body ) => { 29 | const $ = cheerio.load( body ); 30 | const prom = meta.parseSchemaOrgMicrodata( $ ); 31 | return assert.fails( prom ); 32 | } ); 33 | } ); 34 | 35 | it( 'should not find BE Press metadata, reject promise', () => { 36 | const url = 'http://example.com'; 37 | return fetchBody( url ) 38 | .then( ( body ) => { 39 | const $ = cheerio.load( body ); 40 | const prom = meta.parseBEPress( $ ); 41 | return assert.fails( prom ); 42 | } ); 43 | } ); 44 | 45 | it( 'should not find coins metadata, reject promise', () => { 46 | const url = 'http://example.com'; 47 | return fetchBody( url ) 48 | .then( ( body ) => { 49 | const $ = cheerio.load( body ); 50 | const prom = meta.parseCOinS( $ ); 51 | return assert.fails( prom ); 52 | } ); 53 | } ); 54 | 55 | it( 'should not find dublin core metadata, reject promise', () => { 56 | const url = 'http://www.laprovence.com/article/actualites/3411272/marseille-un-proche-du-milieu-corse-abattu-par-balles-en-plein-jour.html'; 57 | return fetchBody( url ) 58 | .then( ( body ) => { 59 | const $ = cheerio.load( body ); 60 | const prom = meta.parseDublinCore( $ ); 61 | return assert.fails( prom ); 62 | } ); 63 | } ); 64 | 65 | it( 'should not find highwire press metadata, reject promise', () => { 66 | const url = 'http://example.com'; 67 | return fetchBody( url ) 68 | .then( ( body ) => { 69 | const $ = cheerio.load( body ); 70 | const prom = meta.parseHighwirePress( $ ); 71 | return assert.fails( prom ); 72 | } ); 73 | } ); 74 | 75 | it( 'should not find open graph metadata, reject promise', () => { 76 | const url = 'http://www.example.com'; 77 | return fetchBody( url ) 78 | .then( ( body ) => { 79 | const $ = cheerio.load( body ); 80 | const prom = meta.parseOpenGraph( $ ); 81 | return assert.fails( prom ); 82 | } ); 83 | } ); 84 | 85 | it( 'should not find eprints metadata, reject promise', () => { 86 | const url = 'http://example.com'; 87 | return fetchBody( url ) 88 | .then( ( body ) => { 89 | const $ = cheerio.load( body ); 90 | const prom = meta.parseEprints( $ ); 91 | return assert.fails( prom ); 92 | } ); 93 | } ); 94 | 95 | it( 'should not find twitter metadata, reject promise', () => { 96 | const url = 'http://example.com'; 97 | return fetchBody( url ) 98 | .then( ( body ) => { 99 | const $ = cheerio.load( body ); 100 | const prom = meta.parseTwitter( $ ); 101 | return assert.fails( prom ); 102 | } ); 103 | } ); 104 | 105 | it( 'should not find JSON-LD, reject promise', () => { 106 | const url = 'http://example.com'; 107 | return fetchBody( url ) 108 | .then( ( body ) => { 109 | const $ = cheerio.load( body ); 110 | const prom = meta.parseJsonLd( $ ); 111 | return assert.fails( prom ); 112 | } ); 113 | } ); 114 | 115 | it( 'should reject parseALL promise for entire error file', () => { 116 | const $ = cheerio.load( fs.readFileSync( './test/static/turtle_article_errors.html' ) ); 117 | return assert.fails( meta.parseAll( $ ) ); 118 | } ); 119 | 120 | it( 'should reject promise with undefined cheerio object', () => { 121 | const prom = meta.parseOpenGraph( undefined ); 122 | return assert.fails( prom ); 123 | } ); 124 | 125 | it( 'should reject promise with non-string title', () => { 126 | const prom = meta.parseCOinSTitle( {} ); 127 | return assert.fails( prom ); 128 | } ); 129 | 130 | it( 'should reject promise with string with no keys', () => { 131 | const prom = meta.parseCOinSTitle( '' ); 132 | return assert.fails( prom ); 133 | } ); 134 | 135 | it( 'should reject promise with string with bad keys', () => { 136 | const prom = meta.parseCOinSTitle( 'badkey.a&badkey.b' ); 137 | return assert.fails( prom ); 138 | } ); 139 | 140 | } ); 141 | -------------------------------------------------------------------------------- /test/static/turtle_article.json: -------------------------------------------------------------------------------- 1 | { 2 | "bePress": { 3 | "series_title": "Turtles", 4 | "author": "Turtle Lvr", 5 | "author_institution": "Mediawiki", 6 | "title": "Turtles are AWESOME!!1", 7 | "date": "2012", 8 | "pdf_url": "http://www.example.com/turtlelvr/pdf", 9 | "abstract_html_url": "http://www.example.com/turtlelvr", 10 | "publisher": "Turtles Society", 11 | "online_date": "2012/02/04" 12 | }, 13 | "coins": [ { 14 | "ctx_ver": "Z39.88-2004", 15 | "rft_id": "info:doi/http://dx.doi.org/10.5555/12345678", 16 | "rfr_id": "info:sid/crossref.org:search", 17 | "rft_val_fmt": "info:ofi/fmt:kev:mtx:journal", 18 | "rft": { 19 | "atitle": "Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory", 20 | "jtitle": "Journal of Psychoceramics", 21 | "date": "2008", 22 | "volume": "5", 23 | "issue": "11", 24 | "spage": "1", 25 | "epage": "3", 26 | "aufirst": "Josiah", 27 | "aulast": "Carberry", 28 | "genre": "article", 29 | "au": [ "Josiah Carberry" ] 30 | } 31 | } ], 32 | "dublinCore": { 33 | "title": "Turtles are AWESOME!!1", 34 | "creator": "http://www.example.com/turtlelvr", 35 | "description": "Exposition on the awesomeness of turtles", 36 | "date": "2012-02-04 12:00:00", 37 | "type": "Text.Article" 38 | }, 39 | "general": { 40 | "appleTouchIcons": [ 41 | { 42 | "href": "turtleapple.png", 43 | "sizes": "72x72" 44 | }, 45 | { 46 | "href": "turtleapple2.png" 47 | } 48 | ], 49 | "author": "Turtle Lvr", 50 | "authorlink": "http://examples.com/turtlelvr", 51 | "canonical": "http://example.com/turtles", 52 | "description": "Exposition on the awesomeness of turtles", 53 | "dir": "ltr", 54 | "icons": [ 55 | { 56 | "href": "turtle.png", 57 | "sizes": "18x18", 58 | "type": "image/png" 59 | }, 60 | { 61 | "href": "turtle2.png", 62 | "type": "image/png" 63 | } 64 | ], 65 | "publisher": "https://mediawiki.org", 66 | "robots": "we welcome our robot overlords", 67 | "shortlink": "http://example.com/c", 68 | "title": "Turtles are AWESOME!!1 | Awesome Turtles Website", 69 | "lang": "en" 70 | }, 71 | "highwirePress": { 72 | "journal_title": "Turtles", 73 | "issn": "1234-5678", 74 | "doi": "10.1000/123", 75 | "publication_date": "2012-02-04", 76 | "title": "Turtles are AWESOME!!1", 77 | "author": "Turtle Lvr", 78 | "author_institution": "Mediawiki", 79 | "volume": "150", 80 | "issue": "1", 81 | "firstpage": "123", 82 | "lastpage": "456", 83 | "publisher": "Turtles Society", 84 | "abstract": "Exposition on the awesomeness of turtles." 85 | }, 86 | "jsonLd": { 87 | "@context": "http://schema.org", 88 | "@type": "Organization", 89 | "url": "https://www.turtles.com" 90 | }, 91 | "openGraph": { 92 | "locale": "en_US", 93 | "type": "article", 94 | "title": "Turtles are AWESOME!!1", 95 | "description": "Exposition on the awesomeness of turtles", 96 | "url": "http://example.com", 97 | "site_name": "Awesome Turtles Website", 98 | "image": [ { 99 | "url": "http://example.com/turtle.jpg", 100 | "secure_url": "https://secure.example.com/turtle.jpg", 101 | "type": "image/jpeg", 102 | "width": "400", 103 | "height": "300" 104 | }, { 105 | "url": "http://example.com/shell.jpg", 106 | "width": "200", 107 | "height": "150" 108 | } ], 109 | "audio": { 110 | "url": "http://example.com/sound.mp3", 111 | "secure_url": "https://secure.example.com/sound.mp3", 112 | "type": "audio/mpeg" 113 | }, 114 | "tag": [ "turtles", "are", "awesome" ], 115 | "section": [ "Turtles are tough", "Turtles are flawless", "Turtles are cute" ], 116 | "published_time": "2012-02-04T12:00:00+00:00", 117 | "modified_time": "2015-01-14T19:14:27+00:00", 118 | "author": "http://examples.com/turtlelvr", 119 | "publisher": "http://mediawiki.org" 120 | }, 121 | "eprints": { 122 | "title": "Turtles are AWESOME!!1", 123 | "creators_name": "http://www.example.com/turtlelvr", 124 | "abstract": "Exposition on the awesomeness of turtles", 125 | "datestamp": "2012-02-04 12:00:00", 126 | "type": "article" 127 | }, 128 | "twitter": { 129 | "card": "summary", 130 | "site": "@Turtlessssssssss", 131 | "creator": [ "@Turtlessssssssss", "@Turtlezzzzzzzzzz" ], 132 | "url": "http://www.example.com/turtles", 133 | "title": "Turtles are AWESOME!!1", 134 | "description": "Exposition on the awesomeness of turtles", 135 | "image": { 136 | "url": "http://example.com/turtles.jpg", 137 | "alt": "It's a bunch of turtles!" 138 | }, 139 | "app": { 140 | "url": { 141 | "iphone": "turtle://", 142 | "googleplay": "turtle://" 143 | }, 144 | "id": { 145 | "iphone": "000", 146 | "googleplay": "superturtlearticle.androidapp" 147 | } 148 | } 149 | }, 150 | "prism": { 151 | "issn": "1234-5678", 152 | "publicationName": "Turtles Society", 153 | "publicationDate": "2012-02-04", 154 | "startingPage": "123", 155 | "copyright": "2012 Turtles Society", 156 | "rightsAgent": "permissions@turtles.com", 157 | "url": "https://www.turtles.com", 158 | "doi": "10.1000/123" 159 | }, 160 | "schemaOrg": { 161 | "items": [ 162 | { 163 | "properties": { 164 | "archivedAt": [ 165 | "http://www.archive.org/turtlearticle" 166 | ], 167 | "headline": [ 168 | "Turtles are AWESOME!!1" 169 | ], 170 | "author": [ 171 | "Turtle Lvr" 172 | ], 173 | "wordCount": [ 174 | "10" 175 | ] 176 | } 177 | } 178 | ] 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * https://github.com/wikimedia/html-metadata 3 | * 4 | * This file wraps all exportable functions so that they 5 | * can be used with Promises. 6 | */ 7 | 8 | 'use strict'; 9 | 10 | /* 11 | Import modules 12 | */ 13 | const cheerio = require( 'cheerio' ); 14 | 15 | const index = require( './lib/index.js' ); 16 | 17 | /** 18 | * Default exported function that takes a url string or 19 | * request library options dictionary and returns a 20 | * Promise for all available metadata 21 | * 22 | * @param {Object} urlOrOpts url String or options dictionary 23 | * @return {Object} Promise for metadata 24 | */ 25 | exports = module.exports = function ( urlOrOpts ) { 26 | return new Promise( ( resolve, reject ) => { 27 | let url, opts; 28 | if ( urlOrOpts instanceof Object ) { 29 | if ( urlOrOpts.uri ) { 30 | url = urlOrOpts.uri; 31 | } 32 | opts = urlOrOpts; 33 | } else if ( typeof urlOrOpts === 'string' ) { 34 | url = urlOrOpts; 35 | } 36 | if ( !url ) { 37 | reject( new Error( 'No uri supplied in argument' ) ); 38 | } else { 39 | resolve( 40 | // eslint-disable-next-line n/no-unsupported-features/node-builtins 41 | fetch( url, opts ).then( 42 | ( response ) => response.text().then( 43 | ( body ) => index.parseAll( cheerio.load( body ) ) 44 | ) 45 | ) 46 | ); 47 | } 48 | } ); 49 | }; 50 | 51 | /** 52 | * Exported function that takes html string and 53 | * returns a Promise for all available metadata 54 | * 55 | * @param {string} html html String HTML of the page 56 | * @return {Object} Promise for metadata 57 | */ 58 | exports.loadFromString = function ( html ) { 59 | return index.parseAll( cheerio.load( html ) ); 60 | }; 61 | 62 | /** 63 | * Returns Object containing all available datatypes, keyed 64 | * using the same keys as in metadataFunctions. 65 | * 66 | * @param {Object} chtml html Cheerio object to parse 67 | * @return {Object} Promise for metadata 68 | */ 69 | exports.parseAll = function ( chtml ) { 70 | return index.parseAll( chtml ); 71 | }; 72 | 73 | /** 74 | * Scrapes BE Press metadata given html object 75 | * 76 | * @param {Object} chtml html Cheerio object 77 | * @return {Object} Promise for metadata 78 | */ 79 | exports.parseBEPress = function ( chtml ) { 80 | return index.parseBEPress( chtml ); 81 | }; 82 | 83 | /** 84 | * Scrapes embedded COinS data given Cheerio loaded html object 85 | * 86 | * @param {Object} chtml html Cheerio object 87 | * @return {Object} Promise for metadata 88 | */ 89 | exports.parseCOinS = function ( chtml ) { 90 | return index.parseCOinS( chtml ); 91 | }; 92 | 93 | /** 94 | * Parses value of COinS title tag 95 | * 96 | * @param {string} title String corresponding to value of title tag in span element 97 | * @return {Object} Promise for metadata 98 | */ 99 | exports.parseCOinSTitle = function ( title ) { 100 | return index.parseCOinSTitle( title ); 101 | }; 102 | 103 | /** 104 | * Scrapes Dublin Core data given Cheerio loaded html object 105 | * 106 | * @param {Object} chtml html Cheerio object 107 | * @return {Object} Promise for metadata 108 | */ 109 | exports.parseDublinCore = function ( chtml ) { 110 | return index.parseDublinCore( chtml ); 111 | }; 112 | 113 | /** 114 | * Scrapes EPrints data given Cheerio loaded html object 115 | * 116 | * @param {Object} chtml html Cheerio object 117 | * @return {Object} Promise for metadata 118 | */ 119 | exports.parseEprints = function ( chtml ) { 120 | return index.parseEprints( chtml ); 121 | }; 122 | 123 | /** 124 | * Scrapes general metadata terms given Cheerio loaded html object 125 | * 126 | * @param {Object} chtml html Cheerio object 127 | * @return {Object} Promise for metadata 128 | */ 129 | exports.parseGeneral = function ( chtml ) { 130 | return index.parseGeneral( chtml ); 131 | }; 132 | 133 | /** 134 | * Scrapes Highwire Press metadata given html object 135 | * 136 | * @param {Object} chtml html Cheerio object 137 | * @return {Object} Promise for metadata 138 | */ 139 | exports.parseHighwirePress = function ( chtml ) { 140 | return index.parseHighwirePress( chtml ); 141 | }; 142 | 143 | /** 144 | * Retrieves JSON-LD for given html object 145 | * 146 | * @param {Object} chtml html Cheerio object 147 | * @return {Object} Promise for JSON-LD 148 | */ 149 | exports.parseJsonLd = function ( chtml ) { 150 | return index.parseJsonLd( chtml ); 151 | }; 152 | 153 | /** 154 | * Scrapes OpenGraph data given html object 155 | * 156 | * @param {Object} chtml html Cheerio object 157 | * @return {Object} Promise for metadata 158 | */ 159 | exports.parseOpenGraph = function ( chtml ) { 160 | return index.parseOpenGraph( chtml ); 161 | }; 162 | 163 | /** 164 | * Scrapes schema.org microdata given Cheerio loaded html object 165 | * 166 | * @param {Object} chtml html Cheerio object 167 | * @return {Object} Promise for metadata 168 | */ 169 | exports.parseSchemaOrgMicrodata = function ( chtml ) { 170 | return index.parseSchemaOrgMicrodata( chtml ); 171 | }; 172 | 173 | /** 174 | * Scrapes Twitter data given html object 175 | * 176 | * @param {Object} chtml html Cheerio object 177 | * @return {Object} Promise for metadata 178 | */ 179 | exports.parseTwitter = function ( chtml ) { 180 | return index.parseTwitter( chtml ); 181 | }; 182 | 183 | /** 184 | * Scrapes PRISM data given html object 185 | * 186 | * @param {Object} chtml html Cheerio object 187 | * @return {Object} Promise for metadata 188 | */ 189 | exports.parsePrism = function ( chtml ) { 190 | return index.parsePrism( chtml ); 191 | }; 192 | 193 | /** 194 | * Global exportable list of scraping promises with string keys 195 | * 196 | * @type {Object} 197 | */ 198 | exports.metadataFunctions = index.metadataFunctions; 199 | 200 | /* 201 | Export the version 202 | */ 203 | 204 | exports.version = require( './package' ).version; 205 | -------------------------------------------------------------------------------- /test/scraping.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const meta = require( '../index' ); 4 | const assert = require( 'assert' ); 5 | const cheerio = require( 'cheerio' ); 6 | 7 | // mocha defines to avoid eslint breakage 8 | /* global describe, it */ 9 | 10 | describe( 'scraping', function () { 11 | 12 | this.timeout( 100000 ); 13 | 14 | const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; 15 | const acceptHeader = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'; 16 | 17 | function getWithHeaders( url ) { 18 | // eslint-disable-next-line n/no-unsupported-features/node-builtins 19 | return fetch( url, { 20 | method: 'GET', 21 | headers: { 22 | 'User-Agent': userAgent, 23 | Accept: acceptHeader 24 | } 25 | // res.body is a ReadableStream of a Uint8Array, but we just want the string 26 | } ).then( ( res ) => res.text() ); 27 | } 28 | 29 | describe( 'parseAll function', () => { 30 | 31 | describe( 'Promise style', () => { 32 | it( 'should resolve promise from woorank with headers', () => { 33 | const url = 'https://www.woorank.com/en/blog/dublin-core-metadata-for-seo-and-usability'; 34 | return meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } ) 35 | .then( ( result ) => { 36 | assert.ok( result, 'Expected result to be truthy' ); 37 | } ) 38 | .catch( ( e ) => { 39 | console.error( 'Error in woorank test:', e ); 40 | throw e; 41 | } ); 42 | } ); 43 | 44 | it( 'should resolve promise from blog.schema.org without headers', () => { 45 | const url = 'http://blog.schema.org'; 46 | return meta( url ) 47 | .then( ( result ) => { 48 | assert.ok( result, 'Expected result to be truthy' ); 49 | } ) 50 | .catch( ( e ) => { 51 | console.error( 'Error in blog.schema.org test:', e ); 52 | throw e; 53 | } ); 54 | } ); 55 | 56 | it( 'should throw error if no uri supplied', () => meta() 57 | .then( () => { 58 | assert.fail( 'Should have rejected the promise' ); 59 | } ) 60 | .catch( ( e ) => { 61 | assert.ok( e instanceof Error, 'Error should be an Error object' ); 62 | assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' ); 63 | } ) 64 | ); 65 | 66 | it( 'should not have any undefined values', () => { 67 | const url = 'http://web.archive.org/web/20220127144804/https://www.cnet.com/special-reports/vr101/'; 68 | return getWithHeaders( url ).then( ( body ) => { 69 | const chtml = cheerio.load( body ); 70 | return meta.parseAll( chtml ) 71 | .then( ( results ) => { 72 | Object.keys( results ).forEach( ( metadataType ) => { 73 | Object.keys( results[ metadataType ] ).forEach( ( key ) => { 74 | assert.notStrictEqual( results[ metadataType ][ key ], undefined, `${ metadataType }.${ key } should not be undefined` ); 75 | } ); 76 | } ); 77 | } ); 78 | } ); 79 | } ); 80 | 81 | } ); 82 | 83 | describe( 'Await style', () => { 84 | 85 | it( 'should support await implementation with headers', async () => { 86 | const url = 'http://blog.schema.org'; 87 | const result = await meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } ); 88 | assert.ok( result, 'Expected result to be truthy' ); 89 | } ); 90 | 91 | it( 'should support await implementation without headers', async () => { 92 | const url = 'http://blog.schema.org'; 93 | const result = await meta( url ); 94 | assert.ok( result, 'Expected result to be truthy' ); 95 | } ); 96 | 97 | it( 'should throw error if no uri is supplied with async/await', async () => { 98 | try { 99 | await meta(); 100 | assert.fail( 'Should have thrown an error' ); 101 | } catch ( e ) { 102 | assert.ok( e instanceof Error, 'Error should be an Error object' ); 103 | assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' ); 104 | } 105 | } ); 106 | } ); 107 | 108 | } ); 109 | 110 | describe( 'Individual metadata functions', () => { 111 | it( 'should get BE Press metadata tags', () => { 112 | const url = 'http://biostats.bepress.com/harvardbiostat/paper154/'; 113 | return getWithHeaders( url ).then( ( body ) => { 114 | const expectedAuthors = [ 'Claggett, Brian', 'Xie, Minge', 'Tian, Lu' ]; 115 | const expectedAuthorInstitutions = [ 'Harvard', 'Rutgers University - New Brunswick/Piscataway', 'Stanford University School of Medicine' ]; 116 | const chtml = cheerio.load( body ); 117 | return meta.parseBEPress( chtml ) 118 | .then( ( results ) => { 119 | assert.deepStrictEqual( results.author, expectedAuthors ); 120 | assert.deepStrictEqual( 121 | results.author_institution, 122 | expectedAuthorInstitutions 123 | ); 124 | [ 'series_title', 'author', 'author_institution', 'title', 'date', 'pdf_url', 125 | 'abstract_html_url', 'publisher', 'online_date' ].forEach( ( key ) => { 126 | assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` ); 127 | } ); 128 | } ); 129 | } ); 130 | } ); 131 | 132 | it( 'should get COinS metadata', () => { 133 | const url = 'https://en.wikipedia.org/wiki/Viral_phylodynamics'; 134 | return getWithHeaders( url ).then( ( body ) => { 135 | const chtml = cheerio.load( body ); 136 | return meta.parseCOinS( chtml ) 137 | .then( ( results ) => { 138 | assert.ok( Array.isArray( results ), `Expected Array, got ${ typeof results }` ); 139 | assert.ok( results.length > 0, 'Expected Array with at least 1 item' ); 140 | assert.ok( results[ 0 ].rft, 'Expected first item of Array to contain key rft' ); 141 | } ); 142 | } ); 143 | } ); 144 | 145 | it( 'should get EPrints metadata', () => { 146 | const url = 'http://eprints.gla.ac.uk/113711/'; 147 | return getWithHeaders( url ).then( ( body ) => { 148 | const chtml = cheerio.load( body ); 149 | const expectedAuthors = [ 'Gatherer, Derek', 'Kohl, Alain' ]; 150 | 151 | return meta.parseEprints( chtml ) 152 | .then( ( results ) => { 153 | assert.deepStrictEqual( results.creators_name, expectedAuthors ); 154 | [ 'eprintid', 'datestamp', 'title', 'abstract', 'issn', 'creators_name', 'publication', 'citation' ].forEach( ( key ) => { 155 | assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` ); 156 | } ); 157 | } ); 158 | } ); 159 | } ); 160 | 161 | it( 'should get general metadata', () => { 162 | const expected = 'Example Domain'; 163 | const url = 'http://example.com'; 164 | return getWithHeaders( url ).then( ( body ) => { 165 | const chtml = cheerio.load( body ); 166 | return meta.parseGeneral( chtml ).then( ( results ) => { 167 | assert.strictEqual( results.title, expected ); 168 | } ); 169 | } ); 170 | } ); 171 | } ); 172 | 173 | } ); 174 | -------------------------------------------------------------------------------- /test/parsing.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * Tests using parsing methods only 5 | */ 6 | 7 | const assert = require( './utils/assert.js' ); 8 | const meta = require( '../index' ); 9 | 10 | // mocha defines to avoid eslint breakage 11 | /* global describe, it */ 12 | 13 | describe( 'parsing', () => { 14 | 15 | it( 'should get correct structure from decoded string', () => { 16 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&rft.jtitle=Journal+of+Psychoceramics&rft.date=2008&rft.volume=5&rft.issue=11&rft.spage=1&rft.epage=3&rft.aufirst=Josiah&rft.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&'; 17 | const expected = { 18 | ctx_ver: 'Z39.88-2004', 19 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678', 20 | rfr_id: 'info:sid/crossref.org:search', 21 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal', 22 | rft: { 23 | atitle: 'Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory', 24 | jtitle: 'Journal of Psychoceramics', 25 | date: '2008', 26 | volume: '5', 27 | issue: '11', 28 | spage: '1', 29 | epage: '3', 30 | aufirst: 'Josiah', 31 | aulast: 'Carberry', 32 | genre: 'article' 33 | } 34 | }; 35 | 36 | return meta.parseCOinSTitle( title ).then( ( results ) => { 37 | assert.deepEqual( results, expected ); 38 | } ); 39 | } ); 40 | 41 | it( 'should get correct structure from html encoded string', () => { 42 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&rft.jtitle=Journal+of+Psychoceramics&rft.date=2008&rft.volume=5&rft.issue=11&rft.spage=1&rft.epage=3&rft.aufirst=Josiah&rft.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&'; 43 | const expected = { 44 | ctx_ver: 'Z39.88-2004', 45 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678', 46 | rfr_id: 'info:sid/crossref.org:search', 47 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal', 48 | rft: { 49 | atitle: 'Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory', 50 | jtitle: 'Journal of Psychoceramics', 51 | date: '2008', 52 | volume: '5', 53 | issue: '11', 54 | spage: '1', 55 | epage: '3', 56 | aufirst: 'Josiah', 57 | aulast: 'Carberry', 58 | genre: 'article' 59 | } 60 | }; 61 | 62 | return meta.parseCOinSTitle( title ).then( ( results ) => { 63 | assert.deepEqual( results, expected ); 64 | } ); 65 | } ); 66 | 67 | it( 'should not add rft object when there are no valid keys', () => { 68 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&badkey.jtitle=Journal+of+Psychoceramics&badkey.date=2008&badkey.volume=5&badkey.issue=11&badkey.spage=1&badkey.epage=3&badkey.aufirst=Josiah&badkey.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&badkey.genre=article&badkey.au=Josiah+Carberry'; 69 | const expected = { 70 | ctx_ver: 'Z39.88-2004', 71 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678', 72 | rfr_id: 'info:sid/crossref.org:search', 73 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal' 74 | }; 75 | 76 | return meta.parseCOinSTitle( title ).then( ( results ) => { 77 | assert.deepEqual( results, expected ); 78 | } ); 79 | } ); 80 | 81 | it( 'should not replace encoded + symbol in doi', () => { 82 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12%2B345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&badkey.jtitle=Journal+of+Psychoceramics&badkey.date=2008&badkey.volume=5&badkey.issue=11&badkey.spage=1&badkey.epage=3&badkey.aufirst=Josiah&badkey.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&badkey.genre=article&badkey.au=Josiah+Carberry'; 83 | const expected = { 84 | ctx_ver: 'Z39.88-2004', 85 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12+345678', 86 | rfr_id: 'info:sid/crossref.org:search', 87 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal' 88 | }; 89 | 90 | return meta.parseCOinSTitle( title ).then( ( results ) => { 91 | assert.deepEqual( results, expected ); 92 | } ); 93 | } ); 94 | 95 | it( 'should add list for au field', () => { 96 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.au=Josiah+Carberry&rft.au=Random+Name&rft.au=Name+of+an+organisation'; 97 | const expected = { 98 | ctx_ver: 'Z39.88-2004', 99 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678', 100 | rfr_id: 'info:sid/crossref.org:search', 101 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal', 102 | rft: { 103 | genre: 'article', 104 | au: [ 105 | 'Josiah Carberry', 106 | 'Random Name', 107 | 'Name of an organisation' 108 | ] 109 | } 110 | }; 111 | 112 | return meta.parseCOinSTitle( title ).then( ( results ) => { 113 | assert.deepEqual( results, expected ); 114 | } ); 115 | } ); 116 | 117 | it( 'should add list for issn and aucorp field', () => { 118 | const title = 'rft.genre=article&rft.issn=1234-5678&rft.issn=2222-3333&rft.aucorp=Name+of+an+organisation'; 119 | const expected = { 120 | rft: { 121 | genre: 'article', 122 | aucorp: [ 123 | 'Name of an organisation' 124 | ], 125 | issn: [ 126 | '1234-5678', 127 | '2222-3333' 128 | ] 129 | } 130 | }; 131 | 132 | return meta.parseCOinSTitle( title ).then( ( results ) => { 133 | assert.deepEqual( results, expected ); 134 | } ); 135 | } ); 136 | 137 | it( 'should ignore bad hierarchical keys', () => { 138 | const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&badkey.jtitle=Journal+of+Psychoceramics&badkey.date=2008&badkey.volume=5&badkey.issue=11&badkey.spage=1&badkey.epage=3&badkey.aufirst=Josiah&badkey.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&badkey.au=Josiah+Carberry'; 139 | const expected = { 140 | ctx_ver: 'Z39.88-2004', 141 | rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678', 142 | rfr_id: 'info:sid/crossref.org:search', 143 | rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal', 144 | rft: { 145 | genre: 'article' 146 | } 147 | }; 148 | 149 | return meta.parseCOinSTitle( title ).then( ( results ) => { 150 | assert.deepEqual( results, expected ); 151 | } ); 152 | } ); 153 | 154 | } ); 155 | -------------------------------------------------------------------------------- /test/static/turtle_article_case.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | Turtles are AWESOME!!1 | Awesome Turtles Website 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 |
153 |

Turtles are AWESOME!!1

154 | 155 | 156 | Turtle Article Archive 157 |
158 | 159 | 160 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /test/static/turtle_article.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Turtles are AWESOME!!1 | Awesome Turtles Website 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 |
151 |

Turtles are AWESOME!!1

152 | 153 | 154 | Turtle Article Archive 155 |
156 | 157 | 158 | 164 | 165 | 166 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const microdata = require( 'microdata-node' ); // Schema.org microdata 4 | 5 | /** 6 | * Returns Object containing all available datatypes, keyed 7 | * using the same keys as in metadataFunctions. 8 | * 9 | * @param {Object} chtml html Cheerio object to parse 10 | * @return {Object} Promise for metadata 11 | */ 12 | exports.parseAll = function ( chtml ) { 13 | // Array of keys corresponding to position of promise 14 | const keys = Object.keys( exports.metadataFunctions ); 15 | const meta = {}; // Metadata keyed by keys in exports.metadataFunctions 16 | // Array of promises for metadata of each type in exports.metadataFunctions 17 | const arr = keys.map( ( key ) => exports.metadataFunctions[ key ]( chtml ) ); 18 | 19 | let result; // Result in for loop over results 20 | let key; // Key corresponding to location of result 21 | 22 | return Promise.all( arr.map( ( promise ) => promise.then( 23 | // Create a promise that will always resolve with either the result or the error 24 | ( value ) => ( { status: 'fulfilled', value } ), 25 | ( error ) => ( { status: 'rejected', reason: error } ) 26 | ) 27 | ) ) 28 | .then( ( results ) => { 29 | Object.keys( results ).forEach( ( r ) => { 30 | result = results[ r ]; 31 | key = keys[ r ]; 32 | if ( result && result.status === 'fulfilled' && result.value ) { 33 | meta[ key ] = result.value; 34 | } 35 | } ); 36 | if ( Object.keys( meta ).length === 0 ) { 37 | throw new Error( 'No metadata found in page' ); 38 | } 39 | return meta; 40 | } ); 41 | }; 42 | 43 | /** 44 | * Base scraper for tags, used by some other parsing functions 45 | * 46 | * @param {Object} chtml html Cheerio object 47 | * @param {string[]} tags tag types to process 48 | * @param {string} reason message when metadata is not found 49 | * @param {Function} getProperty function that gets the property of an element 50 | * @param {Function} getContent function that gets the content of an element 51 | * @return {Object} promise of metadata object 52 | */ 53 | exports.parseBase = function ( chtml, tags, reason, getProperty, getContent ) { 54 | return new Promise( ( resolve, reject ) => { 55 | const meta = {}; 56 | const metaTags = chtml( tags.join() ); 57 | 58 | if ( !metaTags || metaTags.length === 0 ) { 59 | reject( new Error( reason ) ); 60 | } 61 | 62 | metaTags.each( function () { 63 | const element = chtml( this ); 64 | const property = getProperty( element ); 65 | const content = getContent( element ); 66 | 67 | // If lacks property or content, skip 68 | if ( !property || !content ) { 69 | return; 70 | } 71 | 72 | // If the property already exists, make the array of contents 73 | if ( meta[ property ] ) { 74 | if ( meta[ property ] instanceof Array ) { 75 | meta[ property ].push( content ); 76 | } else { 77 | meta[ property ] = [ meta[ property ], content ]; 78 | } 79 | } else { 80 | meta[ property ] = content; 81 | } 82 | } ); 83 | 84 | if ( !Object.keys( meta ).length ) { 85 | reject( new Error( reason ) ); 86 | } 87 | 88 | resolve( meta ); 89 | } ); 90 | }; 91 | 92 | /** 93 | * Scrapes BE Press metadata given html object 94 | * 95 | * @param {Object} chtml html Cheerio object 96 | * @return {Object} promise of BE Press metadata object 97 | */ 98 | exports.parseBEPress = function ( chtml ) { 99 | return exports.parseBase( 100 | chtml, 101 | [ 'meta' ], 102 | 'No BE Press metadata found in page', 103 | ( element ) => { 104 | const content = element.attr( 'content' ); 105 | const name = element.attr( 'name' ); 106 | 107 | // If the element isn't a BE Press property or if content is missing, skip it 108 | if ( !name || !content || ( name.slice( 0, 17 ).toLowerCase() !== 'bepress_citation_' ) ) { 109 | return; 110 | } 111 | 112 | return name.slice( 17 ).toLowerCase(); 113 | }, 114 | ( element ) => element.attr( 'content' ) 115 | ); 116 | }; 117 | 118 | /** 119 | * Scrapes COinS data given Cheerio loaded html object 120 | * 121 | * @param {Object} chtml html Cheerio object 122 | * @return {Object} Promise for COinS metadata 123 | */ 124 | exports.parseCOinS = function ( chtml ) { 125 | let title; 126 | const metadata = []; 127 | const tags = chtml( 'span[class=Z3988]' ); 128 | const promArray = []; 129 | 130 | // Add promises for parsed title tags to an Array 131 | tags.each( function () { 132 | title = chtml( this ).attr( 'title' ); 133 | promArray.push( exports.parseCOinSTitle( title ) ); 134 | } ); 135 | 136 | // Once promises have resolved, add any successfully parsed titles to the metadata Array 137 | return Promise.all( promArray.map( ( promise ) => promise.then( 138 | ( value ) => ( { status: 'fulfilled', value } ), 139 | ( error ) => ( { status: 'rejected', reason: error } ) 140 | ) ) ).then( ( results ) => { 141 | let result; 142 | for ( const r in results ) { 143 | result = results[ r ]; 144 | if ( result && result.status === 'fulfilled' && result.value ) { 145 | metadata.push( result.value ); 146 | } 147 | } 148 | if ( !metadata.length ) { 149 | throw new Error( 'No COinS metadata found' ); 150 | } else { 151 | return metadata; 152 | } 153 | } ); 154 | }; 155 | 156 | /** 157 | * Parses value of COinS title tag 158 | * 159 | * @param {string} title String corresponding to value of title tag in span element 160 | * @return {Object} Promise for CoinS metadata 161 | */ 162 | exports.parseCOinSTitle = function ( title ) { 163 | return new Promise( ( resolve, reject ) => { 164 | const metadata = {}; 165 | const rft = {}; 166 | let value; 167 | let key; 168 | if ( typeof title !== 'string' ) { 169 | reject( new Error( 'Provided value must be a string; Got ' + typeof title ) ); 170 | } 171 | title = title.replace( /&/g, '&' ); // Allows function to take the raw html string 172 | title = title.split( '&' ); 173 | title.forEach( ( element ) => { 174 | element = element.split( '=' ); 175 | if ( element.length !== 2 ) { 176 | return; 177 | } // Invalid element 178 | key = element[ 0 ].toLowerCase(); // Be case-insensitive for properties 179 | value = decodeURIComponent( element[ 1 ].replace( /\+/g, '%20' ) ); // Replace + with encoded space since they aren't getting decoded as spaces 180 | key = key.split( '.' ); // Split hierarchical keys 181 | if ( key.length === 1 ) { // Top level key 182 | metadata[ key[ 0 ] ] = value; 183 | return; 184 | } 185 | if ( key.length === 2 ) { // Split key e.g. rft.date 186 | if ( key[ 0 ] !== 'rft' ) { 187 | return; 188 | } // Invalid hierarchical key 189 | // Keys that may have multiple values - return in list format 190 | if ( key[ 1 ] === 'au' || key[ 1 ] === 'isbn' || key[ 1 ] === 'issn' || key[ 1 ] === 'eissn' || key[ 1 ] === 'aucorp' ) { 191 | if ( !rft[ key[ 1 ] ] ) { 192 | rft[ key[ 1 ] ] = []; 193 | } 194 | rft[ key[ 1 ] ].push( value ); 195 | return; 196 | } 197 | // Add rft value to rft key - this will overwrite duplicates, if they exist 198 | rft[ key[ 1 ] ] = value; 199 | } 200 | } ); 201 | if ( Object.keys( rft ).length ) { // Add rft object if it is not empty 202 | metadata.rft = rft; 203 | } 204 | if ( !Object.keys( metadata ).length ) { 205 | reject( new Error( 'No COinS in provided string' ) ); 206 | } 207 | if ( metadata.rft && metadata.rft.genre ) { 208 | // Genre should be case insensitive as this field may be used programmatically 209 | metadata.rft.genre = metadata.rft.genre.toLowerCase(); 210 | } 211 | resolve( metadata ); 212 | } ); 213 | }; 214 | 215 | /** 216 | * Scrapes Dublin Core data given Cheerio loaded html object 217 | * 218 | * @param {Object} chtml html Cheerio object 219 | * @return {Object} Promise for DC metadata 220 | */ 221 | exports.parseDublinCore = function ( chtml ) { 222 | return exports.parseBase( 223 | chtml, 224 | [ 'meta', 'link' ], 225 | 'No Dublin Core metadata found in page', 226 | ( element ) => { 227 | const isLink = element[ 0 ].name === 'link'; 228 | const nameAttr = element.attr( isLink ? 'rel' : 'name' ); 229 | const value = element.attr( isLink ? 'href' : 'content' ); 230 | 231 | // If the element isn't a Dublin Core property or if value is missing, skip it 232 | if ( !nameAttr || !value || 233 | ( nameAttr.slice( 0, 3 ).toUpperCase() !== 'DC.' && 234 | nameAttr.slice( 0, 8 ).toUpperCase() !== 'DCTERMS.' ) ) { 235 | return; 236 | } 237 | 238 | const property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) ).toLowerCase(); 239 | 240 | return property; 241 | }, 242 | ( element ) => { 243 | const isLink = element[ 0 ].name === 'link'; 244 | return element.attr( isLink ? 'href' : 'content' ); 245 | } 246 | ); 247 | }; 248 | 249 | /** 250 | * Scrapes EPrints data given Cheerio loaded html object 251 | * 252 | * @param {Object} chtml html Cheerio object 253 | * @return {Object} Promise for EPrints metadata 254 | */ 255 | exports.parseEprints = function ( chtml ) { 256 | return exports.parseBase( 257 | chtml, 258 | [ 'meta' ], 259 | 'No EPrints metadata found in page', 260 | ( element ) => { 261 | const nameAttr = element.attr( 'name' ); 262 | const content = element.attr( 'content' ); 263 | 264 | // If the element isn't an EPrints property or content is missing, skip it 265 | if ( !nameAttr || !content || nameAttr.slice( 0, 8 ).toLowerCase() !== 'eprints.' ) { 266 | return; 267 | } 268 | 269 | let property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) ); 270 | 271 | // Lowercase property 272 | property = property.toLowerCase(); 273 | return property; 274 | }, 275 | ( element ) => element.attr( 'content' ) 276 | ).then( ( results ) => { 277 | if ( results.type ) { 278 | results.type = results.type.toLowerCase(); // Standardise 'type' field to lowercase 279 | } 280 | return results; 281 | } ); 282 | }; 283 | 284 | /** 285 | * Scrapes general metadata terms given Cheerio loaded html object 286 | * 287 | * @param {Object} chtml html Cheerio object 288 | * @return {Object} Promise for general metadata 289 | */ 290 | exports.parseGeneral = function ( chtml ) { 291 | return new Promise( ( resolve, reject ) => { 292 | const clutteredMeta = { 293 | appleTouchIcons: chtml( 'link[rel=apple-touch-icon i]' ).map( ( i, e ) => ( { 294 | href: e.attribs.href, 295 | sizes: e.attribs.sizes 296 | } ) ).get(), // apple-touch-icon 297 | icons: chtml( 'link[rel="shortcut icon" i], link[rel="icon" i]' ).map( ( i, e ) => ( { 298 | href: e.attribs.href, 299 | sizes: e.attribs.sizes, 300 | type: e.attribs.type 301 | } ) ).get(), // icon 302 | author: chtml( 'meta[name=author i]' ).first().attr( 'content' ), // author 303 | authorlink: chtml( 'link[rel=author i]' ).first().attr( 'href' ), // author link 304 | canonical: chtml( 'link[rel=canonical i]' ).first().attr( 'href' ), // canonical link 305 | description: chtml( 'meta[name=description i]' ).attr( 'content' ), // meta description 306 | publisher: chtml( 'link[rel=publisher i]' ).first().attr( 'href' ), // publisher link 307 | robots: chtml( 'meta[name=robots i]' ).first().attr( 'content' ), // robots 308 | shortlink: chtml( 'link[rel=shortlink i]' ).first().attr( 'href' ), // short link 309 | title: chtml( 'title' ).first().text(), // title tag 310 | lang: chtml( 'html' ).first().attr( 'lang' ) || chtml( 'html' ).first().attr( 'xml:lang' ), // lang <html lang=""> or <html xml:lang=""> 311 | dir: chtml( 'html' ).first().attr( 'dir' ) // dir <html dir=""> 312 | }; 313 | 314 | // Copy key-value pairs with defined values to meta 315 | const meta = {}; 316 | let value; 317 | let notEmpty = false; 318 | Object.keys( clutteredMeta ).forEach( ( key ) => { 319 | notEmpty = false; 320 | value = clutteredMeta[ key ]; 321 | let innerValue; 322 | if ( value && typeof value === 'object' ) { 323 | let i; 324 | for ( i = 0; i < Object.keys( value ).length; i++ ) { 325 | const definedValue = {}; 326 | // eslint-disable-next-line no-loop-func 327 | Object.keys( value[ i ] ).forEach( ( objectProperty ) => { 328 | innerValue = value[ i ][ objectProperty ]; 329 | if ( innerValue ) { 330 | definedValue[ objectProperty ] = innerValue; 331 | notEmpty = true; 332 | } 333 | } ); 334 | value[ i ] = definedValue; 335 | } 336 | } else { 337 | notEmpty = true; 338 | } 339 | if ( value && notEmpty ) { // Only add if has value 340 | meta[ key ] = value; 341 | } 342 | } ); 343 | 344 | // Reject promise if meta is empty 345 | if ( Object.keys( meta ).length === 0 ) { 346 | reject( new Error( 'No general metadata found in page' ) ); 347 | } 348 | 349 | // Resolve on meta 350 | resolve( meta ); 351 | } ); 352 | }; 353 | 354 | /** 355 | * Scrapes Highwire Press metadata given html object 356 | * 357 | * @param {Object} chtml html Cheerio object 358 | * @return {Object} promise of highwire press metadata object 359 | */ 360 | exports.parseHighwirePress = function ( chtml ) { 361 | return exports.parseBase( 362 | chtml, 363 | [ 'meta' ], 364 | 'No Highwire Press metadata found in page', 365 | ( element ) => { 366 | const nameAttr = element.attr( 'name' ); 367 | const content = element.attr( 'content' ); 368 | 369 | // If the element isn't a Highwire Press property, skip it 370 | if ( !nameAttr || !content || ( nameAttr.slice( 0, 9 ).toLowerCase() !== 'citation_' ) ) { 371 | return; 372 | } 373 | 374 | return nameAttr.slice( Math.max( 0, nameAttr.indexOf( '_' ) + 1 ) ).toLowerCase(); 375 | }, 376 | ( element ) => element.attr( 'content' ) 377 | ); 378 | }; 379 | 380 | /** 381 | * Returns JSON-LD provided by page given HTML object 382 | * 383 | * @param {Object} chtml html Cheerio object 384 | * @return {Object} Promise for JSON-LD 385 | */ 386 | exports.parseJsonLd = function ( chtml ) { 387 | return new Promise( ( resolve, reject ) => { 388 | const json = []; 389 | const jsonLd = chtml( 'script[type="application/ld+json"]' ); 390 | 391 | jsonLd.each( function () { 392 | let contents; 393 | try { 394 | contents = JSON.parse( this.children[ 0 ].data ); 395 | } catch ( e ) { 396 | // Fail silently, just in case there are valid tags 397 | return; 398 | } 399 | if ( contents ) { 400 | json.push( contents ); 401 | } else { 402 | return; 403 | } 404 | } ); 405 | 406 | if ( json.length === 0 ) { 407 | reject( new Error( 'No JSON-LD valid script tags present on page' ) ); 408 | } 409 | 410 | resolve( json.length > 1 ? json : json[ 0 ] ); 411 | } ); 412 | }; 413 | 414 | /** 415 | * Scrapes OpenGraph data given html object 416 | * 417 | * @param {Object} chtml html Cheerio object 418 | * @return {Object} promise of open graph metadata object 419 | */ 420 | exports.parseOpenGraph = function ( chtml ) { 421 | return new Promise( ( resolve, reject ) => { 422 | let property; 423 | let node; 424 | const meta = {}; 425 | const metaTags = chtml( 'meta' ); 426 | const namespace = [ 'og', 'fb' ]; 427 | const subProperty = { 428 | image: 'url', 429 | video: 'url', 430 | audio: 'url' 431 | }; 432 | const roots = {}; // Object to store roots of different type i.e. image, audio 433 | let subProp; // Current subproperty of interest 434 | const reason = new Error( 'No openGraph metadata found in page' ); 435 | 436 | if ( !metaTags || metaTags.length === 0 ) { 437 | reject( reason ); 438 | } 439 | 440 | metaTags.each( function () { 441 | const element = chtml( this ); 442 | let propertyValue = element.attr( 'property' ); 443 | const content = element.attr( 'content' ); 444 | 445 | if ( !propertyValue || !content ) { 446 | return; 447 | } else { 448 | propertyValue = propertyValue.toLowerCase().split( ':' ); 449 | } 450 | 451 | // If the property isn't in namespace, exit 452 | if ( !namespace.includes( propertyValue[ 0 ] ) ) { 453 | return; 454 | } 455 | 456 | if ( propertyValue.length === 2 ) { 457 | property = propertyValue[ 1 ]; // Set property to value after namespace 458 | if ( property in subProperty ) { // If has valid subproperty 459 | node = {}; 460 | node[ subProperty[ property ] ] = content; 461 | roots[ property ] = node; 462 | } else { 463 | node = content; 464 | } 465 | // If the property already exists, make the array of contents 466 | if ( meta[ property ] ) { 467 | if ( meta[ property ] instanceof Array ) { 468 | meta[ property ].push( node ); 469 | } else { 470 | meta[ property ] = [ meta[ property ], node ]; 471 | } 472 | } else { 473 | meta[ property ] = node; 474 | } 475 | } else if ( propertyValue.length === 3 ) { // Property part of a vertical 476 | // i.e. image, audio - as properties, not values, these should be lower case 477 | subProp = propertyValue[ 1 ].toLowerCase(); 478 | // i.e. height, width - as properties, not values, these should be lower case 479 | property = propertyValue[ 2 ].toLowerCase(); 480 | // If root for subproperty exists, and there isn't already a property 481 | // called that in there already i.e. height, add property and content. 482 | if ( roots[ subProp ] && !roots[ subProp ][ property ] ) { 483 | // As properties, not values, these should be lower case 484 | roots[ subProp ][ property ] = content.toLowerCase(); 485 | } 486 | } else { 487 | return; // Discard values with length <2 and >3 as invalid 488 | } 489 | 490 | // Check for "type" property and add to namespace if so 491 | // If any of these type occur in order before the type attribute is defined, 492 | // they'll be skipped; spec requires they be placed below type definition. 493 | // For nested types (e.g. video.movie) the OG protocol uses the super type 494 | // (e.g. movie) as the new namespace. 495 | if ( property === 'type' ) { 496 | namespace.push( content.split( '.' )[ 0 ].toLowerCase() ); // Add the type to the acceptable namespace list - as a property, should be lower case 497 | } 498 | } ); 499 | if ( Object.keys( meta ).length === 0 ) { 500 | reject( reason ); 501 | } 502 | if ( meta.type ) { 503 | // Make type case insensitive as this may be used programmatically 504 | meta.type = meta.type.toLowerCase(); 505 | } 506 | resolve( meta ); 507 | } ); 508 | }; 509 | 510 | /** 511 | * Scrapes schema.org microdata given Cheerio loaded html object 512 | * 513 | * @param {Object} chtml Cheerio object with html loaded 514 | * @return {Object} promise of schema.org microdata object 515 | */ 516 | exports.parseSchemaOrgMicrodata = function ( chtml ) { 517 | return new Promise( ( resolve, reject ) => { 518 | if ( !chtml ) { 519 | reject( new Error( 'Undefined argument' ) ); 520 | } 521 | 522 | const meta = microdata.toJson( chtml.html() ); 523 | if ( !meta || !meta.items || !meta.items[ 0 ] ) { 524 | reject( new Error( 'No schema.org metadata found in page' ) ); 525 | } 526 | resolve( meta ); 527 | } ); 528 | }; 529 | 530 | /** 531 | * Scrapes twitter microdata given Cheerio html object 532 | * 533 | * @param {Object} chtml html Cheerio object 534 | * @return {Object} promise of twitter metadata object 535 | */ 536 | exports.parseTwitter = function ( chtml ) { 537 | return new Promise( ( resolve, reject ) => { 538 | if ( !chtml ) { 539 | reject( new Error( 'Undefined argument' ) ); 540 | } 541 | 542 | const meta = {}; 543 | const metaTags = chtml( 'meta' ); 544 | 545 | // These properties can either be strings or objects 546 | const dualStateSubProperties = { 547 | image: 'url', 548 | player: 'url', 549 | creator: '@username' 550 | }; 551 | 552 | metaTags.each( function () { 553 | const element = chtml( this ); 554 | let name = element.attr( 'name' ); 555 | 556 | let property; 557 | const content = element.attr( 'content' ); 558 | let node; 559 | 560 | // Exit if not a twitter tag or content is missing 561 | if ( !name || !content ) { 562 | return; 563 | } else { 564 | name = name.toLowerCase().split( ':' ); 565 | property = name[ 1 ]; 566 | } 567 | 568 | // Exit if tag not twitter metadata 569 | if ( name[ 0 ] !== 'twitter' ) { 570 | return; 571 | } 572 | 573 | // Handle nested properties 574 | if ( name.length > 2 ) { 575 | const subProperty = name[ 2 ]; 576 | 577 | // Upgrade the property to an object if it needs to be 578 | if ( property in dualStateSubProperties && 579 | !( meta[ property ] instanceof Object ) ) { 580 | node = {}; 581 | node[ dualStateSubProperties[ property ] ] = meta[ property ]; 582 | // Clear out the existing string as we just placed it into our new node 583 | meta[ property ] = []; 584 | } else { 585 | // Either create a new node or ammend the existing one 586 | node = meta[ property ] ? meta[ property ] : {}; 587 | } 588 | 589 | // Differentiate betweeen twice and thrice nested properties 590 | // Not the prettiest solution, but twitter metadata guidelines are fairly strict, 591 | // so it's not nessesary to anticipate strange data. 592 | if ( name.length === 3 ) { 593 | node[ subProperty ] = content; 594 | } else if ( name.length === 4 ) { 595 | // Solve twitter:player:stream:content_type where stream needs to be an obj 596 | if ( subProperty.toLowerCase() === 'stream' ) { 597 | node[ subProperty ] = { url: node[ subProperty ] }; 598 | } else { 599 | // Either create a new subnode or amend the existing one 600 | node[ subProperty ] = node[ subProperty ] ? node[ subProperty ] : {}; 601 | } 602 | node[ subProperty ][ name[ 3 ] ] = content; 603 | } else { 604 | // Something is malformed, so exit 605 | return; 606 | } 607 | } else { 608 | node = content; 609 | } 610 | 611 | // Create array if property exists and is not a nested object 612 | if ( meta[ property ] && !( meta[ property ] instanceof Object ) ) { 613 | if ( meta[ property ] instanceof Array ) { 614 | meta[ property ].push( node ); 615 | } else { 616 | meta[ property ] = [ meta[ property ], node ]; 617 | } 618 | } else { 619 | meta[ property ] = node; 620 | } 621 | } ); 622 | 623 | if ( Object.keys( meta ).length === 0 ) { 624 | reject( new Error( 'No twitter metadata found on this page' ) ); 625 | } 626 | 627 | resolve( meta ); 628 | } ); 629 | }; 630 | 631 | /** 632 | * Scrapes prism metadata given Cheerio html object 633 | * 634 | * @param {Object} chtml html Cheerio object 635 | * @return {Object} promise of prism metadata object 636 | */ 637 | exports.parsePrism = function ( chtml ) { 638 | return new Promise( ( resolve, reject ) => { 639 | if ( !chtml ) { 640 | reject( new Error( 'Undefined argument' ) ); 641 | } 642 | 643 | const meta = {}; 644 | const metaTags = chtml( 'meta' ); 645 | 646 | const reason = new Error( 'No PRISM metadata found in page' ); 647 | 648 | if ( !metaTags || metaTags.length === 0 ) { 649 | reject( reason ); 650 | } 651 | 652 | metaTags.each( function () { 653 | const element = chtml( this ); 654 | let name = element.attr( 'name' ); 655 | const content = element.attr( 'content' ); 656 | 657 | if ( !name || !content ) { 658 | return; 659 | } else { 660 | name = name.split( '.' ); 661 | } 662 | 663 | // If the name does not have the prism prefix, exit 664 | if ( name[ 0 ].toLowerCase() !== 'prism' ) { 665 | return; 666 | } 667 | 668 | // Set the name to the value after the prefix 669 | name = name[ 1 ]; 670 | // Set the first character to lower case 671 | name = name.charAt( 0 ).toLowerCase() + name.slice( 1 ); 672 | 673 | // If the name already exists, make an array of the contents 674 | if ( meta[ name ] ) { 675 | if ( meta[ name ] instanceof Array ) { 676 | meta[ name ].push( content ); 677 | } else { 678 | meta[ name ] = [ meta[ name ], content ]; 679 | } 680 | } else { 681 | meta[ name ] = content; 682 | } 683 | } ); 684 | 685 | if ( Object.keys( meta ).length === 0 ) { 686 | reject( reason ); 687 | } 688 | 689 | resolve( meta ); 690 | } ); 691 | }; 692 | 693 | /** 694 | * Global exportable list of scraping promises with string keys 695 | * 696 | * @type {Object} 697 | */ 698 | exports.metadataFunctions = { 699 | bePress: exports.parseBEPress, 700 | coins: exports.parseCOinS, 701 | dublinCore: exports.parseDublinCore, 702 | eprints: exports.parseEprints, 703 | general: exports.parseGeneral, 704 | highwirePress: exports.parseHighwirePress, 705 | jsonLd: exports.parseJsonLd, 706 | openGraph: exports.parseOpenGraph, 707 | schemaOrg: exports.parseSchemaOrgMicrodata, 708 | twitter: exports.parseTwitter, 709 | prism: exports.parsePrism 710 | }; 711 | --------------------------------------------------------------------------------