├── test
    ├── index.js
    ├── utils
    │   └── assert.js
    ├── static
    │   ├── turtle_movie.json
    │   ├── turtle_movie.html
    │   ├── turtle_article_errors.html
    │   ├── turtle_article.json
    │   ├── turtle_article_case.html
    │   └── turtle_article.html
    ├── static.js
    ├── errors.js
    ├── scraping.js
    └── parsing.js
├── .jshintignore
├── .travis.yml
├── .gitignore
├── .eslintrc.json
├── .jshintrc
├── .github
    └── workflows
    │   └── node.js.yml
├── LICENSE.md
├── package.json
├── README.md
├── index.js
└── lib
    └── index.js


/test/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | 


--------------------------------------------------------------------------------
/.jshintignore:
--------------------------------------------------------------------------------
1 | coverage
2 | node_modules
3 | test
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - "4"
4 |   - "6"
5 |   - "8"
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | coverage
2 | node_modules
3 | npm-debug.log
4 | .eslintcache
5 | .nyc_output
6 | 


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"root": true,
 3 | 	"extends": [
 4 | 		"wikimedia/server"
 5 | 	],
 6 | 	"rules": {
 7 | 		"camelcase": "off",
 8 | 		"no-console": "off",
 9 | 		"no-process-exit": "off",
10 | 		"no-shadow": "off",
11 | 		"no-underscore-dangle": "off",
12 | 		"no-use-before-define": "off",
13 | 		"es-x/no-hashbang": "off",
14 | 		"n/no-process-exit": "off",
15 | 		"jsdoc/newline-after-description": "off"
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bitwise": true,
 3 | 	"laxbreak": true,
 4 | 	"curly": true,
 5 | 	"eqeqeq": true,
 6 | 	"immed": true,
 7 | 	"latedef": "nofunc",
 8 | 	"newcap": true,
 9 | 	"noarg": true,
10 | 	"noempty": true,
11 | 	"nonew": true,
12 | 	"regexp": false,
13 | 	"undef": true,
14 | 	"strict": true,
15 | 	"trailing": true,
16 | 	"smarttabs": true,
17 | 	"multistr": true,
18 | 	"node": true,
19 | 	"nomen": false,
20 | 	"loopfunc": true,
21 | 	"esnext": true
22 | }
23 | 


--------------------------------------------------------------------------------
/test/utils/assert.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | const { use } = require( 'chai' );
 4 | 
 5 | module.exports = use( ( _chai ) => {
 6 | 	const { assert } = _chai;
 7 | 
 8 | 	assert.fails = ( promise ) => {
 9 | 
10 | 		let failed = false;
11 | 
12 | 		function trackFailure( e ) {
13 | 			failed = true;
14 | 			return e;
15 | 		}
16 | 
17 | 		function check() {
18 | 			if ( !failed ) {
19 | 				throw new Error( 'expected error was not thrown' );
20 | 			}
21 | 		}
22 | 		return promise.catch( trackFailure ).then( check );
23 | 
24 | 	};
25 | 
26 | } ).assert;
27 | 


--------------------------------------------------------------------------------
/.github/workflows/node.js.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs
 3 | 
 4 | name: Node.js CI
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     strategy:
18 |       matrix:
19 |         node-version: [18.x, 20.x, 22.x, 24.x]
20 |         # See supported Node.js release schedule at https://nodejs.org/en/about/releases/
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 |     - name: Use Node.js ${{ matrix.node-version }}
25 |       uses: actions/setup-node@v3
26 |       with:
27 |         node-version: ${{ matrix.node-version }}
28 |         cache: 'npm'
29 |     - run: npm ci
30 |     - run: npm run build --if-present
31 |     - run: npm test
32 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Marielle Volz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "html-metadata",
 3 | 	"version": "3.0.1",
 4 | 	"description": "Scrapes metadata of several different standards",
 5 | 	"main": "index.js",
 6 | 	"dependencies": {
 7 | 		"cheerio": "1.0.0-rc.12",
 8 | 		"microdata-node": "^2.0.0"
 9 | 	},
10 | 	"devDependencies": {
11 | 		"chai": "^4.3.0",
12 | 		"eslint-config-wikimedia": "0.29.0",
13 | 		"mocha": "^11.1.0",
14 | 		"mocha-lcov-reporter": "^1.3.0",
15 | 		"nock": "^13.3.0",
16 | 		"nyc": "^17.1.0"
17 | 	},
18 | 	"scripts": {
19 | 		"test": "npm run lint && mocha",
20 | 		"lint": "eslint --cache --max-warnings 0 --ext .js,.json .",
21 | 		"lint:fix": "eslint --fix .",
22 | 		"coverage": "nyc --reporter=lcov _mocha"
23 | 	},
24 | 	"engines": {
25 | 		"node": ">=18"
26 | 	},
27 | 	"keywords": [
28 | 		"bepress",
29 | 		"coins",
30 | 		"dublin core",
31 | 		"eprints",
32 | 		"highwire press",
33 | 		"json-ld",
34 | 		"open graph",
35 | 		"metadata",
36 | 		"microdata",
37 | 		"prism",
38 | 		"twitter cards",
39 | 		"web scraper"
40 | 	],
41 | 	"repository": {
42 | 		"type": "git",
43 | 		"url": "https://github.com/wikimedia/html-metadata.git"
44 | 	},
45 | 	"author": "Marielle Volz <marielle.volz@gmail.com>",
46 | 	"contributors": [
47 | 		"Krzysztof Zbudniewek <krzysztof.zbudniewek@gmail.com>",
48 | 		"Geoffrey Mon <geofbot@gmail.com>",
49 | 		"Scimonster <tehalmightyscimonster@gmail.com>"
50 | 	],
51 | 	"license": "MIT",
52 | 	"bugs": {
53 | 		"url": "https://github.com/wikimedia/html-metadata/issues"
54 | 	},
55 | 	"homepage": "https://github.com/wikimedia/html-metadata"
56 | }
57 | 


--------------------------------------------------------------------------------
/test/static/turtle_movie.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dublinCore": {
 3 | 		"title": "Turtles of the Jungle",
 4 | 		"creator": "http://www.example.com/turtlelvr",
 5 | 		"description": "A 2008 film about jungle turtles.",
 6 | 		"date": "2012-02-04 12:00:00",
 7 | 		"type": "Image.Moving"
 8 | 	},
 9 | 	"general": {
10 | 		"appleTouchIcons": [
11 | 			{
12 | 				"href": "movieturtleapple.png"
13 | 			},
14 | 			{
15 | 				"href": "movieturtleapple2.png",
16 | 				"sizes": "72x72"
17 | 			}
18 | 		],
19 | 		"author": "Turtle Lvr",
20 | 		"authorlink": "http://examples.com/turtlelvr",
21 | 		"canonical": "http://example.com/turtles",
22 | 		"description": "Exposition on the awesomeness of turtles",
23 | 		"icons": [
24 | 			{
25 | 				"href": "movieturtle.png",
26 | 				"type": "image/png"
27 | 			},
28 | 			{
29 | 				"href": "movieturtle2.png",
30 | 				"sizes": "18x18"
31 | 			}
32 | 		],
33 | 		"publisher": "https://mediawiki.org",
34 | 		"robots": "we welcome our robot overlords",
35 | 		"shortlink": "http://example.com/c",
36 | 		"title": "Turtles are AWESOME!!1 | Awesome Turtles Website",
37 | 		"lang": "en"
38 | 	},
39 | 	"openGraph": {
40 | 		"locale": "en_US",
41 | 		"type": "video.movie",
42 | 		"title": "Turtles of the Jungle",
43 | 		"description": "A 2008 film about jungle turtles.",
44 | 		"url": "http://example.com",
45 | 		"site_name": "Awesome Turtle Movies Website",
46 | 		"image": [ {
47 | 			"url": "http://example.com/turtle.jpg"
48 | 		}, {
49 | 			"url": "http://example.com/shell.jpg"
50 | 		} ],
51 | 		"tag": [ "turtle", "movie", "awesome" ],
52 | 		"director": "http://www.example.com/PhilTheTurtle",
53 | 		"actor": [ "http://www.example.com/PatTheTurtle", "http://www.example.com/SaminaTheTurtle" ],
54 | 		"writer": "http://www.example.com/TinaTheTurtle",
55 | 		"release_date": "2015-01-14T19:14:27+00:00",
56 | 		"duration": "1000000"
57 | 	},
58 | 	"twitter": {
59 | 		"card": "summary",
60 | 		"site": "@Turtlessssssssss",
61 | 		"creator": "@Turtlessssssssss",
62 | 		"url": "http://www.example.com/turtles",
63 | 		"title": "Turtles of the Jungle",
64 | 		"description": "A 2008 film about jungle turtles.",
65 | 		"player": {
66 | 			"url": "http://www.example.com/turtles/player",
67 | 			"width": "400",
68 | 			"height": "400",
69 | 			"stream": {
70 | 				"url": "http://www.example.com/turtles/turtle.mp4",
71 | 				"content_type": "video/mp4"
72 | 			}
73 | 		}
74 | 	},
75 | 	"prism": {
76 | 		"publicationName": "Turtles of the Jungle",
77 | 		"publicationDate": "2012-02-04",
78 | 		"copyright": "2012 Turtles Society",
79 | 		"rightsAgent": "permissions@turtles.com",
80 | 		"url": "https://www.turtles.com"
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/test/static.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | /**
 4 |  * Tests using files contained in ./static
 5 |  */
 6 | 
 7 | const assert = require( './utils/assert.js' );
 8 | const cheerio = require( 'cheerio' );
 9 | const meta = require( '../index' );
10 | 
11 | // mocha defines to avoid eslint breakage
12 | /* global describe, it */
13 | 
14 | describe( 'static tests', () => {
15 | 	let $;
16 | 	const fs = require( 'fs' );
17 | 	let expected;
18 | 
19 | 	describe( 'static files', () => {
20 | 		it( 'should get correct info from turtle movie file', () => {
21 | 			expected = JSON.parse( fs.readFileSync( './test/static/turtle_movie.json' ) );
22 | 			$ = cheerio.load( fs.readFileSync( './test/static/turtle_movie.html' ) );
23 | 			return meta.parseAll( $ ).then( ( results ) => {
24 | 				assert.deepEqual( results, expected );
25 | 			} );
26 | 		} );
27 | 
28 | 		it( 'should get correct info from turtle article file', () => {
29 | 			expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) );
30 | 			$ = cheerio.load( fs.readFileSync( './test/static/turtle_article.html' ) );
31 | 			return meta.parseAll( $ ).then( ( results ) => {
32 | 				assert.deepEqual( results, expected );
33 | 			} );
34 | 		} );
35 | 
36 | 		it( 'should be case insensitive on turtle article file', () => {
37 | 			expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) );
38 | 			$ = cheerio.load( fs.readFileSync( './test/static/turtle_article_case.html' ) );
39 | 			return meta.parseAll( $ ).then( ( results ) => {
40 | 				assert.deepEqual( results, expected );
41 | 			} );
42 | 		} );
43 | 
44 | 		it( 'should be case insensitive on turtle article file', () => {
45 | 			expected = JSON.parse( fs.readFileSync( './test/static/turtle_article.json' ) );
46 | 			$ = cheerio.load( fs.readFileSync( './test/static/turtle_article_case.html' ) );
47 | 			return meta.parseAll( $ ).then( ( results ) => {
48 | 				assert.deepEqual( results, expected );
49 | 			} );
50 | 		} );
51 | 	} );
52 | 
53 | 	describe( 'loadFromString', () => {
54 | 		it( 'should get correct info using loadFromString method from turtle movie file ', () => {
55 | 			expected = JSON.parse( fs.readFileSync( './test/static/turtle_movie.json' ) );
56 | 			const html = fs.readFileSync( './test/static/turtle_movie.html' );
57 | 			return meta.loadFromString( html ).then( ( results ) => {
58 | 				assert.deepEqual( results, expected );
59 | 			} );
60 | 		} );
61 | 
62 | 		it( 'should get correct info using loadFromString method for self closing tag', () => {
63 | 			const html = '<div itemscope><span itemprop="price" content="139.90" /> <span itemprop="priceCurrency" content="PLN" /></div>';
64 | 			const expected = { schemaOrg: { items: [ { properties: { priceCurrency: [ 'PLN' ], price: [ '139.90' ] } } ] } };
65 | 			return meta.loadFromString( html ).then( ( results ) => {
66 | 				assert.deepEqual( results, expected );
67 | 			} );
68 | 		} );
69 | 	} );
70 | 
71 | } );
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | html-metadata
 2 | =============
 3 | [![npm](https://img.shields.io/npm/v/html-metadata.svg)](https://www.npmjs.com/package/html-metadata)
 4 | > MetaData html scraper and parser for Node.js (supports Promises only. Callbacks were deprecated in 3.0.0)
 5 | 
 6 | The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, JSON-LD, Open Graph, Twitter, EPrints, PRISM, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
 7 | 
 8 | Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!
 9 | 
10 | ## Install
11 | 
12 | 	npm install html-metadata
13 | 
14 | ## Usage
15 | 
16 | ```js
17 | var scrape = require('html-metadata');
18 | 
19 | var url = "http://blog.woorank.com/2013/04/dublin-core-metadata-for-seo-and-usability/";
20 | 
21 | scrape(url).then(function(metadata){
22 | 	console.log(metadata);
23 | });
24 | ```
25 | 
26 | The scrape method used here invokes the parseAll() method, which uses all the available methods registered in method metadataFunctions(), and are available for use separately as well, for example:
27 | 
28 | ```js
29 | var cheerio = require('cheerio');
30 | var parseDublinCore = require('html-metadata').parseDublinCore;
31 | 
32 | var url = "http://blog.woorank.com/2013/04/dublin-core-metadata-for-seo-and-usability/";
33 | 
34 | fetch(url).then(function(response){
35 | 	$ = cheerio.load(response.body);
36 | 	return parseDublinCore($).then(function(metadata){
37 | 		console.log(metadata);
38 | 	});
39 | });
40 | ```
41 | 
42 | Options dictionary:
43 | 
44 | You can also pass an [options dictionary](https://developer.mozilla.org/en-US/docs/Web/API/RequestInit) as the first argument containing extra parameters. Some websites require the user-agent or cookies to be set in order to get the response. This is identifical to the RequestInit dictionary except that it should also contain the requested url as part of the dictionary. 
45 | 
46 | ```
47 | var scrape = require('html-metadata');
48 | 
49 | var options =  {
50 | 	url: "http://example.com",
51 | 	headers: {
52 | 		'User-Agent': 'webscraper'
53 | 	}
54 | };
55 | 
56 | scrape(options, function(error, metadata){
57 | 	console.log(metadata);
58 | });
59 | ```
60 | 
61 | The method parseGeneral obtains the following general metadata:
62 | 
63 | ```html
64 | <link rel="apple-touch-icon" href="" sizes="" type="">
65 | <link rel="icon" href="" sizes="" type="">
66 | <meta name="author" content="">
67 | <link rel="author" href="">
68 | <link rel="canonical" href="">
69 | <meta name ="description" content="">
70 | <link rel="publisher" href="">
71 | <meta name ="robots" content="">
72 | <link rel="shortlink" href="">
73 | <title></title>
74 | <html lang="en">
75 | <html dir="rtl">
76 | ```
77 | 
78 | ## Tests
79 | 
80 | ```npm test``` runs the mocha tests
81 | 
82 | ```npm run-script coverage``` runs the tests and reports code coverage
83 | 
84 | ## Contributing
85 | 
86 | Contributions welcome! All contibutions should use [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) instead of callbacks.
87 | 


--------------------------------------------------------------------------------
/test/static/turtle_movie.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 | 
 3 | <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# video: http://ogp.me/ns/video#">
 4 | 
 5 | <meta charset="utf-8">
 6 | 
 7 | <title>Turtles are AWESOME!!1 | Awesome Turtles Website</title>
 8 | 
 9 | <meta name="author" content="Turtle Lvr">
10 | <meta name="robots" content="we welcome our robot overlords"/>
11 | <meta name="description" content="Exposition on the awesomeness of turtles"/>
12 | <meta name="keywords" content="turtle, movie" />
13 | 
14 | <link rel="apple-touch-icon" href="movieturtleapple.png" >
15 | <link rel="icon" href="movieturtle.png" type="image/png">
16 | <link rel="apple-touch-icon" href="movieturtleapple2.png" sizes="72x72">
17 | <link rel="icon" href="movieturtle2.png" sizes="18x18">
18 | <link rel="canonical" href="http://example.com/turtles" />
19 | <link rel="publisher" href="https://mediawiki.org"/>
20 | <link rel="author" href="http://examples.com/turtlelvr"/>
21 | <link rel="shortlink" href="http://example.com/c" />
22 | 
23 | <!--Open Graph-->
24 | 
25 | <meta property="og:locale" content="en_US" />
26 | <meta property="og:type" content="video.movie" />
27 | <meta property="og:title" content="Turtles of the Jungle" />
28 | <meta property="og:description" content="A 2008 film about jungle turtles." />
29 | <meta property="og:url" content="http://example.com" />
30 | <meta property="og:site_name" content="Awesome Turtle Movies Website" />
31 | <meta property="og:image" content="http://example.com/turtle.jpg" />
32 | <meta property="og:image" content="http://example.com/shell.jpg" />
33 | 
34 | <meta property="video:tag" content="turtle" />
35 | <meta property="video:tag" content="movie" />
36 | <meta property="video:tag" content="awesome" />
37 | <meta property="video:director" content="http://www.example.com/PhilTheTurtle" />
38 | <meta property="video:actor" content="http://www.example.com/PatTheTurtle" />
39 | <meta property="video:actor:role" content="Turtle #3" /> <!-- Currently ignored -->
40 | <meta property="video:actor" content="http://www.example.com/SaminaTheTurtle" />
41 | <meta property="video:writer" content="http://www.example.com/TinaTheTurtle" />
42 | <meta property="video:release_date" content="2015-01-14T19:14:27+00:00" />
43 | <meta property="video:duration"  content="1000000" />
44 | 
45 | <!--AL-->
46 | 
47 | <meta property="al:ios:url" content="turtle://">
48 | <meta property="al:ios:app_store_id" content="000">
49 | <meta property="al:android:url" content="turtle://">
50 | <meta property="al:android:package" content="superturtlearticle.androidapp">
51 | <meta property="al:web:url" content="http://example.com/">
52 | <meta property="al:web:should_fallback" content="true">
53 | 
54 | <!--Twitter-->
55 | 
56 | <meta name="twitter:card" content="summary">
57 | <meta name="twitter:site" content="@Turtlessssssssss">
58 | <meta name="twitter:creator" content="@Turtlessssssssss">
59 | <meta name="twitter:url" content="http://www.example.com/turtles">
60 | <meta name="twitter:title" content="Turtles of the Jungle">
61 | <meta name="twitter:description" content="A 2008 film about jungle turtles.">
62 | <meta name="twitter:player" content="http://www.example.com/turtles/player">
63 | <meta name="twitter:player:width" content="400">
64 | <meta name="twitter:player:height" content="400">
65 | <meta name="twitter:player:stream" content="http://www.example.com/turtles/turtle.mp4">
66 | <meta name="twitter:player:stream:content_type" content="video/mp4">
67 | 
68 | <!--Dublin Core-->
69 | 
70 | <meta name="DC.Title" content="Turtles of the Jungle" >
71 | <meta name="DC.Creator" content="http://www.example.com/turtlelvr" >
72 | <meta name="DC.Description" content="A 2008 film about jungle turtles." >
73 | <meta name="DC.Date" content="2012-02-04 12:00:00" >
74 | <meta name="DC.Type" content="Image.Moving" >
75 | 
76 | <!--PRISM-->
77 | 
78 | <meta name="prism.publicationName" content="Turtles of the Jungle" >
79 | <meta name="prism.publicationDate" content="2012-02-04" >
80 | <meta name="prism.copyright" content="2012 Turtles Society" >
81 | <meta name="prism.rightsAgent" content="permissions@turtles.com" >
82 | <meta name="prism.url" content="https://www.turtles.com" >
83 | 
84 | </head>
85 | 
86 | <body>
87 | </body>
88 | 
89 | </html>
90 | 


--------------------------------------------------------------------------------
/test/static/turtle_article_errors.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | <!--
 4 | This file contains entirely invalid metadata and should case parseAll to fail
 5 | -->
 6 | <title></title>
 7 | </head>
 8 | 
 9 | <body>
10 | 
11 | <script type="application/ld+json">
12 | {
13 | "@context" / "http://schema.org",
14 | "@type": "Organization"
15 | "url": "https://www.turtles.com"
16 | }
17 | </script>
18 | 
19 | <meta name="author" badcontent="Turtle Lvr">
20 | <meta name="robots" badcontent="we welcome our robot overlords"/>
21 | <meta name="description" badcontent="Exposition on the awesomeness of turtles"/>
22 | <meta name="keywords" badcontent="turtle, movie" />
23 | 
24 | <link rel="apple-touch-icon" badhref="movieturtleapple.png">
25 | <link rel="icon" badhref="movieturtle.png" badtype="image/png">
26 | <link rel="apple-touch-icon" badhref="movieturtleapple2.png" badsize"72x72">
27 | <link rel="icon" badhref="movieturtle2.png" badsize="18x18">
28 | <link rel="canonical" badhref="http://example.com/turtles" />
29 | <link rel="publisher" badhref="https://mediawiki.org"/>
30 | <link rel="author" badhref="http://examples.com/turtlelvr"/>
31 | <link rel="shortlink" badhref="http://example.com/c" />
32 | 
33 | <!--Open Graph-->
34 | 
35 | <meta property="og:locale" badcontent="en_US" />
36 | <meta property="og:type" badcontent="video.movie" />
37 | <meta property="og:title" badcontent="Turtles of the Jungle" />
38 | <meta property="og:description" badcontent="A 2008 film about jungle turtles." />
39 | <meta property="og:url" badcontent="http://example.com" />
40 | <meta property="og:site_name" badcontent="Awesome Turtle Movies Website" />
41 | <meta property="og:image" badcontent="http://example.com/turtle.jpg" />
42 | <meta property="og:image" badcontent="http://example.com/shell.jpg" />
43 | 
44 | <meta property="video:tag" badcontent="turtle" />
45 | <meta property="video:tag" badcontent="movie" />
46 | <meta property="video:tag" badcontent="awesome" />
47 | <meta property="video:director" badcontent="http://www.example.com/PhilTheTurtle" />
48 | <meta property="video:actor" badcontent="http://www.example.com/PatTheTurtle" />
49 | <meta property="video:actor:role" badcontent="Turtle #3" /> <!-- Currently ignored -->
50 | <meta property="video:actor" badcontent="http://www.example.com/SaminaTheTurtle" />
51 | <meta property="video:writer" badcontent="http://www.example.com/TinaTheTurtle" />
52 | <meta property="video:release_date" badcontent="2015-01-14T19:14:27+00:00" />
53 | <meta property="video:duration"  badcontent="1000000" />
54 | 
55 | <!--AL-->
56 | 
57 | <meta property="al:ios:url" badcontent="turtle://">
58 | <meta property="al:ios:app_store_id" badcontent="000">
59 | <meta property="al:android:url" badcontent="turtle://">
60 | <meta property="al:android:package" badcontent="superturtlearticle.androidapp">
61 | <meta property="al:web:url" badcontent="http://example.com/">
62 | <meta property="al:web:should_fallback" badcontent="true">
63 | 
64 | <!--Twitter-->
65 | 
66 | <meta name="twitter:card" badcontent="summary">
67 | <meta name="twitter:site" badcontent="@Turtlessssssssss">
68 | <meta name="twitter:creator" badcontent="@Turtlessssssssss">
69 | <meta name="twitter:url" badcontent="http://www.example.com/turtles">
70 | <meta name="twitter:title" badcontent="Turtles of the Jungle">
71 | <meta name="twitter:description" badcontent="A 2008 film about jungle turtles.">
72 | <meta name="twitter:player" badcontent="http://www.example.com/turtles/player">
73 | <meta name="twitter:player:width" badcontent="400">
74 | <meta name="twitter:player:height" badcontent="400">
75 | <meta name="twitter:player:stream" badcontent="http://www.example.com/turtles/turtle.mp4">
76 | <meta name="twitter:player:stream:badcontent_type" badcontent="video/mp4">
77 | 
78 | <!--Dublin Core-->
79 | 
80 | <meta name="DC.Title" badcontent="Turtles of the Jungle" >
81 | <meta name="DC.Creator" badcontent="http://www.example.com/turtlelvr" >
82 | <meta name="DC.Description" badcontent="A 2008 film about jungle turtles." >
83 | <meta name="DC.Date" badcontent="2012-02-04 12:00:00" >
84 | <meta name="DC.Type" badcontent="Image.Moving" >
85 | 
86 | <!--PRISM-->
87 | 
88 | <meta name="prism.publicationName" badcontent="Turtles of the Jungle" >
89 | <meta name="prism.publicationDate" badcontent="2012-02-04" >
90 | <meta name="prism.copyright" badcontent="2012 Turtles Society" >
91 | <meta name="prism.rightsAgent" badcontent="permissions@turtles.com" >
92 | <meta name="prism.url" badcontent="https://www.turtles.com" >
93 | 
94 | </body>
95 | 
96 | </html>
97 | 


--------------------------------------------------------------------------------
/test/errors.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | /**
  4 |  * Tests expecting promises to reject
  5 |  */
  6 | 
  7 | const cheerio = require( 'cheerio' );
  8 | const meta = require( '../index' );
  9 | const assert = require( './utils/assert.js' );
 10 | const fs = require( 'fs' );
 11 | 
 12 | // mocha defines to avoid eslint breakage
 13 | /* global describe, it */
 14 | 
 15 | describe( 'errors', function () {
 16 | 
 17 | 	this.timeout( 40000 );
 18 | 
 19 | 	function fetchBody( url ) {
 20 | 		// res.body is a ReadableStream of a Uint8Array, but we just want the string
 21 | 		// eslint-disable-next-line n/no-unsupported-features/node-builtins
 22 | 		return fetch( url ).then( ( res ) => res.text() );
 23 | 	}
 24 | 
 25 | 	it( 'should not find schema.org metadata, reject promise', () => {
 26 | 		const url = 'http://example.com';
 27 | 		return fetchBody( url )
 28 | 			.then( ( body ) => {
 29 | 				const $ = cheerio.load( body );
 30 | 				const prom = meta.parseSchemaOrgMicrodata( $ );
 31 | 				return assert.fails( prom );
 32 | 			} );
 33 | 	} );
 34 | 
 35 | 	it( 'should not find BE Press metadata, reject promise', () => {
 36 | 		const url = 'http://example.com';
 37 | 		return fetchBody( url )
 38 | 			.then( ( body ) => {
 39 | 				const $ = cheerio.load( body );
 40 | 				const prom = meta.parseBEPress( $ );
 41 | 				return assert.fails( prom );
 42 | 			} );
 43 | 	} );
 44 | 
 45 | 	it( 'should not find coins metadata, reject promise', () => {
 46 | 		const url = 'http://example.com';
 47 | 		return fetchBody( url )
 48 | 			.then( ( body ) => {
 49 | 				const $ = cheerio.load( body );
 50 | 				const prom = meta.parseCOinS( $ );
 51 | 				return assert.fails( prom );
 52 | 			} );
 53 | 	} );
 54 | 
 55 | 	it( 'should not find dublin core metadata, reject promise', () => {
 56 | 		const url = 'http://www.laprovence.com/article/actualites/3411272/marseille-un-proche-du-milieu-corse-abattu-par-balles-en-plein-jour.html';
 57 | 		return fetchBody( url )
 58 | 			.then( ( body ) => {
 59 | 				const $ = cheerio.load( body );
 60 | 				const prom = meta.parseDublinCore( $ );
 61 | 				return assert.fails( prom );
 62 | 			} );
 63 | 	} );
 64 | 
 65 | 	it( 'should not find highwire press metadata, reject promise', () => {
 66 | 		const url = 'http://example.com';
 67 | 		return fetchBody( url )
 68 | 			.then( ( body ) => {
 69 | 				const $ = cheerio.load( body );
 70 | 				const prom = meta.parseHighwirePress( $ );
 71 | 				return assert.fails( prom );
 72 | 			} );
 73 | 	} );
 74 | 
 75 | 	it( 'should not find open graph metadata, reject promise', () => {
 76 | 		const url = 'http://www.example.com';
 77 | 		return fetchBody( url )
 78 | 			.then( ( body ) => {
 79 | 				const $ = cheerio.load( body );
 80 | 				const prom = meta.parseOpenGraph( $ );
 81 | 				return assert.fails( prom );
 82 | 			} );
 83 | 	} );
 84 | 
 85 | 	it( 'should not find eprints metadata, reject promise', () => {
 86 | 		const url = 'http://example.com';
 87 | 		return fetchBody( url )
 88 | 			.then( ( body ) => {
 89 | 				const $ = cheerio.load( body );
 90 | 				const prom = meta.parseEprints( $ );
 91 | 				return assert.fails( prom );
 92 | 			} );
 93 | 	} );
 94 | 
 95 | 	it( 'should not find twitter metadata, reject promise', () => {
 96 | 		const url = 'http://example.com';
 97 | 		return fetchBody( url )
 98 | 			.then( ( body ) => {
 99 | 				const $ = cheerio.load( body );
100 | 				const prom = meta.parseTwitter( $ );
101 | 				return assert.fails( prom );
102 | 			} );
103 | 	} );
104 | 
105 | 	it( 'should not find JSON-LD, reject promise', () => {
106 | 		const url = 'http://example.com';
107 | 		return fetchBody( url )
108 | 			.then( ( body ) => {
109 | 				const $ = cheerio.load( body );
110 | 				const prom = meta.parseJsonLd( $ );
111 | 				return assert.fails( prom );
112 | 			} );
113 | 	} );
114 | 
115 | 	it( 'should reject parseALL promise for entire error file', () => {
116 | 		const $ = cheerio.load( fs.readFileSync( './test/static/turtle_article_errors.html' ) );
117 | 		return assert.fails( meta.parseAll( $ ) );
118 | 	} );
119 | 
120 | 	it( 'should reject promise with undefined cheerio object', () => {
121 | 		const prom = meta.parseOpenGraph( undefined );
122 | 		return assert.fails( prom );
123 | 	} );
124 | 
125 | 	it( 'should reject promise with non-string title', () => {
126 | 		const prom = meta.parseCOinSTitle( {} );
127 | 		return assert.fails( prom );
128 | 	} );
129 | 
130 | 	it( 'should reject promise with string with no keys', () => {
131 | 		const prom = meta.parseCOinSTitle( '' );
132 | 		return assert.fails( prom );
133 | 	} );
134 | 
135 | 	it( 'should reject promise with string with bad keys', () => {
136 | 		const prom = meta.parseCOinSTitle( 'badkey.a&badkey.b' );
137 | 		return assert.fails( prom );
138 | 	} );
139 | 
140 | } );
141 | 


--------------------------------------------------------------------------------
/test/static/turtle_article.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"bePress": {
  3 | 		"series_title": "Turtles",
  4 | 		"author": "Turtle Lvr",
  5 | 		"author_institution": "Mediawiki",
  6 | 		"title": "Turtles are AWESOME!!1",
  7 | 		"date": "2012",
  8 | 		"pdf_url": "http://www.example.com/turtlelvr/pdf",
  9 | 		"abstract_html_url": "http://www.example.com/turtlelvr",
 10 | 		"publisher": "Turtles Society",
 11 | 		"online_date": "2012/02/04"
 12 | 	},
 13 | 	"coins": [ {
 14 | 		"ctx_ver": "Z39.88-2004",
 15 | 		"rft_id": "info:doi/http://dx.doi.org/10.5555/12345678",
 16 | 		"rfr_id": "info:sid/crossref.org:search",
 17 | 		"rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
 18 | 		"rft": {
 19 | 			"atitle": "Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory",
 20 | 			"jtitle": "Journal of Psychoceramics",
 21 | 			"date": "2008",
 22 | 			"volume": "5",
 23 | 			"issue": "11",
 24 | 			"spage": "1",
 25 | 			"epage": "3",
 26 | 			"aufirst": "Josiah",
 27 | 			"aulast": "Carberry",
 28 | 			"genre": "article",
 29 | 			"au": [ "Josiah Carberry" ]
 30 | 		}
 31 | 	} ],
 32 | 	"dublinCore": {
 33 | 		"title": "Turtles are AWESOME!!1",
 34 | 		"creator": "http://www.example.com/turtlelvr",
 35 | 		"description": "Exposition on the awesomeness of turtles",
 36 | 		"date": "2012-02-04 12:00:00",
 37 | 		"type": "Text.Article"
 38 | 	},
 39 | 	"general": {
 40 | 		"appleTouchIcons": [
 41 | 			{
 42 | 				"href": "turtleapple.png",
 43 | 				"sizes": "72x72"
 44 | 			},
 45 | 			{
 46 | 				"href": "turtleapple2.png"
 47 | 			}
 48 | 		],
 49 | 		"author": "Turtle Lvr",
 50 | 		"authorlink": "http://examples.com/turtlelvr",
 51 | 		"canonical": "http://example.com/turtles",
 52 | 		"description": "Exposition on the awesomeness of turtles",
 53 | 		"dir": "ltr",
 54 | 		"icons": [
 55 | 			{
 56 | 				"href": "turtle.png",
 57 | 				"sizes": "18x18",
 58 | 				"type": "image/png"
 59 | 			},
 60 | 			{
 61 | 				"href": "turtle2.png",
 62 | 				"type": "image/png"
 63 | 			}
 64 | 		],
 65 | 		"publisher": "https://mediawiki.org",
 66 | 		"robots": "we welcome our robot overlords",
 67 | 		"shortlink": "http://example.com/c",
 68 | 		"title": "Turtles are AWESOME!!1 | Awesome Turtles Website",
 69 | 		"lang": "en"
 70 | 	},
 71 | 	"highwirePress": {
 72 | 		"journal_title": "Turtles",
 73 | 		"issn": "1234-5678",
 74 | 		"doi": "10.1000/123",
 75 | 		"publication_date": "2012-02-04",
 76 | 		"title": "Turtles are AWESOME!!1",
 77 | 		"author": "Turtle Lvr",
 78 | 		"author_institution": "Mediawiki",
 79 | 		"volume": "150",
 80 | 		"issue": "1",
 81 | 		"firstpage": "123",
 82 | 		"lastpage": "456",
 83 | 		"publisher": "Turtles Society",
 84 | 		"abstract": "Exposition on the awesomeness of turtles."
 85 | 	},
 86 | 	"jsonLd": {
 87 | 		"@context": "http://schema.org",
 88 | 		"@type": "Organization",
 89 | 		"url": "https://www.turtles.com"
 90 | 	},
 91 | 	"openGraph": {
 92 | 		"locale": "en_US",
 93 | 		"type": "article",
 94 | 		"title": "Turtles are AWESOME!!1",
 95 | 		"description": "Exposition on the awesomeness of turtles",
 96 | 		"url": "http://example.com",
 97 | 		"site_name": "Awesome Turtles Website",
 98 | 		"image": [ {
 99 | 			"url": "http://example.com/turtle.jpg",
100 | 			"secure_url": "https://secure.example.com/turtle.jpg",
101 | 			"type": "image/jpeg",
102 | 			"width": "400",
103 | 			"height": "300"
104 | 		}, {
105 | 			"url": "http://example.com/shell.jpg",
106 | 			"width": "200",
107 | 			"height": "150"
108 | 		} ],
109 | 		"audio": {
110 | 			"url": "http://example.com/sound.mp3",
111 | 			"secure_url": "https://secure.example.com/sound.mp3",
112 | 			"type": "audio/mpeg"
113 | 		},
114 | 		"tag": [ "turtles", "are", "awesome" ],
115 | 		"section": [ "Turtles are tough", "Turtles are flawless", "Turtles are cute" ],
116 | 		"published_time": "2012-02-04T12:00:00+00:00",
117 | 		"modified_time": "2015-01-14T19:14:27+00:00",
118 | 		"author": "http://examples.com/turtlelvr",
119 | 		"publisher": "http://mediawiki.org"
120 | 	},
121 | 	"eprints": {
122 | 		"title": "Turtles are AWESOME!!1",
123 | 		"creators_name": "http://www.example.com/turtlelvr",
124 | 		"abstract": "Exposition on the awesomeness of turtles",
125 | 		"datestamp": "2012-02-04 12:00:00",
126 | 		"type": "article"
127 | 	},
128 | 	"twitter": {
129 | 		"card": "summary",
130 | 		"site": "@Turtlessssssssss",
131 | 		"creator": [ "@Turtlessssssssss", "@Turtlezzzzzzzzzz" ],
132 | 		"url": "http://www.example.com/turtles",
133 | 		"title": "Turtles are AWESOME!!1",
134 | 		"description": "Exposition on the awesomeness of turtles",
135 | 		"image": {
136 | 			"url": "http://example.com/turtles.jpg",
137 | 			"alt": "It's a bunch of turtles!"
138 | 		},
139 | 		"app": {
140 | 			"url": {
141 | 				"iphone": "turtle://",
142 | 				"googleplay": "turtle://"
143 | 			},
144 | 			"id": {
145 | 				"iphone": "000",
146 | 				"googleplay": "superturtlearticle.androidapp"
147 | 			}
148 | 		}
149 | 	},
150 | 	"prism": {
151 | 		"issn": "1234-5678",
152 | 		"publicationName": "Turtles Society",
153 | 		"publicationDate": "2012-02-04",
154 | 		"startingPage": "123",
155 | 		"copyright": "2012 Turtles Society",
156 | 		"rightsAgent": "permissions@turtles.com",
157 | 		"url": "https://www.turtles.com",
158 | 		"doi": "10.1000/123"
159 | 	},
160 | 	"schemaOrg": {
161 | 		"items": [
162 | 			{
163 | 				"properties": {
164 | 					"archivedAt": [
165 | 						"http://www.archive.org/turtlearticle"
166 | 					],
167 | 					"headline": [
168 | 						"Turtles are AWESOME!!1"
169 | 					],
170 | 					"author": [
171 | 						"Turtle Lvr"
172 | 					],
173 | 					"wordCount": [
174 | 						"10"
175 | 					]
176 | 				}
177 | 			}
178 | 		]
179 | 	}
180 | 
181 | }
182 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * https://github.com/wikimedia/html-metadata
  3 |  *
  4 |  * This file wraps all exportable functions so that they
  5 |  * can be used with Promises.
  6 |  */
  7 | 
  8 | 'use strict';
  9 | 
 10 | /*
 11 | Import modules
 12 |  */
 13 | const cheerio = require( 'cheerio' );
 14 | 
 15 | const index = require( './lib/index.js' );
 16 | 
 17 | /**
 18 |  * Default exported function that takes a url string or
 19 |  * request library options dictionary and returns a
 20 |  * Promise for all available metadata
 21 |  *
 22 |  * @param  {Object}   urlOrOpts  url String or options dictionary
 23 |  * @return {Object}              Promise for metadata
 24 |  */
 25 | exports = module.exports = function ( urlOrOpts ) {
 26 | 	return new Promise( ( resolve, reject ) => {
 27 | 		let url, opts;
 28 | 		if ( urlOrOpts instanceof Object ) {
 29 | 			if ( urlOrOpts.uri ) {
 30 | 				url = urlOrOpts.uri;
 31 | 			}
 32 | 			opts = urlOrOpts;
 33 | 		} else if ( typeof urlOrOpts === 'string' ) {
 34 | 			url = urlOrOpts;
 35 | 		}
 36 | 		if ( !url ) {
 37 | 			reject( new Error( 'No uri supplied in argument' ) );
 38 | 		} else {
 39 | 			resolve(
 40 | 				// eslint-disable-next-line n/no-unsupported-features/node-builtins
 41 | 				fetch( url, opts ).then(
 42 | 					( response ) => response.text().then(
 43 | 						( body ) => index.parseAll( cheerio.load( body ) )
 44 | 					)
 45 | 				)
 46 | 			);
 47 | 		}
 48 | 	} );
 49 | };
 50 | 
 51 | /**
 52 |  * Exported function that takes html string and
 53 |  * returns a Promise for all available metadata
 54 |  *
 55 |  * @param  {string}   html       html String HTML of the page
 56 |  * @return {Object}              Promise for metadata
 57 |  */
 58 | exports.loadFromString = function ( html ) {
 59 | 	return index.parseAll( cheerio.load( html ) );
 60 | };
 61 | 
 62 | /**
 63 |  * Returns Object containing all available datatypes, keyed
 64 |  * using the same keys as in metadataFunctions.
 65 |  *
 66 |  * @param  {Object}   chtml      html Cheerio object to parse
 67 |  * @return {Object}              Promise for metadata
 68 |  */
 69 | exports.parseAll = function ( chtml ) {
 70 | 	return index.parseAll( chtml );
 71 | };
 72 | 
 73 | /**
 74 |  * Scrapes BE Press metadata given html object
 75 |  *
 76 |  * @param  {Object}   chtml      html Cheerio object
 77 |  * @return {Object}              Promise for metadata
 78 |  */
 79 | exports.parseBEPress = function ( chtml ) {
 80 | 	return index.parseBEPress( chtml );
 81 | };
 82 | 
 83 | /**
 84 |  * Scrapes embedded COinS data given Cheerio loaded html object
 85 |  *
 86 |  * @param  {Object}   chtml      html Cheerio object
 87 |  * @return {Object}              Promise for metadata
 88 |  */
 89 | exports.parseCOinS = function ( chtml ) {
 90 | 	return index.parseCOinS( chtml );
 91 | };
 92 | 
 93 | /**
 94 |  * Parses value of COinS title tag
 95 |  *
 96 |  * @param  {string}   title      String corresponding to value of title tag in span element
 97 |  * @return {Object}              Promise for metadata
 98 |  */
 99 | exports.parseCOinSTitle = function ( title ) {
100 | 	return index.parseCOinSTitle( title );
101 | };
102 | 
103 | /**
104 |  * Scrapes Dublin Core data given Cheerio loaded html object
105 |  *
106 |  * @param  {Object}   chtml      html Cheerio object
107 |  * @return {Object}              Promise for metadata
108 |  */
109 | exports.parseDublinCore = function ( chtml ) {
110 | 	return index.parseDublinCore( chtml );
111 | };
112 | 
113 | /**
114 |  * Scrapes EPrints data given Cheerio loaded html object
115 |  *
116 |  * @param  {Object}   chtml      html Cheerio object
117 |  * @return {Object}              Promise for metadata
118 |  */
119 | exports.parseEprints = function ( chtml ) {
120 | 	return index.parseEprints( chtml );
121 | };
122 | 
123 | /**
124 |  * Scrapes general metadata terms given Cheerio loaded html object
125 |  *
126 |  * @param  {Object}   chtml      html Cheerio object
127 |  * @return {Object}              Promise for metadata
128 |  */
129 | exports.parseGeneral = function ( chtml ) {
130 | 	return index.parseGeneral( chtml );
131 | };
132 | 
133 | /**
134 |  * Scrapes Highwire Press metadata given html object
135 |  *
136 |  * @param  {Object}   chtml      html Cheerio object
137 |  * @return {Object}              Promise for metadata
138 |  */
139 | exports.parseHighwirePress = function ( chtml ) {
140 | 	return index.parseHighwirePress( chtml );
141 | };
142 | 
143 | /**
144 |  * Retrieves JSON-LD for given html object
145 |  *
146 |  * @param  {Object}   chtml      html Cheerio object
147 |  * @return {Object}              Promise for JSON-LD
148 |  */
149 | exports.parseJsonLd = function ( chtml ) {
150 | 	return index.parseJsonLd( chtml );
151 | };
152 | 
153 | /**
154 |  * Scrapes OpenGraph data given html object
155 |  *
156 |  * @param  {Object}   chtml      html Cheerio object
157 |  * @return {Object}              Promise for metadata
158 |  */
159 | exports.parseOpenGraph = function ( chtml ) {
160 | 	return index.parseOpenGraph( chtml );
161 | };
162 | 
163 | /**
164 |  * Scrapes schema.org microdata given Cheerio loaded html object
165 |  *
166 |  * @param  {Object}   chtml      html Cheerio object
167 |  * @return {Object}              Promise for metadata
168 |  */
169 | exports.parseSchemaOrgMicrodata = function ( chtml ) {
170 | 	return index.parseSchemaOrgMicrodata( chtml );
171 | };
172 | 
173 | /**
174 |  * Scrapes Twitter data given html object
175 |  *
176 |  * @param  {Object}   chtml      html Cheerio object
177 |  * @return {Object}              Promise for metadata
178 |  */
179 | exports.parseTwitter = function ( chtml ) {
180 | 	return index.parseTwitter( chtml );
181 | };
182 | 
183 | /**
184 |  * Scrapes PRISM data given html object
185 |  *
186 |  * @param  {Object}   chtml      html Cheerio object
187 |  * @return {Object}              Promise for metadata
188 |  */
189 | exports.parsePrism = function ( chtml ) {
190 | 	return index.parsePrism( chtml );
191 | };
192 | 
193 | /**
194 |  * Global exportable list of scraping promises with string keys
195 |  *
196 |  * @type {Object}
197 |  */
198 | exports.metadataFunctions = index.metadataFunctions;
199 | 
200 | /*
201 |   Export the version
202 | */
203 | 
204 | exports.version = require( './package' ).version;
205 | 


--------------------------------------------------------------------------------
/test/scraping.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | const meta = require( '../index' );
  4 | const assert = require( 'assert' );
  5 | const cheerio = require( 'cheerio' );
  6 | 
  7 | // mocha defines to avoid eslint breakage
  8 | /* global describe, it */
  9 | 
 10 | describe( 'scraping', function () {
 11 | 
 12 | 	this.timeout( 100000 );
 13 | 
 14 | 	const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
 15 | 	const acceptHeader = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
 16 | 
 17 | 	function getWithHeaders( url ) {
 18 | 		// eslint-disable-next-line n/no-unsupported-features/node-builtins
 19 | 		return fetch( url, {
 20 | 			method: 'GET',
 21 | 			headers: {
 22 | 				'User-Agent': userAgent,
 23 | 				Accept: acceptHeader
 24 | 			}
 25 | 		// res.body is a ReadableStream of a Uint8Array, but we just want the string
 26 | 		} ).then( ( res ) => res.text() );
 27 | 	}
 28 | 
 29 | 	describe( 'parseAll function', () => {
 30 | 
 31 | 		describe( 'Promise style', () => {
 32 | 			it( 'should resolve promise from woorank with headers', () => {
 33 | 				const url = 'https://www.woorank.com/en/blog/dublin-core-metadata-for-seo-and-usability';
 34 | 				return meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } )
 35 | 					.then( ( result ) => {
 36 | 						assert.ok( result, 'Expected result to be truthy' );
 37 | 					} )
 38 | 					.catch( ( e ) => {
 39 | 						console.error( 'Error in woorank test:', e );
 40 | 						throw e;
 41 | 					} );
 42 | 			} );
 43 | 
 44 | 			it( 'should resolve promise from blog.schema.org without headers', () => {
 45 | 				const url = 'http://blog.schema.org';
 46 | 				return meta( url )
 47 | 					.then( ( result ) => {
 48 | 						assert.ok( result, 'Expected result to be truthy' );
 49 | 					} )
 50 | 					.catch( ( e ) => {
 51 | 						console.error( 'Error in blog.schema.org test:', e );
 52 | 						throw e;
 53 | 					} );
 54 | 			} );
 55 | 
 56 | 			it( 'should throw error if no uri supplied', () => meta()
 57 | 				.then( () => {
 58 | 					assert.fail( 'Should have rejected the promise' );
 59 | 				} )
 60 | 				.catch( ( e ) => {
 61 | 					assert.ok( e instanceof Error, 'Error should be an Error object' );
 62 | 					assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' );
 63 | 				} )
 64 | 			);
 65 | 
 66 | 			it( 'should not have any undefined values', () => {
 67 | 				const url = 'http://web.archive.org/web/20220127144804/https://www.cnet.com/special-reports/vr101/';
 68 | 				return getWithHeaders( url ).then( ( body ) => {
 69 | 					const chtml = cheerio.load( body );
 70 | 					return meta.parseAll( chtml )
 71 | 						.then( ( results ) => {
 72 | 							Object.keys( results ).forEach( ( metadataType ) => {
 73 | 								Object.keys( results[ metadataType ] ).forEach( ( key ) => {
 74 | 									assert.notStrictEqual( results[ metadataType ][ key ], undefined, `${ metadataType }.${ key } should not be undefined` );
 75 | 								} );
 76 | 							} );
 77 | 						} );
 78 | 				} );
 79 | 			} );
 80 | 
 81 | 		} );
 82 | 
 83 | 		describe( 'Await style', () => {
 84 | 
 85 | 			it( 'should support await implementation with headers', async () => {
 86 | 				const url = 'http://blog.schema.org';
 87 | 				const result = await meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } );
 88 | 				assert.ok( result, 'Expected result to be truthy' );
 89 | 			} );
 90 | 
 91 | 			it( 'should support await implementation without headers', async () => {
 92 | 				const url = 'http://blog.schema.org';
 93 | 				const result = await meta( url );
 94 | 				assert.ok( result, 'Expected result to be truthy' );
 95 | 			} );
 96 | 
 97 | 			it( 'should throw error if no uri is supplied with async/await', async () => {
 98 | 				try {
 99 | 					await meta();
100 | 					assert.fail( 'Should have thrown an error' );
101 | 				} catch ( e ) {
102 | 					assert.ok( e instanceof Error, 'Error should be an Error object' );
103 | 					assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' );
104 | 				}
105 | 			} );
106 | 		} );
107 | 
108 | 	} );
109 | 
110 | 	describe( 'Individual metadata functions', () => {
111 | 		it( 'should get BE Press metadata tags', () => {
112 | 			const url = 'http://biostats.bepress.com/harvardbiostat/paper154/';
113 | 			return getWithHeaders( url ).then( ( body ) => {
114 | 				const expectedAuthors = [ 'Claggett, Brian', 'Xie, Minge', 'Tian, Lu' ];
115 | 				const expectedAuthorInstitutions = [ 'Harvard', 'Rutgers University - New Brunswick/Piscataway', 'Stanford University School of Medicine' ];
116 | 				const chtml = cheerio.load( body );
117 | 				return meta.parseBEPress( chtml )
118 | 					.then( ( results ) => {
119 | 						assert.deepStrictEqual( results.author, expectedAuthors );
120 | 						assert.deepStrictEqual(
121 | 							results.author_institution,
122 | 							expectedAuthorInstitutions
123 | 						);
124 | 						[ 'series_title', 'author', 'author_institution', 'title', 'date', 'pdf_url',
125 | 							'abstract_html_url', 'publisher', 'online_date' ].forEach( ( key ) => {
126 | 							assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` );
127 | 						} );
128 | 					} );
129 | 			} );
130 | 		} );
131 | 
132 | 		it( 'should get COinS metadata', () => {
133 | 			const url = 'https://en.wikipedia.org/wiki/Viral_phylodynamics';
134 | 			return getWithHeaders( url ).then( ( body ) => {
135 | 				const chtml = cheerio.load( body );
136 | 				return meta.parseCOinS( chtml )
137 | 					.then( ( results ) => {
138 | 						assert.ok( Array.isArray( results ), `Expected Array, got ${ typeof results }` );
139 | 						assert.ok( results.length > 0, 'Expected Array with at least 1 item' );
140 | 						assert.ok( results[ 0 ].rft, 'Expected first item of Array to contain key rft' );
141 | 					} );
142 | 			} );
143 | 		} );
144 | 
145 | 		it( 'should get EPrints metadata', () => {
146 | 			const url = 'http://eprints.gla.ac.uk/113711/';
147 | 			return getWithHeaders( url ).then( ( body ) => {
148 | 				const chtml = cheerio.load( body );
149 | 				const expectedAuthors = [ 'Gatherer, Derek', 'Kohl, Alain' ];
150 | 
151 | 				return meta.parseEprints( chtml )
152 | 					.then( ( results ) => {
153 | 						assert.deepStrictEqual( results.creators_name, expectedAuthors );
154 | 						[ 'eprintid', 'datestamp', 'title', 'abstract', 'issn', 'creators_name', 'publication', 'citation' ].forEach( ( key ) => {
155 | 							assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` );
156 | 						} );
157 | 					} );
158 | 			} );
159 | 		} );
160 | 
161 | 		it( 'should get general metadata', () => {
162 | 			const expected = 'Example Domain';
163 | 			const url = 'http://example.com';
164 | 			return getWithHeaders( url ).then( ( body ) => {
165 | 				const chtml = cheerio.load( body );
166 | 				return meta.parseGeneral( chtml ).then( ( results ) => {
167 | 					assert.strictEqual( results.title, expected );
168 | 				} );
169 | 			} );
170 | 		} );
171 | 	} );
172 | 
173 | } );
174 | 


--------------------------------------------------------------------------------
/test/parsing.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | /**
  4 |  * Tests using parsing methods only
  5 |  */
  6 | 
  7 | const assert = require( './utils/assert.js' );
  8 | const meta = require( '../index' );
  9 | 
 10 | // mocha defines to avoid eslint breakage
 11 | /* global describe, it */
 12 | 
 13 | describe( 'parsing', () => {
 14 | 
 15 | 	it( 'should get correct structure from decoded string', () => {
 16 | 		const title = 'ctx_ver=Z39.88-2004&rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&rfr_id=info%3Asid%2Fcrossref.org%3Asearch&rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&rft.jtitle=Journal+of+Psychoceramics&rft.date=2008&rft.volume=5&rft.issue=11&rft.spage=1&rft.epage=3&rft.aufirst=Josiah&rft.aulast=Carberry&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&';
 17 | 		const expected = {
 18 | 			ctx_ver: 'Z39.88-2004',
 19 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678',
 20 | 			rfr_id: 'info:sid/crossref.org:search',
 21 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal',
 22 | 			rft: {
 23 | 				atitle: 'Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory',
 24 | 				jtitle: 'Journal of Psychoceramics',
 25 | 				date: '2008',
 26 | 				volume: '5',
 27 | 				issue: '11',
 28 | 				spage: '1',
 29 | 				epage: '3',
 30 | 				aufirst: 'Josiah',
 31 | 				aulast: 'Carberry',
 32 | 				genre: 'article'
 33 | 			}
 34 | 		};
 35 | 
 36 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
 37 | 			assert.deepEqual( results, expected );
 38 | 		} );
 39 | 	} );
 40 | 
 41 | 	it( 'should get correct structure from html encoded string', () => {
 42 | 		const title = 'ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;rft.jtitle=Journal+of+Psychoceramics&amp;rft.date=2008&amp;rft.volume=5&amp;rft.issue=11&amp;rft.spage=1&amp;rft.epage=3&amp;rft.aufirst=Josiah&amp;rft.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;';
 43 | 		const expected = {
 44 | 			ctx_ver: 'Z39.88-2004',
 45 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678',
 46 | 			rfr_id: 'info:sid/crossref.org:search',
 47 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal',
 48 | 			rft: {
 49 | 				atitle: 'Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory',
 50 | 				jtitle: 'Journal of Psychoceramics',
 51 | 				date: '2008',
 52 | 				volume: '5',
 53 | 				issue: '11',
 54 | 				spage: '1',
 55 | 				epage: '3',
 56 | 				aufirst: 'Josiah',
 57 | 				aulast: 'Carberry',
 58 | 				genre: 'article'
 59 | 			}
 60 | 		};
 61 | 
 62 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
 63 | 			assert.deepEqual( results, expected );
 64 | 		} );
 65 | 	} );
 66 | 
 67 | 	it( 'should not add rft object when there are no valid keys', () => {
 68 | 		const title = 'ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;badkey.jtitle=Journal+of+Psychoceramics&amp;badkey.date=2008&amp;badkey.volume=5&amp;badkey.issue=11&amp;badkey.spage=1&amp;badkey.epage=3&amp;badkey.aufirst=Josiah&amp;badkey.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;badkey.genre=article&amp;badkey.au=Josiah+Carberry';
 69 | 		const expected = {
 70 | 			ctx_ver: 'Z39.88-2004',
 71 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678',
 72 | 			rfr_id: 'info:sid/crossref.org:search',
 73 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal'
 74 | 		};
 75 | 
 76 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
 77 | 			assert.deepEqual( results, expected );
 78 | 		} );
 79 | 	} );
 80 | 
 81 | 	it( 'should not replace encoded + symbol in doi', () => {
 82 | 		const title = 'ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12%2B345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;badkey.jtitle=Journal+of+Psychoceramics&amp;badkey.date=2008&amp;badkey.volume=5&amp;badkey.issue=11&amp;badkey.spage=1&amp;badkey.epage=3&amp;badkey.aufirst=Josiah&amp;badkey.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;badkey.genre=article&amp;badkey.au=Josiah+Carberry';
 83 | 		const expected = {
 84 | 			ctx_ver: 'Z39.88-2004',
 85 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12+345678',
 86 | 			rfr_id: 'info:sid/crossref.org:search',
 87 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal'
 88 | 		};
 89 | 
 90 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
 91 | 			assert.deepEqual( results, expected );
 92 | 		} );
 93 | 	} );
 94 | 
 95 | 	it( 'should add list for au field', () => {
 96 | 		const title = 'ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Josiah+Carberry&amp;rft.au=Random+Name&amp;rft.au=Name+of+an+organisation';
 97 | 		const expected = {
 98 | 			ctx_ver: 'Z39.88-2004',
 99 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678',
100 | 			rfr_id: 'info:sid/crossref.org:search',
101 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal',
102 | 			rft: {
103 | 				genre: 'article',
104 | 				au: [
105 | 					'Josiah Carberry',
106 | 					'Random Name',
107 | 					'Name of an organisation'
108 | 				]
109 | 			}
110 | 		};
111 | 
112 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
113 | 			assert.deepEqual( results, expected );
114 | 		} );
115 | 	} );
116 | 
117 | 	it( 'should add list for issn and aucorp field', () => {
118 | 		const title = 'rft.genre=article&amp;rft.issn=1234-5678&amp;rft.issn=2222-3333&amp;rft.aucorp=Name+of+an+organisation';
119 | 		const expected = {
120 | 			rft: {
121 | 				genre: 'article',
122 | 				aucorp: [
123 | 					'Name of an organisation'
124 | 				],
125 | 				issn: [
126 | 					'1234-5678',
127 | 					'2222-3333'
128 | 				]
129 | 			}
130 | 		};
131 | 
132 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
133 | 			assert.deepEqual( results, expected );
134 | 		} );
135 | 	} );
136 | 
137 | 	it( 'should ignore bad hierarchical keys', () => {
138 | 		const title = 'ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;badkey.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;badkey.jtitle=Journal+of+Psychoceramics&amp;badkey.date=2008&amp;badkey.volume=5&amp;badkey.issue=11&amp;badkey.spage=1&amp;badkey.epage=3&amp;badkey.aufirst=Josiah&amp;badkey.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;badkey.au=Josiah+Carberry';
139 | 		const expected = {
140 | 			ctx_ver: 'Z39.88-2004',
141 | 			rft_id: 'info:doi/http://dx.doi.org/10.5555/12345678',
142 | 			rfr_id: 'info:sid/crossref.org:search',
143 | 			rft_val_fmt: 'info:ofi/fmt:kev:mtx:journal',
144 | 			rft: {
145 | 				genre: 'article'
146 | 			}
147 | 		};
148 | 
149 | 		return meta.parseCOinSTitle( title ).then( ( results ) => {
150 | 			assert.deepEqual( results, expected );
151 | 		} );
152 | 	} );
153 | 
154 | } );
155 | 


--------------------------------------------------------------------------------
/test/static/turtle_article_case.html:
--------------------------------------------------------------------------------
  1 | <html lang="en" dir="ltr">
  2 | <!--
  3 | Turtle Article containing capitALised tags to test case sensitivity
  4 | -->
  5 | <head prefix="OG: http://OGp.me/ns# fb: http://OGp.me/ns/fb# Article: http://OGp.me/ns/Article#">
  6 | 
  7 | <meta charset="utf-8">
  8 | 
  9 | <Title>Turtles are AWESOME!!1 | Awesome Turtles Website</Title>
 10 | 
 11 | <meta name="Author" content="Turtle Lvr">
 12 | <meta name="Robots" content="we welcome our robot overlords"/>
 13 | <meta name="Description" content="Exposition on the awesomeness of turtles"/>
 14 | <meta name="Keywords" content="turtles, are, awesome" />
 15 | 
 16 | <link rel="APPle-touch-icon" href="turtleapple.png" sizes="72x72">
 17 | <link rel="IcOn" href="turtle.png" sizes="18x18" type="image/png">
 18 | <link rel="APPle-touch-icon" href="turtleapple2.png" sizes="">
 19 | <link rel="IcOn" href="turtle2.png" sizes="" type="image/png">
 20 | <link rel="CanonicAL" href="http://example.com/turtles" />
 21 | <link rel="Publisher" href="https://mediawiki.org"/>
 22 | <link rel="Author" href="http://examples.com/turtlelvr"/>
 23 | <link rel="Shortlink" href="http://example.com/c" />
 24 | 
 25 | <!--Open Graph-->
 26 | 
 27 | <meta property="OG:Locale" content="en_US" />
 28 | <meta property="OG:Type" content="Article" />
 29 | <meta property="OG:Title" content="Turtles are AWESOME!!1" />
 30 | <meta property="OG:Description" content="Exposition on the awesomeness of turtles" />
 31 | <meta property="OG:Url" content="http://example.com" />
 32 | <meta property="OG:Site_Name" content="Awesome Turtles Website" />
 33 | <!--Image subproperty tags with no root -->
 34 | <meta property="OG:Image:Width" content="666" /> <!--Ignored-->
 35 | <meta property="OG:Image:height" content="666" /> <!--Ignored-->
 36 | <meta property="OG:Image" content="http://example.com/turtle.jpg" />
 37 | <meta property="OG:Image:Secure_Url" content="https://secure.example.com/turtle.jpg" />
 38 | <meta property="OG:Image:Type" content="Image/jpeg" />
 39 | <meta property="OG:Image:Width" content="400" />
 40 | <meta property="OG:Image:Width" content="666" /> <!--Ignored-->
 41 | <meta property="OG:Image:height" content="300" />
 42 | <meta property="OG:Image" content="http://example.com/shell.jpg" />
 43 | <!--Interrupt Image tags with Audio tags-->
 44 | <meta property="OG:Audio" content="http://example.com/sound.mp3" />
 45 | <meta property="OG:Audio:Secure_Url" content="https://secure.example.com/sound.mp3" />
 46 | <meta property="OG:Audio:Type" content="Audio/mpeg" />
 47 | <!--End interruption with Audio tags -->
 48 | <meta property="OG:Image:Width" content="200" />
 49 | <meta property="OG:Image:Height" content="150" />
 50 | <!--Invalid subproperty-->
 51 | <meta property="OG:Cat:Meow" content="purr" /><!--Ignored-->
 52 | <!--Article vertical-->
 53 | <meta property="Article:Tag" content="turtles" />
 54 | <meta property="Article:Tag" content="are" />
 55 | <meta property="Article:Tag" content="awesome" />
 56 | <meta property="Article:Section" content="Turtles are tough" />
 57 | <meta property="Article:Section" content="Turtles are flawless" />
 58 | <meta property="Article:Section" content="Turtles are cute" />
 59 | <meta property="Article:Published_time" content="2012-02-04T12:00:00+00:00" />
 60 | <meta property="Article:Modified_time" content="2015-01-14T19:14:27+00:00" />
 61 | <meta property="Article:Author"  content="http://examples.com/turtlelvr" />
 62 | <meta property="Article:Publisher"  content="http://mediawiki.org" />
 63 | 
 64 | <!--AL-->
 65 | 
 66 | <meta property="AL:Ios:Url" content="turtle://">
 67 | <meta property="AL:Ios:App_Store_Id" content="000">
 68 | <meta property="AL:Android:Url" content="turtle://">
 69 | <meta property="AL:Android:Package" content="superturtleArticle.Androidapp">
 70 | <meta property="AL:Web:Url" content="http://example.com/">
 71 | <meta property="AL:Web:Should_Falback" content="true">
 72 | 
 73 | <!--Twitter-->
 74 | 
 75 | <meta name="Twitter:Card" content="summary">
 76 | <meta name="Twitter:Site" content="@Turtlessssssssss">
 77 | <meta name="Twitter:Creator" content="@Turtlessssssssss">
 78 | <meta name="Twitter:Creator" content="@Turtlezzzzzzzzzz">
 79 | <meta name="Twitter:Url" content="http://www.example.com/turtles">
 80 | <meta name="Twitter:Title" content="Turtles are AWESOME!!1">
 81 | <meta name="Twitter:Description" content="Exposition on the awesomeness of turtles">
 82 | <meta name="Twitter:Image" content="http://example.com/turtles.jpg">
 83 | <meta name="Twitter:Image:Alt" content="It's a bunch of turtles!">
 84 | <meta name="Twitter:App:Url:Iphone" content="turtle://">
 85 | <meta name="Twitter:App:Id:Iphone" content="000">
 86 | <meta name="Twitter:App:Url:Googleplay" content="turtle://">
 87 | <meta name="Twitter:App:Id:Googleplay" content="superturtlearticle.androidapp">
 88 | 
 89 | <!--BE Press-->
 90 | 
 91 | <meta name="Bepress_Citation_Series_Title" content="Turtles" >
 92 | <meta name="Bepress_Citation_Author" content="Turtle Lvr" />
 93 | <meta name="Bepress_Citation_Author_Institution" content="Mediawiki" />
 94 | <meta name="Bepress_Citation_Title" content="Turtles are AWESOME!!1" >
 95 | <meta name="Bepress_Citation_Date" content="2012" />
 96 | <meta name="Bepress_Citation_Pdf_Url" content="http://www.example.com/turtlelvr/pdf" />
 97 | <meta name="Bepress_Citation_Abstract_Html_Url" content="http://www.example.com/turtlelvr" />
 98 | <meta name="Bepress_Citation_Publisher" content="Turtles Society" />
 99 | <meta name="Bepress_Citation_Online_Date" content="2012/02/04" />
100 | 
101 | <!--Dublin Core-->
102 | 
103 | <meta name="DC.Title" content="Turtles are AWESOME!!1" >
104 | <meta name="DC.Creator" content="http://www.example.com/turtlelvr" >
105 | <meta name="DC.Description" content="Exposition on the awesomeness of turtles" >
106 | <meta name="DC.Date" content="2012-02-04 12:00:00" >
107 | <meta name="DC.Type" content="Text.Article" >
108 | 
109 | <!--EPrints-->
110 | 
111 | <meta name="Eprints.Title" content="Turtles are AWESOME!!1" >
112 | <meta name="Eprints.Creators_Name" content="http://www.example.com/turtlelvr" >
113 | <meta name="Eprints.Abstract" content="Exposition on the awesomeness of turtles" >
114 | <meta name="Eprints.Datestamp" content="2012-02-04 12:00:00" >
115 | <meta name="Eprints.Type" content="Article" >
116 | 
117 | <!--Highwire Press-->
118 | 
119 | <meta name="Citation_Journal_Title" content="Turtles" >
120 | <meta name="Citation_Issn" content="1234-5678" >
121 | <meta name="Citation_Doi" content="10.1000/123" >
122 | <meta name="Citation_Publication_Date" content="2012-02-04" >
123 | <meta name="Citation_Title" content="Turtles are AWESOME!!1" >
124 | <meta name="Citation_Author" content="Turtle Lvr" />
125 | <meta name="Citation_Author_Institution" content="Mediawiki" />
126 | <meta name="Citation_Volume" content="150" />
127 | <meta name="Citation_Issue" content="1" />
128 | <meta name="Citation_Firstpage" content="123" />
129 | <meta name="Citation_Lastpage" content="456" />
130 | <meta name="Citation_Publisher" content="Turtles Society" />
131 | <meta name="Citation_Abstract" content="Exposition on the awesomeness of turtles." />
132 | 
133 | <!--PRISM-->
134 | 
135 | <meta name="Prism.Issn" content="1234-5678" >
136 | <meta name="Prism.PublicationName" content="Turtles Society" >
137 | <meta name="Prism.PublicationDate" content="2012-02-04" >
138 | <meta name="Prism.StartingPage" content="123" >
139 | <meta name="Prism.Copyright" content="2012 Turtles Society" >
140 | <meta name="Prism.RightsAgent" content="permissions@turtles.com" >
141 | <meta name="Prism.Url" content="https://www.turtles.com" >
142 | <meta name="Prism.Doi" content="10.1000/123" >
143 | 
144 | </head>
145 | 
146 | <body>
147 | 
148 | 	<!--COINS-->
149 | 	 <span class="Z3988" Title="ctx_ver=Z39.88-2004&amp;RFT_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;RFT.aTitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;RFT.jTitle=Journal+of+Psychoceramics&amp;RFT.Date=2008&amp;RFT.Volume=5&amp;RFT.issue=11&amp;RFT.Spage=1&amp;RFT.Epage=3&amp;RFT.Aufirst=Josiah&amp;RFT.Aulast=Carberry&amp;RFT_vAL_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;RFT.Genre=Article&amp;RFT.Au=Josiah+Carberry"></span>
150 | 
151 | 	<!--Schema.org Microdata: Case sensitive-->
152 | 	<div itemscope>
153 | 		<h1 itemprop="headline">Turtles are AWESOME!!1</h1> <!--Uses text from inside tags-->
154 | 		<span itemprop="wordCount" content="10" />  <!--Self-closing tag-->
155 | 		<span itemprop="author" content="Turtle Lvr">Turtle H8r</span> 	<!--Prefers content attr to text inside tags-->
156 | 		<a href="http://www.archive.org/turtlearticle" itemprop="archivedAt">Turtle Article Archive</a>  <!--Uses href and not text between tags-->
157 | 	</div>
158 | 
159 | 	<!-- Since keys may be case-sensitive in JSON-LD, take the keys as-is -->
160 | 	<script type="application/ld+json">{
161 | 		"@context": "http://schema.org",
162 | 		"@type": "Organization",
163 | 		"url": "https://www.turtles.com"
164 | 	}
165 | 	</script>
166 | 
167 | </body>
168 | 
169 | </html>
170 | 


--------------------------------------------------------------------------------
/test/static/turtle_article.html:
--------------------------------------------------------------------------------
  1 | <html lang="en" dir="ltr">
  2 | 
  3 | <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
  4 | 
  5 | <meta charset="utf-8">
  6 | 
  7 | <title>Turtles are AWESOME!!1 | Awesome Turtles Website</title>
  8 | 
  9 | <meta name="author" content="Turtle Lvr">
 10 | <meta name="robots" content="we welcome our robot overlords"/>
 11 | <meta name="description" content="Exposition on the awesomeness of turtles"/>
 12 | <meta name="keywords" content="turtles, are, awesome" />
 13 | 
 14 | <link rel="apple-touch-icon" href="turtleapple.png" sizes="72x72">
 15 | <link rel="icon" href="turtle.png" sizes="18x18" type="image/png">
 16 | <link rel="apple-touch-icon" href="turtleapple2.png" sizes="">
 17 | <link rel="icon" href="turtle2.png" sizes="" type="image/png">
 18 | <link rel="canonical" href="http://example.com/turtles" />
 19 | <link rel="publisher" href="https://mediawiki.org"/>
 20 | <link rel="author" href="http://examples.com/turtlelvr"/>
 21 | <link rel="shortlink" href="http://example.com/c" />
 22 | 
 23 | <!--Open Graph-->
 24 | 
 25 | <meta property="og:locale" content="en_US" />
 26 | <meta property="og:type" content="article" />
 27 | <meta property="og:title" content="Turtles are AWESOME!!1" />
 28 | <meta property="og:description" content="Exposition on the awesomeness of turtles" />
 29 | <meta property="og:url" content="http://example.com" />
 30 | <meta property="og:site_name" content="Awesome Turtles Website" />
 31 | <!--Image subproperty tags with no root -->
 32 | <meta property="og:image:width" content="666" /> <!--Ignored-->
 33 | <meta property="og:image:height" content="666" /> <!--Ignored-->
 34 | <meta property="og:image" content="http://example.com/turtle.jpg" />
 35 | <meta property="og:image:secure_url" content="https://secure.example.com/turtle.jpg" />
 36 | <meta property="og:image:type" content="image/jpeg" />
 37 | <meta property="og:image:width" content="400" />
 38 | <meta property="og:image:width" content="666" /> <!--Ignored-->
 39 | <meta property="og:image:height" content="300" />
 40 | <meta property="og:image" content="http://example.com/shell.jpg" />
 41 | <!--Interrupt image tags with audio tags-->
 42 | <meta property="og:audio" content="http://example.com/sound.mp3" />
 43 | <meta property="og:audio:secure_url" content="https://secure.example.com/sound.mp3" />
 44 | <meta property="og:audio:type" content="audio/mpeg" />
 45 | <!--End interruption with audio tags -->
 46 | <meta property="og:image:width" content="200" />
 47 | <meta property="og:image:height" content="150" />
 48 | <!--Invalid subproperty-->
 49 | <meta property="og:cat:meow" content="purr" /><!--Ignored-->
 50 | <!--Article vertical-->
 51 | <meta property="article:tag" content="turtles" />
 52 | <meta property="article:tag" content="are" />
 53 | <meta property="article:tag" content="awesome" />
 54 | <meta property="article:section" content="Turtles are tough" />
 55 | <meta property="article:section" content="Turtles are flawless" />
 56 | <meta property="article:section" content="Turtles are cute" />
 57 | <meta property="article:published_time" content="2012-02-04T12:00:00+00:00" />
 58 | <meta property="article:modified_time" content="2015-01-14T19:14:27+00:00" />
 59 | <meta property="article:author"  content="http://examples.com/turtlelvr" />
 60 | <meta property="article:publisher"  content="http://mediawiki.org" />
 61 | 
 62 | <!--AL-->
 63 | 
 64 | <meta property="al:ios:url" content="turtle://">
 65 | <meta property="al:ios:app_store_id" content="000">
 66 | <meta property="al:android:url" content="turtle://">
 67 | <meta property="al:android:package" content="superturtlearticle.androidapp">
 68 | <meta property="al:web:url" content="http://example.com/">
 69 | <meta property="al:web:should_fallback" content="true">
 70 | 
 71 | <!--Twitter-->
 72 | 
 73 | <meta name="twitter:card" content="summary">
 74 | <meta name="twitter:site" content="@Turtlessssssssss">
 75 | <meta name="twitter:creator" content="@Turtlessssssssss">
 76 | <meta name="twitter:creator" content="@Turtlezzzzzzzzzz">
 77 | <meta name="twitter:url" content="http://www.example.com/turtles">
 78 | <meta name="twitter:title" content="Turtles are AWESOME!!1">
 79 | <meta name="twitter:description" content="Exposition on the awesomeness of turtles">
 80 | <meta name="twitter:image" content="http://example.com/turtles.jpg">
 81 | <meta name="twitter:image:alt" content="It's a bunch of turtles!">
 82 | <meta name="twitter:app:url:iphone" content="turtle://">
 83 | <meta name="twitter:app:id:iphone" content="000">
 84 | <meta name="twitter:app:url:googleplay" content="turtle://">
 85 | <meta name="twitter:app:id:googleplay" content="superturtlearticle.androidapp">
 86 | 
 87 | <!--BE Press-->
 88 | 
 89 | <meta name="bepress_citation_series_title" content="Turtles" >
 90 | <meta name="bepress_citation_author" content="Turtle Lvr" />
 91 | <meta name="bepress_citation_author_institution" content="Mediawiki" />
 92 | <meta name="bepress_citation_title" content="Turtles are AWESOME!!1" >
 93 | <meta name="bepress_citation_date" content="2012" />
 94 | <meta name="bepress_citation_pdf_url" content="http://www.example.com/turtlelvr/pdf" />
 95 | <meta name="bepress_citation_abstract_html_url" content="http://www.example.com/turtlelvr" />
 96 | <meta name="bepress_citation_publisher" content="Turtles Society" />
 97 | <meta name="bepress_citation_online_date" content="2012/02/04" />
 98 | 
 99 | <!--Dublin Core-->
100 | 
101 | <meta name="dc.title" content="Turtles are AWESOME!!1" >
102 | <meta name="dc.creator" content="http://www.example.com/turtlelvr" >
103 | <meta name="dc.description" content="Exposition on the awesomeness of turtles" >
104 | <meta name="dc.date" content="2012-02-04 12:00:00" >
105 | <meta name="dc.type" content="Text.Article" >
106 | 
107 | <!--EPrints-->
108 | 
109 | <meta name="eprints.title" content="Turtles are AWESOME!!1" >
110 | <meta name="eprints.creators_name" content="http://www.example.com/turtlelvr" >
111 | <meta name="eprints.abstract" content="Exposition on the awesomeness of turtles" >
112 | <meta name="eprints.datestamp" content="2012-02-04 12:00:00" >
113 | <meta name="eprints.type" content="article" >
114 | 
115 | <!--Highwire Press-->
116 | 
117 | <meta name="citation_journal_title" content="Turtles" >
118 | <meta name="citation_issn" content="1234-5678" >
119 | <meta name="citation_doi" content="10.1000/123" >
120 | <meta name="citation_publication_date" content="2012-02-04" >
121 | <meta name="citation_title" content="Turtles are AWESOME!!1" >
122 | <meta name="citation_author" content="Turtle Lvr" />
123 | <meta name="citation_author_institution" content="Mediawiki" />
124 | <meta name="citation_volume" content="150" />
125 | <meta name="citation_issue" content="1" />
126 | <meta name="citation_firstpage" content="123" />
127 | <meta name="citation_lastpage" content="456" />
128 | <meta name="citation_publisher" content="Turtles Society" />
129 | <meta name="citation_abstract" content="Exposition on the awesomeness of turtles." />
130 | 
131 | <!--PRISM-->
132 | 
133 | <meta name="prism.issn" content="1234-5678" >
134 | <meta name="prism.publicationName" content="Turtles Society" >
135 | <meta name="prism.publicationDate" content="2012-02-04" >
136 | <meta name="prism.startingPage" content="123" >
137 | <meta name="prism.copyright" content="2012 Turtles Society" >
138 | <meta name="prism.rightsAgent" content="permissions@turtles.com" >
139 | <meta name="prism.url" content="https://www.turtles.com" >
140 | <meta name="prism.doi" content="10.1000/123" >
141 | 
142 | </head>
143 | 
144 | <body>
145 | 
146 | 	<!--COINS-->
147 | 	<span class="Z3988" title="ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;rft.jtitle=Journal+of+Psychoceramics&amp;rft.date=2008&amp;rft.volume=5&amp;rft.issue=11&amp;rft.spage=1&amp;rft.epage=3&amp;rft.aufirst=Josiah&amp;rft.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Josiah+Carberry"></span>
148 | 
149 | 	<!--Schema.org Microdata-->
150 | 	<div itemscope>
151 | 		<h1 itemprop="headline">Turtles are AWESOME!!1</h1> <!--Uses text from inside tags-->
152 | 		<span itemprop="wordCount" content="10" />  <!--Self-closing tag-->
153 | 		<span itemprop="author" content="Turtle Lvr">Turtle H8r</span> 	<!--Prefers content attr to text inside tags-->
154 | 		<a href="http://www.archive.org/turtlearticle" itemprop="archivedAt">Turtle Article Archive</a>  <!--Uses href and not text between tags-->
155 | 	</div>
156 | 
157 | 	<!--Valid JSON-LD-->
158 | 	<script type="application/ld+json">{
159 | 		"@context": "http://schema.org",
160 | 		"@type": "Organization",
161 | 		"url": "https://www.turtles.com"
162 | 	}
163 | 	</script>
164 | 
165 | 	<!-- Invalid JSON-LD: ignored -->
166 | 	<script type="application/ld+json">
167 | 	{
168 | 		"@id": "https://www.turtles.com/"
169 | 		"potentialAction" / {
170 | 			"@type": "ViewAction",
171 | 			"target": "android-app://com.turtles/"
172 | 		},
173 | 		"@type": "WebPage",
174 | 		"@context": "http://schema.org"
175 | 	}
176 | 	</script>
177 | 
178 | </body>
179 | 
180 | </html>
181 | 


--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | const microdata = require( 'microdata-node' ); // Schema.org microdata
  4 | 
  5 | /**
  6 |  * Returns Object containing all available datatypes, keyed
  7 |  * using the same keys as in metadataFunctions.
  8 |  *
  9 |  * @param  {Object}   chtml html Cheerio object to parse
 10 |  * @return {Object}   Promise for metadata
 11 |  */
 12 | exports.parseAll = function ( chtml ) {
 13 | 	// Array of keys corresponding to position of promise
 14 | 	const keys = Object.keys( exports.metadataFunctions );
 15 | 	const meta = {}; // Metadata keyed by keys in exports.metadataFunctions
 16 | 	// Array of promises for metadata of each type in exports.metadataFunctions
 17 | 	const arr = keys.map( ( key ) => exports.metadataFunctions[ key ]( chtml ) );
 18 | 
 19 | 	let result; // Result in for loop over results
 20 | 	let key; // Key corresponding to location of result
 21 | 
 22 | 	return Promise.all( arr.map( ( promise ) => promise.then(
 23 | 	// Create a promise that will always resolve with either the result or the error
 24 | 		( value ) => ( { status: 'fulfilled', value } ),
 25 | 		( error ) => ( { status: 'rejected', reason: error } )
 26 | 	)
 27 | 	) )
 28 | 		.then( ( results ) => {
 29 | 			Object.keys( results ).forEach( ( r ) => {
 30 | 				result = results[ r ];
 31 | 				key = keys[ r ];
 32 | 				if ( result && result.status === 'fulfilled' && result.value ) {
 33 | 					meta[ key ] = result.value;
 34 | 				}
 35 | 			} );
 36 | 			if ( Object.keys( meta ).length === 0 ) {
 37 | 				throw new Error( 'No metadata found in page' );
 38 | 			}
 39 | 			return meta;
 40 | 		} );
 41 | };
 42 | 
 43 | /**
 44 |  * Base scraper for tags, used by some other parsing functions
 45 |  *
 46 |  * @param  {Object}   chtml html Cheerio object
 47 |  * @param  {string[]} tags tag types to process
 48 |  * @param  {string}   reason message when metadata is not found
 49 |  * @param  {Function} getProperty function that gets the property of an element
 50 |  * @param  {Function} getContent function that gets the content of an element
 51 |  * @return {Object}   promise of metadata object
 52 |  */
 53 | exports.parseBase = function ( chtml, tags, reason, getProperty, getContent ) {
 54 | 	return new Promise( ( resolve, reject ) => {
 55 | 		const meta = {};
 56 | 		const metaTags = chtml( tags.join() );
 57 | 
 58 | 		if ( !metaTags || metaTags.length === 0 ) {
 59 | 			reject( new Error( reason ) );
 60 | 		}
 61 | 
 62 | 		metaTags.each( function () {
 63 | 			const element = chtml( this );
 64 | 			const property = getProperty( element );
 65 | 			const content = getContent( element );
 66 | 
 67 | 			// If lacks property or content, skip
 68 | 			if ( !property || !content ) {
 69 | 				return;
 70 | 			}
 71 | 
 72 | 			// If the property already exists, make the array of contents
 73 | 			if ( meta[ property ] ) {
 74 | 				if ( meta[ property ] instanceof Array ) {
 75 | 					meta[ property ].push( content );
 76 | 				} else {
 77 | 					meta[ property ] = [ meta[ property ], content ];
 78 | 				}
 79 | 			} else {
 80 | 				meta[ property ] = content;
 81 | 			}
 82 | 		} );
 83 | 
 84 | 		if ( !Object.keys( meta ).length ) {
 85 | 			reject( new Error( reason ) );
 86 | 		}
 87 | 
 88 | 		resolve( meta );
 89 | 	} );
 90 | };
 91 | 
 92 | /**
 93 |  * Scrapes BE Press metadata given html object
 94 |  *
 95 |  * @param  {Object}   chtml html Cheerio object
 96 |  * @return {Object}   promise of BE Press metadata object
 97 |  */
 98 | exports.parseBEPress = function ( chtml ) {
 99 | 	return exports.parseBase(
100 | 		chtml,
101 | 		[ 'meta' ],
102 | 		'No BE Press metadata found in page',
103 | 		( element ) => {
104 | 			const content = element.attr( 'content' );
105 | 			const name = element.attr( 'name' );
106 | 
107 | 			// If the element isn't a BE Press property or if content is missing, skip it
108 | 			if ( !name || !content || ( name.slice( 0, 17 ).toLowerCase() !== 'bepress_citation_' ) ) {
109 | 				return;
110 | 			}
111 | 
112 | 			return name.slice( 17 ).toLowerCase();
113 | 		},
114 | 		( element ) => element.attr( 'content' )
115 | 	);
116 | };
117 | 
118 | /**
119 |  * Scrapes COinS data given Cheerio loaded html object
120 |  *
121 |  * @param  {Object}   chtml html Cheerio object
122 |  * @return {Object}   Promise for COinS metadata
123 |  */
124 | exports.parseCOinS = function ( chtml ) {
125 | 	let title;
126 | 	const metadata = [];
127 | 	const tags = chtml( 'span[class=Z3988]' );
128 | 	const promArray = [];
129 | 
130 | 	// Add promises for parsed title tags to an Array
131 | 	tags.each( function () {
132 | 		title = chtml( this ).attr( 'title' );
133 | 		promArray.push( exports.parseCOinSTitle( title ) );
134 | 	} );
135 | 
136 | 	// Once promises have resolved, add any successfully parsed titles to the metadata Array
137 | 	return Promise.all( promArray.map( ( promise ) => promise.then(
138 | 		( value ) => ( { status: 'fulfilled', value } ),
139 | 		( error ) => ( { status: 'rejected', reason: error } )
140 | 	) ) ).then( ( results ) => {
141 | 		let result;
142 | 		for ( const r in results ) {
143 | 			result = results[ r ];
144 | 			if ( result && result.status === 'fulfilled' && result.value ) {
145 | 				metadata.push( result.value );
146 | 			}
147 | 		}
148 | 		if ( !metadata.length ) {
149 | 			throw new Error( 'No COinS metadata found' );
150 | 		} else {
151 | 			return metadata;
152 | 		}
153 | 	} );
154 | };
155 | 
156 | /**
157 |  * Parses value of COinS title tag
158 |  *
159 |  * @param  {string}   title String corresponding to value of title tag in span element
160 |  * @return {Object}   Promise for CoinS metadata
161 |  */
162 | exports.parseCOinSTitle = function ( title ) {
163 | 	return new Promise( ( resolve, reject ) => {
164 | 		const metadata = {};
165 | 		const rft = {};
166 | 		let value;
167 | 		let key;
168 | 		if ( typeof title !== 'string' ) {
169 | 			reject( new Error( 'Provided value must be a string; Got ' + typeof title ) );
170 | 		}
171 | 		title = title.replace( /&amp;/g, '&' ); // Allows function to take the raw html string
172 | 		title = title.split( '&' );
173 | 		title.forEach( ( element ) => {
174 | 			element = element.split( '=' );
175 | 			if ( element.length !== 2 ) {
176 | 				return;
177 | 			} // Invalid element
178 | 			key = element[ 0 ].toLowerCase(); // Be case-insensitive for properties
179 | 			value = decodeURIComponent( element[ 1 ].replace( /\+/g, '%20' ) ); // Replace + with encoded space since they aren't getting decoded as spaces
180 | 			key = key.split( '.' ); // Split hierarchical keys
181 | 			if ( key.length === 1 ) { // Top level key
182 | 				metadata[ key[ 0 ] ] = value;
183 | 				return;
184 | 			}
185 | 			if ( key.length === 2 ) { // Split key e.g. rft.date
186 | 				if ( key[ 0 ] !== 'rft' ) {
187 | 					return;
188 | 				} // Invalid hierarchical key
189 | 				// Keys that may have multiple values - return in list format
190 | 				if ( key[ 1 ] === 'au' || key[ 1 ] === 'isbn' || key[ 1 ] === 'issn' || key[ 1 ] === 'eissn' || key[ 1 ] === 'aucorp' ) {
191 | 					if ( !rft[ key[ 1 ] ] ) {
192 | 						rft[ key[ 1 ] ] = [];
193 | 					}
194 | 					rft[ key[ 1 ] ].push( value );
195 | 					return;
196 | 				}
197 | 				// Add rft value to rft key - this will overwrite duplicates, if they exist
198 | 				rft[ key[ 1 ] ] = value;
199 | 			}
200 | 		} );
201 | 		if ( Object.keys( rft ).length ) { // Add rft object if it is not empty
202 | 			metadata.rft = rft;
203 | 		}
204 | 		if ( !Object.keys( metadata ).length ) {
205 | 			reject( new Error( 'No COinS in provided string' ) );
206 | 		}
207 | 		if ( metadata.rft && metadata.rft.genre ) {
208 | 			// Genre should be case insensitive as this field may be used programmatically
209 | 			metadata.rft.genre = metadata.rft.genre.toLowerCase();
210 | 		}
211 | 		resolve( metadata );
212 | 	} );
213 | };
214 | 
215 | /**
216 |  * Scrapes Dublin Core data given Cheerio loaded html object
217 |  *
218 |  * @param  {Object}   chtml html Cheerio object
219 |  * @return {Object}   Promise for DC metadata
220 |  */
221 | exports.parseDublinCore = function ( chtml ) {
222 | 	return exports.parseBase(
223 | 		chtml,
224 | 		[ 'meta', 'link' ],
225 | 		'No Dublin Core metadata found in page',
226 | 		( element ) => {
227 | 			const isLink = element[ 0 ].name === 'link';
228 | 			const nameAttr = element.attr( isLink ? 'rel' : 'name' );
229 | 			const value = element.attr( isLink ? 'href' : 'content' );
230 | 
231 | 			// If the element isn't a Dublin Core property or if value is missing, skip it
232 | 			if ( !nameAttr || !value ||
233 |         ( nameAttr.slice( 0, 3 ).toUpperCase() !== 'DC.' &&
234 |           nameAttr.slice( 0, 8 ).toUpperCase() !== 'DCTERMS.' ) ) {
235 | 				return;
236 | 			}
237 | 
238 | 			const property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) ).toLowerCase();
239 | 
240 | 			return property;
241 | 		},
242 | 		( element ) => {
243 | 			const isLink = element[ 0 ].name === 'link';
244 | 			return element.attr( isLink ? 'href' : 'content' );
245 | 		}
246 | 	);
247 | };
248 | 
249 | /**
250 |  * Scrapes EPrints data given Cheerio loaded html object
251 |  *
252 |  * @param  {Object}   chtml html Cheerio object
253 |  * @return {Object}   Promise for EPrints metadata
254 |  */
255 | exports.parseEprints = function ( chtml ) {
256 | 	return exports.parseBase(
257 | 		chtml,
258 | 		[ 'meta' ],
259 | 		'No EPrints metadata found in page',
260 | 		( element ) => {
261 | 			const nameAttr = element.attr( 'name' );
262 | 			const content = element.attr( 'content' );
263 | 
264 | 			// If the element isn't an EPrints property or content is missing, skip it
265 | 			if ( !nameAttr || !content || nameAttr.slice( 0, 8 ).toLowerCase() !== 'eprints.' ) {
266 | 				return;
267 | 			}
268 | 
269 | 			let property = nameAttr.slice( Math.max( 0, nameAttr.lastIndexOf( '.' ) + 1 ) );
270 | 
271 | 			// Lowercase property
272 | 			property = property.toLowerCase();
273 | 			return property;
274 | 		},
275 | 		( element ) => element.attr( 'content' )
276 | 	).then( ( results ) => {
277 | 		if ( results.type ) {
278 | 			results.type = results.type.toLowerCase(); // Standardise 'type' field to lowercase
279 | 		}
280 | 		return results;
281 | 	} );
282 | };
283 | 
284 | /**
285 |  * Scrapes general metadata terms given Cheerio loaded html object
286 |  *
287 |  * @param  {Object}   chtml html Cheerio object
288 |  * @return {Object}   Promise for general metadata
289 |  */
290 | exports.parseGeneral = function ( chtml ) {
291 | 	return new Promise( ( resolve, reject ) => {
292 | 		const clutteredMeta = {
293 | 			appleTouchIcons: chtml( 'link[rel=apple-touch-icon i]' ).map( ( i, e ) => ( {
294 | 				href: e.attribs.href,
295 | 				sizes: e.attribs.sizes
296 | 			} ) ).get(), // apple-touch-icon <link rel="apple-touch-icon" href="" sizes="">
297 | 			icons: chtml( 'link[rel="shortcut icon" i], link[rel="icon" i]' ).map( ( i, e ) => ( {
298 | 				href: e.attribs.href,
299 | 				sizes: e.attribs.sizes,
300 | 				type: e.attribs.type
301 | 			} ) ).get(), // icon <link rel="icon" href="" sizes="" type="">
302 | 			author: chtml( 'meta[name=author i]' ).first().attr( 'content' ), // author <meta name="author" content="">
303 | 			authorlink: chtml( 'link[rel=author i]' ).first().attr( 'href' ), // author link <link rel="author" href="">
304 | 			canonical: chtml( 'link[rel=canonical i]' ).first().attr( 'href' ), // canonical link <link rel="canonical" href="">
305 | 			description: chtml( 'meta[name=description i]' ).attr( 'content' ), // meta description <meta name ="description" content="">
306 | 			publisher: chtml( 'link[rel=publisher i]' ).first().attr( 'href' ), // publisher link <link rel="publisher" href="">
307 | 			robots: chtml( 'meta[name=robots i]' ).first().attr( 'content' ), // robots <meta name ="robots" content="">
308 | 			shortlink: chtml( 'link[rel=shortlink i]' ).first().attr( 'href' ), // short link <link rel="shortlink" href="">
309 | 			title: chtml( 'title' ).first().text(), // title tag <title>
310 | 			lang: chtml( 'html' ).first().attr( 'lang' ) || chtml( 'html' ).first().attr( 'xml:lang' ), // lang <html lang=""> or <html xml:lang="">
311 | 			dir: chtml( 'html' ).first().attr( 'dir' ) // dir <html dir="">
312 | 		};
313 | 
314 | 		// Copy key-value pairs with defined values to meta
315 | 		const meta = {};
316 | 		let value;
317 | 		let notEmpty = false;
318 | 		Object.keys( clutteredMeta ).forEach( ( key ) => {
319 | 			notEmpty = false;
320 | 			value = clutteredMeta[ key ];
321 | 			let innerValue;
322 | 			if ( value && typeof value === 'object' ) {
323 | 				let i;
324 | 				for ( i = 0; i < Object.keys( value ).length; i++ ) {
325 | 					const definedValue = {};
326 | 					// eslint-disable-next-line no-loop-func
327 | 					Object.keys( value[ i ] ).forEach( ( objectProperty ) => {
328 | 						innerValue = value[ i ][ objectProperty ];
329 | 						if ( innerValue ) {
330 | 							definedValue[ objectProperty ] = innerValue;
331 | 							notEmpty = true;
332 | 						}
333 | 					} );
334 | 					value[ i ] = definedValue;
335 | 				}
336 | 			} else {
337 | 				notEmpty = true;
338 | 			}
339 | 			if ( value && notEmpty ) { // Only add if has value
340 | 				meta[ key ] = value;
341 | 			}
342 | 		} );
343 | 
344 | 		// Reject promise if meta is empty
345 | 		if ( Object.keys( meta ).length === 0 ) {
346 | 			reject( new Error( 'No general metadata found in page' ) );
347 | 		}
348 | 
349 | 		// Resolve on meta
350 | 		resolve( meta );
351 | 	} );
352 | };
353 | 
354 | /**
355 |  * Scrapes Highwire Press metadata given html object
356 |  *
357 |  * @param  {Object}   chtml html Cheerio object
358 |  * @return {Object}   promise of highwire press metadata object
359 |  */
360 | exports.parseHighwirePress = function ( chtml ) {
361 | 	return exports.parseBase(
362 | 		chtml,
363 | 		[ 'meta' ],
364 | 		'No Highwire Press metadata found in page',
365 | 		( element ) => {
366 | 			const nameAttr = element.attr( 'name' );
367 | 			const content = element.attr( 'content' );
368 | 
369 | 			// If the element isn't a Highwire Press property, skip it
370 | 			if ( !nameAttr || !content || ( nameAttr.slice( 0, 9 ).toLowerCase() !== 'citation_' ) ) {
371 | 				return;
372 | 			}
373 | 
374 | 			return nameAttr.slice( Math.max( 0, nameAttr.indexOf( '_' ) + 1 ) ).toLowerCase();
375 | 		},
376 | 		( element ) => element.attr( 'content' )
377 | 	);
378 | };
379 | 
380 | /**
381 |  * Returns JSON-LD provided by page given HTML object
382 |  *
383 |  * @param  {Object}   chtml html Cheerio object
384 |  * @return {Object}   Promise for JSON-LD
385 |  */
386 | exports.parseJsonLd = function ( chtml ) {
387 | 	return new Promise( ( resolve, reject ) => {
388 | 		const json = [];
389 | 		const jsonLd = chtml( 'script[type="application/ld+json"]' );
390 | 
391 | 		jsonLd.each( function () {
392 | 			let contents;
393 | 			try {
394 | 				contents = JSON.parse( this.children[ 0 ].data );
395 | 			} catch ( e ) {
396 | 				// Fail silently, just in case there are valid tags
397 | 				return;
398 | 			}
399 | 			if ( contents ) {
400 | 				json.push( contents );
401 | 			} else {
402 | 				return;
403 | 			}
404 | 		} );
405 | 
406 | 		if ( json.length === 0 ) {
407 | 			reject( new Error( 'No JSON-LD valid script tags present on page' ) );
408 | 		}
409 | 
410 | 		resolve( json.length > 1 ? json : json[ 0 ] );
411 | 	} );
412 | };
413 | 
414 | /**
415 |  * Scrapes OpenGraph data given html object
416 |  *
417 |  * @param  {Object}   chtml html Cheerio object
418 |  * @return {Object}   promise of open graph metadata object
419 |  */
420 | exports.parseOpenGraph = function ( chtml ) {
421 | 	return new Promise( ( resolve, reject ) => {
422 | 		let property;
423 | 		let node;
424 | 		const meta = {};
425 | 		const metaTags = chtml( 'meta' );
426 | 		const namespace = [ 'og', 'fb' ];
427 | 		const subProperty = {
428 | 			image: 'url',
429 | 			video: 'url',
430 | 			audio: 'url'
431 | 		};
432 | 		const roots = {}; // Object to store roots of different type i.e. image, audio
433 | 		let subProp; // Current subproperty of interest
434 | 		const reason = new Error( 'No openGraph metadata found in page' );
435 | 
436 | 		if ( !metaTags || metaTags.length === 0 ) {
437 | 			reject( reason );
438 | 		}
439 | 
440 | 		metaTags.each( function () {
441 | 			const element = chtml( this );
442 | 			let propertyValue = element.attr( 'property' );
443 | 			const content = element.attr( 'content' );
444 | 
445 | 			if ( !propertyValue || !content ) {
446 | 				return;
447 | 			} else {
448 | 				propertyValue = propertyValue.toLowerCase().split( ':' );
449 | 			}
450 | 
451 | 			// If the property isn't in namespace, exit
452 | 			if ( !namespace.includes( propertyValue[ 0 ] ) ) {
453 | 				return;
454 | 			}
455 | 
456 | 			if ( propertyValue.length === 2 ) {
457 | 				property = propertyValue[ 1 ]; // Set property to value after namespace
458 | 				if ( property in subProperty ) { // If has valid subproperty
459 | 					node = {};
460 | 					node[ subProperty[ property ] ] = content;
461 | 					roots[ property ] = node;
462 | 				} else {
463 | 					node = content;
464 | 				}
465 | 				// If the property already exists, make the array of contents
466 | 				if ( meta[ property ] ) {
467 | 					if ( meta[ property ] instanceof Array ) {
468 | 						meta[ property ].push( node );
469 | 					} else {
470 | 						meta[ property ] = [ meta[ property ], node ];
471 | 					}
472 | 				} else {
473 | 					meta[ property ] = node;
474 | 				}
475 | 			} else if ( propertyValue.length === 3 ) { // Property part of a vertical
476 | 				// i.e. image, audio - as properties, not values, these should be lower case
477 | 				subProp = propertyValue[ 1 ].toLowerCase();
478 | 				// i.e. height, width - as properties, not values, these should be lower case
479 | 				property = propertyValue[ 2 ].toLowerCase();
480 | 				// If root for subproperty exists, and there isn't already a property
481 | 				// called that in there already i.e. height, add property and content.
482 | 				if ( roots[ subProp ] && !roots[ subProp ][ property ] ) {
483 | 					// As properties, not values, these should be lower case
484 | 					roots[ subProp ][ property ] = content.toLowerCase();
485 | 				}
486 | 			} else {
487 | 				return; // Discard values with length <2 and >3 as invalid
488 | 			}
489 | 
490 | 			// Check for "type" property and add to namespace if so
491 | 			// If any of these type occur in order before the type attribute is defined,
492 | 			// they'll be skipped; spec requires they be placed below type definition.
493 | 			// For nested types (e.g. video.movie) the OG protocol uses the super type
494 | 			// (e.g. movie) as the new namespace.
495 | 			if ( property === 'type' ) {
496 | 				namespace.push( content.split( '.' )[ 0 ].toLowerCase() ); // Add the type to the acceptable namespace list - as a property, should be lower case
497 | 			}
498 | 		} );
499 | 		if ( Object.keys( meta ).length === 0 ) {
500 | 			reject( reason );
501 | 		}
502 | 		if ( meta.type ) {
503 | 			// Make type case insensitive as this may be used programmatically
504 | 			meta.type = meta.type.toLowerCase();
505 | 		}
506 | 		resolve( meta );
507 | 	} );
508 | };
509 | 
510 | /**
511 |  * Scrapes schema.org microdata given Cheerio loaded html object
512 |  *
513 |  * @param  {Object}  chtml Cheerio object with html loaded
514 |  * @return {Object}  promise of schema.org microdata object
515 |  */
516 | exports.parseSchemaOrgMicrodata = function ( chtml ) {
517 | 	return new Promise( ( resolve, reject ) => {
518 | 		if ( !chtml ) {
519 | 			reject( new Error( 'Undefined argument' ) );
520 | 		}
521 | 
522 | 		const meta = microdata.toJson( chtml.html() );
523 | 		if ( !meta || !meta.items || !meta.items[ 0 ] ) {
524 | 			reject( new Error( 'No schema.org metadata found in page' ) );
525 | 		}
526 | 		resolve( meta );
527 | 	} );
528 | };
529 | 
530 | /**
531 |  * Scrapes twitter microdata given Cheerio html object
532 |  *
533 |  * @param  {Object}   chtml html Cheerio object
534 |  * @return {Object}   promise of twitter metadata object
535 |  */
536 | exports.parseTwitter = function ( chtml ) {
537 | 	return new Promise( ( resolve, reject ) => {
538 | 		if ( !chtml ) {
539 | 			reject( new Error( 'Undefined argument' ) );
540 | 		}
541 | 
542 | 		const meta = {};
543 | 		const metaTags = chtml( 'meta' );
544 | 
545 | 		// These properties can either be strings or objects
546 | 		const dualStateSubProperties = {
547 | 			image: 'url',
548 | 			player: 'url',
549 | 			creator: '@username'
550 | 		};
551 | 
552 | 		metaTags.each( function () {
553 | 			const element = chtml( this );
554 | 			let name = element.attr( 'name' );
555 | 
556 | 			let property;
557 | 			const content = element.attr( 'content' );
558 | 			let node;
559 | 
560 | 			// Exit if not a twitter tag or content is missing
561 | 			if ( !name || !content ) {
562 | 				return;
563 | 			} else {
564 | 				name = name.toLowerCase().split( ':' );
565 | 				property = name[ 1 ];
566 | 			}
567 | 
568 | 			// Exit if tag not twitter metadata
569 | 			if ( name[ 0 ] !== 'twitter' ) {
570 | 				return;
571 | 			}
572 | 
573 | 			// Handle nested properties
574 | 			if ( name.length > 2 ) {
575 | 				const subProperty = name[ 2 ];
576 | 
577 | 				// Upgrade the property to an object if it needs to be
578 | 				if ( property in dualStateSubProperties &&
579 | 					!( meta[ property ] instanceof Object ) ) {
580 | 					node = {};
581 | 					node[ dualStateSubProperties[ property ] ] = meta[ property ];
582 | 					// Clear out the existing string as we just placed it into our new node
583 | 					meta[ property ] = [];
584 | 				} else {
585 | 					// Either create a new node or ammend the existing one
586 | 					node = meta[ property ] ? meta[ property ] : {};
587 | 				}
588 | 
589 | 				// Differentiate betweeen twice and thrice nested properties
590 | 				// Not the prettiest solution, but twitter metadata guidelines are fairly strict,
591 | 				// so it's not nessesary to anticipate strange data.
592 | 				if ( name.length === 3 ) {
593 | 					node[ subProperty ] = content;
594 | 				} else if ( name.length === 4 ) {
595 | 					// Solve twitter:player:stream:content_type where stream needs to be an obj
596 | 					if ( subProperty.toLowerCase() === 'stream' ) {
597 | 						node[ subProperty ] = { url: node[ subProperty ] };
598 | 					} else {
599 | 						// Either create a new subnode or amend the existing one
600 | 						node[ subProperty ] = node[ subProperty ] ? node[ subProperty ] : {};
601 | 					}
602 | 					node[ subProperty ][ name[ 3 ] ] = content;
603 | 				} else {
604 | 					// Something is malformed, so exit
605 | 					return;
606 | 				}
607 | 			} else {
608 | 				node = content;
609 | 			}
610 | 
611 | 			// Create array if property exists and is not a nested object
612 | 			if ( meta[ property ] && !( meta[ property ] instanceof Object ) ) {
613 | 				if ( meta[ property ] instanceof Array ) {
614 | 					meta[ property ].push( node );
615 | 				} else {
616 | 					meta[ property ] = [ meta[ property ], node ];
617 | 				}
618 | 			} else {
619 | 				meta[ property ] = node;
620 | 			}
621 | 		} );
622 | 
623 | 		if ( Object.keys( meta ).length === 0 ) {
624 | 			reject( new Error( 'No twitter metadata found on this page' ) );
625 | 		}
626 | 
627 | 		resolve( meta );
628 | 	} );
629 | };
630 | 
631 | /**
632 |  * Scrapes prism metadata given Cheerio html object
633 |  *
634 |  * @param  {Object}   chtml html Cheerio object
635 |  * @return {Object}   promise of prism metadata object
636 |  */
637 | exports.parsePrism = function ( chtml ) {
638 | 	return new Promise( ( resolve, reject ) => {
639 | 		if ( !chtml ) {
640 | 			reject( new Error( 'Undefined argument' ) );
641 | 		}
642 | 
643 | 		const meta = {};
644 | 		const metaTags = chtml( 'meta' );
645 | 
646 | 		const reason = new Error( 'No PRISM metadata found in page' );
647 | 
648 | 		if ( !metaTags || metaTags.length === 0 ) {
649 | 			reject( reason );
650 | 		}
651 | 
652 | 		metaTags.each( function () {
653 | 			const element = chtml( this );
654 | 			let name = element.attr( 'name' );
655 | 			const content = element.attr( 'content' );
656 | 
657 | 			if ( !name || !content ) {
658 | 				return;
659 | 			} else {
660 | 				name = name.split( '.' );
661 | 			}
662 | 
663 | 			// If the name does not have the prism prefix, exit
664 | 			if ( name[ 0 ].toLowerCase() !== 'prism' ) {
665 | 				return;
666 | 			}
667 | 
668 | 			// Set the name to the value after the prefix
669 | 			name = name[ 1 ];
670 | 			// Set the first character to lower case
671 | 			name = name.charAt( 0 ).toLowerCase() + name.slice( 1 );
672 | 
673 | 			// If the name already exists, make an array of the contents
674 | 			if ( meta[ name ] ) {
675 | 				if ( meta[ name ] instanceof Array ) {
676 | 					meta[ name ].push( content );
677 | 				} else {
678 | 					meta[ name ] = [ meta[ name ], content ];
679 | 				}
680 | 			} else {
681 | 				meta[ name ] = content;
682 | 			}
683 | 		} );
684 | 
685 | 		if ( Object.keys( meta ).length === 0 ) {
686 | 			reject( reason );
687 | 		}
688 | 
689 | 		resolve( meta );
690 | 	} );
691 | };
692 | 
693 | /**
694 |  * Global exportable list of scraping promises with string keys
695 |  *
696 |  * @type {Object}
697 |  */
698 | exports.metadataFunctions = {
699 | 	bePress: exports.parseBEPress,
700 | 	coins: exports.parseCOinS,
701 | 	dublinCore: exports.parseDublinCore,
702 | 	eprints: exports.parseEprints,
703 | 	general: exports.parseGeneral,
704 | 	highwirePress: exports.parseHighwirePress,
705 | 	jsonLd: exports.parseJsonLd,
706 | 	openGraph: exports.parseOpenGraph,
707 | 	schemaOrg: exports.parseSchemaOrgMicrodata,
708 | 	twitter: exports.parseTwitter,
709 | 	prism: exports.parsePrism
710 | };
711 | 


--------------------------------------------------------------------------------