├── package.cjs.json ├── .prettierrc ├── RELEASING.md ├── .mocharc.json ├── .gitignore ├── tsconfig.build-cjs.json ├── .github ├── renovate.json └── workflows │ ├── release-github.yml │ ├── release-npm.yml │ └── test-javascript.yml ├── .nycrc.json ├── tsconfig.build-esm.json ├── tsconfig.build.json ├── CONTRIBUTING.md ├── tsconfig.json ├── .eslintrc.json ├── LICENSE ├── package.json ├── CHANGELOG.md ├── test ├── acceptanceTest.ts └── microdataTest.ts ├── README.md └── src └── index.ts /package.cjs.json: -------------------------------------------------------------------------------- 1 | {"type": "commonjs"} 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "trailingComma": "es5", 3 | "semi": false, 4 | "singleQuote": true 5 | } 6 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | See [.github/RELEASING](https://github.com/cucumber/.github/blob/main/RELEASING.md). 2 | -------------------------------------------------------------------------------- /.mocharc.json: -------------------------------------------------------------------------------- 1 | { 2 | "loader": "ts-node/esm", 3 | "extension": ["ts"], 4 | "recursive": true 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | .idea/ 3 | .nyc_output/ 4 | coverage/ 5 | node_modules/ 6 | yarn.lock 7 | *.log 8 | *.iml 9 | -------------------------------------------------------------------------------- /tsconfig.build-cjs.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.build.json", 3 | "compilerOptions": { 4 | "outDir": "dist/cjs", 5 | "target": "ES5", 6 | "module": "CommonJS", 7 | }, 8 | } 9 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>cucumber/renovate-config", 5 | ":automergeMajor" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /.nycrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extension": [ 3 | ".ts" 4 | ], 5 | "exclude": [ 6 | "coverage", 7 | "dist", 8 | "src/**/*.d.ts" 9 | ], 10 | "reporter": [ 11 | "html", 12 | "text" 13 | ], 14 | "all": true 15 | } -------------------------------------------------------------------------------- /tsconfig.build-esm.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.build.json", 3 | "compilerOptions": { 4 | "lib": [ 5 | "ES2019" 6 | ], 7 | "target": "ES6", 8 | "module": "ES6", 9 | "outDir": "dist/esm" 10 | }, 11 | } 12 | -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "composite": true, 5 | "declaration": true, 6 | "declarationMap": true, 7 | "sourceMap": true, 8 | "rootDir": ".", 9 | "noEmit": false 10 | }, 11 | "include": [ 12 | "src", 13 | "test" 14 | ], 15 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to @cucumber/microdata 2 | 3 | ## Making changes 4 | 5 | * Use TDD 6 | * Update `CHANGELOG.md` when you make a significant change 7 | 8 | ## Release process 9 | 10 | Update links in `CHANGELOG.md` and commit. Then: 11 | 12 | npm version NEW_VERSION 13 | npm publish --access public 14 | git push && git push --tags 15 | -------------------------------------------------------------------------------- /.github/workflows/release-github.yml: -------------------------------------------------------------------------------- 1 | name: Release GitHub 2 | 3 | on: 4 | push: 5 | branches: [release/*] 6 | 7 | jobs: 8 | create-github-release: 9 | name: Create GitHub Release and Git tag 10 | runs-on: ubuntu-latest 11 | environment: Release 12 | permissions: 13 | contents: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: cucumber/action-create-github-release@v1.1.1 17 | with: 18 | github-token: ${{ secrets.GITHUB_TOKEN }} 19 | -------------------------------------------------------------------------------- /.github/workflows/release-npm.yml: -------------------------------------------------------------------------------- 1 | name: Release NPM 2 | 3 | on: 4 | push: 5 | branches: [release/*] 6 | 7 | jobs: 8 | publish-npm: 9 | name: Publish NPM module 10 | runs-on: ubuntu-latest 11 | environment: Release 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-node@v4 15 | with: 16 | node-version: '22' 17 | cache: 'npm' 18 | cache-dependency-path: package-lock.json 19 | - run: npm install-test 20 | - uses: cucumber/action-publish-npm@v1.1.1 21 | with: 22 | npm-token: ${{ secrets.NPM_TOKEN }} 23 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": ".", 4 | "declaration": true, 5 | "sourceMap": true, 6 | "allowJs": false, 7 | "resolveJsonModule": true, 8 | "esModuleInterop": true, 9 | "noImplicitAny": true, 10 | "downlevelIteration": true, 11 | "skipLibCheck": true, 12 | "strictNullChecks": true, 13 | "experimentalDecorators": true, 14 | "module": "ESNext", 15 | "lib": [ 16 | "ES6", 17 | "dom" 18 | ], 19 | "target": "ES6", 20 | "moduleResolution": "node", 21 | "allowSyntheticDefaultImports": true, 22 | "noEmit": true 23 | }, 24 | "include": ["src", "test"] 25 | } 26 | -------------------------------------------------------------------------------- /.github/workflows/test-javascript.yml: -------------------------------------------------------------------------------- 1 | name: test-javascript 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - renovate/** 8 | pull_request: 9 | branches: 10 | - main 11 | workflow_call: 12 | 13 | jobs: 14 | test-javascript: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | os: 20 | - macos-latest 21 | - ubuntu-latest 22 | node-version: ['16.11', 17, 18, 20] 23 | 24 | steps: 25 | - name: set git core.autocrlf to 'input' 26 | run: git config --global core.autocrlf input 27 | - uses: actions/checkout@v4 28 | - name: with Node.js ${{ matrix.node-version }} on ${{ matrix.os }} 29 | uses: actions/setup-node@v4 30 | with: 31 | node-version: ${{ matrix.node-version }} 32 | cache: 'npm' 33 | cache-dependency-path: package-lock.json 34 | - run: npm install-test 35 | - run: npm run eslint 36 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "browser": true, 4 | "node": true 5 | }, 6 | "parser": "@typescript-eslint/parser", 7 | "parserOptions": { 8 | "project": "tsconfig.json", 9 | "sourceType": "module" 10 | }, 11 | "plugins": [ 12 | "node", 13 | "@typescript-eslint" 14 | ], 15 | "extends": [ 16 | "eslint:recommended", 17 | "plugin:@typescript-eslint/eslint-recommended", 18 | "plugin:@typescript-eslint/recommended", 19 | "plugin:prettier/recommended" 20 | ], 21 | "rules": { 22 | "node/no-extraneous-import": "error", 23 | "@typescript-eslint/ban-ts-ignore": "off", 24 | "@typescript-eslint/no-use-before-define": "off", 25 | "@typescript-eslint/explicit-function-return-type": "off", 26 | "@typescript-eslint/interface-name-prefix": "off", 27 | "@typescript-eslint/member-delimiter-style": "off", 28 | "@typescript-eslint/no-explicit-any": "off" 29 | }, 30 | "overrides": [ 31 | { 32 | "files": ["test/*"], 33 | "rules": { 34 | "@typescript-eslint/no-non-null-assertion": "off" 35 | } 36 | } 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Cucumber Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@cucumber/microdata", 3 | "version": "2.2.0", 4 | "description": "Extract WHATWG microdata from a DOM", 5 | "type": "module", 6 | "main": "dist/cjs/src/index.js", 7 | "types": "dist/cjs/src/index.d.ts", 8 | "files": [ 9 | "dist/cjs", 10 | "dist/esm" 11 | ], 12 | "module": "dist/esm/src/index.js", 13 | "jsnext:main": "dist/esm/src/index.js", 14 | "exports": { 15 | ".": { 16 | "import": "./dist/esm/src/index.js", 17 | "require": "./dist/cjs/src/index.js" 18 | } 19 | }, 20 | "repository": "git+https://github.com/cucumber/microdata.git", 21 | "author": "Aslak Hellesøy", 22 | "license": "MIT", 23 | "scripts": { 24 | "build:cjs": "tsc --build tsconfig.build-cjs.json && shx cp package.cjs.json dist/cjs/package.json", 25 | "build:esm": "tsc --build tsconfig.build-esm.json", 26 | "build": "npm run build:cjs && npm run build:esm", 27 | "test": "mocha && npm run test:cjs", 28 | "test:cjs": "npm run build:cjs && mocha --no-config dist/cjs/test", 29 | "prepublishOnly": "npm run build", 30 | "eslint-fix": "eslint --ext ts --max-warnings 0 --fix src test", 31 | "eslint": "eslint --ext ts --max-warnings 0 src test" 32 | }, 33 | "devDependencies": { 34 | "@types/jsdom": "21.1.7", 35 | "@types/mocha": "10.0.10", 36 | "@types/node": "22.15.30", 37 | "@typescript-eslint/eslint-plugin": "8.4.0", 38 | "@typescript-eslint/parser": "8.4.0", 39 | "eslint": "8.57.1", 40 | "eslint-config-prettier": "10.1.5", 41 | "eslint-plugin-node": "11.1.0", 42 | "eslint-plugin-prettier": "5.4.1", 43 | "jsdom": "26.1.0", 44 | "mocha": "11.6.0", 45 | "prettier": "3.5.3", 46 | "schema-dts": "1.1.5", 47 | "shx": "0.4.0", 48 | "ts-node": "10.9.2", 49 | "typescript": "5.8.3" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 6 | and this project adheres to [Semantic Versioning](http://semver.org/). 7 | 8 | ## [Unreleased] 9 | 10 | ## [2.2.0] - 2025-05-01 11 | ### Fixed 12 | - `extractValue` can return booleans too. 13 | 14 | ## [2.1.0] - 2022-07-27 15 | ### Added 16 | - Support for `track`, `data` and `meter` elements 17 | 18 | ### Fixed 19 | - Properties with `itemtype="http://schema.org/Boolean"` are parsed correctly. 20 | 21 | ## [2.0.1] - 2022-06-22 22 | ### Fixed 23 | - Allow `itemProp` to have an empty string ([#126](https://github.com/cucumber/microdata/issues/126), [#127](https://github.com/cucumber/microdata/pull/127)) 24 | 25 | ## [2.0.0] - 2022-04-28 26 | ### Added 27 | - Package as both ESM and CommonJS module ([#66](https://github.com/cucumber/microdata/pull/66)) 28 | 29 | ### Changed 30 | - The return type of `microdata` changed from `T` to `T | null` ([#108](https://github.com/cucumber/microdata/pull/108)) 31 | 32 | ## [1.5.0] 33 | ### Added 34 | - Added `toArray` utility function to convert children into an array of 0, 1 or more objects 35 | 36 | ### Changed 37 | - Primitive types using `itemtype="http://schema.org/Integer"` etc. with `http:` protocol are no longer recognised. 38 | Starting with this version it only recognises `itemtype="https://schema.org/Integer"` etc. with `https:` protocol. 39 | See [this post](https://stackoverflow.com/questions/55242400/itemtype-with-http-or-better-https) for more details. 40 | 41 | ## [1.4.0] 42 | ### Added 43 | - Improve types to accept document as a scope [#8](https://github.com/cucumber/microdata/pull/8) 44 | 45 | ## [1.3.0] 46 | ### Added 47 | - Support all specific [DataType](https://schema.org/DataType) subtypes. 48 | 49 | ## [1.2.1] 50 | ### Fixed 51 | - Better error messages when attributes are missing 52 | 53 | ## [1.2.0] - 2020-04-02 54 | ### Added 55 | - An optional `(element: Element) => string | undefined` function can be passed as the last argument to 56 | provide a custom function to look up element values. This is useful for extracting values from non-standard 57 | elements, such as a [CodeMirror](https://codemirror.net/) editor. 58 | 59 | ## [1.1.0] - 2019-03-16 60 | ### Changed 61 | - `microdata` and `microdataAll` return generic types 62 | 63 | ## [1.0.1] - 2019-03-16 64 | ### Fixed 65 | - Trim text content on multiple lines 66 | 67 | ## [1.0.0] - 2019-03-16 68 | ### Added 69 | - First release 70 | 71 | [Unreleased]: https://github.com/cucumber/microdata/compare/v2.2.0...HEAD 72 | [2.2.0]: https://github.com/cucumber/microdata/compare/v2.1.0...v2.2.0 73 | [2.1.0]: https://github.com/cucumber/microdata/compare/v2.0.1...v2.1.0 74 | [2.0.1]: https://github.com/cucumber/microdata/compare/v2.0.0...v2.0.1 75 | [2.0.0]: https://github.com/cucumber/microdata/compare/v1.5.0...v2.0.0 76 | [1.5.0]: https://github.com/cucumber/microdata/compare/v1.4.0...v1.5.0 77 | [1.4.0]: https://github.com/cucumber/microdata/compare/v1.3.0...v1.4.0 78 | [1.3.0]: https://github.com/cucumber/microdata/compare/v1.2.1...v1.3.0 79 | [1.2.1]: https://github.com/cucumber/microdata/compare/v1.2.0...v1.2.1 80 | [1.2.0]: https://github.com/cucumber/microdata/compare/v1.1.0...v1.2.0 81 | [1.1.0]: https://github.com/cucumber/microdata/compare/v1.0.1...v1.1.0 82 | [1.0.1]: https://github.com/cucumber/microdata/compare/v1.0.0...v1.0.1 83 | [1.0.0]: https://github.com/cucumber/microdata/releases/tag/v1.0.0 84 | -------------------------------------------------------------------------------- /test/acceptanceTest.ts: -------------------------------------------------------------------------------- 1 | import { JSDOM } from 'jsdom' 2 | import { microdata } from '../src/index.js' 3 | import { LocalBusiness, Person } from 'schema-dts' 4 | import assert from 'assert' 5 | 6 | describe('microdata', () => { 7 | context( 8 | 'acceptance tests from https://github.com/schemaorg/schemaorg/blob/master/data/examples.txt', 9 | () => { 10 | it('makes a https://schema.org/Person', () => { 11 | const html = ` 12 |
13 | Jane Doe 14 | Photo of Jane Doe 15 | 16 | Professor 17 |
18 | 19 | 20341 Whitworth Institute 20 | 405 N. Whitworth 21 | 22 | Seattle, 23 | WA 24 | 98052 25 |
26 | (425) 123-4567 27 | 28 | jane-doe@xyz.edu 29 | 30 | Jane's home page: 31 | 32 | 33 | Graduate students: 34 | 35 | Alice Jones 36 | 37 | Bob Smith 38 |
39 | ` 40 | const expected: Person = { 41 | '@type': 'Person', 42 | address: { 43 | '@type': 'PostalAddress', 44 | addressLocality: 'Seattle', 45 | addressRegion: 'WA', 46 | postalCode: '98052', 47 | streetAddress: '20341 Whitworth Institute 405 N. Whitworth', 48 | }, 49 | colleague: [ 50 | 'https://www.xyz.edu/students/alicejones.html', 51 | 'https://www.xyz.edu/students/bobsmith.html', 52 | ], 53 | email: 'mailto:jane-doe@xyz.edu', 54 | image: 'janedoe.jpg', 55 | jobTitle: 'Professor', 56 | name: 'Jane Doe', 57 | telephone: '(425) 123-4567', 58 | url: 'https://www.janedoe.com', 59 | } 60 | assertMicrodata(html, expected) 61 | }) 62 | 63 | it('makes a https://schema.org/LocalBusiness', () => { 64 | const html = ` 65 |
66 |

Beachwalk Beachwear & Giftware

67 | A superb collection of fine gifts and clothing 68 | to accent your stay in Mexico Beach. 69 |
70 | 3102 Highway 98 71 | Mexico Beach, 72 | FL 73 |
74 | Phone: 850-648-4200 75 |
76 | ` 77 | const expected: LocalBusiness = { 78 | '@type': 'LocalBusiness', 79 | address: { 80 | '@type': 'PostalAddress', 81 | addressLocality: 'Mexico Beach', 82 | addressRegion: 'FL', 83 | streetAddress: '3102 Highway 98', 84 | }, 85 | description: 86 | 'A superb collection of fine gifts and clothing to accent your stay in Mexico Beach.', 87 | name: 'Beachwalk Beachwear & Giftware', 88 | telephone: '850-648-4200', 89 | } 90 | assertMicrodata(html, expected) 91 | }) 92 | } 93 | ) 94 | }) 95 | 96 | function assertMicrodata(html: string, expected: any) { 97 | const doc = new JSDOM(html).window.document.documentElement 98 | const itemscope = doc.querySelector(`[itemscope]`) 99 | assert(itemscope) 100 | const itemtype = itemscope.getAttribute('itemtype') 101 | assert(itemtype) 102 | assert.deepStrictEqual(microdata(itemtype, doc), expected) 103 | } 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Node.js CI](https://github.com/cucumber/microdata/workflows/Node.js%20CI/badge.svg) 2 | 3 | # Microdata 4 | 5 | This zero-dependency library converts a DOM to [Microdata](https://html.spec.whatwg.org/multipage/microdata.html). 6 | 7 | It can be used to extract "interesting" pieces of information from a DOM, such as [Person](https://schema.org/Person), 8 | [Order](https://schema.org/Order), [MusicEvent](https://schema.org/MusicEvent) etc. 9 | 10 | All you need to do is to add the appropriate `itemscope`, `itemtype` and `itemprop` attributes to your HTML, and this library 11 | will be able to extract the data. 12 | 13 | The library supports [all schema.org types](https://schema.org/docs/full.html), and also allows custom Microdata types. 14 | 15 | The returned Mircodata uses the [JSON-LD](https://json-ld.org/) format. 16 | 17 | ## Installation 18 | 19 | npm install @cucumber/microdata 20 | 21 | ## Example 22 | 23 | Given a sample DOM: 24 | 25 | ```html 26 | 27 |
28 | Jane Doe 29 |
30 | ``` 31 | 32 | We can extract the `Person` on that page to a [JSON-LD](https://json-ld.org/) compliant JavaScript object: 33 | 34 | ```javascript 35 | const { microdata } = require('@cucumber/microdata') 36 | 37 | const person = microdata('https://schema.org/Person', document) 38 | console.log(person.name) // "Jane Doe" 39 | ``` 40 | 41 | If you are using TypeScript you can cast the result to a type from [schema-dts](https://github.com/google/schema-dts): 42 | 43 | ```typescript 44 | import { microdata } from '@cucumber/microdata' 45 | import { Person } from 'schema-dts' 46 | 47 | const person = microdata('https://schema.org/Person', document) as Person 48 | if (typeof person === 'string') throw new Error('Expected a Person object') 49 | console.log(person.name) // "Jane Doe" 50 | ``` 51 | 52 | ## Custom value extraction 53 | 54 | In some cases you may want finer grained control over how to extract values from the DOM. For example, 55 | you may have a [CodeMirror](https://codemirror.net/) editor sitting inside of an element: 56 | 57 | ```html 58 |
59 | 60 |
61 | ``` 62 | 63 | You can pass a custom `extractValue` function as the last argument to `microdata` or `microdataAll`: 64 | 65 | ```typescript 66 | const data = microdata( 67 | someSchemaType, 68 | someElement, 69 | element => element.querySelector('.CodeMirror')?.CodeMirror?.getValue() 70 | ) 71 | ``` 72 | 73 | This function may return `undefined`. In that case, the default lookup mechanisms will be used. 74 | 75 | ## Custom types 76 | 77 | We recommend using the official types defined by schema.org if you can. Sometimes however, you may want to 78 | define your own types if the official types are insufficient. 79 | 80 | You can see an example of how this is done in [test/microdataTest.ts](test/microdataTest.ts). 81 | 82 | ## Usage in testing 83 | 84 | This library can be used to write assertions against web pages. 85 | It works with any UI library as it only inspects the DOM. The only requirement 86 | is that the HTML has Microdata in it. 87 | 88 | Here is an example from a hypothetical TODO list application: 89 | 90 | ```typescript 91 | import { microdata } from '@cucumber/microdata' 92 | 93 | const itemList = microdata('https://schema.org/ItemList', element) as ItemList 94 | const todos = itemList.itemListElement as Text[] 95 | assert.deepStrictEqual(todos, ['Get milk', 'Feed dog']) 96 | ``` 97 | 98 | ## Arrays 99 | 100 | Some microdata `itemScope`s allow `itemProp` elements that can be specified more than once. 101 | For example, if an `ItemList` has two or more `itemListElement` children, then the `itemListElement` 102 | field in the LD-JSON object will be an `Array`. 103 | 104 | However, if there is only one child, it will have the value of that child rather than an array with one element. 105 | 106 | And if there are none, the value of that child will be undefined. 107 | 108 | The `toArray` function of this library will convert a value to an array with 0, 1 or more elements so you 109 | don't need to worry about this. 110 | 111 | ```typescript 112 | import { microdata, toArray } from '@cucumber/microdata' 113 | 114 | const itemList = microdata('https://schema.org/ItemList', element) as ItemList 115 | const todos = toArray(itemList.itemListElement) as Text[] 116 | assert.deepStrictEqual(todos, ['Get milk', 'Feed dog']) 117 | ``` 118 | 119 | ## Credit 120 | 121 | This library is based on the excellent, but abandoned [microdata](https://github.com/nathan7/microdata). It's been ported to TypeScript, and some bug fixes have 122 | been applied to make it compliant with JSON-LD. 123 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export function microdataAll( 2 | itemtype: string, 3 | scope: Scope, 4 | extractValue: ExtractValue = () => undefined 5 | ): ReadonlyArray { 6 | const itemScopes = scope.querySelectorAll( 7 | `[itemscope][itemtype="${itemtype}"]` 8 | ) 9 | return Array.from(itemScopes).map((scope) => extract(scope, extractValue)) 10 | } 11 | 12 | export function microdata( 13 | itemtype: string, 14 | scope: Scope, 15 | extractValue: ExtractValue = () => undefined 16 | ): T | null { 17 | const itemScope = scope.querySelector(`[itemscope][itemtype="${itemtype}"]`) 18 | return itemScope === null ? null : extract(itemScope, extractValue) 19 | } 20 | 21 | /** 22 | * Converts an object to an array 23 | * @param o an object, array, null or undefined 24 | * @return an array of 0, 1 or more elements 25 | */ 26 | export function toArray( 27 | o: T | readonly T[] | undefined | null 28 | ): readonly T[] { 29 | if (o === null || o === undefined) return [] 30 | return Array.isArray(o) ? o : [o as T] 31 | } 32 | 33 | function extract(scope: Element, extractValue: ExtractValue): T { 34 | const itemType = scope.getAttribute('itemtype') 35 | 36 | if (itemType === null) { 37 | throw new Error(`Missing itemtype on element ${scope.outerHTML}`) 38 | } 39 | 40 | const microdata = { '@type': new URL(itemType).pathname.slice(1) } 41 | const children = Array.from(scope.children) 42 | let child: Element | undefined = undefined 43 | 44 | while ((child = children.shift())) { 45 | const key = child.getAttribute('itemprop') 46 | if (key) { 47 | add(microdata, key, value(child, extractValue)) 48 | } 49 | if (child.getAttribute('itemscope') === null) 50 | prepend(children, child.children) 51 | } 52 | 53 | return microdata as unknown as T 54 | } 55 | 56 | function add(microdata: any, key: string, value: any) { 57 | if (value === null) return 58 | 59 | const prop = microdata[key] 60 | if (prop == null) microdata[key] = value 61 | else if (Array.isArray(prop)) prop.push(value) 62 | else microdata[key] = [prop, value] 63 | } 64 | 65 | function value(element: Element, extractValue: ExtractValue) { 66 | if (element.getAttribute('itemscope') !== null) { 67 | return extract(element, extractValue) 68 | } 69 | const attributeName = attributeNameByTagName[element.tagName.toLowerCase()] 70 | const extractedValue = extractValue(element) 71 | const rawValue = 72 | extractedValue === undefined 73 | ? attributeName 74 | ? element.getAttribute(attributeName) 75 | : element.textContent 76 | : extractedValue 77 | 78 | if (rawValue === null) { 79 | throw new Error(`Unable to extract value`) 80 | } 81 | 82 | if (typeof rawValue === 'boolean') { 83 | return rawValue 84 | } 85 | 86 | const stringValue = rawValue 87 | .trim() 88 | .split(/\n/) 89 | .map((s) => s.trim()) 90 | .join(' ') 91 | const itemType = element.getAttribute('itemtype') 92 | switch (itemType) { 93 | case null: 94 | return stringValue 95 | case 'https://schema.org/Text': 96 | case 'https://schema.org/DateTime': 97 | case 'https://schema.org/Date': 98 | case 'https://schema.org/Time': 99 | case 'https://schema.org/CssSelectorType': 100 | case 'https://schema.org/PronounceableText': 101 | case 'https://schema.org/URL': 102 | case 'https://schema.org/XPathType': 103 | return stringValue 104 | case 'https://schema.org/Number': 105 | case 'https://schema.org/Float': 106 | case 'https://schema.org/Integer': 107 | return Number(stringValue) 108 | case 'https://schema.org/Boolean': 109 | return stringValue === 'true' 110 | case 'https://schema.org/False': 111 | return false 112 | case 'https://schema.org/True': 113 | return true 114 | default: 115 | throw new Error( 116 | `Unable to extract value. Change itemtype to a primitive type or add itemscope on element ${element.outerHTML}` 117 | ) 118 | } 119 | } 120 | 121 | function prepend(target: Element[], addition: HTMLCollection) { 122 | ;[].unshift.apply(target, [].slice.call(addition)) 123 | } 124 | 125 | // https://html.spec.whatwg.org/multipage/microdata.html#values 126 | const attributeNameByTagName: { [key: string]: string } = { 127 | meta: 'content', 128 | audio: 'src', 129 | embed: 'src', 130 | iframe: 'src', 131 | img: 'src', 132 | source: 'src', 133 | track: 'src', 134 | video: 'src', 135 | a: 'href', 136 | area: 'href', 137 | link: 'href', 138 | object: 'data', 139 | data: 'value', 140 | meter: 'value', 141 | time: 'datetime', 142 | } 143 | 144 | type ExtractValue = (element: Element) => string | boolean | undefined | null 145 | type Scope = Document | Element 146 | -------------------------------------------------------------------------------- /test/microdataTest.ts: -------------------------------------------------------------------------------- 1 | import { JSDOM } from 'jsdom' 2 | import { microdata, toArray } from '../src/index.js' 3 | import { 4 | BreadcrumbList, 5 | CreativeWork, 6 | Event, 7 | ListItem, 8 | Person, 9 | Text, 10 | } from 'schema-dts' 11 | import assert from 'assert' 12 | 13 | type Tree = { 14 | '@type': 'Tree' 15 | value: Text 16 | children?: TreeList 17 | } 18 | 19 | type TreeList = { 20 | '@type': 'TreeList' 21 | treeListElement: Tree | Tree[] 22 | } 23 | 24 | describe('microdata', () => { 25 | it('converts primitive types', () => { 26 | const dom = new JSDOM(` 27 |
28 |
29 | Maximum attendees: 35. 30 | Ticket: pay at the entrance. 31 |
32 |
33 | `) 34 | const event: Event = microdata( 35 | 'https://schema.org/Event', 36 | dom.window.document.documentElement 37 | )! 38 | 39 | assert.strictEqual(event.maximumAttendeeCapacity, 35) 40 | assert.strictEqual(event.isAccessibleForFree, false) 41 | }) 42 | 43 | it('converts objects with dates', () => { 44 | const dom = new JSDOM(` 45 |
46 |
47 | Maximum attendees: 2020-11-20T11:15:52.927Z. 48 |
49 |
50 | `) 51 | const creativeWork: CreativeWork = microdata( 52 | 'https://schema.org/CreativeWork', 53 | dom.window.document.documentElement 54 | )! 55 | 56 | assert.strictEqual(creativeWork.dateCreated, '2020-11-20T11:15:52.927Z') 57 | }) 58 | 59 | it('creates a Tree using custom types', () => { 60 | const dom = new JSDOM(` 61 |
    62 |
  1. 63 | Europe 64 |
      65 |
    1. 66 | France 67 |
        68 |
      1. 69 | Toulouse 70 |
      2. 71 |
      3. 72 | Paris 73 |
      4. 74 |
      75 |
    2. 76 |
    3. 77 | Spain 78 |
    4. 79 |
    80 |
  2. 81 |
82 | `) 83 | 84 | const expected: Tree = { 85 | '@type': 'Tree', 86 | children: { 87 | '@type': 'TreeList', 88 | treeListElement: [ 89 | { 90 | '@type': 'Tree', 91 | children: { 92 | '@type': 'TreeList', 93 | treeListElement: [ 94 | { 95 | '@type': 'Tree', 96 | value: 'Toulouse', 97 | }, 98 | { 99 | '@type': 'Tree', 100 | value: 'Paris', 101 | }, 102 | ], 103 | }, 104 | value: 'France', 105 | }, 106 | { 107 | '@type': 'Tree', 108 | value: 'Spain', 109 | }, 110 | ], 111 | }, 112 | value: 'Europe', 113 | } 114 | 115 | assert.deepStrictEqual( 116 | microdata( 117 | 'https://schema.cucumber.io/Tree', 118 | dom.window.document.documentElement 119 | ), 120 | expected 121 | ) 122 | }) 123 | 124 | it('can use a custom function to look up value from element', () => { 125 | const dom = new JSDOM(` 126 |
127 |
128 | Ignore this 129 | Aslak 130 |
131 | Hellesøy 132 |
133 | `) 134 | const person = microdata( 135 | 'https://schema.org/Person', 136 | dom.window.document.documentElement, 137 | (element) => element.querySelector('.use-this')?.textContent 138 | )! 139 | 140 | if (typeof person === 'string') throw new Error('Expected a Person object') 141 | 142 | assert.strictEqual(person.givenName, 'Aslak') 143 | assert.strictEqual(person.familyName, 'Hellesøy') 144 | }) 145 | 146 | it('can extract properties with empty strings', () => { 147 | const dom = new JSDOM(` 148 |
149 |
150 | Hellesøy 151 |
152 | `) 153 | const person = microdata( 154 | 'https://schema.org/Person', 155 | dom.window.document.documentElement 156 | )! 157 | 158 | if (typeof person === 'string') throw new Error('Expected a Person object') 159 | 160 | assert.strictEqual(person.givenName, '') 161 | assert.strictEqual(person.familyName, 'Hellesøy') 162 | }) 163 | 164 | it('does not fallback to the default look up when the custom one returns an empty string', () => { 165 | const dom = new JSDOM(` 166 |
167 | 168 |
169 | `) 170 | const person = microdata( 171 | 'https://schema.org/Person', 172 | dom.window.document.documentElement, 173 | (element) => { 174 | if (element.getAttribute('itemprop') === 'givenName') 175 | return element.getAttribute('value') 176 | return undefined 177 | } 178 | )! 179 | 180 | if (typeof person === 'string') throw new Error('Expected a Person object') 181 | 182 | assert.strictEqual(person.givenName, '') 183 | }) 184 | 185 | it('can extract boolean value with extractValue', () => { 186 | const dom = new JSDOM(` 187 |
188 |
189 | A quick explanation about the book 190 |
191 | Y 192 |
193 | `) 194 | 195 | const book = microdata( 196 | 'https://schema.org/Book', 197 | dom.window.document.documentElement, 198 | (element) => { 199 | if (element.getAttribute('itemtype') === 'https://schema.org/Boolean') { 200 | return element.textContent === 'Y' 201 | } 202 | } 203 | ) 204 | 205 | assert.deepStrictEqual(book, { 206 | '@type': 'Book', 207 | abstract: 'A quick explanation about the book', 208 | abridged: true, 209 | }) 210 | }) 211 | 212 | describe('toArray', () => { 213 | it('converts two children to array with two elements', () => { 214 | const dom = new JSDOM(` 215 |
    216 |
  1. 218 | 219 | Dresses 220 | 221 |
  2. 222 |
  3. 224 | 225 | Real Dresses 226 | 227 |
  4. 228 |
229 | `) 230 | const breadcrumbList = microdata( 231 | 'https://schema.org/BreadcrumbList', 232 | dom.window.document.documentElement 233 | )! 234 | 235 | const dressNames = toArray(breadcrumbList.itemListElement).map( 236 | (e: ListItem) => e.name 237 | ) 238 | assert.deepStrictEqual(dressNames, ['Dresses', 'Real Dresses']) 239 | }) 240 | 241 | it('converts one child to array with one element', () => { 242 | const dom = new JSDOM(` 243 |
    244 |
  1. 246 | 247 | Dresses 248 | 249 |
  2. 250 |
251 | `) 252 | const breadcrumbList = microdata( 253 | 'https://schema.org/BreadcrumbList', 254 | dom.window.document.documentElement 255 | )! 256 | 257 | const dressNames = toArray(breadcrumbList.itemListElement).map( 258 | (e: ListItem) => e.name 259 | ) 260 | assert.deepStrictEqual(dressNames, ['Dresses']) 261 | }) 262 | 263 | it('converts no children to array with zero elements', () => { 264 | const dom = new JSDOM(` 265 |
    266 |
267 | `) 268 | const breadcrumbList = microdata( 269 | 'https://schema.org/BreadcrumbList', 270 | dom.window.document.documentElement 271 | )! 272 | 273 | const dresses = toArray(breadcrumbList.itemListElement) as ListItem[] 274 | const dressNames = dresses.map((e: ListItem) => e.name) 275 | assert.deepStrictEqual(dressNames, []) 276 | }) 277 | }) 278 | }) 279 | --------------------------------------------------------------------------------