76 | `
77 | const expected: LocalBusiness = {
78 | '@type': 'LocalBusiness',
79 | address: {
80 | '@type': 'PostalAddress',
81 | addressLocality: 'Mexico Beach',
82 | addressRegion: 'FL',
83 | streetAddress: '3102 Highway 98',
84 | },
85 | description:
86 | 'A superb collection of fine gifts and clothing to accent your stay in Mexico Beach.',
87 | name: 'Beachwalk Beachwear & Giftware',
88 | telephone: '850-648-4200',
89 | }
90 | assertMicrodata(html, expected)
91 | })
92 | }
93 | )
94 | })
95 |
96 | function assertMicrodata(html: string, expected: any) {
97 | const doc = new JSDOM(html).window.document.documentElement
98 | const itemscope = doc.querySelector(`[itemscope]`)
99 | assert(itemscope)
100 | const itemtype = itemscope.getAttribute('itemtype')
101 | assert(itemtype)
102 | assert.deepStrictEqual(microdata(itemtype, doc), expected)
103 | }
104 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Microdata
4 |
5 | This zero-dependency library converts a DOM to [Microdata](https://html.spec.whatwg.org/multipage/microdata.html).
6 |
7 | It can be used to extract "interesting" pieces of information from a DOM, such as [Person](https://schema.org/Person),
8 | [Order](https://schema.org/Order), [MusicEvent](https://schema.org/MusicEvent) etc.
9 |
10 | All you need to do is to add the appropriate `itemscope`, `itemtype` and `itemprop` attributes to your HTML, and this library
11 | will be able to extract the data.
12 |
13 | The library supports [all schema.org types](https://schema.org/docs/full.html), and also allows custom Microdata types.
14 |
15 | The returned Mircodata uses the [JSON-LD](https://json-ld.org/) format.
16 |
17 | ## Installation
18 |
19 | npm install @cucumber/microdata
20 |
21 | ## Example
22 |
23 | Given a sample DOM:
24 |
25 | ```html
26 |
27 |
28 | Jane Doe
29 |
30 | ```
31 |
32 | We can extract the `Person` on that page to a [JSON-LD](https://json-ld.org/) compliant JavaScript object:
33 |
34 | ```javascript
35 | const { microdata } = require('@cucumber/microdata')
36 |
37 | const person = microdata('https://schema.org/Person', document)
38 | console.log(person.name) // "Jane Doe"
39 | ```
40 |
41 | If you are using TypeScript you can cast the result to a type from [schema-dts](https://github.com/google/schema-dts):
42 |
43 | ```typescript
44 | import { microdata } from '@cucumber/microdata'
45 | import { Person } from 'schema-dts'
46 |
47 | const person = microdata('https://schema.org/Person', document) as Person
48 | if (typeof person === 'string') throw new Error('Expected a Person object')
49 | console.log(person.name) // "Jane Doe"
50 | ```
51 |
52 | ## Custom value extraction
53 |
54 | In some cases you may want finer grained control over how to extract values from the DOM. For example,
55 | you may have a [CodeMirror](https://codemirror.net/) editor sitting inside of an element:
56 |
57 | ```html
58 |
59 |
60 |
61 | ```
62 |
63 | You can pass a custom `extractValue` function as the last argument to `microdata` or `microdataAll`:
64 |
65 | ```typescript
66 | const data = microdata(
67 | someSchemaType,
68 | someElement,
69 | element => element.querySelector('.CodeMirror')?.CodeMirror?.getValue()
70 | )
71 | ```
72 |
73 | This function may return `undefined`. In that case, the default lookup mechanisms will be used.
74 |
75 | ## Custom types
76 |
77 | We recommend using the official types defined by schema.org if you can. Sometimes however, you may want to
78 | define your own types if the official types are insufficient.
79 |
80 | You can see an example of how this is done in [test/microdataTest.ts](test/microdataTest.ts).
81 |
82 | ## Usage in testing
83 |
84 | This library can be used to write assertions against web pages.
85 | It works with any UI library as it only inspects the DOM. The only requirement
86 | is that the HTML has Microdata in it.
87 |
88 | Here is an example from a hypothetical TODO list application:
89 |
90 | ```typescript
91 | import { microdata } from '@cucumber/microdata'
92 |
93 | const itemList = microdata('https://schema.org/ItemList', element) as ItemList
94 | const todos = itemList.itemListElement as Text[]
95 | assert.deepStrictEqual(todos, ['Get milk', 'Feed dog'])
96 | ```
97 |
98 | ## Arrays
99 |
100 | Some microdata `itemScope`s allow `itemProp` elements that can be specified more than once.
101 | For example, if an `ItemList` has two or more `itemListElement` children, then the `itemListElement`
102 | field in the LD-JSON object will be an `Array`.
103 |
104 | However, if there is only one child, it will have the value of that child rather than an array with one element.
105 |
106 | And if there are none, the value of that child will be undefined.
107 |
108 | The `toArray` function of this library will convert a value to an array with 0, 1 or more elements so you
109 | don't need to worry about this.
110 |
111 | ```typescript
112 | import { microdata, toArray } from '@cucumber/microdata'
113 |
114 | const itemList = microdata('https://schema.org/ItemList', element) as ItemList
115 | const todos = toArray(itemList.itemListElement) as Text[]
116 | assert.deepStrictEqual(todos, ['Get milk', 'Feed dog'])
117 | ```
118 |
119 | ## Credit
120 |
121 | This library is based on the excellent, but abandoned [microdata](https://github.com/nathan7/microdata). It's been ported to TypeScript, and some bug fixes have
122 | been applied to make it compliant with JSON-LD.
123 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export function microdataAll(
2 | itemtype: string,
3 | scope: Scope,
4 | extractValue: ExtractValue = () => undefined
5 | ): ReadonlyArray {
6 | const itemScopes = scope.querySelectorAll(
7 | `[itemscope][itemtype="${itemtype}"]`
8 | )
9 | return Array.from(itemScopes).map((scope) => extract(scope, extractValue))
10 | }
11 |
12 | export function microdata(
13 | itemtype: string,
14 | scope: Scope,
15 | extractValue: ExtractValue = () => undefined
16 | ): T | null {
17 | const itemScope = scope.querySelector(`[itemscope][itemtype="${itemtype}"]`)
18 | return itemScope === null ? null : extract(itemScope, extractValue)
19 | }
20 |
21 | /**
22 | * Converts an object to an array
23 | * @param o an object, array, null or undefined
24 | * @return an array of 0, 1 or more elements
25 | */
26 | export function toArray(
27 | o: T | readonly T[] | undefined | null
28 | ): readonly T[] {
29 | if (o === null || o === undefined) return []
30 | return Array.isArray(o) ? o : [o as T]
31 | }
32 |
33 | function extract(scope: Element, extractValue: ExtractValue): T {
34 | const itemType = scope.getAttribute('itemtype')
35 |
36 | if (itemType === null) {
37 | throw new Error(`Missing itemtype on element ${scope.outerHTML}`)
38 | }
39 |
40 | const microdata = { '@type': new URL(itemType).pathname.slice(1) }
41 | const children = Array.from(scope.children)
42 | let child: Element | undefined = undefined
43 |
44 | while ((child = children.shift())) {
45 | const key = child.getAttribute('itemprop')
46 | if (key) {
47 | add(microdata, key, value(child, extractValue))
48 | }
49 | if (child.getAttribute('itemscope') === null)
50 | prepend(children, child.children)
51 | }
52 |
53 | return microdata as unknown as T
54 | }
55 |
56 | function add(microdata: any, key: string, value: any) {
57 | if (value === null) return
58 |
59 | const prop = microdata[key]
60 | if (prop == null) microdata[key] = value
61 | else if (Array.isArray(prop)) prop.push(value)
62 | else microdata[key] = [prop, value]
63 | }
64 |
65 | function value(element: Element, extractValue: ExtractValue) {
66 | if (element.getAttribute('itemscope') !== null) {
67 | return extract(element, extractValue)
68 | }
69 | const attributeName = attributeNameByTagName[element.tagName.toLowerCase()]
70 | const extractedValue = extractValue(element)
71 | const rawValue =
72 | extractedValue === undefined
73 | ? attributeName
74 | ? element.getAttribute(attributeName)
75 | : element.textContent
76 | : extractedValue
77 |
78 | if (rawValue === null) {
79 | throw new Error(`Unable to extract value`)
80 | }
81 |
82 | if (typeof rawValue === 'boolean') {
83 | return rawValue
84 | }
85 |
86 | const stringValue = rawValue
87 | .trim()
88 | .split(/\n/)
89 | .map((s) => s.trim())
90 | .join(' ')
91 | const itemType = element.getAttribute('itemtype')
92 | switch (itemType) {
93 | case null:
94 | return stringValue
95 | case 'https://schema.org/Text':
96 | case 'https://schema.org/DateTime':
97 | case 'https://schema.org/Date':
98 | case 'https://schema.org/Time':
99 | case 'https://schema.org/CssSelectorType':
100 | case 'https://schema.org/PronounceableText':
101 | case 'https://schema.org/URL':
102 | case 'https://schema.org/XPathType':
103 | return stringValue
104 | case 'https://schema.org/Number':
105 | case 'https://schema.org/Float':
106 | case 'https://schema.org/Integer':
107 | return Number(stringValue)
108 | case 'https://schema.org/Boolean':
109 | return stringValue === 'true'
110 | case 'https://schema.org/False':
111 | return false
112 | case 'https://schema.org/True':
113 | return true
114 | default:
115 | throw new Error(
116 | `Unable to extract value. Change itemtype to a primitive type or add itemscope on element ${element.outerHTML}`
117 | )
118 | }
119 | }
120 |
121 | function prepend(target: Element[], addition: HTMLCollection) {
122 | ;[].unshift.apply(target, [].slice.call(addition))
123 | }
124 |
125 | // https://html.spec.whatwg.org/multipage/microdata.html#values
126 | const attributeNameByTagName: { [key: string]: string } = {
127 | meta: 'content',
128 | audio: 'src',
129 | embed: 'src',
130 | iframe: 'src',
131 | img: 'src',
132 | source: 'src',
133 | track: 'src',
134 | video: 'src',
135 | a: 'href',
136 | area: 'href',
137 | link: 'href',
138 | object: 'data',
139 | data: 'value',
140 | meter: 'value',
141 | time: 'datetime',
142 | }
143 |
144 | type ExtractValue = (element: Element) => string | boolean | undefined | null
145 | type Scope = Document | Element
146 |
--------------------------------------------------------------------------------
/test/microdataTest.ts:
--------------------------------------------------------------------------------
1 | import { JSDOM } from 'jsdom'
2 | import { microdata, toArray } from '../src/index.js'
3 | import {
4 | BreadcrumbList,
5 | CreativeWork,
6 | Event,
7 | ListItem,
8 | Person,
9 | Text,
10 | } from 'schema-dts'
11 | import assert from 'assert'
12 |
13 | type Tree = {
14 | '@type': 'Tree'
15 | value: Text
16 | children?: TreeList
17 | }
18 |
19 | type TreeList = {
20 | '@type': 'TreeList'
21 | treeListElement: Tree | Tree[]
22 | }
23 |
24 | describe('microdata', () => {
25 | it('converts primitive types', () => {
26 | const dom = new JSDOM(`
27 |
28 |
29 | Maximum attendees: 35.
30 | Ticket: pay at the entrance.
31 |
133 | `)
134 | const person = microdata(
135 | 'https://schema.org/Person',
136 | dom.window.document.documentElement,
137 | (element) => element.querySelector('.use-this')?.textContent
138 | )!
139 |
140 | if (typeof person === 'string') throw new Error('Expected a Person object')
141 |
142 | assert.strictEqual(person.givenName, 'Aslak')
143 | assert.strictEqual(person.familyName, 'Hellesøy')
144 | })
145 |
146 | it('can extract properties with empty strings', () => {
147 | const dom = new JSDOM(`
148 |
149 |
150 | Hellesøy
151 |
152 | `)
153 | const person = microdata(
154 | 'https://schema.org/Person',
155 | dom.window.document.documentElement
156 | )!
157 |
158 | if (typeof person === 'string') throw new Error('Expected a Person object')
159 |
160 | assert.strictEqual(person.givenName, '')
161 | assert.strictEqual(person.familyName, 'Hellesøy')
162 | })
163 |
164 | it('does not fallback to the default look up when the custom one returns an empty string', () => {
165 | const dom = new JSDOM(`
166 |
167 |
168 |
169 | `)
170 | const person = microdata(
171 | 'https://schema.org/Person',
172 | dom.window.document.documentElement,
173 | (element) => {
174 | if (element.getAttribute('itemprop') === 'givenName')
175 | return element.getAttribute('value')
176 | return undefined
177 | }
178 | )!
179 |
180 | if (typeof person === 'string') throw new Error('Expected a Person object')
181 |
182 | assert.strictEqual(person.givenName, '')
183 | })
184 |
185 | it('can extract boolean value with extractValue', () => {
186 | const dom = new JSDOM(`
187 |
188 |
189 | A quick explanation about the book
190 |
191 | Y
192 |
193 | `)
194 |
195 | const book = microdata(
196 | 'https://schema.org/Book',
197 | dom.window.document.documentElement,
198 | (element) => {
199 | if (element.getAttribute('itemtype') === 'https://schema.org/Boolean') {
200 | return element.textContent === 'Y'
201 | }
202 | }
203 | )
204 |
205 | assert.deepStrictEqual(book, {
206 | '@type': 'Book',
207 | abstract: 'A quick explanation about the book',
208 | abridged: true,
209 | })
210 | })
211 |
212 | describe('toArray', () => {
213 | it('converts two children to array with two elements', () => {
214 | const dom = new JSDOM(`
215 |
216 |