├── .gitignore ├── src ├── top-user-agents.d.ts ├── file-extension.d.ts ├── randomUserAgent.ts ├── link-type.ts ├── json-response.ts ├── type-checker.ts ├── get-error-message.ts ├── follow-short-url.ts ├── link-type.test.ts ├── type-checker.test.ts ├── types.ts ├── worker.ts ├── scraper.ts └── scraper-rules.ts ├── .prettierrc ├── .editorconfig ├── package.json ├── wrangler.toml ├── test └── index.html ├── README.md └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | -------------------------------------------------------------------------------- /src/top-user-agents.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'top-user-agents' 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "trailingComma": "es5", 4 | "semi": false, 5 | "useTabs": false 6 | } 7 | -------------------------------------------------------------------------------- /src/file-extension.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'file-extension' { 2 | const fileExtension: (value: string) => string 3 | export default fileExtension 4 | } 5 | -------------------------------------------------------------------------------- /src/randomUserAgent.ts: -------------------------------------------------------------------------------- 1 | import uniqueRandomArray from 'unique-random-array' 2 | import userAgents from 'top-user-agents' 3 | 4 | export const randomUserAgent = uniqueRandomArray(userAgents) 5 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | tab_width = 2 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | 12 | [*.yml] 13 | indent_style = space 14 | -------------------------------------------------------------------------------- /src/link-type.ts: -------------------------------------------------------------------------------- 1 | import { LinkType, typeChecker } from './type-checker' 2 | 3 | export const linkType = (link: string, isReaderable?: boolean): LinkType => { 4 | // if known file or site type, return early and use the 5 | // value for the type 6 | const urlIsKnownFileType = typeChecker(link) 7 | if (urlIsKnownFileType) { 8 | return urlIsKnownFileType 9 | } 10 | 11 | if (isReaderable) { 12 | return 'article' 13 | } 14 | 15 | return 'link' 16 | } 17 | -------------------------------------------------------------------------------- /src/json-response.ts: -------------------------------------------------------------------------------- 1 | import { getErrorMessage } from './get-error-message' 2 | 3 | const generateJSONResponse = (obj: any) => { 4 | return new Response(JSON.stringify(obj), { 5 | headers: { 6 | 'content-type': 'application/json;charset=UTF-8', 7 | 'Access-Control-Allow-Origin': '*', 8 | }, 9 | }) 10 | } 11 | 12 | const generateErrorJSONResponse = (error: unknown, url?: string) => { 13 | const errorMessage = getErrorMessage(error) 14 | return generateJSONResponse({ 15 | error: errorMessage, 16 | url, 17 | }) 18 | } 19 | 20 | export { generateJSONResponse, generateErrorJSONResponse } 21 | -------------------------------------------------------------------------------- /src/type-checker.ts: -------------------------------------------------------------------------------- 1 | import fileExtension from 'file-extension' 2 | import { types } from './types' 3 | 4 | export type LinkType = 5 | | 'link' 6 | | 'video' 7 | | 'audio' 8 | | 'recipe' 9 | | 'image' 10 | | 'document' 11 | | 'article' 12 | | 'game' 13 | | 'book' 14 | | 'event' 15 | | 'product' 16 | | 'note' 17 | | 'file' 18 | export type TypeDictionary = Record 19 | export const typeChecker = (path: string): LinkType | undefined => { 20 | try { 21 | const url = new URL(path) 22 | const hostname = url.hostname.replace('www.', '') 23 | return types[hostname] 24 | } catch (err) { 25 | // swallow the error, no need to do anything 26 | } 27 | 28 | const extension = fileExtension(path) 29 | if (extension) { 30 | return types[extension] 31 | } 32 | 33 | return undefined 34 | } 35 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "metadata-scraper", 3 | "version": "0.0.0", 4 | "private": true, 5 | "author": { 6 | "name": "Zander Martineau", 7 | "url": "https://zander.wtf" 8 | }, 9 | "scripts": { 10 | "deploy": "wrangler deploy", 11 | "start": "wrangler dev", 12 | "test": "vitest", 13 | "serve:test": "serve test -p 1234" 14 | }, 15 | "devDependencies": { 16 | "@cloudflare/workers-types": "^4.20231025.0", 17 | "serve": "^14.2.1", 18 | "typescript": "^5.0.4", 19 | "vite": "^4.5.0", 20 | "vitest": "^0.34.6", 21 | "wrangler": "^3.15.0" 22 | }, 23 | "dependencies": { 24 | "file-extension": "^4.0.5", 25 | "html-entities": "^2.4.0", 26 | "isomorphic-dompurify": "^1.9.0", 27 | "tidy-url": "^1.10.1", 28 | "top-user-agents": "^1.0.66", 29 | "unique-random-array": "^3.0.0" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/get-error-message.ts: -------------------------------------------------------------------------------- 1 | type ErrorWithMessage = { 2 | message: string 3 | } 4 | 5 | const isErrorWithMessage = (error: unknown): error is ErrorWithMessage => { 6 | return ( 7 | typeof error === 'object' && 8 | error !== null && 9 | 'message' in error && 10 | typeof (error as Record).message === 'string' 11 | ) 12 | } 13 | 14 | const toErrorWithMessage = (maybeError: unknown): ErrorWithMessage => { 15 | if (isErrorWithMessage(maybeError)) return maybeError 16 | 17 | try { 18 | return new Error(JSON.stringify(maybeError)) 19 | } catch { 20 | // fallback in case there's an error stringifying the maybeError 21 | // like with circular references for example. 22 | return new Error(String(maybeError)) 23 | } 24 | } 25 | 26 | export const getErrorMessage = (error: unknown) => { 27 | console.log(`🚀 ~ getErrorMessage ~ error:`, error) 28 | return toErrorWithMessage(error).message 29 | } 30 | -------------------------------------------------------------------------------- /src/follow-short-url.ts: -------------------------------------------------------------------------------- 1 | const MAX_REDIRECTS = 5 2 | 3 | export interface FollowShortUrlResponse { 4 | urls: string[] 5 | unshortened_url: string 6 | } 7 | // This function follows a short URL and returns the final URL, use https://t.co/wy9S5P0Cd2 as an example. 8 | export const followShortUrl = async ( 9 | urls: string[], 10 | redirectCount = 0 11 | ): Promise => { 12 | const fetchResponse = await fetch(urls[urls.length - 1], { 13 | headers: { 14 | referrer: 'http://www.google.com/', 15 | Accept: 16 | 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 17 | 'Accept-Language': 'en-US,en;q=0.5', 18 | }, 19 | method: 'HEAD', 20 | redirect: 'manual', 21 | }) 22 | 23 | if (redirectCount >= MAX_REDIRECTS) { 24 | throw new Error(`Maximum redirects exceeded.`) 25 | } 26 | if (fetchResponse.headers.get('location')) { 27 | urls.push(fetchResponse.headers.get('location') as string) 28 | await followShortUrl(urls, redirectCount + 1) 29 | } 30 | 31 | return { 32 | urls, 33 | unshortened_url: urls[urls.length - 1], 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/link-type.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, describe, test } from 'vitest' 2 | import { linkType } from './link-type' 3 | describe('linkType', () => { 4 | test('video', () => { 5 | expect(linkType('http://youtube.com')).toBe('video') 6 | expect(linkType('https://www.youtube.com/watch?v=1Cz8_6aZ248')).toBe( 7 | 'video' 8 | ) 9 | expect(linkType('https://youtu.be/1Cz8_6aZ248')).toBe('video') 10 | expect(linkType('https://youtu.be/1Cz8_6aZ248', true)).toBe('video') 11 | expect(linkType('https://vimeo.com/746423508')).toBe('video') 12 | expect(linkType('https://vimeo.com/746423508', true)).toBe('video') 13 | }) 14 | test('audio', () => { 15 | expect( 16 | linkType( 17 | 'https://www.mixcloud.com/TheBlessedMadonna/we-still-believe-episode-090-it-couldnt-happen-here/' 18 | ) 19 | ).toBe('audio') 20 | expect( 21 | linkType( 22 | 'https://soundcloud.com/thisislegang/riviera-maya?si=083fb734554e4a1a8d16846be4eb5a2e&utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing' 23 | ) 24 | ).toBe('audio') 25 | }) 26 | test('article', () => { 27 | expect( 28 | linkType( 29 | 'https://medium.com/@x_TomCooper_x/ukraine-war-4-september-2022-ukrainian-attacks-in-kherson-oblast-ed25239f3116' 30 | ) 31 | ).toBe('article') 32 | }) 33 | test('image', () => { 34 | expect(linkType('https://imgur.com/gallery/zjWRx8y')).toBe('image') 35 | }) 36 | test('link', () => { 37 | expect(linkType('https://zander.wtf')).toBe('link') 38 | expect(linkType('https://lexica.art/')).toBe('link') 39 | expect(linkType(' https://yoyotricks.com/')).toBe('link') 40 | }) 41 | }) 42 | -------------------------------------------------------------------------------- /src/type-checker.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, describe, test } from 'vitest' 2 | import { typeChecker } from './type-checker' 3 | describe('typeChecker', () => { 4 | test('video', () => { 5 | expect(typeChecker('zander.mp4')).toBe('video') 6 | expect(typeChecker('zander.mov')).toBe('video') 7 | expect(typeChecker('zander-something.foo.mov')).toBe('video') 8 | expect(typeChecker('https://www.youtube.com/watch?v=1Cz8_6aZ248')).toBe( 9 | 'video' 10 | ) 11 | expect(typeChecker('https://youtu.be/1Cz8_6aZ248')).toBe('video') 12 | expect(typeChecker('https://vimeo.com/746423508')).toBe('video') 13 | }) 14 | test('image', () => { 15 | expect(typeChecker('zander.jpg')).toBe('image') 16 | expect(typeChecker('zander.gif')).toBe('image') 17 | expect(typeChecker('zander-something.foo.png')).toBe('image') 18 | expect(typeChecker('https://imgur.com/gallery/zjWRx8y')).toBe('image') 19 | }) 20 | test('audio', () => { 21 | expect(typeChecker('zander.aac')).toBe('audio') 22 | expect(typeChecker('zander.mp3')).toBe('audio') 23 | expect( 24 | typeChecker( 25 | 'https://www.mixcloud.com/TheBlessedMadonna/we-still-believe-episode-090-it-couldnt-happen-here/' 26 | ) 27 | ).toBe('audio') 28 | expect( 29 | typeChecker( 30 | 'https://soundcloud.com/thisislegang/riviera-maya?si=083fb734554e4a1a8d16846be4eb5a2e&utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing' 31 | ) 32 | ).toBe('audio') 33 | }) 34 | test('document', () => { 35 | expect(typeChecker('zander.doc')).toBe('document') 36 | expect(typeChecker('zander.pdf')).toBe('document') 37 | }) 38 | test('file', () => { 39 | expect(typeChecker('zander.otf')).toBe('file') 40 | }) 41 | test('unknown', () => { 42 | expect(typeChecker('zander.html')).toBe(undefined) 43 | expect(typeChecker('zander.com')).toBe(undefined) 44 | }) 45 | }) 46 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import { TypeDictionary } from './type-checker' 2 | export const fileTypes: TypeDictionary = { 3 | jpg: 'image', 4 | jpeg: 'image', 5 | png: 'image', 6 | apng: 'image', 7 | gif: 'image', 8 | webp: 'image', 9 | tiff: 'image', 10 | bmp: 'image', 11 | heif: 'image', 12 | svg: 'image', 13 | psd: 'image', 14 | dng: 'image', 15 | icns: 'image', 16 | avif: 'image', 17 | ico: 'image', 18 | aac: 'audio', 19 | aiff: 'audio', 20 | flac: 'audio', 21 | m4a: 'audio', 22 | m4p: 'audio', 23 | mogg: 'audio', 24 | mp2: 'audio', 25 | mp3: 'audio', 26 | oga: 'audio', 27 | wav: 'audio', 28 | wma: 'audio', 29 | asf: 'video', 30 | avi: 'video', 31 | flv: 'video', 32 | gifv: 'video', 33 | m2v: 'video', 34 | m4v: 'video', 35 | mid: 'video', 36 | mkv: 'video', 37 | mov: 'video', 38 | mp4: 'video', 39 | mpeg: 'video', 40 | mpg: 'video', 41 | ogg: 'video', 42 | ogv: 'video', 43 | qt: 'video', 44 | vob: 'video', 45 | webm: 'video', 46 | doc: 'document', 47 | docx: 'document', 48 | md: 'document', 49 | mdx: 'document', 50 | odt: 'document', 51 | pdf: 'document', 52 | rtf: 'document', 53 | txt: 'document', 54 | xml: 'document', 55 | epub: 'document', 56 | pptx: 'document', 57 | woff: 'file', 58 | woff2: 'file', 59 | eot: 'file', 60 | zip: 'file', 61 | dmg: 'file', 62 | rar: 'file', 63 | ttf: 'file', 64 | otf: 'file', 65 | } 66 | 67 | export const siteTypes: TypeDictionary = { 68 | 'youtube.com': 'video', 69 | 'youtu.be': 'video', 70 | 'vimeo.com': 'video', 71 | 'microsoftstream.com': 'video', 72 | 'tiktok.com': 'video', 73 | 'dailymotion.com': 'video', 74 | 'dai.ly': 'video', 75 | 'imgur.com': 'image', 76 | 'unsplash.com': 'image', 77 | 'medium.com': 'article', 78 | 'dev.to': 'article', 79 | 'spotify.com': 'audio', 80 | 'soundcloud.com': 'audio', 81 | 'bandcamp.com': 'audio', 82 | 'deezer.com': 'audio', 83 | 'tidal.com': 'audio', 84 | 'pandora.com': 'audio', 85 | 'mixcloud.com': 'audio', 86 | 'bbcgoodfood.com': 'recipe', 87 | 'liquor.com': 'recipe', 88 | 'meetup.com': 'event', 89 | 'hopin.com': 'event', 90 | 'amazon.com': 'product', 91 | 'amazon.co.uk': 'product', 92 | } 93 | 94 | export const types = { 95 | ...fileTypes, 96 | ...siteTypes, 97 | } 98 | -------------------------------------------------------------------------------- /wrangler.toml: -------------------------------------------------------------------------------- 1 | name = "zm-scraper" 2 | main = "src/worker.ts" 3 | compatibility_date = "2023-08-21" 4 | 5 | # Variable bindings. These are arbitrary, plaintext strings (similar to environment variables) 6 | # Note: Use secrets to store sensitive data. 7 | # Docs: https://developers.cloudflare.com/workers/platform/environment-variables 8 | # [vars] 9 | # MY_VARIABLE = "production_value" 10 | 11 | # Bind a KV Namespace. Use KV as persistent storage for small key-value pairs. 12 | # Docs: https://developers.cloudflare.com/workers/runtime-apis/kv 13 | # [[kv_namespaces]] 14 | # binding = "MY_KV_NAMESPACE" 15 | # id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 16 | 17 | # Bind an R2 Bucket. Use R2 to store arbitrarily large blobs of data, such as files. 18 | # Docs: https://developers.cloudflare.com/r2/api/workers/workers-api-usage/ 19 | # [[r2_buckets]] 20 | # binding = "MY_BUCKET" 21 | # bucket_name = "my-bucket" 22 | 23 | # Bind a Queue producer. Use this binding to schedule an arbitrary task that may be processed later by a Queue consumer. 24 | # Docs: https://developers.cloudflare.com/queues/get-started 25 | # [[queues.producers]] 26 | # binding = "MY_QUEUE" 27 | # queue = "my-queue" 28 | 29 | # Bind a Queue consumer. Queue Consumers can retrieve tasks scheduled by Producers to act on them. 30 | # Docs: https://developers.cloudflare.com/queues/get-started 31 | # [[queues.consumers]] 32 | # queue = "my-queue" 33 | 34 | # Bind another Worker service. Use this binding to call another Worker without network overhead. 35 | # Docs: https://developers.cloudflare.com/workers/platform/services 36 | # [[services]] 37 | # binding = "MY_SERVICE" 38 | # service = "/api/*" 39 | 40 | # Bind a Durable Object. Durable objects are a scale-to-zero compute primitive based on the actor model. 41 | # Durable Objects can live for as long as needed. Use these when you need a long-running "server", such as in realtime apps. 42 | # Docs: https://developers.cloudflare.com/workers/runtime-apis/durable-objects 43 | # [[durable_objects.bindings]] 44 | # name = "MY_DURABLE_OBJECT" 45 | # class_name = "MyDurableObject" 46 | 47 | # Durable Object migrations. 48 | # Docs: https://developers.cloudflare.com/workers/learning/using-durable-objects#configure-durable-object-classes-with-migrations 49 | # [[migrations]] 50 | # tag = "v1" 51 | # new_classes = ["MyDurableObject"] 52 | -------------------------------------------------------------------------------- /src/worker.ts: -------------------------------------------------------------------------------- 1 | import { 2 | generateErrorJSONResponse, 3 | generateJSONResponse, 4 | } from './json-response' 5 | import { linkType } from './link-type' 6 | import Scraper from './scraper' 7 | import { TidyURL } from 'tidy-url' 8 | import { scraperRules } from './scraper-rules' 9 | 10 | addEventListener('fetch', (event: FetchEvent) => { 11 | event.respondWith(handleRequest(event.request)) 12 | }) 13 | 14 | type JSONValue = 15 | | string 16 | | number 17 | | boolean 18 | | null 19 | | JSONValue[] 20 | | { [key: string]: JSONValue } 21 | 22 | interface JSONObject { 23 | [k: string]: JSONValue 24 | } 25 | 26 | export type ScrapeResponse = string | string[] | JSONObject 27 | 28 | async function handleRequest(request: Request) { 29 | const searchParams = new URL(request.url).searchParams 30 | const scraper = new Scraper() 31 | let response: Record 32 | let url = searchParams.get('url') 33 | const cleanUrl = searchParams.get('cleanUrl') 34 | 35 | if (!url) { 36 | return generateErrorJSONResponse( 37 | 'Please provide a `url` query parameter, e.g. ?url=https://example.com' 38 | ) 39 | } 40 | 41 | if (url && !url.match(/^[a-zA-Z]+:\/\//)) { 42 | url = 'https://' + url 43 | } 44 | 45 | try { 46 | const requestedUrl = new URL(url) 47 | 48 | // If the url is a reddit url, use old.reddit.com because it has much 49 | // more information when scraping 50 | if (url.includes('reddit.com')) { 51 | requestedUrl.hostname = 'old.reddit.com' 52 | url = requestedUrl.toString() 53 | } 54 | 55 | await scraper.fetch(url) 56 | } catch (error) { 57 | return generateErrorJSONResponse(error, url) 58 | } 59 | 60 | try { 61 | // Get metadata using the rules defined in `src/scraper-rules.ts` 62 | response = await scraper.getMetadata(scraperRules) 63 | 64 | const unshortenedUrl = scraper.response.url 65 | 66 | // Add cleaned url 67 | if (cleanUrl) { 68 | const cleanedUrl = TidyURL.clean(unshortenedUrl || url) 69 | response.cleaned_url = cleanedUrl.url 70 | } 71 | 72 | // Add unshortened url 73 | response.url = unshortenedUrl 74 | 75 | // Add url type 76 | response.urlType = linkType(url, false) 77 | 78 | // Parse JSON-LD 79 | if (response?.jsonld) { 80 | response.jsonld = JSON.parse(response.jsonld as string) 81 | } 82 | } catch (error) { 83 | return generateErrorJSONResponse(error, url) 84 | } 85 | 86 | return generateJSONResponse(response) 87 | } 88 | -------------------------------------------------------------------------------- /test/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Scraper test 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 34 | 35 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 81 | 87 | 93 | 94 | 95 | 96 |

h1[slot="title"]

97 |

.post-title

98 |

.entry-title

99 |

h1.title a

100 |

h1.title a

101 | 102 | 103 | -------------------------------------------------------------------------------- /src/scraper.ts: -------------------------------------------------------------------------------- 1 | import { decode } from 'html-entities' 2 | import { ScrapeResponse } from './worker' 3 | import { randomUserAgent } from './randomUserAgent' 4 | import { FollowShortUrlResponse, followShortUrl } from './follow-short-url' 5 | import { generateErrorJSONResponse } from './json-response' 6 | 7 | const cleanText = (string: string) => decode(string.trim(), { level: 'html5' }) 8 | 9 | type GetValueOption = { selector: string; attribute?: string } 10 | export type GetMetadataOptions = { 11 | name: string 12 | selectors: GetValueOption[] 13 | multiple: boolean 14 | } 15 | 16 | class Scraper { 17 | rewriter: HTMLRewriter 18 | url: string 19 | response: Response 20 | metadata: ScrapeResponse 21 | unshortenedInfo: FollowShortUrlResponse 22 | 23 | constructor() { 24 | this.rewriter = new HTMLRewriter() 25 | return this 26 | } 27 | 28 | async fetch(url: string): Promise { 29 | this.url = url 30 | this.unshortenedInfo 31 | try { 32 | this.unshortenedInfo = await followShortUrl([url]) 33 | } catch (error) { 34 | return generateErrorJSONResponse(error, url) 35 | } 36 | this.response = await fetch(this.unshortenedInfo.unshortened_url || url, { 37 | headers: { 38 | referrer: 'http://www.google.com/', 39 | 'User-Agent': randomUserAgent(), 40 | Accept: 41 | 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 42 | 'Accept-Language': 'en-US,en;q=0.5', 43 | }, 44 | }) 45 | 46 | const server = this.response.headers.get('server') 47 | 48 | const isThisWorkerErrorNotErrorWithinScrapedSite = 49 | [530, 503, 502, 403, 400].includes(this.response.status) && 50 | (server === 'cloudflare' || !server) /* Workers preview editor */ 51 | 52 | if (isThisWorkerErrorNotErrorWithinScrapedSite) { 53 | throw new Error(`Status ${this.response.status} requesting ${url}`) 54 | } 55 | 56 | return this.response 57 | } 58 | 59 | async getMetadata( 60 | options: GetMetadataOptions[] 61 | ): Promise> { 62 | let matches: Record = {} 63 | let selectedSelectors: Record = {} 64 | 65 | for (const optionsItem of options) { 66 | const name = optionsItem.name 67 | const isMultiple = optionsItem.multiple 68 | 69 | if (!matches[name]) { 70 | if (isMultiple) { 71 | matches[name] = [] 72 | } else { 73 | matches[name] = '' 74 | } 75 | } 76 | 77 | selectorLoop: for await (const item of optionsItem.selectors) { 78 | const selector = item.selector 79 | let nextText = '' 80 | 81 | if (selectedSelectors[name]) { 82 | break selectorLoop 83 | } 84 | 85 | this.rewriter.on(selector, { 86 | element(element: Element) { 87 | if (item.attribute) { 88 | // Get attribute content value 89 | 90 | const attrText = element.getAttribute(item.attribute) 91 | if (attrText) { 92 | nextText = attrText 93 | 94 | // If multiple, push to array, otherwise set as string 95 | if (isMultiple) { 96 | Array.isArray(matches[name]) && 97 | (matches[name] as string[]).push(cleanText(nextText)) 98 | } else { 99 | if (matches[name] === '') { 100 | matches[name] = cleanText(nextText) 101 | selectedSelectors[name] = true 102 | } 103 | } 104 | } 105 | } else { 106 | nextText = '' 107 | } 108 | }, 109 | text(text) { 110 | // Get text content value 111 | if (!item.attribute) { 112 | nextText += text.text 113 | 114 | if (text.lastInTextNode) { 115 | // If multiple, push to array, otherwise set as string 116 | if (isMultiple) { 117 | Array.isArray(matches[name]) && 118 | (matches[name] as string[]).push(cleanText(nextText)) 119 | } else { 120 | if (matches[name] === '') { 121 | matches[name] = cleanText(nextText) 122 | selectedSelectors[name] = true 123 | } 124 | } 125 | nextText = '' 126 | } 127 | } 128 | }, 129 | }) 130 | } 131 | } 132 | const transformed = this.rewriter.transform(this.response) 133 | await transformed.arrayBuffer() 134 | 135 | return matches 136 | } 137 | } 138 | 139 | export default Scraper 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Page Metadata Scraper with Cloudflare workers 2 | 3 | It uses a Cloudflare's `HTMLRewriter` to scrape the page for metadata and returns it as JSON. 4 | 5 | ### Features 6 | 7 | - super fast metadata scraping using rules based on [Metascraper's rules](https://metascraper.js.org/#/?id=how-it-works) to pick out the metadata for a given property. These rules can easily be modified to suit your needs, they are defined in [`src/scraper-rules.ts`](https://github.com/mrmartineau/cloudflare-worker-scraper/blob/main/src/scraper-rules.ts) 8 | - resolves short urls (e.g. https://t.co/wy9S5P0Cd2) and unshortens them 9 | - option to clean url tracking params (e.g. `utm_*` and `fbclid` using the `cleanUrl` query param, see below) 10 | - tries to infer the link type and returns that in the `urlType` property, will be one of: `'link' | 'video' | 'audio' | 'recipe' | 'image' | 'document' | 'article' | 'game' | 'book' | 'event' | 'product' | 'note' | 'file'` 11 | - the worker randomises user-agent strings to avoid being blocked 12 | 13 | ### URL parameters: 14 | 15 | - `url` - the URL to scrape 16 | - `cleanUrl` - if `true`, the URL will be cleaned up to remove any tracking params 17 | 18 | Once deployed to **Cloudflare**, add a `url` query param for the URL you want to scrape, e.g. 19 | 20 | ```sh 21 | # Basic example 22 | https://your-worker-name.cloudflare.com/?url=https://zander.wtf 23 | 24 | # Example with `cleanUrl=true` 25 | https://your-worker-name.cloudflare.com/?cleanUrl=true&url=https://poetsroad.bandcamp.com/?from=search&search_item_id=1141951669&search_item_type=b&search_match_part=%3F&search_page_id=1748155363&search_page_no=1&search_rank=1&search_sig=a9a9cbdfc454df7c2999f097dc8a216b 26 | ``` 27 | 28 | Response: 29 | 30 | From [my website](https://zander.wtf) (https://zander.wtf): 31 | 32 | ```json 33 | { 34 | "title": "Hi! I'm Zander, I make websites", 35 | "description": "Zander Martineau's personal site. I'm a contractor with 15+ years of experience helping companies get products to market, rewriting apps, creating POCs and more. I specialize in front-end but also work full-stack.", 36 | "author": "Zander Martineau", 37 | "image": "https://zander.wtf/opengraph.jpg", 38 | "feeds": [ 39 | "https://zander.wtf/blog.rss.xml", 40 | "https://zander.wtf/links.rss.xml" 41 | ], 42 | "date": "2023-09-07T00:00:00.000Z", 43 | "lang": "en", 44 | "logo": "", 45 | "video": "", 46 | "keywords": "", 47 | "jsonld": "", 48 | "cleaned_url": "https://zander.wtf", 49 | "url": "https://zander.wtf", 50 | "urlType": "link" 51 | } 52 | ``` 53 | 54 | From [a YouTube video](https://www.youtube.com/watch?v=ctEksNz7tqg): 55 | 56 | ```json 57 | { 58 | "title": "World's Best FPV Drone Shot? (extreme mountain biking) - YouTube", 59 | "description": "Dive into the hardest mountain bike race through the eyes of an intense FPV drone shot. The @dutchdronegods followed Kade Edwards down the Red Bull Hardline ...", 60 | "author": "", 61 | "image": "https://i.ytimg.com/vi/ctEksNz7tqg/maxresdefault.jpg", 62 | "feeds": [], 63 | "date": "2023-09-19T07:00:07-07:00", 64 | "lang": "en", 65 | "logo": "", 66 | "video": "https://www.youtube.com/embed/ctEksNz7tqg", 67 | "keywords": "red bull, redbull, action sports, extreme sports, sport videos, action, sport, red bull bike, bike, downhill, pov, mtb, pov mtb, urban downhill, urban, downhill mtb, urban downhill racing, racing, DRONE, drone, fpv drone, dutch drone gods, drone shot, hardline, red bull hardline, hardest mountain bike race, hardest race, hard line, hardest drone shot, downhill mountain bike, downhill race, hardest mountain bike, hardest mtb, kade edwards, kade, edwards, wales, welsh, one shot", 68 | "jsonld": { 69 | "@context": "http://schema.org", 70 | "@type": "BreadcrumbList", 71 | "itemListElement": [ 72 | { 73 | "@type": "ListItem", 74 | "position": 1, 75 | "item": { 76 | "@id": "http://www.youtube.com/@redbull", 77 | "name": "Red Bull" 78 | } 79 | } 80 | ] 81 | }, 82 | "url": "https://www.youtube.com/watch?v=ctEksNz7tqg", 83 | "urlType": "video" 84 | } 85 | ``` 86 | 87 | ## Testing 88 | 89 | 1. Run `npm start` 90 | 2. The test file in `src/test/index.html` can be used to test the worker locally. Run `npm run serve:test` to start a local server and then run a GET request against `http://127.0.0.1:8787/?url=http://localhost:1234` to view the output. 91 | 92 | ### Unit tests 93 | 94 | Run `npm run test` to run the small suite of unit tests. 95 | 96 | ## Improvements and suggestions 97 | 98 | Scraping metadata from a page is a tricky business, so if you have any suggestions or improvements, please [open an issue](https://github.com/mrmartineau/cloudflare-worker-scraper/issues/new) or [submit a PR](https://github.com/mrmartineau/cloudflare-worker-scraper/pulls?q=is:pr+is:open+sort:updated-desc), they are always welcome! 99 | 100 | --- 101 | 102 | ## License 103 | 104 | [MIT](https://choosealicense.com/licenses/mit/) © [Zander Martineau](https://zander.wtf) 105 | 106 | > Made by Zander • [zander.wtf](https://zander.wtf) • [GitHub](https://github.com/mrmartineau/) • [Mastodon](https://main.elk.zone/toot.cafe/@zander) 107 | -------------------------------------------------------------------------------- /src/scraper-rules.ts: -------------------------------------------------------------------------------- 1 | import { GetMetadataOptions } from './scraper' 2 | 3 | /** 4 | * Scraper rules 5 | * For each rule, the first selector that matches will be used 6 | */ 7 | export const scraperRules: GetMetadataOptions[] = [ 8 | { 9 | name: 'title', 10 | multiple: false, 11 | selectors: [ 12 | { selector: 'meta[name="og:title"]', attribute: 'content' }, 13 | { selector: 'meta[property="og:title"]', attribute: 'content' }, 14 | { selector: 'meta[name=title]', attribute: 'content' }, 15 | { selector: 'meta[name="twitter:title"]', attribute: 'content' }, 16 | { selector: 'meta[property="twitter:title"]', attribute: 'content' }, 17 | { selector: 'title' }, 18 | { selector: 'h1[slot="title"]' }, 19 | { selector: '.post-title' }, 20 | { selector: '.entry-title' }, 21 | { selector: 'h1[class*="title" i] a' }, 22 | { selector: 'h1[class*="title" i]' }, 23 | ], 24 | }, 25 | { 26 | name: 'description', 27 | multiple: false, 28 | selectors: [ 29 | { selector: 'status-body' }, 30 | { selector: 'meta[name="og:description"]', attribute: 'content' }, 31 | { selector: 'meta[property="og:description"]', attribute: 'content' }, 32 | { 33 | selector: 'meta[name="twitter:description"]', 34 | attribute: 'content', 35 | }, 36 | { 37 | selector: 'meta[property="twitter:description"]', 38 | attribute: 'content', 39 | }, 40 | { selector: 'meta[itemprop="description"]', attribute: 'content' }, 41 | { selector: 'meta[name="description"]', attribute: 'content' }, 42 | ], 43 | }, 44 | { 45 | name: 'author', 46 | multiple: false, 47 | selectors: [ 48 | { selector: 'link[rel=author]', attribute: 'href' }, 49 | { selector: 'meta[name="author"]', attribute: 'content' }, 50 | { selector: 'meta[name="article:author"]', attribute: 'content' }, 51 | { selector: 'meta[property="article:author"]', attribute: 'content' }, 52 | { selector: '[itemprop*="author" i] [itemprop="name"]' }, 53 | ], 54 | }, 55 | { 56 | name: 'image', 57 | multiple: false, 58 | selectors: [ 59 | { 60 | selector: 'link[rel="image_src"]', 61 | attribute: 'href', 62 | }, 63 | { selector: 'meta[name="og:image"]', attribute: 'content' }, 64 | { selector: 'meta[property="og:image"]', attribute: 'content' }, 65 | { selector: 'meta[name="og:image:url"]', attribute: 'content' }, 66 | { selector: 'meta[property="og:image:url"]', attribute: 'content' }, 67 | { 68 | selector: 'meta[name="og:image:secure_url"]', 69 | attribute: 'content', 70 | }, 71 | { 72 | selector: 'meta[property="og:image:secure_url"]', 73 | attribute: 'content', 74 | }, 75 | { selector: 'meta[name="twitter:image:src"]', attribute: 'content' }, 76 | { 77 | selector: 'meta[property="twitter:image:src"]', 78 | attribute: 'content', 79 | }, 80 | { selector: 'meta[name="twitter:image"]', attribute: 'content' }, 81 | { selector: 'meta[property="twitter:image"]', attribute: 'content' }, 82 | { selector: 'meta[itemprop="image"]', attribute: 'content' }, 83 | ], 84 | }, 85 | { 86 | name: 'feeds', 87 | multiple: true, 88 | selectors: [ 89 | { 90 | selector: 'link[type="application/rss+xml"]', 91 | attribute: 'href', 92 | }, 93 | { selector: 'link[type="application/feed+json"]', attribute: 'href' }, 94 | { selector: 'link[type="application/atom+xml"]', attribute: 'href' }, 95 | ], 96 | }, 97 | { 98 | name: 'date', 99 | multiple: false, 100 | selectors: [ 101 | { selector: 'meta[name="date" i]', attribute: 'content' }, 102 | { selector: '[itemprop*="date" i]', attribute: 'content' }, 103 | { selector: 'time[itemprop*="date" i]', attribute: 'datetime' }, 104 | { selector: 'time[datetime]', attribute: 'datetime' }, 105 | { selector: 'time' }, 106 | ], 107 | }, 108 | { 109 | name: 'lang', 110 | multiple: false, 111 | selectors: [ 112 | { selector: 'meta[name="og:locale"]', attribute: 'content' }, 113 | { selector: 'meta[property="og:locale"]', attribute: 'content' }, 114 | { selector: 'meta[itemprop="inLanguage"]', attribute: 'content' }, 115 | { selector: 'html', attribute: 'lang' }, 116 | ], 117 | }, 118 | { 119 | name: 'logo', 120 | multiple: false, 121 | selectors: [ 122 | { selector: 'meta[name="og:logo"]', attribute: 'content' }, 123 | { selector: 'meta[property="og:logo"]', attribute: 'content' }, 124 | { selector: 'meta[itemprop="logo"]', attribute: 'content' }, 125 | { selector: 'img[itemprop="logo"]', attribute: 'src' }, 126 | { 127 | selector: 'link[rel="apple-touch-icon-precomposed"]', 128 | attribute: 'href', 129 | }, 130 | ], 131 | }, 132 | { 133 | name: 'video', 134 | multiple: false, 135 | selectors: [ 136 | { 137 | selector: 'meta[name="og:video:secure_url"]', 138 | attribute: 'content', 139 | }, 140 | { 141 | selector: 'meta[property="og:video:secure_url"]', 142 | attribute: 'content', 143 | }, 144 | { selector: 'meta[name="og:video:url"]', attribute: 'content' }, 145 | { selector: 'meta[property="og:video:url"]', attribute: 'content' }, 146 | { selector: 'meta[name="og:video"]', attribute: 'content' }, 147 | { selector: 'meta[property="og:video"]', attribute: 'content' }, 148 | ], 149 | }, 150 | { 151 | name: 'keywords', 152 | multiple: false, 153 | selectors: [ 154 | { 155 | selector: 'meta[name="keywords"]', 156 | attribute: 'content', 157 | }, 158 | ], 159 | }, 160 | { 161 | name: 'jsonld', 162 | multiple: false, 163 | selectors: [ 164 | { 165 | selector: '#content #microformat script[type="application/ld+json"]', 166 | }, 167 | { 168 | selector: 169 | 'ytd-player-microformat-renderer script[type="application/ld+json"]', 170 | }, 171 | { 172 | selector: 'script[type="application/ld+json"]', 173 | }, 174 | ], 175 | }, 176 | ] 177 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Projects */ 6 | // "incremental": true, /* Enable incremental compilation */ 7 | // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ 8 | // "tsBuildInfoFile": "./", /* Specify the folder for .tsbuildinfo incremental compilation files. */ 9 | // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects */ 10 | // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ 11 | // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ 12 | 13 | /* Language and Environment */ 14 | "target": "es2021" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */, 15 | "lib": [ 16 | "es2021" 17 | ] /* Specify a set of bundled library declaration files that describe the target runtime environment. */, 18 | "jsx": "react" /* Specify what JSX code is generated. */, 19 | // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */ 20 | // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ 21 | // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h' */ 22 | // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ 23 | // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using `jsx: react-jsx*`.` */ 24 | // "reactNamespace": "", /* Specify the object invoked for `createElement`. This only applies when targeting `react` JSX emit. */ 25 | // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ 26 | // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ 27 | 28 | /* Modules */ 29 | "module": "es2022" /* Specify what module code is generated. */, 30 | // "rootDir": "./", /* Specify the root folder within your source files. */ 31 | "moduleResolution": "node" /* Specify how TypeScript looks up a file from a given module specifier. */, 32 | // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ 33 | // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ 34 | // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ 35 | // "typeRoots": [], /* Specify multiple folders that act like `./node_modules/@types`. */ 36 | "types": [ 37 | "@cloudflare/workers-types" 38 | ] /* Specify type package names to be included without being referenced in a source file. */, 39 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 40 | "resolveJsonModule": true /* Enable importing .json files */, 41 | // "noResolve": true, /* Disallow `import`s, `require`s or ``s from expanding the number of files TypeScript should add to a project. */ 42 | 43 | /* JavaScript Support */ 44 | "allowJs": true /* Allow JavaScript files to be a part of your program. Use the `checkJS` option to get errors from these files. */, 45 | "checkJs": false /* Enable error reporting in type-checked JavaScript files. */, 46 | // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from `node_modules`. Only applicable with `allowJs`. */ 47 | 48 | /* Emit */ 49 | // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ 50 | // "declarationMap": true, /* Create sourcemaps for d.ts files. */ 51 | // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ 52 | // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ 53 | // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If `declaration` is true, also designates a file that bundles all .d.ts output. */ 54 | // "outDir": "./", /* Specify an output folder for all emitted files. */ 55 | // "removeComments": true, /* Disable emitting comments. */ 56 | "noEmit": true /* Disable emitting files from a compilation. */, 57 | // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ 58 | // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types */ 59 | // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ 60 | // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ 61 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 62 | // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ 63 | // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ 64 | // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ 65 | // "newLine": "crlf", /* Set the newline character for emitting files. */ 66 | // "stripInternal": true, /* Disable emitting declarations that have `@internal` in their JSDoc comments. */ 67 | // "noEmitHelpers": true, /* Disable generating custom helper functions like `__extends` in compiled output. */ 68 | // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ 69 | // "preserveConstEnums": true, /* Disable erasing `const enum` declarations in generated code. */ 70 | // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ 71 | // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ 72 | 73 | /* Interop Constraints */ 74 | "isolatedModules": true /* Ensure that each file can be safely transpiled without relying on other imports. */, 75 | "allowSyntheticDefaultImports": true /* Allow 'import x from y' when a module doesn't have a default export. */, 76 | // "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */, 77 | // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ 78 | "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */, 79 | 80 | /* Type Checking */ 81 | "strict": true /* Enable all strict type-checking options. */, 82 | // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied `any` type.. */ 83 | // "strictNullChecks": true, /* When type checking, take into account `null` and `undefined`. */ 84 | // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ 85 | // "strictBindCallApply": true, /* Check that the arguments for `bind`, `call`, and `apply` methods match the original function. */ 86 | "strictPropertyInitialization": false /* Check for class properties that are declared but not set in the constructor. */, 87 | // "noImplicitThis": true, /* Enable error reporting when `this` is given the type `any`. */ 88 | // "useUnknownInCatchVariables": true, /* Type catch clause variables as 'unknown' instead of 'any'. */ 89 | // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ 90 | // "noUnusedLocals": true, /* Enable error reporting when a local variables aren't read. */ 91 | // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read */ 92 | // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ 93 | // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ 94 | // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ 95 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ 96 | // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ 97 | // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type */ 98 | // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ 99 | // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ 100 | 101 | /* Completeness */ 102 | // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ 103 | "skipLibCheck": true /* Skip type checking all .d.ts files. */ 104 | } 105 | } 106 | --------------------------------------------------------------------------------