├── .github ├── raw │ └── 510.jpg └── workflows │ └── build.yaml ├── .gitignore ├── .node-version ├── LICENSE ├── README.md ├── apps └── web │ ├── .eslintrc.json │ ├── .gitignore │ ├── README.md │ ├── next.config.js │ ├── package.json │ ├── pages │ ├── _app.tsx │ ├── _document.tsx │ ├── api │ │ └── scrape.ts │ └── index.tsx │ ├── public │ ├── favicon.ico │ ├── github.svg │ ├── next.svg │ ├── thirteen.svg │ └── vercel.svg │ ├── styles │ ├── Home.module.css │ └── globals.css │ └── tsconfig.json ├── package-lock.json ├── package.json ├── packages └── core │ ├── jest.config.js │ ├── package.json │ ├── src │ ├── image.ts │ ├── index.ts │ ├── parse.ts │ ├── scrape.ts │ └── types.ts │ ├── test │ ├── dummy.html │ └── scraper.test.ts │ └── tsconfig.json └── turbo.json /.github/raw/510.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jasonaibrahim/scraper/cfde57b9f6683faffad3dd39740fabd9ee963c84/.github/raw/510.jpg -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: scraper-js CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | branches: [ master ] 9 | 10 | jobs: 11 | test: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | strategy: 16 | matrix: 17 | node-version: [ 16.x, 18.x ] 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Build for node version ${{ matrix.node-version }} 22 | uses: actions/setup-node@v3 23 | with: 24 | node-version: ${{ matrix.node-version }} 25 | - run: npm ci 26 | - run: npm test 27 | 28 | build: 29 | 30 | runs-on: ubuntu-latest 31 | 32 | strategy: 33 | matrix: 34 | node-version: [ 16.x, 18.x ] 35 | 36 | steps: 37 | - uses: actions/checkout@v3 38 | - name: Build for node version ${{ matrix.node-version }} 39 | uses: actions/setup-node@v3 40 | with: 41 | node-version: ${{ matrix.node-version }} 42 | - run: npm ci 43 | - run: npm run build 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea 3 | .turbo 4 | packages/core/dist 5 | -------------------------------------------------------------------------------- /.node-version: -------------------------------------------------------------------------------- 1 | 18.8.0 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Jason Ibrahim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scraper-js 2 | 3 | 4 | 5 | ### Demo 6 | 7 | ## overview 8 | 9 | need thumbnails? scraper is a lightweight node.js package designed to return high quality and highly relevant images from a source url fast. 10 | 11 | installation 12 | 13 | --- 14 | 15 | ```bash 16 | npm install scraper-js 17 | ``` 18 | 19 | ## use 20 | 21 | ```javascript 22 | import scraper from "scraper-js"; 23 | 24 | const url = 25 | "https://barackobama.medium.com/my-2022-end-of-year-lists-ba76b6278801"; 26 | const result = await scraper.scrape(url); 27 | ``` 28 | 29 | Calling `scrape` returns a `ScrapeResult` object: 30 | 31 | ```typescript 32 | export interface ScrapeResult { 33 | html: string; 34 | images: RankedImage[]; 35 | linkedData: Thing | null; 36 | openGraph: 37 | | scrapeOpenGraphData.successResultObject 38 | | scrapeOpenGraphData.errorResultObject; 39 | featureImage?: RankedImage | null; 40 | } 41 | ``` 42 | 43 | The recommended version of node.js is `18`. 44 | 45 | ## example node server 46 | 47 | ```javascript 48 | // 49 | // scraperapp 50 | // 51 | // thumbnail scraping http server. usage is as follows: 52 | // get the address to scrape from the parameters passed to the url 53 | // e.g. localhost:1337/scrape?url=http://www.reddit.com; address to scrape => http://www.reddit.com 54 | // response will be an array of image urls => [http://image1.jpg, http://image2.jpg, ...] 55 | // 56 | // authored by Jason Ibrahim 57 | // copyright (c) 2015 Jason Ibrahim 58 | // 59 | 60 | // initialize dependencies 61 | const http = require("http"), 62 | https = require("https"), 63 | url = require("url"), 64 | scraper = require("scraper-js"); 65 | 66 | // set the port 67 | const port = process.env.port || 80; 68 | 69 | // create the server 70 | const server = http 71 | .createServer(function (req, res) { 72 | const scrapereg = new RegExp(/^(\/scrape)/), 73 | query = url.parse(req.url, true).query, 74 | address = query.url, 75 | scraper = new Scraper.Scraper(); 76 | // only listen for api calls to /scrape 77 | if (!req.url.match(scrapereg)) { 78 | res.writeHead(404); 79 | res.end("Did you mean /scrape?"); 80 | } 81 | res.writeHead(200, { "Access-Control-Allow-Origin": "*" }); 82 | // scraper returns a promise that will resolve an array of `RankedImage` 83 | scraper.scrape(address).then( 84 | function (result) { 85 | res.end(JSON.stringify(result.images)); 86 | }, 87 | function (error) { 88 | res.writeHead(404); 89 | res.end(JSON.stringify([error])); 90 | } 91 | ); 92 | // if we don't get at least one thumbnail within 8 seconds, quit 93 | setTimeout(function () { 94 | res.writeHead(408); 95 | res.end(JSON.stringify(["timeout."])); 96 | }, 8000); 97 | }) 98 | .listen(port); 99 | 100 | console.log("Scraping on", port); 101 | ``` 102 | 103 | ## contributions 104 | 105 | improvements, features, bug fixes and any other type of contribution are welcome to this project. please feel free to extend what has been started here and if it solves a particular problem, please submit a pull request so we can share it with others. 106 | -------------------------------------------------------------------------------- /apps/web/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /apps/web/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | .pnpm-debug.log* 27 | 28 | # local env files 29 | .env*.local 30 | 31 | # vercel 32 | .vercel 33 | 34 | # typescript 35 | *.tsbuildinfo 36 | next-env.d.ts 37 | -------------------------------------------------------------------------------- /apps/web/README.md: -------------------------------------------------------------------------------- 1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). 2 | 3 | ## Getting Started 4 | 5 | First, run the development server: 6 | 7 | ```bash 8 | npm run dev 9 | # or 10 | yarn dev 11 | ``` 12 | 13 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. 14 | 15 | You can start editing the page by modifying `pages/index.tsx`. The page auto-updates as you edit the file. 16 | 17 | [API routes](https://nextjs.org/docs/api-routes/introduction) can be accessed on [http://localhost:3000/api/hello](http://localhost:3000/api/hello). This endpoint can be edited in `pages/api/scrape.ts`. 18 | 19 | The `pages/api` directory is mapped to `/api/*`. Files in this directory are treated as [API routes](https://nextjs.org/docs/api-routes/introduction) instead of React pages. 20 | 21 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. 22 | 23 | ## Learn More 24 | 25 | To learn more about Next.js, take a look at the following resources: 26 | 27 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. 28 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. 29 | 30 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome! 31 | 32 | ## Deploy on Vercel 33 | 34 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. 35 | 36 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details. 37 | -------------------------------------------------------------------------------- /apps/web/next.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const withTM = require("next-transpile-modules")(["@scraper-js/core"]); 3 | const nextConfig = { 4 | reactStrictMode: true, 5 | } 6 | 7 | module.exports = withTM(nextConfig) 8 | -------------------------------------------------------------------------------- /apps/web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@next/font": "13.1.1", 13 | "@scraper-js/core": "*", 14 | "@types/node": "18.11.18", 15 | "@types/react": "18.0.26", 16 | "@types/react-dom": "18.0.10", 17 | "eslint": "8.31.0", 18 | "eslint-config-next": "13.1.1", 19 | "next": "13.1.1", 20 | "react": "18.2.0", 21 | "react-dom": "18.2.0", 22 | "typescript": "4.9.4" 23 | }, 24 | "devDependencies": { 25 | "next-transpile-modules": "^10.0.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /apps/web/pages/_app.tsx: -------------------------------------------------------------------------------- 1 | import '../styles/globals.css' 2 | import type { AppProps } from 'next/app' 3 | 4 | export default function App({ Component, pageProps }: AppProps) { 5 | return 6 | } 7 | -------------------------------------------------------------------------------- /apps/web/pages/_document.tsx: -------------------------------------------------------------------------------- 1 | import { Html, Head, Main, NextScript } from 'next/document' 2 | 3 | export default function Document() { 4 | return ( 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | ) 13 | } 14 | -------------------------------------------------------------------------------- /apps/web/pages/api/scrape.ts: -------------------------------------------------------------------------------- 1 | // Next.js API route support: https://nextjs.org/docs/api-routes/introduction 2 | import type { NextApiRequest, NextApiResponse } from "next"; 3 | import scraper, { ScrapeResult } from "@scraper-js/core"; 4 | 5 | type Data = 6 | | ScrapeResult 7 | | { 8 | error?: string; 9 | }; 10 | 11 | export default async function handler( 12 | req: NextApiRequest, 13 | res: NextApiResponse 14 | ) { 15 | const { url } = req.query; 16 | if (url) { 17 | if (typeof url === "string") { 18 | const result = await scraper.scrape(url); 19 | res.status(200).json(result); 20 | } else { 21 | res.status(400).json({ error: "`url` cannot be an array" }); 22 | } 23 | } else { 24 | res.status(400).json({ error: "`url` is a required field" }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /apps/web/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import Head from "next/head"; 2 | import Image from "next/image"; 3 | import { Inter } from "@next/font/google"; 4 | import styles from "../styles/Home.module.css"; 5 | import { useState } from "react"; 6 | import type { ScrapeResult } from "@scraper-js/core"; 7 | 8 | const inter = Inter({ subsets: ["latin"] }); 9 | 10 | export default function Home() { 11 | const [input, setInput] = useState(""); 12 | const [error, setError] = useState(""); 13 | const [scrapeResult, setScrapeResult] = useState(null); 14 | 15 | async function onSubmit() { 16 | setError(""); 17 | 18 | try { 19 | const url = new URL(input); 20 | const res = await fetch(`/api/scrape?url=${url.href}`).then( 21 | async (res) => { 22 | if (res.status === 200) { 23 | return res.json(); 24 | } else { 25 | const message = await res.json(); 26 | throw new Error(message.error); 27 | } 28 | } 29 | ); 30 | setScrapeResult(res); 31 | } catch (err) { 32 | setError((err as Error).message); 33 | } 34 | } 35 | 36 | return ( 37 | <> 38 | 39 | scraper-js 40 | 44 | 45 | 46 | 47 |
48 |
49 |

scraper-js

50 | 51 | {"GitHub"} 52 | 53 |
54 | 55 |
56 |
57 | setInput(e.target.value)} 63 | /> 64 | 67 |
68 |
69 | 70 | {error && ( 71 |
72 |

{error}

73 |
74 | )} 75 | 76 | {scrapeResult?.featureImage && ( 77 |
78 |

Feature Image

79 |
80 | {"Feature 81 |
82 |
83 | )} 84 | 85 | 114 |
115 | 116 | ); 117 | } 118 | -------------------------------------------------------------------------------- /apps/web/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jasonaibrahim/scraper/cfde57b9f6683faffad3dd39740fabd9ee963c84/apps/web/public/favicon.ico -------------------------------------------------------------------------------- /apps/web/public/github.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /apps/web/public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/web/public/thirteen.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/web/public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/web/styles/Home.module.css: -------------------------------------------------------------------------------- 1 | .main { 2 | display: flex; 3 | flex-direction: column; 4 | justify-content: space-between; 5 | align-items: center; 6 | padding: 6rem; 7 | min-height: 100vh; 8 | } 9 | 10 | .description { 11 | display: inherit; 12 | justify-content: inherit; 13 | align-items: inherit; 14 | font-size: 0.85rem; 15 | max-width: var(--max-width); 16 | width: 100%; 17 | z-index: 2; 18 | font-family: var(--font-mono); 19 | } 20 | 21 | .description a { 22 | display: flex; 23 | justify-content: center; 24 | align-items: center; 25 | gap: 0.5rem; 26 | } 27 | 28 | .description p { 29 | position: relative; 30 | margin: 0; 31 | padding: 1rem; 32 | background-color: rgba(var(--callout-rgb), 0.5); 33 | border: 1px solid rgba(var(--callout-border-rgb), 0.3); 34 | border-radius: var(--border-radius); 35 | } 36 | 37 | .error { 38 | display: inherit; 39 | justify-content: center; 40 | align-items: inherit; 41 | font-size: 0.85rem; 42 | max-width: var(--max-width); 43 | width: 100%; 44 | z-index: 2; 45 | font-family: var(--font-mono); 46 | color: red; 47 | } 48 | 49 | .error p { 50 | position: relative; 51 | margin: 0; 52 | padding: 1rem; 53 | background-color: rgba(var(--callout-rgb), 0.5); 54 | border: 1px solid rgba(var(--callout-border-rgb), 0.3); 55 | border-radius: var(--border-radius); 56 | } 57 | 58 | .result { 59 | display: flex; 60 | flex-direction: column; 61 | justify-content: center; 62 | align-items: inherit; 63 | font-size: 0.85rem; 64 | max-width: var(--max-width); 65 | width: 100%; 66 | z-index: 2; 67 | font-family: var(--font-mono); 68 | } 69 | 70 | .result p { 71 | position: relative; 72 | margin-bottom: 1rem; 73 | padding: 1rem; 74 | background-color: rgba(var(--callout-rgb), 0.5); 75 | border: 1px solid rgba(var(--callout-border-rgb), 0.3); 76 | border-radius: var(--border-radius); 77 | } 78 | 79 | .code { 80 | font-weight: 700; 81 | font-family: var(--font-mono); 82 | } 83 | 84 | .form { 85 | z-index: 1; 86 | } 87 | 88 | .input { 89 | padding: 0.5rem; 90 | outline: none; 91 | border: none; 92 | } 93 | 94 | .button { 95 | padding: 0.5rem; 96 | background: black; 97 | color: white; 98 | border: none; 99 | border-top-right-radius: 4px; 100 | border-bottom-right-radius: 4px; 101 | } 102 | 103 | .button:active { 104 | transform: scale(0.99); 105 | } 106 | 107 | .grid { 108 | display: grid; 109 | grid-template-columns: repeat(4, minmax(25%, auto)); 110 | width: var(--max-width); 111 | max-width: 100%; 112 | } 113 | 114 | .card { 115 | padding: 1rem 1.2rem; 116 | border-radius: var(--border-radius); 117 | background: rgba(var(--card-rgb), 0); 118 | border: 1px solid rgba(var(--card-border-rgb), 0); 119 | transition: background 200ms, border 200ms; 120 | } 121 | 122 | .card span { 123 | display: inline-block; 124 | transition: transform 200ms; 125 | } 126 | 127 | .card h2 { 128 | font-weight: 600; 129 | margin-bottom: 0.7rem; 130 | } 131 | 132 | .card p { 133 | margin: 0; 134 | opacity: 0.6; 135 | font-size: 0.9rem; 136 | line-height: 1.5; 137 | max-width: 30ch; 138 | } 139 | 140 | .center { 141 | display: flex; 142 | justify-content: center; 143 | align-items: center; 144 | position: relative; 145 | padding: 4rem 0; 146 | } 147 | 148 | .center::before { 149 | background: var(--secondary-glow); 150 | border-radius: 50%; 151 | width: 480px; 152 | height: 360px; 153 | margin-left: -400px; 154 | } 155 | 156 | .center::after { 157 | background: var(--primary-glow); 158 | width: 240px; 159 | height: 180px; 160 | z-index: -1; 161 | } 162 | 163 | .center::before, 164 | .center::after { 165 | content: ''; 166 | left: 50%; 167 | position: absolute; 168 | filter: blur(45px); 169 | transform: translateZ(0); 170 | } 171 | 172 | .featureImage { 173 | max-width: 500px; 174 | margin-bottom: 4rem; 175 | } 176 | 177 | .featureImage img { 178 | width: 100%; 179 | } 180 | 181 | .logo, 182 | .thirteen { 183 | position: relative; 184 | } 185 | 186 | .thirteen { 187 | display: flex; 188 | justify-content: center; 189 | align-items: center; 190 | width: 75px; 191 | height: 75px; 192 | padding: 25px 10px; 193 | margin-left: 16px; 194 | transform: translateZ(0); 195 | border-radius: var(--border-radius); 196 | overflow: hidden; 197 | box-shadow: 0px 2px 8px -1px #0000001a; 198 | } 199 | 200 | .thirteen::before, 201 | .thirteen::after { 202 | content: ''; 203 | position: absolute; 204 | z-index: -1; 205 | } 206 | 207 | /* Conic Gradient Animation */ 208 | .thirteen::before { 209 | animation: 6s rotate linear infinite; 210 | width: 200%; 211 | height: 200%; 212 | background: var(--tile-border); 213 | } 214 | 215 | /* Inner Square */ 216 | .thirteen::after { 217 | inset: 0; 218 | padding: 1px; 219 | border-radius: var(--border-radius); 220 | background: linear-gradient( 221 | to bottom right, 222 | rgba(var(--tile-start-rgb), 1), 223 | rgba(var(--tile-end-rgb), 1) 224 | ); 225 | background-clip: content-box; 226 | } 227 | 228 | /* Enable hover only on non-touch devices */ 229 | @media (hover: hover) and (pointer: fine) { 230 | .card:hover { 231 | background: rgba(var(--card-rgb), 0.1); 232 | border: 1px solid rgba(var(--card-border-rgb), 0.15); 233 | } 234 | 235 | .card:hover span { 236 | transform: translateX(4px); 237 | } 238 | } 239 | 240 | @media (prefers-reduced-motion) { 241 | .thirteen::before { 242 | animation: none; 243 | } 244 | 245 | .card:hover span { 246 | transform: none; 247 | } 248 | } 249 | 250 | /* Mobile */ 251 | @media (max-width: 700px) { 252 | .content { 253 | padding: 4rem; 254 | } 255 | 256 | .grid { 257 | grid-template-columns: 1fr; 258 | margin-bottom: 120px; 259 | max-width: 320px; 260 | text-align: center; 261 | } 262 | 263 | .card { 264 | padding: 1rem 2.5rem; 265 | } 266 | 267 | .card h2 { 268 | margin-bottom: 0.5rem; 269 | } 270 | 271 | .center { 272 | padding: 8rem 0 6rem; 273 | } 274 | 275 | .center::before { 276 | transform: none; 277 | height: 300px; 278 | } 279 | 280 | .description { 281 | font-size: 0.8rem; 282 | } 283 | 284 | .description a { 285 | padding: 1rem; 286 | } 287 | 288 | .description p, 289 | .description div { 290 | display: flex; 291 | justify-content: center; 292 | position: fixed; 293 | width: 100%; 294 | } 295 | 296 | .description p { 297 | align-items: center; 298 | inset: 0 0 auto; 299 | padding: 2rem 1rem 1.4rem; 300 | border-radius: 0; 301 | border: none; 302 | border-bottom: 1px solid rgba(var(--callout-border-rgb), 0.25); 303 | background: linear-gradient( 304 | to bottom, 305 | rgba(var(--background-start-rgb), 1), 306 | rgba(var(--callout-rgb), 0.5) 307 | ); 308 | background-clip: padding-box; 309 | backdrop-filter: blur(24px); 310 | } 311 | 312 | .description div { 313 | align-items: flex-end; 314 | pointer-events: none; 315 | inset: auto 0 0; 316 | padding: 2rem; 317 | height: 200px; 318 | background: linear-gradient( 319 | to bottom, 320 | transparent 0%, 321 | rgb(var(--background-end-rgb)) 40% 322 | ); 323 | z-index: 1; 324 | } 325 | } 326 | 327 | /* Tablet and Smaller Desktop */ 328 | @media (min-width: 701px) and (max-width: 1120px) { 329 | .grid { 330 | grid-template-columns: repeat(2, 50%); 331 | } 332 | } 333 | 334 | @media (prefers-color-scheme: dark) { 335 | .vercelLogo { 336 | filter: invert(1); 337 | } 338 | 339 | .logo, 340 | .thirteen img { 341 | filter: invert(1) drop-shadow(0 0 0.3rem #ffffff70); 342 | } 343 | } 344 | 345 | @keyframes rotate { 346 | from { 347 | transform: rotate(360deg); 348 | } 349 | to { 350 | transform: rotate(0deg); 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /apps/web/styles/globals.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --max-width: 1100px; 3 | --border-radius: 12px; 4 | --font-mono: ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 5 | 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 6 | 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace; 7 | 8 | --foreground-rgb: 0, 0, 0; 9 | --background-start-rgb: 214, 219, 220; 10 | --background-end-rgb: 255, 255, 255; 11 | 12 | --primary-glow: conic-gradient( 13 | from 180deg at 50% 50%, 14 | #16abff33 0deg, 15 | #0885ff33 55deg, 16 | #54d6ff33 120deg, 17 | #0071ff33 160deg, 18 | transparent 360deg 19 | ); 20 | --secondary-glow: radial-gradient( 21 | rgba(255, 255, 255, 1), 22 | rgba(255, 255, 255, 0) 23 | ); 24 | 25 | --tile-start-rgb: 239, 245, 249; 26 | --tile-end-rgb: 228, 232, 233; 27 | --tile-border: conic-gradient( 28 | #00000080, 29 | #00000040, 30 | #00000030, 31 | #00000020, 32 | #00000010, 33 | #00000010, 34 | #00000080 35 | ); 36 | 37 | --callout-rgb: 238, 240, 241; 38 | --callout-border-rgb: 172, 175, 176; 39 | --card-rgb: 180, 185, 188; 40 | --card-border-rgb: 131, 134, 135; 41 | } 42 | 43 | @media (prefers-color-scheme: dark) { 44 | :root { 45 | --foreground-rgb: 255, 255, 255; 46 | --background-start-rgb: 0, 0, 0; 47 | --background-end-rgb: 0, 0, 0; 48 | 49 | --primary-glow: radial-gradient(rgba(1, 65, 255, 0.4), rgba(1, 65, 255, 0)); 50 | --secondary-glow: linear-gradient( 51 | to bottom right, 52 | rgba(1, 65, 255, 0), 53 | rgba(1, 65, 255, 0), 54 | rgba(1, 65, 255, 0.3) 55 | ); 56 | 57 | --tile-start-rgb: 2, 13, 46; 58 | --tile-end-rgb: 2, 5, 19; 59 | --tile-border: conic-gradient( 60 | #ffffff80, 61 | #ffffff40, 62 | #ffffff30, 63 | #ffffff20, 64 | #ffffff10, 65 | #ffffff10, 66 | #ffffff80 67 | ); 68 | 69 | --callout-rgb: 20, 20, 20; 70 | --callout-border-rgb: 108, 108, 108; 71 | --card-rgb: 100, 100, 100; 72 | --card-border-rgb: 200, 200, 200; 73 | } 74 | } 75 | 76 | * { 77 | box-sizing: border-box; 78 | padding: 0; 79 | margin: 0; 80 | } 81 | 82 | html, 83 | body { 84 | max-width: 100vw; 85 | overflow-x: hidden; 86 | } 87 | 88 | body { 89 | color: rgb(var(--foreground-rgb)); 90 | background: linear-gradient( 91 | to bottom, 92 | transparent, 93 | rgb(var(--background-end-rgb)) 94 | ) 95 | rgb(var(--background-start-rgb)); 96 | } 97 | 98 | a { 99 | color: inherit; 100 | text-decoration: none; 101 | } 102 | 103 | @media (prefers-color-scheme: dark) { 104 | html { 105 | color-scheme: dark; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /apps/web/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "forceConsistentCasingInFileNames": true, 9 | "noEmit": true, 10 | "esModuleInterop": true, 11 | "module": "esnext", 12 | "moduleResolution": "node", 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "jsx": "preserve", 16 | "incremental": true 17 | }, 18 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"], 19 | "exclude": ["node_modules"] 20 | } 21 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scraper-js", 3 | "version": "2.0.0", 4 | "description": "From the Bay to LA, scraper will collect all of the images from the url you tell it to and return a list of the images that best represent the site.", 5 | "scripts": { 6 | "build": "turbo run build", 7 | "test": "turbo run test" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git://github.com/jasonaibrahim/scraper.git" 12 | }, 13 | "keywords": [ 14 | "scrape", 15 | "thumbnails", 16 | "images", 17 | "facebook", 18 | "twitter", 19 | "thumbnail", 20 | "image", 21 | "scraper", 22 | "web", 23 | "crawler", 24 | "image", 25 | "web", 26 | "oakland" 27 | ], 28 | "author": "jason ibrahim", 29 | "license": "MIT", 30 | "bugs": { 31 | "url": "https://github.com/jasonaibrahim/scraper/issues" 32 | }, 33 | "homepage": "https://github.com/jasonaibrahim/scraper", 34 | "workspaces": [ 35 | "apps/*", 36 | "packages/*" 37 | ], 38 | "files": [ 39 | "packages/core/dist", 40 | "README.md", 41 | "LICENSE", 42 | "package.json" 43 | ], 44 | "devDependencies": { 45 | "@jest/globals": "^29.3.1", 46 | "jest": "^29.3.1", 47 | "prettier": "^2.8.1", 48 | "ts-jest": "^29.0.3", 49 | "turbo": "^1.6.3" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /packages/core/jest.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('ts-jest').JestConfigWithTsJest} */ 2 | module.exports = { 3 | preset: 'ts-jest', 4 | testEnvironment: 'node', 5 | transform: { 6 | '^.+\\.(ts|tsx)?$': 'ts-jest', 7 | } 8 | }; 9 | -------------------------------------------------------------------------------- /packages/core/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@scraper-js/core", 3 | "scripts": { 4 | "test": "jest", 5 | "build": "rm -rf dist; tsc" 6 | }, 7 | "main": "dist/index.js", 8 | "dependencies": { 9 | "axios": "^1.2.2", 10 | "cheerio": "1.0.0-rc.12", 11 | "lodash": "^4.17.21", 12 | "open-graph-scraper": "^5.0.3" 13 | }, 14 | "devDependencies": { 15 | "@types/jquery": "^3.5.16", 16 | "@types/lodash": "^4.14.191", 17 | "axios-mock-adapter": "^1.21.2", 18 | "dotenv": "^16.0.3", 19 | "schema-dts": "^1.1.0", 20 | "typescript": "^4.9.4" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /packages/core/src/image.ts: -------------------------------------------------------------------------------- 1 | import { ImageMetadata, ImageSource, ParseResult, RankedImage } from './types'; 2 | 3 | export interface ImageExtractOptions { 4 | rankImage?: (imageMetadata: ImageMetadata) => number; 5 | } 6 | 7 | const defaultRankAlgorithm: ImageExtractOptions["rankImage"] = ( 8 | imageMetadata 9 | ) => { 10 | /** 11 | * Immediately down rank images with no `src` attribute; this is a failure condition 12 | */ 13 | if (!imageMetadata.src) { 14 | return -1 15 | } 16 | 17 | let score = 0; 18 | 19 | /** 20 | * Weigh image based on source, with open graph images ranking highest 21 | */ 22 | switch (imageMetadata.sourceType) { 23 | case ImageSource.OpenGraph: 24 | score += 2; 25 | break 26 | case ImageSource.LinkedData: 27 | score += 1; 28 | break; 29 | } 30 | 31 | /** 32 | * Weigh image based on dimensions 33 | */ 34 | if (imageMetadata.width && imageMetadata.height) { 35 | score += 1; 36 | } 37 | 38 | return score; 39 | }; 40 | 41 | export function featureImageFromParseResult( 42 | result: ParseResult, 43 | options: ImageExtractOptions = {} 44 | ): RankedImage | null { 45 | return imagesFromParseResult(result, options).shift() ?? null; 46 | } 47 | 48 | export function imagesFromParseResult( 49 | result: ParseResult, 50 | options: ImageExtractOptions = {} 51 | ): Array { 52 | let images: ImageMetadata[] = []; 53 | 54 | if (result.linkedData) { 55 | // @ts-ignore - type is not being inferred but is SchemaValue 56 | const { image } = result.linkedData; 57 | 58 | if (Array.isArray(image)) { 59 | for (const img of image) { 60 | if (typeof img === "string") { 61 | images.push({ 62 | width: 0, 63 | height: 0, 64 | src: img, 65 | sourceType: ImageSource.LinkedData, 66 | }); 67 | } else { 68 | const width = parseInt(img.width ?? 0); 69 | const height = parseInt(img.height ?? 0); 70 | const src = img.contentUrl; 71 | 72 | images.push({ 73 | width, 74 | height, 75 | src, 76 | sourceType: ImageSource.LinkedData, 77 | }); 78 | } 79 | } 80 | } else if (typeof image === "string") { 81 | images.push({ 82 | width: 0, 83 | height: 0, 84 | src: image, 85 | sourceType: ImageSource.LinkedData, 86 | }); 87 | } else { 88 | const width = parseInt(image.width ?? 0); 89 | const height = parseInt(image.height ?? 0); 90 | const src = image.contentUrl; 91 | 92 | images.push({ 93 | width, 94 | height, 95 | src, 96 | sourceType: ImageSource.LinkedData, 97 | }); 98 | } 99 | } 100 | 101 | if (result.openGraph.ogImage) { 102 | const image = result.openGraph.ogImage; 103 | 104 | if (typeof image === "string") { 105 | /** 106 | * Handle type string 107 | */ 108 | images.push({ 109 | width: 0, 110 | height: 0, 111 | src: image, 112 | sourceType: ImageSource.OpenGraph, 113 | }); 114 | } else if (Array.isArray(image)) { 115 | /** 116 | * Handle Array 117 | */ 118 | for (const img of image) { 119 | let width = 0; 120 | let height = 0; 121 | let src: string; 122 | 123 | if (typeof img === "string") { 124 | src = img; 125 | } else { 126 | src = img.url; 127 | width = parseInt(`${img.width}`); 128 | height = parseInt(`${img.height}`); 129 | } 130 | 131 | images.push({ 132 | ...img, 133 | width, 134 | height, 135 | src, 136 | sourceType: ImageSource.OpenGraph, 137 | }); 138 | } 139 | } else { 140 | /** 141 | * Handle type ImageObject 142 | * 143 | */ 144 | let width = parseInt(`${image.width}`); 145 | let height = parseInt(`${image.height}`); 146 | let src = image.url; 147 | images.push({ 148 | ...image, 149 | width, 150 | height, 151 | src, 152 | sourceType: ImageSource.OpenGraph, 153 | }); 154 | } 155 | } 156 | 157 | const tags = result.document("img"); 158 | for (const tag of tags) { 159 | const width = parseInt(tag.attribs["width"] ?? 0); 160 | const height = parseInt(tag.attribs["height"] ?? 0); 161 | const src = tag.attribs["src"]; 162 | 163 | images.push({ 164 | width, 165 | height, 166 | src, 167 | sourceType: ImageSource.DOM, 168 | }); 169 | } 170 | 171 | return sortedByRank(images, options.rankImage); 172 | } 173 | 174 | function sortedByRank( 175 | images: Array, 176 | rankImage: ImageExtractOptions["rankImage"] = defaultRankAlgorithm 177 | ): Array { 178 | let ranked: RankedImage[] = []; 179 | 180 | for (const image of images) { 181 | ranked.push({ 182 | ...image, 183 | rank: rankImage!(image), 184 | }); 185 | } 186 | 187 | return ranked.sort((a, b) => b.rank - a.rank); 188 | } 189 | -------------------------------------------------------------------------------- /packages/core/src/index.ts: -------------------------------------------------------------------------------- 1 | import { scrape } from "./scrape"; 2 | 3 | export * from "./scrape"; 4 | 5 | export default { 6 | scrape, 7 | }; 8 | -------------------------------------------------------------------------------- /packages/core/src/parse.ts: -------------------------------------------------------------------------------- 1 | import cheerio, { CheerioAPI } from 'cheerio'; 2 | import scrapeOpenGraphData from 'open-graph-scraper'; 3 | import { ParseResult } from './types'; 4 | 5 | export interface ParseOptions { 6 | parser?: CheerioAPI; 7 | } 8 | export async function parse( 9 | content: string, 10 | { parser = cheerio }: ParseOptions 11 | ): Promise { 12 | if (!content) { 13 | throw new Error("Failed to retrieve page content"); 14 | } 15 | 16 | const $ = parser.load(content); 17 | 18 | /** 19 | * Scrape OpenGraph data from page html 20 | */ 21 | const { result: openGraph } = await scrapeOpenGraphData({ 22 | url: "", 23 | html: content, 24 | }); 25 | 26 | /** 27 | * Scrape LinkedData from html if present. 28 | */ 29 | const linkedDataElement = $('script[type="application/ld+json"]'); 30 | let linkedData: ParseResult["linkedData"] = null; 31 | try { 32 | linkedData = JSON.parse(linkedDataElement.html()!); 33 | } catch (err) { 34 | console.warn("Failed to retrieve linked data", err); 35 | } 36 | 37 | return { 38 | openGraph, 39 | linkedData, 40 | document: $, 41 | }; 42 | } 43 | -------------------------------------------------------------------------------- /packages/core/src/scrape.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosInstance } from 'axios'; 2 | import { Thing } from 'schema-dts'; 3 | import scrapeOpenGraphData from 'open-graph-scraper'; 4 | import { featureImageFromParseResult, ImageExtractOptions, imagesFromParseResult } from './image'; 5 | import { parse, ParseOptions } from './parse'; 6 | import { RankedImage } from './types'; 7 | 8 | export type ScrapeOptions = Pick & 9 | Pick & { 10 | client: AxiosInstance; 11 | }; 12 | 13 | export interface ScrapeResult { 14 | html: string; 15 | images: RankedImage[]; 16 | linkedData: Thing | null; 17 | openGraph: 18 | | scrapeOpenGraphData.successResultObject 19 | | scrapeOpenGraphData.errorResultObject; 20 | featureImage?: RankedImage | null; 21 | } 22 | 23 | export async function scrape( 24 | url: string, 25 | options: ScrapeOptions = { 26 | client: axios.create({ 27 | timeout: 1000, 28 | }), 29 | } 30 | ): Promise { 31 | const { client } = options; 32 | 33 | const response = await client.get(url); 34 | const result = await parse(response.data, { 35 | parser: options.parser, 36 | }); 37 | 38 | return { 39 | featureImage: featureImageFromParseResult(result, { 40 | rankImage: options.rankImage, 41 | }), 42 | images: imagesFromParseResult(result, { 43 | rankImage: options.rankImage, 44 | }), 45 | html: result.document.html(), 46 | linkedData: result.linkedData, 47 | openGraph: result.openGraph, 48 | }; 49 | } 50 | -------------------------------------------------------------------------------- /packages/core/src/types.ts: -------------------------------------------------------------------------------- 1 | import { CheerioAPI } from "cheerio"; 2 | import { Thing } from "schema-dts"; 3 | import type scrapeOpenGraphData from "open-graph-scraper"; 4 | 5 | export interface ParseResult { 6 | linkedData: Thing | null; 7 | openGraph: 8 | | scrapeOpenGraphData.successResultObject 9 | | scrapeOpenGraphData.errorResultObject; 10 | document: CheerioAPI; 11 | } 12 | 13 | export enum ImageSource { 14 | LinkedData = "linked_data", 15 | OpenGraph = "opengraph", 16 | DOM = "dom", 17 | } 18 | export interface ImageMetadata extends Record { 19 | width: number; 20 | height: number; 21 | sourceType: ImageSource; 22 | src: string; 23 | } 24 | 25 | export interface RankedImage extends ImageMetadata { 26 | rank: number; 27 | } 28 | -------------------------------------------------------------------------------- /packages/core/test/dummy.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 42 | 43 | -------------------------------------------------------------------------------- /packages/core/test/scraper.test.ts: -------------------------------------------------------------------------------- 1 | import scraper from "../src"; 2 | import { describe, expect, it } from "@jest/globals"; 3 | import axios from "axios"; 4 | import MockAdapter from "axios-mock-adapter"; 5 | import * as fs from "fs"; 6 | import * as path from "path"; 7 | 8 | describe("scraper", () => { 9 | it("should provide an initialization interface", () => { 10 | expect(scraper.scrape).toBeDefined(); 11 | }); 12 | }); 13 | 14 | describe("scraping", () => { 15 | it("should scrape metadata from a given url", async () => { 16 | const url = 17 | "https://barackobama.medium.com/my-2022-end-of-year-lists-ba76b6278801"; 18 | const result = await scraper.scrape(url); 19 | 20 | expect(result.html).toBeTruthy(); 21 | expect(result.featureImage!.src).toEqual( 22 | "https://miro.medium.com/max/960/1*Fm3OR_ORrkhUxF_fkKRvsw.png" 23 | ); 24 | expect(result.openGraph.ogTitle).toEqual("My 2022 End of Year Lists"); 25 | expect(result.linkedData).toBeTruthy(); 26 | expect(result.images.length).toBeGreaterThan(0); 27 | expect(result.images[0].url).toEqual(result.featureImage!.url); 28 | }); 29 | }); 30 | 31 | describe("options", () => { 32 | it("should allow for override of the http adapter", async () => { 33 | const customAxios = axios.create(); 34 | const mockHttp = new MockAdapter(customAxios); 35 | 36 | const dummyPageContent = fs.readFileSync( 37 | path.join(__dirname, "dummy.html"), 38 | "utf8" 39 | ); 40 | mockHttp.onGet("fake-url").reply(200, dummyPageContent); 41 | 42 | const { featureImage } = await scraper.scrape("fake-url", { 43 | client: customAxios, 44 | }); 45 | expect(featureImage!.src).toEqual( 46 | "https://miro.medium.com/max/1200/0*_K6j83V2soow_A2c" 47 | ); 48 | }); 49 | }); 50 | 51 | describe("errors", () => { 52 | it("should throw an error if a given url is invalid", async () => { 53 | let badUrls: string[] = [ 54 | "", 55 | "htp://example.com", 56 | "fasdfasdf", 57 | // @ts-ignore 58 | null, 59 | // @ts-expect-error 60 | () => { 61 | throw new Error(); 62 | }, 63 | // @ts-expect-error 64 | 1e5, 65 | ]; 66 | for (const badUrl of badUrls) { 67 | await expect(scraper.scrape(badUrl)).rejects.toBeTruthy(); 68 | } 69 | }); 70 | }); 71 | -------------------------------------------------------------------------------- /packages/core/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "./dist/", 4 | "noImplicitAny": true, 5 | "module": "commonjs", 6 | "target": "es2015", 7 | "jsx": "react", 8 | "allowJs": true, 9 | "moduleResolution": "node", 10 | "declaration": true, 11 | "esModuleInterop": true, 12 | "strict": true, 13 | "allowSyntheticDefaultImports": true 14 | }, 15 | "include": ["./**/*.ts"], 16 | "exclude": ["test"] 17 | } 18 | -------------------------------------------------------------------------------- /turbo.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://turbo.build/schema.json", 3 | "pipeline": { 4 | "build": { 5 | "dependsOn": ["^build"], 6 | "outputs": ["dist/**"] 7 | }, 8 | "test": { 9 | "dependsOn": ["build"], 10 | "outputs": [], 11 | "inputs": ["src/**/*.ts", "test/**/*.ts"] 12 | } 13 | } 14 | } 15 | --------------------------------------------------------------------------------