├── .github
├── raw
│ └── 510.jpg
└── workflows
│ └── build.yaml
├── .gitignore
├── .node-version
├── LICENSE
├── README.md
├── apps
└── web
│ ├── .eslintrc.json
│ ├── .gitignore
│ ├── README.md
│ ├── next.config.js
│ ├── package.json
│ ├── pages
│ ├── _app.tsx
│ ├── _document.tsx
│ ├── api
│ │ └── scrape.ts
│ └── index.tsx
│ ├── public
│ ├── favicon.ico
│ ├── github.svg
│ ├── next.svg
│ ├── thirteen.svg
│ └── vercel.svg
│ ├── styles
│ ├── Home.module.css
│ └── globals.css
│ └── tsconfig.json
├── package-lock.json
├── package.json
├── packages
└── core
│ ├── jest.config.js
│ ├── package.json
│ ├── src
│ ├── image.ts
│ ├── index.ts
│ ├── parse.ts
│ ├── scrape.ts
│ └── types.ts
│ ├── test
│ ├── dummy.html
│ └── scraper.test.ts
│ └── tsconfig.json
└── turbo.json
/.github/raw/510.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jasonaibrahim/scraper/cfde57b9f6683faffad3dd39740fabd9ee963c84/.github/raw/510.jpg
--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
1 | name: scraper-js CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - '*'
7 | pull_request:
8 | branches: [ master ]
9 |
10 | jobs:
11 | test:
12 |
13 | runs-on: ubuntu-latest
14 |
15 | strategy:
16 | matrix:
17 | node-version: [ 16.x, 18.x ]
18 |
19 | steps:
20 | - uses: actions/checkout@v3
21 | - name: Build for node version ${{ matrix.node-version }}
22 | uses: actions/setup-node@v3
23 | with:
24 | node-version: ${{ matrix.node-version }}
25 | - run: npm ci
26 | - run: npm test
27 |
28 | build:
29 |
30 | runs-on: ubuntu-latest
31 |
32 | strategy:
33 | matrix:
34 | node-version: [ 16.x, 18.x ]
35 |
36 | steps:
37 | - uses: actions/checkout@v3
38 | - name: Build for node version ${{ matrix.node-version }}
39 | uses: actions/setup-node@v3
40 | with:
41 | node-version: ${{ matrix.node-version }}
42 | - run: npm ci
43 | - run: npm run build
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .turbo
4 | packages/core/dist
5 |
--------------------------------------------------------------------------------
/.node-version:
--------------------------------------------------------------------------------
1 | 18.8.0
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Jason Ibrahim
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # scraper-js
2 |
3 |
4 |
5 | ### Demo
6 |
7 | ## overview
8 |
9 | need thumbnails? scraper is a lightweight node.js package designed to return high quality and highly relevant images from a source url fast.
10 |
11 | installation
12 |
13 | ---
14 |
15 | ```bash
16 | npm install scraper-js
17 | ```
18 |
19 | ## use
20 |
21 | ```javascript
22 | import scraper from "scraper-js";
23 |
24 | const url =
25 | "https://barackobama.medium.com/my-2022-end-of-year-lists-ba76b6278801";
26 | const result = await scraper.scrape(url);
27 | ```
28 |
29 | Calling `scrape` returns a `ScrapeResult` object:
30 |
31 | ```typescript
32 | export interface ScrapeResult {
33 | html: string;
34 | images: RankedImage[];
35 | linkedData: Thing | null;
36 | openGraph:
37 | | scrapeOpenGraphData.successResultObject
38 | | scrapeOpenGraphData.errorResultObject;
39 | featureImage?: RankedImage | null;
40 | }
41 | ```
42 |
43 | The recommended version of node.js is `18`.
44 |
45 | ## example node server
46 |
47 | ```javascript
48 | //
49 | // scraperapp
50 | //
51 | // thumbnail scraping http server. usage is as follows:
52 | // get the address to scrape from the parameters passed to the url
53 | // e.g. localhost:1337/scrape?url=http://www.reddit.com; address to scrape => http://www.reddit.com
54 | // response will be an array of image urls => [http://image1.jpg, http://image2.jpg, ...]
55 | //
56 | // authored by Jason Ibrahim
57 | // copyright (c) 2015 Jason Ibrahim
58 | //
59 |
60 | // initialize dependencies
61 | const http = require("http"),
62 | https = require("https"),
63 | url = require("url"),
64 | scraper = require("scraper-js");
65 |
66 | // set the port
67 | const port = process.env.port || 80;
68 |
69 | // create the server
70 | const server = http
71 | .createServer(function (req, res) {
72 | const scrapereg = new RegExp(/^(\/scrape)/),
73 | query = url.parse(req.url, true).query,
74 | address = query.url,
75 | scraper = new Scraper.Scraper();
76 | // only listen for api calls to /scrape
77 | if (!req.url.match(scrapereg)) {
78 | res.writeHead(404);
79 | res.end("Did you mean /scrape?");
80 | }
81 | res.writeHead(200, { "Access-Control-Allow-Origin": "*" });
82 | // scraper returns a promise that will resolve an array of `RankedImage`
83 | scraper.scrape(address).then(
84 | function (result) {
85 | res.end(JSON.stringify(result.images));
86 | },
87 | function (error) {
88 | res.writeHead(404);
89 | res.end(JSON.stringify([error]));
90 | }
91 | );
92 | // if we don't get at least one thumbnail within 8 seconds, quit
93 | setTimeout(function () {
94 | res.writeHead(408);
95 | res.end(JSON.stringify(["timeout."]));
96 | }, 8000);
97 | })
98 | .listen(port);
99 |
100 | console.log("Scraping on", port);
101 | ```
102 |
103 | ## contributions
104 |
105 | improvements, features, bug fixes and any other type of contribution are welcome to this project. please feel free to extend what has been started here and if it solves a particular problem, please submit a pull request so we can share it with others.
106 |
--------------------------------------------------------------------------------
/apps/web/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "next/core-web-vitals"
3 | }
4 |
--------------------------------------------------------------------------------
/apps/web/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # next.js
12 | /.next/
13 | /out/
14 |
15 | # production
16 | /build
17 |
18 | # misc
19 | .DS_Store
20 | *.pem
21 |
22 | # debug
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | .pnpm-debug.log*
27 |
28 | # local env files
29 | .env*.local
30 |
31 | # vercel
32 | .vercel
33 |
34 | # typescript
35 | *.tsbuildinfo
36 | next-env.d.ts
37 |
--------------------------------------------------------------------------------
/apps/web/README.md:
--------------------------------------------------------------------------------
1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
2 |
3 | ## Getting Started
4 |
5 | First, run the development server:
6 |
7 | ```bash
8 | npm run dev
9 | # or
10 | yarn dev
11 | ```
12 |
13 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
14 |
15 | You can start editing the page by modifying `pages/index.tsx`. The page auto-updates as you edit the file.
16 |
17 | [API routes](https://nextjs.org/docs/api-routes/introduction) can be accessed on [http://localhost:3000/api/hello](http://localhost:3000/api/hello). This endpoint can be edited in `pages/api/scrape.ts`.
18 |
19 | The `pages/api` directory is mapped to `/api/*`. Files in this directory are treated as [API routes](https://nextjs.org/docs/api-routes/introduction) instead of React pages.
20 |
21 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font.
22 |
23 | ## Learn More
24 |
25 | To learn more about Next.js, take a look at the following resources:
26 |
27 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
28 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
29 |
30 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome!
31 |
32 | ## Deploy on Vercel
33 |
34 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
35 |
36 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details.
37 |
--------------------------------------------------------------------------------
/apps/web/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const withTM = require("next-transpile-modules")(["@scraper-js/core"]);
3 | const nextConfig = {
4 | reactStrictMode: true,
5 | }
6 |
7 | module.exports = withTM(nextConfig)
8 |
--------------------------------------------------------------------------------
/apps/web/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "web",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "dev": "next dev",
7 | "build": "next build",
8 | "start": "next start",
9 | "lint": "next lint"
10 | },
11 | "dependencies": {
12 | "@next/font": "13.1.1",
13 | "@scraper-js/core": "*",
14 | "@types/node": "18.11.18",
15 | "@types/react": "18.0.26",
16 | "@types/react-dom": "18.0.10",
17 | "eslint": "8.31.0",
18 | "eslint-config-next": "13.1.1",
19 | "next": "13.1.1",
20 | "react": "18.2.0",
21 | "react-dom": "18.2.0",
22 | "typescript": "4.9.4"
23 | },
24 | "devDependencies": {
25 | "next-transpile-modules": "^10.0.0"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/apps/web/pages/_app.tsx:
--------------------------------------------------------------------------------
1 | import '../styles/globals.css'
2 | import type { AppProps } from 'next/app'
3 |
4 | export default function App({ Component, pageProps }: AppProps) {
5 | return
6 | }
7 |
--------------------------------------------------------------------------------
/apps/web/pages/_document.tsx:
--------------------------------------------------------------------------------
1 | import { Html, Head, Main, NextScript } from 'next/document'
2 |
3 | export default function Document() {
4 | return (
5 |
6 |
8 |
9 |
10 |
11 |
12 | )
13 | }
14 |
--------------------------------------------------------------------------------
/apps/web/pages/api/scrape.ts:
--------------------------------------------------------------------------------
1 | // Next.js API route support: https://nextjs.org/docs/api-routes/introduction
2 | import type { NextApiRequest, NextApiResponse } from "next";
3 | import scraper, { ScrapeResult } from "@scraper-js/core";
4 |
5 | type Data =
6 | | ScrapeResult
7 | | {
8 | error?: string;
9 | };
10 |
11 | export default async function handler(
12 | req: NextApiRequest,
13 | res: NextApiResponse
14 | ) {
15 | const { url } = req.query;
16 | if (url) {
17 | if (typeof url === "string") {
18 | const result = await scraper.scrape(url);
19 | res.status(200).json(result);
20 | } else {
21 | res.status(400).json({ error: "`url` cannot be an array" });
22 | }
23 | } else {
24 | res.status(400).json({ error: "`url` is a required field" });
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/apps/web/pages/index.tsx:
--------------------------------------------------------------------------------
1 | import Head from "next/head";
2 | import Image from "next/image";
3 | import { Inter } from "@next/font/google";
4 | import styles from "../styles/Home.module.css";
5 | import { useState } from "react";
6 | import type { ScrapeResult } from "@scraper-js/core";
7 |
8 | const inter = Inter({ subsets: ["latin"] });
9 |
10 | export default function Home() {
11 | const [input, setInput] = useState("");
12 | const [error, setError] = useState("");
13 | const [scrapeResult, setScrapeResult] = useState(null);
14 |
15 | async function onSubmit() {
16 | setError("");
17 |
18 | try {
19 | const url = new URL(input);
20 | const res = await fetch(`/api/scrape?url=${url.href}`).then(
21 | async (res) => {
22 | if (res.status === 200) {
23 | return res.json();
24 | } else {
25 | const message = await res.json();
26 | throw new Error(message.error);
27 | }
28 | }
29 | );
30 | setScrapeResult(res);
31 | } catch (err) {
32 | setError((err as Error).message);
33 | }
34 | }
35 |
36 | return (
37 | <>
38 |
39 | scraper-js
40 |
44 |
45 |
46 |
47 |
48 |
54 |
55 |
56 |
57 | setInput(e.target.value)}
63 | />
64 |
67 |
68 |
69 |
70 | {error && (
71 |
74 | )}
75 |
76 | {scrapeResult?.featureImage && (
77 |
78 |
Feature Image
79 |
80 |
81 |
82 |
83 | )}
84 |
85 |
114 |
115 | >
116 | );
117 | }
118 |
--------------------------------------------------------------------------------
/apps/web/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jasonaibrahim/scraper/cfde57b9f6683faffad3dd39740fabd9ee963c84/apps/web/public/favicon.ico
--------------------------------------------------------------------------------
/apps/web/public/github.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apps/web/public/next.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apps/web/public/thirteen.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apps/web/public/vercel.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apps/web/styles/Home.module.css:
--------------------------------------------------------------------------------
1 | .main {
2 | display: flex;
3 | flex-direction: column;
4 | justify-content: space-between;
5 | align-items: center;
6 | padding: 6rem;
7 | min-height: 100vh;
8 | }
9 |
10 | .description {
11 | display: inherit;
12 | justify-content: inherit;
13 | align-items: inherit;
14 | font-size: 0.85rem;
15 | max-width: var(--max-width);
16 | width: 100%;
17 | z-index: 2;
18 | font-family: var(--font-mono);
19 | }
20 |
21 | .description a {
22 | display: flex;
23 | justify-content: center;
24 | align-items: center;
25 | gap: 0.5rem;
26 | }
27 |
28 | .description p {
29 | position: relative;
30 | margin: 0;
31 | padding: 1rem;
32 | background-color: rgba(var(--callout-rgb), 0.5);
33 | border: 1px solid rgba(var(--callout-border-rgb), 0.3);
34 | border-radius: var(--border-radius);
35 | }
36 |
37 | .error {
38 | display: inherit;
39 | justify-content: center;
40 | align-items: inherit;
41 | font-size: 0.85rem;
42 | max-width: var(--max-width);
43 | width: 100%;
44 | z-index: 2;
45 | font-family: var(--font-mono);
46 | color: red;
47 | }
48 |
49 | .error p {
50 | position: relative;
51 | margin: 0;
52 | padding: 1rem;
53 | background-color: rgba(var(--callout-rgb), 0.5);
54 | border: 1px solid rgba(var(--callout-border-rgb), 0.3);
55 | border-radius: var(--border-radius);
56 | }
57 |
58 | .result {
59 | display: flex;
60 | flex-direction: column;
61 | justify-content: center;
62 | align-items: inherit;
63 | font-size: 0.85rem;
64 | max-width: var(--max-width);
65 | width: 100%;
66 | z-index: 2;
67 | font-family: var(--font-mono);
68 | }
69 |
70 | .result p {
71 | position: relative;
72 | margin-bottom: 1rem;
73 | padding: 1rem;
74 | background-color: rgba(var(--callout-rgb), 0.5);
75 | border: 1px solid rgba(var(--callout-border-rgb), 0.3);
76 | border-radius: var(--border-radius);
77 | }
78 |
79 | .code {
80 | font-weight: 700;
81 | font-family: var(--font-mono);
82 | }
83 |
84 | .form {
85 | z-index: 1;
86 | }
87 |
88 | .input {
89 | padding: 0.5rem;
90 | outline: none;
91 | border: none;
92 | }
93 |
94 | .button {
95 | padding: 0.5rem;
96 | background: black;
97 | color: white;
98 | border: none;
99 | border-top-right-radius: 4px;
100 | border-bottom-right-radius: 4px;
101 | }
102 |
103 | .button:active {
104 | transform: scale(0.99);
105 | }
106 |
107 | .grid {
108 | display: grid;
109 | grid-template-columns: repeat(4, minmax(25%, auto));
110 | width: var(--max-width);
111 | max-width: 100%;
112 | }
113 |
114 | .card {
115 | padding: 1rem 1.2rem;
116 | border-radius: var(--border-radius);
117 | background: rgba(var(--card-rgb), 0);
118 | border: 1px solid rgba(var(--card-border-rgb), 0);
119 | transition: background 200ms, border 200ms;
120 | }
121 |
122 | .card span {
123 | display: inline-block;
124 | transition: transform 200ms;
125 | }
126 |
127 | .card h2 {
128 | font-weight: 600;
129 | margin-bottom: 0.7rem;
130 | }
131 |
132 | .card p {
133 | margin: 0;
134 | opacity: 0.6;
135 | font-size: 0.9rem;
136 | line-height: 1.5;
137 | max-width: 30ch;
138 | }
139 |
140 | .center {
141 | display: flex;
142 | justify-content: center;
143 | align-items: center;
144 | position: relative;
145 | padding: 4rem 0;
146 | }
147 |
148 | .center::before {
149 | background: var(--secondary-glow);
150 | border-radius: 50%;
151 | width: 480px;
152 | height: 360px;
153 | margin-left: -400px;
154 | }
155 |
156 | .center::after {
157 | background: var(--primary-glow);
158 | width: 240px;
159 | height: 180px;
160 | z-index: -1;
161 | }
162 |
163 | .center::before,
164 | .center::after {
165 | content: '';
166 | left: 50%;
167 | position: absolute;
168 | filter: blur(45px);
169 | transform: translateZ(0);
170 | }
171 |
172 | .featureImage {
173 | max-width: 500px;
174 | margin-bottom: 4rem;
175 | }
176 |
177 | .featureImage img {
178 | width: 100%;
179 | }
180 |
181 | .logo,
182 | .thirteen {
183 | position: relative;
184 | }
185 |
186 | .thirteen {
187 | display: flex;
188 | justify-content: center;
189 | align-items: center;
190 | width: 75px;
191 | height: 75px;
192 | padding: 25px 10px;
193 | margin-left: 16px;
194 | transform: translateZ(0);
195 | border-radius: var(--border-radius);
196 | overflow: hidden;
197 | box-shadow: 0px 2px 8px -1px #0000001a;
198 | }
199 |
200 | .thirteen::before,
201 | .thirteen::after {
202 | content: '';
203 | position: absolute;
204 | z-index: -1;
205 | }
206 |
207 | /* Conic Gradient Animation */
208 | .thirteen::before {
209 | animation: 6s rotate linear infinite;
210 | width: 200%;
211 | height: 200%;
212 | background: var(--tile-border);
213 | }
214 |
215 | /* Inner Square */
216 | .thirteen::after {
217 | inset: 0;
218 | padding: 1px;
219 | border-radius: var(--border-radius);
220 | background: linear-gradient(
221 | to bottom right,
222 | rgba(var(--tile-start-rgb), 1),
223 | rgba(var(--tile-end-rgb), 1)
224 | );
225 | background-clip: content-box;
226 | }
227 |
228 | /* Enable hover only on non-touch devices */
229 | @media (hover: hover) and (pointer: fine) {
230 | .card:hover {
231 | background: rgba(var(--card-rgb), 0.1);
232 | border: 1px solid rgba(var(--card-border-rgb), 0.15);
233 | }
234 |
235 | .card:hover span {
236 | transform: translateX(4px);
237 | }
238 | }
239 |
240 | @media (prefers-reduced-motion) {
241 | .thirteen::before {
242 | animation: none;
243 | }
244 |
245 | .card:hover span {
246 | transform: none;
247 | }
248 | }
249 |
250 | /* Mobile */
251 | @media (max-width: 700px) {
252 | .content {
253 | padding: 4rem;
254 | }
255 |
256 | .grid {
257 | grid-template-columns: 1fr;
258 | margin-bottom: 120px;
259 | max-width: 320px;
260 | text-align: center;
261 | }
262 |
263 | .card {
264 | padding: 1rem 2.5rem;
265 | }
266 |
267 | .card h2 {
268 | margin-bottom: 0.5rem;
269 | }
270 |
271 | .center {
272 | padding: 8rem 0 6rem;
273 | }
274 |
275 | .center::before {
276 | transform: none;
277 | height: 300px;
278 | }
279 |
280 | .description {
281 | font-size: 0.8rem;
282 | }
283 |
284 | .description a {
285 | padding: 1rem;
286 | }
287 |
288 | .description p,
289 | .description div {
290 | display: flex;
291 | justify-content: center;
292 | position: fixed;
293 | width: 100%;
294 | }
295 |
296 | .description p {
297 | align-items: center;
298 | inset: 0 0 auto;
299 | padding: 2rem 1rem 1.4rem;
300 | border-radius: 0;
301 | border: none;
302 | border-bottom: 1px solid rgba(var(--callout-border-rgb), 0.25);
303 | background: linear-gradient(
304 | to bottom,
305 | rgba(var(--background-start-rgb), 1),
306 | rgba(var(--callout-rgb), 0.5)
307 | );
308 | background-clip: padding-box;
309 | backdrop-filter: blur(24px);
310 | }
311 |
312 | .description div {
313 | align-items: flex-end;
314 | pointer-events: none;
315 | inset: auto 0 0;
316 | padding: 2rem;
317 | height: 200px;
318 | background: linear-gradient(
319 | to bottom,
320 | transparent 0%,
321 | rgb(var(--background-end-rgb)) 40%
322 | );
323 | z-index: 1;
324 | }
325 | }
326 |
327 | /* Tablet and Smaller Desktop */
328 | @media (min-width: 701px) and (max-width: 1120px) {
329 | .grid {
330 | grid-template-columns: repeat(2, 50%);
331 | }
332 | }
333 |
334 | @media (prefers-color-scheme: dark) {
335 | .vercelLogo {
336 | filter: invert(1);
337 | }
338 |
339 | .logo,
340 | .thirteen img {
341 | filter: invert(1) drop-shadow(0 0 0.3rem #ffffff70);
342 | }
343 | }
344 |
345 | @keyframes rotate {
346 | from {
347 | transform: rotate(360deg);
348 | }
349 | to {
350 | transform: rotate(0deg);
351 | }
352 | }
353 |
--------------------------------------------------------------------------------
/apps/web/styles/globals.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --max-width: 1100px;
3 | --border-radius: 12px;
4 | --font-mono: ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono',
5 | 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro',
6 | 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace;
7 |
8 | --foreground-rgb: 0, 0, 0;
9 | --background-start-rgb: 214, 219, 220;
10 | --background-end-rgb: 255, 255, 255;
11 |
12 | --primary-glow: conic-gradient(
13 | from 180deg at 50% 50%,
14 | #16abff33 0deg,
15 | #0885ff33 55deg,
16 | #54d6ff33 120deg,
17 | #0071ff33 160deg,
18 | transparent 360deg
19 | );
20 | --secondary-glow: radial-gradient(
21 | rgba(255, 255, 255, 1),
22 | rgba(255, 255, 255, 0)
23 | );
24 |
25 | --tile-start-rgb: 239, 245, 249;
26 | --tile-end-rgb: 228, 232, 233;
27 | --tile-border: conic-gradient(
28 | #00000080,
29 | #00000040,
30 | #00000030,
31 | #00000020,
32 | #00000010,
33 | #00000010,
34 | #00000080
35 | );
36 |
37 | --callout-rgb: 238, 240, 241;
38 | --callout-border-rgb: 172, 175, 176;
39 | --card-rgb: 180, 185, 188;
40 | --card-border-rgb: 131, 134, 135;
41 | }
42 |
43 | @media (prefers-color-scheme: dark) {
44 | :root {
45 | --foreground-rgb: 255, 255, 255;
46 | --background-start-rgb: 0, 0, 0;
47 | --background-end-rgb: 0, 0, 0;
48 |
49 | --primary-glow: radial-gradient(rgba(1, 65, 255, 0.4), rgba(1, 65, 255, 0));
50 | --secondary-glow: linear-gradient(
51 | to bottom right,
52 | rgba(1, 65, 255, 0),
53 | rgba(1, 65, 255, 0),
54 | rgba(1, 65, 255, 0.3)
55 | );
56 |
57 | --tile-start-rgb: 2, 13, 46;
58 | --tile-end-rgb: 2, 5, 19;
59 | --tile-border: conic-gradient(
60 | #ffffff80,
61 | #ffffff40,
62 | #ffffff30,
63 | #ffffff20,
64 | #ffffff10,
65 | #ffffff10,
66 | #ffffff80
67 | );
68 |
69 | --callout-rgb: 20, 20, 20;
70 | --callout-border-rgb: 108, 108, 108;
71 | --card-rgb: 100, 100, 100;
72 | --card-border-rgb: 200, 200, 200;
73 | }
74 | }
75 |
76 | * {
77 | box-sizing: border-box;
78 | padding: 0;
79 | margin: 0;
80 | }
81 |
82 | html,
83 | body {
84 | max-width: 100vw;
85 | overflow-x: hidden;
86 | }
87 |
88 | body {
89 | color: rgb(var(--foreground-rgb));
90 | background: linear-gradient(
91 | to bottom,
92 | transparent,
93 | rgb(var(--background-end-rgb))
94 | )
95 | rgb(var(--background-start-rgb));
96 | }
97 |
98 | a {
99 | color: inherit;
100 | text-decoration: none;
101 | }
102 |
103 | @media (prefers-color-scheme: dark) {
104 | html {
105 | color-scheme: dark;
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/apps/web/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es5",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": true,
6 | "skipLibCheck": true,
7 | "strict": true,
8 | "forceConsistentCasingInFileNames": true,
9 | "noEmit": true,
10 | "esModuleInterop": true,
11 | "module": "esnext",
12 | "moduleResolution": "node",
13 | "resolveJsonModule": true,
14 | "isolatedModules": true,
15 | "jsx": "preserve",
16 | "incremental": true
17 | },
18 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
19 | "exclude": ["node_modules"]
20 | }
21 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "scraper-js",
3 | "version": "2.0.0",
4 | "description": "From the Bay to LA, scraper will collect all of the images from the url you tell it to and return a list of the images that best represent the site.",
5 | "scripts": {
6 | "build": "turbo run build",
7 | "test": "turbo run test"
8 | },
9 | "repository": {
10 | "type": "git",
11 | "url": "git://github.com/jasonaibrahim/scraper.git"
12 | },
13 | "keywords": [
14 | "scrape",
15 | "thumbnails",
16 | "images",
17 | "facebook",
18 | "twitter",
19 | "thumbnail",
20 | "image",
21 | "scraper",
22 | "web",
23 | "crawler",
24 | "image",
25 | "web",
26 | "oakland"
27 | ],
28 | "author": "jason ibrahim",
29 | "license": "MIT",
30 | "bugs": {
31 | "url": "https://github.com/jasonaibrahim/scraper/issues"
32 | },
33 | "homepage": "https://github.com/jasonaibrahim/scraper",
34 | "workspaces": [
35 | "apps/*",
36 | "packages/*"
37 | ],
38 | "files": [
39 | "packages/core/dist",
40 | "README.md",
41 | "LICENSE",
42 | "package.json"
43 | ],
44 | "devDependencies": {
45 | "@jest/globals": "^29.3.1",
46 | "jest": "^29.3.1",
47 | "prettier": "^2.8.1",
48 | "ts-jest": "^29.0.3",
49 | "turbo": "^1.6.3"
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/packages/core/jest.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('ts-jest').JestConfigWithTsJest} */
2 | module.exports = {
3 | preset: 'ts-jest',
4 | testEnvironment: 'node',
5 | transform: {
6 | '^.+\\.(ts|tsx)?$': 'ts-jest',
7 | }
8 | };
9 |
--------------------------------------------------------------------------------
/packages/core/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@scraper-js/core",
3 | "scripts": {
4 | "test": "jest",
5 | "build": "rm -rf dist; tsc"
6 | },
7 | "main": "dist/index.js",
8 | "dependencies": {
9 | "axios": "^1.2.2",
10 | "cheerio": "1.0.0-rc.12",
11 | "lodash": "^4.17.21",
12 | "open-graph-scraper": "^5.0.3"
13 | },
14 | "devDependencies": {
15 | "@types/jquery": "^3.5.16",
16 | "@types/lodash": "^4.14.191",
17 | "axios-mock-adapter": "^1.21.2",
18 | "dotenv": "^16.0.3",
19 | "schema-dts": "^1.1.0",
20 | "typescript": "^4.9.4"
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/packages/core/src/image.ts:
--------------------------------------------------------------------------------
1 | import { ImageMetadata, ImageSource, ParseResult, RankedImage } from './types';
2 |
3 | export interface ImageExtractOptions {
4 | rankImage?: (imageMetadata: ImageMetadata) => number;
5 | }
6 |
7 | const defaultRankAlgorithm: ImageExtractOptions["rankImage"] = (
8 | imageMetadata
9 | ) => {
10 | /**
11 | * Immediately down rank images with no `src` attribute; this is a failure condition
12 | */
13 | if (!imageMetadata.src) {
14 | return -1
15 | }
16 |
17 | let score = 0;
18 |
19 | /**
20 | * Weigh image based on source, with open graph images ranking highest
21 | */
22 | switch (imageMetadata.sourceType) {
23 | case ImageSource.OpenGraph:
24 | score += 2;
25 | break
26 | case ImageSource.LinkedData:
27 | score += 1;
28 | break;
29 | }
30 |
31 | /**
32 | * Weigh image based on dimensions
33 | */
34 | if (imageMetadata.width && imageMetadata.height) {
35 | score += 1;
36 | }
37 |
38 | return score;
39 | };
40 |
41 | export function featureImageFromParseResult(
42 | result: ParseResult,
43 | options: ImageExtractOptions = {}
44 | ): RankedImage | null {
45 | return imagesFromParseResult(result, options).shift() ?? null;
46 | }
47 |
48 | export function imagesFromParseResult(
49 | result: ParseResult,
50 | options: ImageExtractOptions = {}
51 | ): Array {
52 | let images: ImageMetadata[] = [];
53 |
54 | if (result.linkedData) {
55 | // @ts-ignore - type is not being inferred but is SchemaValue
56 | const { image } = result.linkedData;
57 |
58 | if (Array.isArray(image)) {
59 | for (const img of image) {
60 | if (typeof img === "string") {
61 | images.push({
62 | width: 0,
63 | height: 0,
64 | src: img,
65 | sourceType: ImageSource.LinkedData,
66 | });
67 | } else {
68 | const width = parseInt(img.width ?? 0);
69 | const height = parseInt(img.height ?? 0);
70 | const src = img.contentUrl;
71 |
72 | images.push({
73 | width,
74 | height,
75 | src,
76 | sourceType: ImageSource.LinkedData,
77 | });
78 | }
79 | }
80 | } else if (typeof image === "string") {
81 | images.push({
82 | width: 0,
83 | height: 0,
84 | src: image,
85 | sourceType: ImageSource.LinkedData,
86 | });
87 | } else {
88 | const width = parseInt(image.width ?? 0);
89 | const height = parseInt(image.height ?? 0);
90 | const src = image.contentUrl;
91 |
92 | images.push({
93 | width,
94 | height,
95 | src,
96 | sourceType: ImageSource.LinkedData,
97 | });
98 | }
99 | }
100 |
101 | if (result.openGraph.ogImage) {
102 | const image = result.openGraph.ogImage;
103 |
104 | if (typeof image === "string") {
105 | /**
106 | * Handle type string
107 | */
108 | images.push({
109 | width: 0,
110 | height: 0,
111 | src: image,
112 | sourceType: ImageSource.OpenGraph,
113 | });
114 | } else if (Array.isArray(image)) {
115 | /**
116 | * Handle Array
117 | */
118 | for (const img of image) {
119 | let width = 0;
120 | let height = 0;
121 | let src: string;
122 |
123 | if (typeof img === "string") {
124 | src = img;
125 | } else {
126 | src = img.url;
127 | width = parseInt(`${img.width}`);
128 | height = parseInt(`${img.height}`);
129 | }
130 |
131 | images.push({
132 | ...img,
133 | width,
134 | height,
135 | src,
136 | sourceType: ImageSource.OpenGraph,
137 | });
138 | }
139 | } else {
140 | /**
141 | * Handle type ImageObject
142 | *
143 | */
144 | let width = parseInt(`${image.width}`);
145 | let height = parseInt(`${image.height}`);
146 | let src = image.url;
147 | images.push({
148 | ...image,
149 | width,
150 | height,
151 | src,
152 | sourceType: ImageSource.OpenGraph,
153 | });
154 | }
155 | }
156 |
157 | const tags = result.document("img");
158 | for (const tag of tags) {
159 | const width = parseInt(tag.attribs["width"] ?? 0);
160 | const height = parseInt(tag.attribs["height"] ?? 0);
161 | const src = tag.attribs["src"];
162 |
163 | images.push({
164 | width,
165 | height,
166 | src,
167 | sourceType: ImageSource.DOM,
168 | });
169 | }
170 |
171 | return sortedByRank(images, options.rankImage);
172 | }
173 |
174 | function sortedByRank(
175 | images: Array,
176 | rankImage: ImageExtractOptions["rankImage"] = defaultRankAlgorithm
177 | ): Array {
178 | let ranked: RankedImage[] = [];
179 |
180 | for (const image of images) {
181 | ranked.push({
182 | ...image,
183 | rank: rankImage!(image),
184 | });
185 | }
186 |
187 | return ranked.sort((a, b) => b.rank - a.rank);
188 | }
189 |
--------------------------------------------------------------------------------
/packages/core/src/index.ts:
--------------------------------------------------------------------------------
1 | import { scrape } from "./scrape";
2 |
3 | export * from "./scrape";
4 |
5 | export default {
6 | scrape,
7 | };
8 |
--------------------------------------------------------------------------------
/packages/core/src/parse.ts:
--------------------------------------------------------------------------------
1 | import cheerio, { CheerioAPI } from 'cheerio';
2 | import scrapeOpenGraphData from 'open-graph-scraper';
3 | import { ParseResult } from './types';
4 |
5 | export interface ParseOptions {
6 | parser?: CheerioAPI;
7 | }
8 | export async function parse(
9 | content: string,
10 | { parser = cheerio }: ParseOptions
11 | ): Promise {
12 | if (!content) {
13 | throw new Error("Failed to retrieve page content");
14 | }
15 |
16 | const $ = parser.load(content);
17 |
18 | /**
19 | * Scrape OpenGraph data from page html
20 | */
21 | const { result: openGraph } = await scrapeOpenGraphData({
22 | url: "",
23 | html: content,
24 | });
25 |
26 | /**
27 | * Scrape LinkedData from html if present.
28 | */
29 | const linkedDataElement = $('script[type="application/ld+json"]');
30 | let linkedData: ParseResult["linkedData"] = null;
31 | try {
32 | linkedData = JSON.parse(linkedDataElement.html()!);
33 | } catch (err) {
34 | console.warn("Failed to retrieve linked data", err);
35 | }
36 |
37 | return {
38 | openGraph,
39 | linkedData,
40 | document: $,
41 | };
42 | }
43 |
--------------------------------------------------------------------------------
/packages/core/src/scrape.ts:
--------------------------------------------------------------------------------
1 | import axios, { AxiosInstance } from 'axios';
2 | import { Thing } from 'schema-dts';
3 | import scrapeOpenGraphData from 'open-graph-scraper';
4 | import { featureImageFromParseResult, ImageExtractOptions, imagesFromParseResult } from './image';
5 | import { parse, ParseOptions } from './parse';
6 | import { RankedImage } from './types';
7 |
8 | export type ScrapeOptions = Pick &
9 | Pick & {
10 | client: AxiosInstance;
11 | };
12 |
13 | export interface ScrapeResult {
14 | html: string;
15 | images: RankedImage[];
16 | linkedData: Thing | null;
17 | openGraph:
18 | | scrapeOpenGraphData.successResultObject
19 | | scrapeOpenGraphData.errorResultObject;
20 | featureImage?: RankedImage | null;
21 | }
22 |
23 | export async function scrape(
24 | url: string,
25 | options: ScrapeOptions = {
26 | client: axios.create({
27 | timeout: 1000,
28 | }),
29 | }
30 | ): Promise {
31 | const { client } = options;
32 |
33 | const response = await client.get(url);
34 | const result = await parse(response.data, {
35 | parser: options.parser,
36 | });
37 |
38 | return {
39 | featureImage: featureImageFromParseResult(result, {
40 | rankImage: options.rankImage,
41 | }),
42 | images: imagesFromParseResult(result, {
43 | rankImage: options.rankImage,
44 | }),
45 | html: result.document.html(),
46 | linkedData: result.linkedData,
47 | openGraph: result.openGraph,
48 | };
49 | }
50 |
--------------------------------------------------------------------------------
/packages/core/src/types.ts:
--------------------------------------------------------------------------------
1 | import { CheerioAPI } from "cheerio";
2 | import { Thing } from "schema-dts";
3 | import type scrapeOpenGraphData from "open-graph-scraper";
4 |
5 | export interface ParseResult {
6 | linkedData: Thing | null;
7 | openGraph:
8 | | scrapeOpenGraphData.successResultObject
9 | | scrapeOpenGraphData.errorResultObject;
10 | document: CheerioAPI;
11 | }
12 |
13 | export enum ImageSource {
14 | LinkedData = "linked_data",
15 | OpenGraph = "opengraph",
16 | DOM = "dom",
17 | }
18 | export interface ImageMetadata extends Record {
19 | width: number;
20 | height: number;
21 | sourceType: ImageSource;
22 | src: string;
23 | }
24 |
25 | export interface RankedImage extends ImageMetadata {
26 | rank: number;
27 | }
28 |
--------------------------------------------------------------------------------
/packages/core/test/dummy.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
42 |
43 |
--------------------------------------------------------------------------------
/packages/core/test/scraper.test.ts:
--------------------------------------------------------------------------------
1 | import scraper from "../src";
2 | import { describe, expect, it } from "@jest/globals";
3 | import axios from "axios";
4 | import MockAdapter from "axios-mock-adapter";
5 | import * as fs from "fs";
6 | import * as path from "path";
7 |
8 | describe("scraper", () => {
9 | it("should provide an initialization interface", () => {
10 | expect(scraper.scrape).toBeDefined();
11 | });
12 | });
13 |
14 | describe("scraping", () => {
15 | it("should scrape metadata from a given url", async () => {
16 | const url =
17 | "https://barackobama.medium.com/my-2022-end-of-year-lists-ba76b6278801";
18 | const result = await scraper.scrape(url);
19 |
20 | expect(result.html).toBeTruthy();
21 | expect(result.featureImage!.src).toEqual(
22 | "https://miro.medium.com/max/960/1*Fm3OR_ORrkhUxF_fkKRvsw.png"
23 | );
24 | expect(result.openGraph.ogTitle).toEqual("My 2022 End of Year Lists");
25 | expect(result.linkedData).toBeTruthy();
26 | expect(result.images.length).toBeGreaterThan(0);
27 | expect(result.images[0].url).toEqual(result.featureImage!.url);
28 | });
29 | });
30 |
31 | describe("options", () => {
32 | it("should allow for override of the http adapter", async () => {
33 | const customAxios = axios.create();
34 | const mockHttp = new MockAdapter(customAxios);
35 |
36 | const dummyPageContent = fs.readFileSync(
37 | path.join(__dirname, "dummy.html"),
38 | "utf8"
39 | );
40 | mockHttp.onGet("fake-url").reply(200, dummyPageContent);
41 |
42 | const { featureImage } = await scraper.scrape("fake-url", {
43 | client: customAxios,
44 | });
45 | expect(featureImage!.src).toEqual(
46 | "https://miro.medium.com/max/1200/0*_K6j83V2soow_A2c"
47 | );
48 | });
49 | });
50 |
51 | describe("errors", () => {
52 | it("should throw an error if a given url is invalid", async () => {
53 | let badUrls: string[] = [
54 | "",
55 | "htp://example.com",
56 | "fasdfasdf",
57 | // @ts-ignore
58 | null,
59 | // @ts-expect-error
60 | () => {
61 | throw new Error();
62 | },
63 | // @ts-expect-error
64 | 1e5,
65 | ];
66 | for (const badUrl of badUrls) {
67 | await expect(scraper.scrape(badUrl)).rejects.toBeTruthy();
68 | }
69 | });
70 | });
71 |
--------------------------------------------------------------------------------
/packages/core/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "outDir": "./dist/",
4 | "noImplicitAny": true,
5 | "module": "commonjs",
6 | "target": "es2015",
7 | "jsx": "react",
8 | "allowJs": true,
9 | "moduleResolution": "node",
10 | "declaration": true,
11 | "esModuleInterop": true,
12 | "strict": true,
13 | "allowSyntheticDefaultImports": true
14 | },
15 | "include": ["./**/*.ts"],
16 | "exclude": ["test"]
17 | }
18 |
--------------------------------------------------------------------------------
/turbo.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://turbo.build/schema.json",
3 | "pipeline": {
4 | "build": {
5 | "dependsOn": ["^build"],
6 | "outputs": ["dist/**"]
7 | },
8 | "test": {
9 | "dependsOn": ["build"],
10 | "outputs": [],
11 | "inputs": ["src/**/*.ts", "test/**/*.ts"]
12 | }
13 | }
14 | }
15 |
--------------------------------------------------------------------------------