├── .editorconfig ├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .npmignore ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── README.md ├── biome.json ├── build.mjs ├── bun.lockb ├── package.json ├── src └── index.ts ├── test └── index.test.ts └── tsconfig.json /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | [*] 3 | indent_style = space 4 | indent_size = 2 5 | end_of_line = lf 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | test: 14 | runs-on: ${{ matrix.os }} 15 | 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest] 19 | fail-fast: false 20 | 21 | steps: 22 | - id: checkout 23 | name: Checkout 24 | uses: actions/checkout@v3 25 | - id: setup-bun 26 | name: Setup Bun 27 | uses: oven-sh/setup-bun@v1 28 | with: 29 | bun-version: latest 30 | - id: install-deps 31 | name: Install dependencies 32 | run: | 33 | bun install 34 | npx playwright install 35 | - id: test 36 | name: Run test 37 | run: | 38 | bun test 39 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 0 15 | 16 | - uses: actions/setup-node@v3 17 | with: 18 | node-version: 16.x 19 | 20 | - run: npx changelogithub 21 | env: 22 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | storage 2 | node_modules 3 | dist 4 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | storage 2 | node_modules 3 | src 4 | tsconfig.json 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "GitHub.copilot", 4 | "EditorConfig.EditorConfig", 5 | "streetsidesoftware.code-spell-checker", 6 | "biomejs.biome", 7 | "bradlc.vscode-tailwindcss", 8 | "ms-playwright.playwright", 9 | "Prisma.prisma" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.defaultFormatter": "biomejs.biome", 3 | "editor.formatOnSave": true, 4 | "editor.codeActionsOnSave": { 5 | "source.fixAll": "explicit" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Robert Soriano 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yomuco 2 | 3 | Yomuco – A simple web crawling library for Node.js 4 | 5 | ## Setup 6 | 7 | ```bash 8 | bun add yomuco 9 | ``` 10 | 11 | ## Getting Started 12 | 13 | ```typescript 14 | import { getBrowser, getUrls, getContent } from "yomuco"; 15 | ``` 16 | 17 | ### Set up a browser instance using Playwright 18 | 19 | ```typescript 20 | const browser = await getBrowser(); 21 | 22 | browser.close(); 23 | ``` 24 | 25 | ### Retrieve URLs from a page 26 | 27 | ```typescript 28 | const browser = await getBrowser(); 29 | 30 | const urls = await getUrls({ 31 | context: await browser.newContext(), 32 | fromUrl: "https://react.dev", 33 | maxDepth: 1, 34 | }); 35 | 36 | browser.close(); 37 | ``` 38 | 39 | ### Fetch a page content 40 | 41 | ```typescript 42 | const browser = await getBrowser(); 43 | 44 | for (const url of urls) { 45 | const content = await getContent({ 46 | context: await browser.newContext(), 47 | url, 48 | selector: "main", 49 | }); 50 | } 51 | 52 | browser.close(); 53 | ``` 54 | 55 | ## Format 56 | 57 | ```bash 58 | bunx @biomejs/biome format --write . 59 | ``` 60 | 61 | ## Lint 62 | 63 | ```bash 64 | bunx @biomejs/biome lint --apply . 65 | ``` 66 | 67 | ## Test 68 | 69 | ```bash 70 | bun test 71 | ``` 72 | 73 | ## License 74 | 75 | MIT 76 | -------------------------------------------------------------------------------- /biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.7.3/schema.json", 3 | "organizeImports": { 4 | "enabled": true 5 | }, 6 | "files": { 7 | "ignore": ["dist/**"] 8 | }, 9 | "linter": { 10 | "enabled": true, 11 | "rules": { 12 | "recommended": true 13 | } 14 | }, 15 | "formatter": { 16 | "enabled": true, 17 | "indentStyle": "space" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /build.mjs: -------------------------------------------------------------------------------- 1 | import dts from "bun-plugin-dts"; 2 | 3 | await Bun.build({ 4 | entrypoints: ["./src/index.ts"], 5 | outdir: "./dist", 6 | minify: true, 7 | plugins: [dts()], 8 | external: ["*"], 9 | }); 10 | -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andraindrops/yomuco/d487c7d0146dbcad9b1ed0736bca45ef3f264541/bun.lockb -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "yomuco", 3 | "version": "0.0.8", 4 | "main": "dist/index.js", 5 | "types": "dist/index.d.ts", 6 | "description": "", 7 | "scripts": { 8 | "build": "bun run build.mjs", 9 | "prepublishOnly": "bun run build" 10 | }, 11 | "files": ["dist"], 12 | "keywords": ["crawler"], 13 | "license": "MIT", 14 | "homepage": "https://github.com/andraindrops/yomuco#readme", 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/andraindrops/yomuco.git" 18 | }, 19 | "bugs": "https://github.com/andraindrops/yomuco/issues", 20 | "author": "Junichi Takahashi ", 21 | "devDependencies": { 22 | "@biomejs/biome": "^1.7.3", 23 | "@types/bun": "^1.1.1", 24 | "bun-plugin-dts": "^0.2.3", 25 | "typescript": "^5.4.5" 26 | }, 27 | "dependencies": { 28 | "playwright": "^1.44.0" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import playwright from "playwright"; 2 | import * as path from "node:path"; 3 | 4 | export const getBrowser = async () => { 5 | return await playwright.chromium.launch(); 6 | }; 7 | 8 | export const getUrls = async ({ 9 | context, 10 | fromUrl, 11 | maxDepth, 12 | }: { 13 | context: playwright.BrowserContext; 14 | fromUrl: string; 15 | maxDepth: number; 16 | }) => { 17 | const visitedUrls = new Set<[number, string]>(); 18 | 19 | const crawlURL = async ({ url, depth }: { url: string; depth: number }) => { 20 | if (visitedUrls.has([depth, url])) { 21 | return; 22 | } 23 | 24 | visitedUrls.add([depth, url]); 25 | 26 | if (depth >= maxDepth) { 27 | return; 28 | } 29 | 30 | try { 31 | await new Promise((resolve) => setTimeout(resolve, 100)); 32 | 33 | const page = await context.newPage(); 34 | await page.goto(url); 35 | 36 | for (const link of await page.$$("a")) { 37 | const href = await link.getAttribute("href"); 38 | 39 | if (href == null) { 40 | continue; 41 | } 42 | 43 | const absoluteUrl = new URL(href, url); 44 | 45 | if (new URL(absoluteUrl).host === new URL(fromUrl).host) { 46 | const extname = path.extname(absoluteUrl.href); 47 | 48 | if ( 49 | extname === "" || 50 | extname === ".html" || 51 | extname === ".htm" || 52 | extname === ".php" || 53 | extname === ".jsp" || 54 | extname === ".asp" || 55 | extname === ".aspx" 56 | ) { 57 | await crawlURL({ url: absoluteUrl.href, depth: depth + 1 }); 58 | } 59 | } 60 | } 61 | } catch (e: unknown) { 62 | if (e instanceof Error) { 63 | console.error(e.message); 64 | } 65 | } 66 | }; 67 | 68 | await crawlURL({ url: fromUrl, depth: 0 }); 69 | 70 | await context.close(); 71 | 72 | return [...new Set([...visitedUrls].map(([, url]) => url))]; 73 | }; 74 | 75 | export const getContent = async ({ 76 | context, 77 | url, 78 | selector, 79 | }: { 80 | context: playwright.BrowserContext; 81 | url: string; 82 | selector: string; 83 | }) => { 84 | const page = await context.newPage(); 85 | await page.goto(url); 86 | 87 | const body = await page.locator(selector).textContent(); 88 | 89 | await context.close(); 90 | 91 | return body; 92 | }; 93 | -------------------------------------------------------------------------------- /test/index.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, beforeAll, test, expect } from "bun:test"; 2 | 3 | import { getBrowser, getUrls, getContent } from "../src"; 4 | 5 | describe("crawler test", () => { 6 | beforeAll(() => { 7 | const topPage = 8 | '

hello

abc
'; 9 | 10 | const aPage = 11 | '

a

a-anext
'; 12 | const bPage = 13 | '

a

b-anext
'; 14 | const cPage = 15 | '

a

c-anext
'; 16 | 17 | Bun.serve({ 18 | fetch(req) { 19 | const url = new URL(req.url); 20 | if (url.pathname === "/") { 21 | return new Response(topPage, { 22 | headers: { "Content-Type": "text/html" }, 23 | }); 24 | } 25 | if (url.pathname === "/a") { 26 | return new Response(aPage, { 27 | headers: { "Content-Type": "text/html" }, 28 | }); 29 | } 30 | if (url.pathname === "/b") { 31 | return new Response(bPage, { 32 | headers: { "Content-Type": "text/html" }, 33 | }); 34 | } 35 | if (url.pathname === "/c") { 36 | return new Response(cPage, { 37 | headers: { "Content-Type": "text/html" }, 38 | }); 39 | } 40 | return new Response("404!"); 41 | }, 42 | }); 43 | }); 44 | 45 | test("getUrls - depth 1", async () => { 46 | const browser = await getBrowser(); 47 | const urls = await getUrls({ 48 | context: await browser.newContext(), 49 | fromUrl: "http://127.0.0.1:3000/", 50 | maxDepth: 1, 51 | }); 52 | 53 | expect(urls).toEqual([ 54 | "http://127.0.0.1:3000/", 55 | "http://127.0.0.1:3000/a", 56 | "http://127.0.0.1:3000/b", 57 | "http://127.0.0.1:3000/c", 58 | ]); 59 | }); 60 | 61 | test("getUrls - depth 2", async () => { 62 | const browser = await getBrowser(); 63 | const urls = await getUrls({ 64 | context: await browser.newContext(), 65 | fromUrl: "http://127.0.0.1:3000/", 66 | maxDepth: 2, 67 | }); 68 | 69 | expect(urls).toEqual([ 70 | "http://127.0.0.1:3000/", 71 | "http://127.0.0.1:3000/a", 72 | "http://127.0.0.1:3000/a-a", 73 | "http://127.0.0.1:3000/b", 74 | "http://127.0.0.1:3000/b-a", 75 | "http://127.0.0.1:3000/c", 76 | "http://127.0.0.1:3000/c-a", 77 | ]); 78 | }); 79 | 80 | test("getContent", async () => { 81 | const browser = await getBrowser(); 82 | const content = await getContent({ 83 | context: await browser.newContext(), 84 | url: "http://127.0.0.1:3000/", 85 | selector: "main", 86 | }); 87 | expect(content).toBe("helloabc"); 88 | }); 89 | }); 90 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "module": "esnext", 5 | "strict": true, 6 | "esModuleInterop": true, 7 | "moduleResolution": "node", 8 | "skipLibCheck": true, 9 | "noUnusedLocals": true, 10 | "noImplicitAny": true, 11 | "allowJs": true, 12 | "noEmit": true, 13 | "outDir": "dist", 14 | "resolveJsonModule": true 15 | } 16 | } 17 | --------------------------------------------------------------------------------