├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── bun.lockb ├── index.ts ├── package.json ├── sqlite-html-shim.d.ts ├── sqlite-http-shim.d.ts └── tsconfig.json /.dockerignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | Dockerfile* 3 | docker-compose* 4 | .dockerignore 5 | .git 6 | .gitignore 7 | README.md 8 | LICENSE 9 | .vscode 10 | Makefile 11 | helm-charts 12 | .env 13 | .editorconfig 14 | .idea 15 | coverage* 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore 2 | 3 | # Logs 4 | 5 | logs 6 | _.log 7 | npm-debug.log_ 8 | yarn-debug.log* 9 | yarn-error.log* 10 | lerna-debug.log* 11 | .pnpm-debug.log* 12 | 13 | # Caches 14 | 15 | .cache 16 | 17 | # Diagnostic reports (https://nodejs.org/api/report.html) 18 | 19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 20 | 21 | # Runtime data 22 | 23 | pids 24 | _.pid 25 | _.seed 26 | *.pid.lock 27 | 28 | # Directory for instrumented libs generated by jscoverage/JSCover 29 | 30 | lib-cov 31 | 32 | # Coverage directory used by tools like istanbul 33 | 34 | coverage 35 | *.lcov 36 | 37 | # nyc test coverage 38 | 39 | .nyc_output 40 | 41 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 42 | 43 | .grunt 44 | 45 | # Bower dependency directory (https://bower.io/) 46 | 47 | bower_components 48 | 49 | # node-waf configuration 50 | 51 | .lock-wscript 52 | 53 | # Compiled binary addons (https://nodejs.org/api/addons.html) 54 | 55 | build/Release 56 | 57 | # Dependency directories 58 | 59 | node_modules/ 60 | jspm_packages/ 61 | 62 | # Snowpack dependency directory (https://snowpack.dev/) 63 | 64 | web_modules/ 65 | 66 | # TypeScript cache 67 | 68 | *.tsbuildinfo 69 | 70 | # Optional npm cache directory 71 | 72 | .npm 73 | 74 | # Optional eslint cache 75 | 76 | .eslintcache 77 | 78 | # Optional stylelint cache 79 | 80 | .stylelintcache 81 | 82 | # Microbundle cache 83 | 84 | .rpt2_cache/ 85 | .rts2_cache_cjs/ 86 | .rts2_cache_es/ 87 | .rts2_cache_umd/ 88 | 89 | # Optional REPL history 90 | 91 | .node_repl_history 92 | 93 | # Output of 'npm pack' 94 | 95 | *.tgz 96 | 97 | # Yarn Integrity file 98 | 99 | .yarn-integrity 100 | 101 | # dotenv environment variable files 102 | 103 | .env 104 | .env.development.local 105 | .env.test.local 106 | .env.production.local 107 | .env.local 108 | 109 | # parcel-bundler cache (https://parceljs.org/) 110 | 111 | .parcel-cache 112 | 113 | # Next.js build output 114 | 115 | .next 116 | out 117 | 118 | # Nuxt.js build / generate output 119 | 120 | .nuxt 121 | dist 122 | 123 | # Gatsby files 124 | 125 | # Comment in the public line in if your project uses Gatsby and not Next.js 126 | 127 | # https://nextjs.org/blog/next-9-1#public-directory-support 128 | 129 | # public 130 | 131 | # vuepress build output 132 | 133 | .vuepress/dist 134 | 135 | # vuepress v2.x temp and cache directory 136 | 137 | .temp 138 | 139 | # Docusaurus cache and generated files 140 | 141 | .docusaurus 142 | 143 | # Serverless directories 144 | 145 | .serverless/ 146 | 147 | # FuseBox cache 148 | 149 | .fusebox/ 150 | 151 | # DynamoDB Local files 152 | 153 | .dynamodb/ 154 | 155 | # TernJS port file 156 | 157 | .tern-port 158 | 159 | # Stores VSCode versions used for testing VSCode extensions 160 | 161 | .vscode-test 162 | 163 | # yarn v2 164 | 165 | .yarn/cache 166 | .yarn/unplugged 167 | .yarn/build-state.yml 168 | .yarn/install-state.gz 169 | .pnp.* 170 | 171 | # IntelliJ based IDEs 172 | .idea 173 | 174 | # Finder (MacOS) folder config 175 | .DS_Store 176 | index.html 177 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # use the official Bun image 2 | FROM --platform=linux/x86_64 oven/bun:1 3 | WORKDIR /app 4 | 5 | COPY . . 6 | 7 | RUN bun install --frozen-lockfile 8 | RUN apt-get update 9 | RUN apt-get install ca-certificates curl -y 10 | RUN update-ca-certificates 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bun SQLite Scraper 2 | 3 | Because why not 4 | 5 | ### Build and install deps 6 | 7 | ```sh 8 | docker build -t bun-sqlite-scraper 9 | ``` 10 | 11 | ### Run Container 12 | 13 | ```sh 14 | docker run --rm -it -v "$PWD":/app bun-sqlite-scraper /bin/bash 15 | ``` 16 | 17 | ### Execute 18 | 19 | ```sh 20 | bun run index.ts 21 | ``` 22 | -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Eckhardt-D/bun-sqlite-scraper/1a6d5b3b4b5c7cdb119a397d582b42b24c4dd479/bun.lockb -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | /* https://github.com/Eckhardt-D/bun-sqlite-scraper */ 2 | 3 | import sql from 'sql-template-tag' 4 | import { Database } from 'bun:sqlite' 5 | import * as sqlite_http from 'sqlite-http' 6 | import * as sqlite_html from 'sqlite-html' 7 | 8 | const db = new Database(':memory:') 9 | 10 | db.loadExtension(sqlite_http.getLoadablePath()) 11 | db.loadExtension(sqlite_html.getLoadablePath()) 12 | 13 | const query = sql` 14 | SELECT text, html_attribute_get(html, 'a', 'href') AS href 15 | FROM html_each(http_get_body('https://text.npr.org'), 'a') 16 | `; 17 | 18 | const rows = db.prepare(query.sql).all(); 19 | 20 | 21 | console.table(rows) 22 | 23 | /* 24 | 25 | ---------------------- 26 | | text | href | 27 | ---------------------- 28 | | title | /1234 | 29 | ---------------------- 30 | 31 | */ 32 | 33 | 34 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bun-sqlite-scraper", 3 | "description": "Because why not", 4 | "module": "index.ts", 5 | "type": "module", 6 | "author": "Eckhardt-D", 7 | "license": "MIT", 8 | "scripts": { 9 | "test": "bun run index.ts", 10 | "docker:build": "docker build -t bun-sqlite-scraper .", 11 | "docker:run": "docker run --rm -it -v \"$PWD\":/app bun-sqlite-scraper /bin/bash" 12 | }, 13 | "devDependencies": { 14 | "@types/bun": "latest" 15 | }, 16 | "peerDependencies": { 17 | "typescript": "^5.0.0" 18 | }, 19 | "dependencies": { 20 | "sql-template-tag": "^5.2.1", 21 | "sqlite-html": "^0.1.3", 22 | "sqlite-http": "^0.1.1" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /sqlite-html-shim.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'sqlite-html' { 2 | export function getLoadablePath(): string; 3 | } 4 | -------------------------------------------------------------------------------- /sqlite-http-shim.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'sqlite-http' { 2 | export function getLoadablePath(): string; 3 | } 4 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | // Enable latest features 4 | "lib": ["ESNext", "DOM"], 5 | "target": "ESNext", 6 | "module": "ESNext", 7 | "moduleDetection": "force", 8 | "jsx": "react-jsx", 9 | "allowJs": true, 10 | 11 | // Bundler mode 12 | "moduleResolution": "bundler", 13 | "allowImportingTsExtensions": true, 14 | "verbatimModuleSyntax": true, 15 | "noEmit": true, 16 | 17 | // Best practices 18 | "strict": true, 19 | "skipLibCheck": true, 20 | "noFallthroughCasesInSwitch": true, 21 | 22 | // Some stricter flags (disabled by default) 23 | "noUnusedLocals": false, 24 | "noUnusedParameters": false, 25 | "noPropertyAccessFromIndexSignature": false 26 | } 27 | } 28 | --------------------------------------------------------------------------------