├── .eslintignore ├── .eslintrc.json ├── .gitignore ├── LICENSE ├── README.md ├── bun.lockb ├── components.json ├── next.config.mjs ├── package.json ├── postcss.config.mjs ├── public ├── android-chrome-192x192.png ├── android-chrome-512x512.png ├── apple-touch-icon.png ├── favicon-16x16.png ├── favicon-32x32.png ├── favicon.ico ├── next.svg ├── og-image.png ├── usage-1.png ├── usage-2.png ├── usage-3.png └── vercel.svg ├── src ├── app │ ├── api │ │ └── [[...route]] │ │ │ └── route.ts │ ├── favicon.ico │ ├── globals.css │ ├── layout.tsx │ └── page.tsx ├── components │ ├── attribute-fields.tsx │ ├── browse-template.tsx │ ├── execution-time-result.tsx │ ├── input-field.tsx │ ├── json-result-view.tsx │ ├── layout │ │ ├── footer.tsx │ │ ├── model-select.tsx │ │ ├── navbar.tsx │ │ ├── theme-provider.tsx │ │ └── theme-toggler-button.tsx │ ├── property-field.tsx │ ├── property-items-field.tsx │ ├── property-object-field.tsx │ ├── result-section.tsx │ ├── select-field.tsx │ ├── skeleton.tsx │ ├── template-form.tsx │ ├── text-area-field.tsx │ └── ui │ │ ├── accordion.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── checkbox.tsx │ │ ├── dialog.tsx │ │ ├── form.tsx │ │ ├── input.tsx │ │ ├── label.tsx │ │ ├── select.tsx │ │ └── textarea.tsx ├── controllers │ └── extract-controller.ts ├── errors │ ├── request-timeout-error.ts │ └── validation-error.ts ├── hooks │ └── use-debounce.tsx ├── lib │ ├── constants.ts │ ├── context-utils.ts │ ├── embed-utils.ts │ ├── env.ts │ ├── error-utils.ts │ ├── langchain-setup.ts │ ├── llm-utils.ts │ ├── string-utils.ts │ ├── time-utils.ts │ ├── types.ts │ ├── utils.ts │ └── web-scraper.ts ├── middlewares │ └── rate-limiter-middleware.ts ├── routes │ ├── extract-route.ts │ └── vectorstore-routes.ts ├── schemas │ └── template-schema.ts └── store │ ├── model-store.ts │ └── template-store.ts ├── tailwind.config.ts └── tsconfig.json /.eslintignore: -------------------------------------------------------------------------------- 1 | src/components/ui/ 2 | 3 | next.config.mjs 4 | src/lib/web-scraper.ts -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["next/core-web-vitals", "airbnb", "airbnb-typescript"], 3 | "parserOptions": { 4 | "project": "./tsconfig.json" 5 | }, 6 | "plugins": ["align-assignments"], 7 | "rules": { 8 | // to align json keys 9 | "key-spacing": [ 10 | "warn", 11 | { 12 | "align": { 13 | "beforeColon": true, 14 | "afterColon": true, 15 | "on": "colon" 16 | } 17 | } 18 | ], 19 | "@typescript-eslint/quotes": "off", 20 | "react/react-in-jsx-scope": "off", 21 | "react/jsx-props-no-spreading": "off", 22 | "import/prefer-default-export": "off", 23 | "react/no-array-index-key": "off", 24 | "no-multi-spaces": "off", 25 | // for variable alignment 26 | "align-assignments/align-assignments": "warn", 27 | "react/require-default-props": "off", 28 | "max-len": "off", 29 | "@typescript-eslint/no-use-before-define": "off", 30 | "indent": [ 31 | "error", 32 | 2, 33 | { 34 | "SwitchCase": 1 35 | } 36 | ], 37 | "react/jsx-curly-spacing": [ 38 | "warn", 39 | { 40 | "when": "always", 41 | "children": true, 42 | "spacing": { 43 | "objectLiterals": "never" 44 | } 45 | } 46 | ] 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | .yarn/install-state.gz 8 | 9 | # testing 10 | /coverage 11 | 12 | # next.js 13 | /.next/ 14 | /out/ 15 | 16 | # production 17 | /build 18 | 19 | # misc 20 | .DS_Store 21 | *.pem 22 | 23 | # debug 24 | npm-debug.log* 25 | yarn-debug.log* 26 | yarn-error.log* 27 | 28 | # local env files 29 | .env*.local 30 | .env* 31 | 32 | # vercel 33 | .vercel 34 | 35 | # typescript 36 | *.tsbuildinfo 37 | next-env.d.ts 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Imam Septian Adi Wijaya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JSON-Shift 2 | 3 | JSON-Shift is a tool designed to simplify web scraping and information extraction. It transforms website content into structured JSON format based on user-defined attributes. 4 | 5 | ## Table of Contents 6 | 7 | - [Key Features](#keyfeatures) 8 | - [Setup Project](#setup-project) 9 | - [How This Works](#how-this-works) 10 | - [Usage Examples](#usage-examples) 11 | - [Extract array of mangas object from manga reading website](#extract-array-of-mangas-object-from-manga-reading-website) 12 | - [Extract Person Information from Wiki](#extract-person-information-from-wiki) 13 | - [Extract Indonesian News](#extract-indonesian-news) 14 | - [Technologies Used](#technologies-used) 15 | - [License](#license) 16 | 17 | ## KeyFeatures 18 | 19 | To extract information that requested by user, this project utilizing: 20 | 21 | - Web scraping using Puppeteer 22 | - Embedding with Cohere 23 | - Large Language Model integration (currently using Gemini and Groq free plans) 24 | - Intelligent content filtering 25 | - Vector store for efficient similarity search 26 | - Langchain to integrate data/context to Large Language Model 27 | - Gemini and Groq LLM 28 | - Customizable JSON output 29 | 30 | ## Setup Project 31 | 32 | - Clone this project 33 | - install project dependencies 34 | 35 | ```bash 36 | bun install 37 | 38 | # or 39 | 40 | npm install 41 | ``` 42 | 43 | - create `.env` file and make sure you set all required env variable. You can check `@/lib/env.ts` to know what `.env` variable should be set 44 | 45 | Check [src/lib/env.ts](src/lib/env.ts) for .env variable requirements 46 | 47 | ```typescript 48 | import { z } from "zod"; 49 | 50 | const envSchema = z.object({ 51 | // llm providers 52 | // https://console.groq.com/keys 53 | GROQ_API_KEY: z.string().min(1), 54 | // https://aistudio.google.com/app/apikey 55 | GOOGLE_AI_STUDIO_API_KEY: z.string().min(1), 56 | // embedding, get from https://dashboard.cohere.com/api-keys 57 | COHERE_API_KEY: z.string().min(1), 58 | // upstash vectorstore, get from https://console.upstash.com/ 59 | UPSTASH_VECTOR_REST_URL: z.string().url(), 60 | UPSTASH_VECTOR_REST_TOKEN: z.string().min(1), 61 | // upstash redis, get from https://console.upstash.com/ 62 | UPSTASH_REDIS_REST_URL: z.string().url(), 63 | UPSTASH_REDIS_REST_TOKEN: z.string().min(1), 64 | // just put random string or generate with command `openssl rand -base64 32` 65 | CLEAR_UPSTASH_VECTOR_STORE_TOKEN: z.string().min(1), 66 | BASE_URL: z.string().url(), 67 | NODE_ENV: z 68 | .union([ 69 | z.literal("development"), 70 | z.literal("testing"), 71 | z.literal("production"), 72 | ]) 73 | .default("development"), 74 | }); 75 | export const env = envSchema.parse(process.env); 76 | ``` 77 | 78 | - run project on development mode 79 | 80 | ```bash 81 | bun dev 82 | 83 | # or 84 | 85 | npm install 86 | ``` 87 | 88 | ## How this works ? 89 | 90 | 1. Users provide a URL and define desired JSON attributes. 91 | 2. The backend scrapes the website, filtering out unnecessary elements like: 92 | 93 | - `nav` 94 | - `footer` 95 | - `header` 96 | - `aside` 97 | - `script` 98 | - `style` 99 | - `noscript` 100 | - `iframe` 101 | 102 | 3. Extracted content is split and stored in a vector database. 103 | 4. Relevant chunks are retrieved using similarity search. 104 | 5. An LLM processes the data to generate the requested JSON output. 105 | 106 | > **Note:** We store user form or template data in localStorage, so user can reuse it without need to refill the form 107 | 108 | ## Usage Example 109 | 110 | ### Extract array of mangas object from manga reading website 111 | 112 | ![Manga website scrape](/public/usage-1.png) 113 | 114 | **Request Body** 115 | 116 | ```json 117 | { 118 | "id": "1c8ab1fa-303d-4000-8e2c-70d22ea5b528", 119 | "url": "https://tcbscans.me", 120 | "name": "manga scraper", 121 | "attributes": [ 122 | { 123 | "name": "mangas", 124 | "description": "array of manga object", 125 | "type": "array", 126 | "items": { 127 | "type": "object", 128 | "properties": [ 129 | { 130 | "name": "name", 131 | "description": "manga name", 132 | "type": "string" 133 | }, 134 | { 135 | "name": "chapter", 136 | "description": "manga chapter number", 137 | "type": "string" 138 | }, 139 | { 140 | "name": "thumbnail", 141 | "description": "manga thumbnail image url", 142 | "type": "string" 143 | }, 144 | { 145 | "name": "url", 146 | "description": "url to read the manga chapter", 147 | "type": "string" 148 | } 149 | ] 150 | } 151 | }, 152 | { 153 | "name": "last_updated_at", 154 | "description": "time of latest manga update", 155 | "type": "string" 156 | } 157 | ], 158 | "latestResult": { 159 | "output": { 160 | "mangas": [ 161 | { 162 | "name": "My Hero Academia", 163 | "chapter": "430", 164 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/mhaDescriptionv2.png", 165 | "url": "https://tcbscans.me/chapters/7777/my-hero-academia-chapter-430" 166 | }, 167 | { 168 | "name": "Black Clover", 169 | "chapter": "370.371", 170 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/site_cover_bc1.png", 171 | "url": "https://tcbscans.me/chapters/7723/black-clover-chapter-370and371" 172 | }, 173 | { 174 | "name": "Haikyuu!! (New Special)", 175 | "chapter": "3", 176 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/haikyu45-1200px.jpeg", 177 | "url": "https://tcbscans.me/chapters/7654/haikyu-special-chapter-3" 178 | }, 179 | { 180 | "name": "Black Clover Gaiden: Quartet Knights", 181 | "chapter": "40", 182 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/hbc.png", 183 | "url": "https://tcbscans.me/chapters/7651/black-clover-gaiden-quartet-knights-chapter-40" 184 | }, 185 | { 186 | "name": "Jujutsu Kaisen", 187 | "chapter": "267", 188 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/jjkkk.png", 189 | "url": "https://tcbscans.me/chapters/7790/jujutsu-kaisen-chapter-267" 190 | }, 191 | { 192 | "name": "One Piece", 193 | "chapter": "1124", 194 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/op_1009_00-Cover-redraw-fin-wm-lvl-1.png", 195 | "url": "https://tcbscans.me/chapters/7789/one-piece-chapter-1124" 196 | }, 197 | { 198 | "name": "Chainsaw Man", 199 | "chapter": "174", 200 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/cmt2.jpg", 201 | "url": "https://tcbscans.me/chapters/7787/chainsaw-man-chapter-174" 202 | }, 203 | { 204 | "name": "My Hero Academia One-Shot: You're Next!!", 205 | "chapter": "1", 206 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/PV_pic.png", 207 | "url": "https://tcbscans.me/chapters/7782/my-hero-academia-one-shot-you-re-next-chapter-1" 208 | } 209 | ], 210 | "last_updated_at": "2 days ago" 211 | } 212 | }, 213 | "updatedAt": "2024-08-25T14:50:36.284Z", 214 | "ignoreCache": false, 215 | "model": "mixtral-8x7b-32768" 216 | } 217 | ``` 218 | 219 | **Output** 220 | 221 | ```json 222 | { 223 | "output": { 224 | "mangas": [ 225 | { 226 | "name": "My Hero Academia", 227 | "chapter": "430", 228 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/mhaDescriptionv2.png", 229 | "url": "https://tcbscans.me/chapters/7777/my-hero-academia-chapter-430" 230 | }, 231 | { 232 | "name": "Black Clover", 233 | "chapter": "370.371", 234 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/site_cover_bc1.png", 235 | "url": "https://tcbscans.me/chapters/7723/black-clover-chapter-370and371" 236 | }, 237 | { 238 | "name": "Haikyuu!! (New Special)", 239 | "chapter": "3", 240 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/haikyu45-1200px.jpeg", 241 | "url": "https://tcbscans.me/chapters/7654/haikyu-special-chapter-3" 242 | }, 243 | { 244 | "name": "Black Clover Gaiden: Quartet Knights", 245 | "chapter": "40", 246 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/hbc.png", 247 | "url": "https://tcbscans.me/chapters/7651/black-clover-gaiden-quartet-knights-chapter-40" 248 | }, 249 | { 250 | "name": "Jujutsu Kaisen", 251 | "chapter": "267", 252 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/jjkkk.png", 253 | "url": "https://tcbscans.me/chapters/7790/jujutsu-kaisen-chapter-267" 254 | }, 255 | { 256 | "name": "One Piece", 257 | "chapter": "1124", 258 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/op_1009_00-Cover-redraw-fin-wm-lvl-1.png", 259 | "url": "https://tcbscans.me/chapters/7789/one-piece-chapter-1124" 260 | }, 261 | { 262 | "name": "Chainsaw Man", 263 | "chapter": "174", 264 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/cmt2.jpg", 265 | "url": "https://tcbscans.me/chapters/7787/chainsaw-man-chapter-174" 266 | }, 267 | { 268 | "name": "My Hero Academia One-Shot: You're Next!!", 269 | "chapter": "1", 270 | "thumbnail": "https://cdn.onepiecechapters.com/file/CDN-M-A-N/PV_pic.png", 271 | "url": "https://tcbscans.me/chapters/7782/my-hero-academia-one-shot-you-re-next-chapter-1" 272 | } 273 | ], 274 | "last_updated_at": "2 days ago" 275 | } 276 | } 277 | ``` 278 | 279 | ### Extract Person Information from Wiki 280 | 281 | ![Dota player scrape](/public/usage-2.png) 282 | 283 | **Request Body** 284 | 285 | ```json 286 | { 287 | "id": "e0654277-89b6-4a7b-a071-a788fdbb6636", 288 | "url": "https://liquipedia.net/dota2/Gorgc", 289 | "name": "dota player detail", 290 | "attributes": [ 291 | { 292 | "name": "name", 293 | "description": "player real name", 294 | "type": "string" 295 | }, 296 | { 297 | "name": "ign", 298 | "description": "player In game name", 299 | "type": "string" 300 | }, 301 | { 302 | "name": "earnings", 303 | "description": "players earnings from dota competitive scene", 304 | "type": "number" 305 | }, 306 | { 307 | "name": "nationality", 308 | "description": "player nationality", 309 | "type": "string" 310 | }, 311 | { 312 | "name": "picture", 313 | "description": "image url of player picture", 314 | "type": "string" 315 | } 316 | ], 317 | "latestResult": { 318 | "output": { 319 | "name": "Janne Stefanovski", 320 | "ign": "Gorgc", 321 | "earnings": 14839, 322 | "nationality": "Sweden", 323 | "picture": "https://liquipedia.net/commons/images/thumb/0/0f/Gorgc_WESG_2016.jpg/600px-Gorgc_WESG_2016.jpg" 324 | } 325 | }, 326 | "createdAt": "2024-08-25T14:59:46.509Z", 327 | "updatedAt": "2024-08-25T14:59:46.509Z", 328 | "ignoreCache": false, 329 | "model": "mixtral-8x7b-32768" 330 | } 331 | ``` 332 | 333 | **Output** 334 | 335 | ```json 336 | { 337 | "output": { 338 | "name": "Janne Stefanovski", 339 | "ign": "Gorgc", 340 | "earnings": 14839, 341 | "nationality": "Sweden", 342 | "picture": "https://liquipedia.net/commons/images/thumb/0/0f/Gorgc_WESG_2016.jpg/600px-Gorgc_WESG_2016.jpg" 343 | } 344 | } 345 | ``` 346 | 347 | ### Extract indonesian news 348 | 349 | ![Extract indonesian news](/public/usage-3.png) 350 | 351 | **Request Body** 352 | 353 | ```json 354 | { 355 | "id": "f6086055-324d-4b81-be68-b2c220b83b1f", 356 | "url": "https://www.kaskus.co.id/thread/66c916cb5a6daedab1041d6c/netizen-curiga-skandal-azizah-salsha-hanya-pengalihan-isu-polemik-putusan-mk?ref=homelanding&med=hot_thread&style=thumb", 357 | "name": "news scraper", 358 | "attributes": [ 359 | { 360 | "name": "judul", 361 | "description": "judul dari berita pada website", 362 | "type": "string" 363 | }, 364 | { 365 | "name": "rangkuman", 366 | "description": "2 sampai 3 kalimat rangkuman mengenai berita pada website", 367 | "type": "string" 368 | }, 369 | { 370 | "name": "gambar", 371 | "description": "link untuk gambar thumbnaill berita", 372 | "type": "string" 373 | }, 374 | { 375 | "name": "penulis", 376 | "description": "objek yang berisi detail profil penulis berita", 377 | "type": "object", 378 | "properties": [ 379 | { 380 | "name": "nama", 381 | "description": "nama akun penulis", 382 | "type": "string" 383 | }, 384 | { 385 | "name": "total_post", 386 | "description": "jumlah postingan penulis", 387 | "type": "number" 388 | } 389 | ] 390 | } 391 | ], 392 | "latestResult": { 393 | "output": { 394 | "judul": "Netizen Curiga! Skandal Azizah Salsha Hanya Pengalihan Isu Polemik Putusan MK!", 395 | "rangkuman": "Kasus dugaan perselingkuhan istri Pratama Arhan, Azizah Salsha, dengan pacar selebgram Rachel Vennya, Salim Nauderer, memancing spekulasi di ranah digital. Netizen memulai kampanye #KawalPutusanMK untuk menjaga fokus terhadap isu yang dinilai lebih krusial.", 396 | "gambar": "https://s.kaskus.id/images/2024/08/23/10600510_202408231058540652.jpg", 397 | "penulis": { 398 | "nama": "TS harrywjyy", 399 | "total_post": 3 400 | } 401 | } 402 | }, 403 | "createdAt": "2024-08-25T15:08:10.227Z", 404 | "updatedAt": "2024-08-25T15:08:10.227Z", 405 | "ignoreCache": false, 406 | "model": "mixtral-8x7b-32768" 407 | } 408 | ``` 409 | 410 | **Output** 411 | 412 | ```json 413 | { 414 | "output": { 415 | "judul": "Netizen Curiga! Skandal Azizah Salsha Hanya Pengalihan Isu Polemik Putusan MK!", 416 | "rangkuman": "Kasus dugaan perselingkuhan istri Pratama Arhan, Azizah Salsha, dengan pacar selebgram Rachel Vennya, Salim Nauderer, memancing spekulasi di ranah digital. Netizen memulai kampanye #KawalPutusanMK untuk menjaga fokus terhadap isu yang dinilai lebih krusial.", 417 | "gambar": "https://s.kaskus.id/images/2024/08/23/10600510_202408231058540652.jpg", 418 | "penulis": { 419 | "nama": "TS harrywjyy", 420 | "total_post": 3 421 | } 422 | } 423 | } 424 | ``` 425 | 426 | ## Technologies Used 427 | 428 | - Web Scraping: Puppeteer 429 | - Embedding: Cohere 430 | - Langchain 431 | - LLM: Gemini, Groq 432 | - Vector Store: Upstash 433 | - Development: TypeScript, Bun 434 | 435 | ## License 436 | 437 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 438 | -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/bun.lockb -------------------------------------------------------------------------------- /components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "default", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.ts", 8 | "css": "src/app/globals.css", 9 | "baseColor": "slate", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/lib/utils" 16 | } 17 | } -------------------------------------------------------------------------------- /next.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | reactStrictMode: true, 4 | // env: env, 5 | experimental: { 6 | serverComponentsExternalPackages: ["puppeteer-core", "@sparticuz/chromium"], 7 | }, 8 | }; 9 | 10 | export default nextConfig; 11 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "json-shift", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@hookform/resolvers": "^3.9.0", 13 | "@langchain/cohere": "^0.2.2", 14 | "@langchain/community": "^0.2.28", 15 | "@langchain/core": "^0.2.28", 16 | "@langchain/google-genai": "^0.0.26", 17 | "@langchain/groq": "^0.0.16", 18 | "@langchain/openai": "^0.2.6", 19 | "@radix-ui/react-accordion": "^1.2.0", 20 | "@radix-ui/react-checkbox": "^1.1.1", 21 | "@radix-ui/react-dialog": "^1.1.1", 22 | "@radix-ui/react-label": "^2.1.0", 23 | "@radix-ui/react-select": "^2.1.1", 24 | "@radix-ui/react-slot": "^1.1.0", 25 | "@sparticuz/chromium": "^123.0.0", 26 | "@uiw/react-json-view": "^2.0.0-alpha.26", 27 | "@upstash/redis": "^1.34.0", 28 | "@upstash/vector": "^1.1.5", 29 | "axios": "^1.7.3", 30 | "class-variance-authority": "^0.7.0", 31 | "clsx": "^2.1.1", 32 | "hono": "^4.5.4", 33 | "hono-rate-limiter": "^0.4.0", 34 | "langchain": "^0.2.17", 35 | "lucide-react": "^0.426.0", 36 | "next": "14.2.6", 37 | "next-themes": "^0.3.0", 38 | "puppeteer": "^22.11.0", 39 | "puppeteer-core": "^22.11.0", 40 | "react": "^18", 41 | "react-dom": "^18", 42 | "react-hook-form": "^7.52.2", 43 | "tailwind-merge": "^2.4.0", 44 | "tailwindcss-animate": "^1.0.7", 45 | "uuid": "^10.0.0", 46 | "zod": "^3.23.8", 47 | "zustand": "^4.5.4" 48 | }, 49 | "devDependencies": { 50 | "@types/node": "^20", 51 | "@types/puppeteer": "^7.0.4", 52 | "@types/react": "^18", 53 | "@types/react-dom": "^18", 54 | "@typescript-eslint/eslint-plugin": "^7.0.0", 55 | "@typescript-eslint/parser": "^7.10.0", 56 | "eslint": "^8", 57 | "eslint-config-airbnb": "^19.0.4", 58 | "eslint-config-airbnb-typescript": "^18.0.0", 59 | "eslint-config-next": "14.2.6", 60 | "eslint-plugin-align-assignments": "^1.1.2", 61 | "postcss": "^8", 62 | "tailwindcss": "^3.4.1", 63 | "typescript": "^5" 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /public/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/android-chrome-192x192.png -------------------------------------------------------------------------------- /public/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/android-chrome-512x512.png -------------------------------------------------------------------------------- /public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/apple-touch-icon.png -------------------------------------------------------------------------------- /public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/favicon-16x16.png -------------------------------------------------------------------------------- /public/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/favicon-32x32.png -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/favicon.ico -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/og-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/og-image.png -------------------------------------------------------------------------------- /public/usage-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/usage-1.png -------------------------------------------------------------------------------- /public/usage-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/usage-2.png -------------------------------------------------------------------------------- /public/usage-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/public/usage-3.png -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/app/api/[[...route]]/route.ts: -------------------------------------------------------------------------------- 1 | import { Context, Hono } from "hono"; 2 | 3 | import RequestTimeoutError from "@/errors/request-timeout-error"; 4 | import ValidationError from "@/errors/validation-error"; 5 | import { env } from "@/lib/env"; 6 | import extractRoute from "@/routes/extract-route"; 7 | import vectorStoreRoute from "@/routes/vectorstore-routes"; 8 | import { cors } from "hono/cors"; 9 | import { StatusCode } from "hono/utils/http-status"; 10 | import { handle } from "hono/vercel"; 11 | 12 | /** 13 | * Set maximum duration for serverless function 14 | */ 15 | export const maxDuration = 60; 16 | 17 | /** 18 | * Initialize Hono app with base path 19 | */ 20 | const app = new Hono().basePath("/api"); 21 | 22 | app.use( 23 | "/api/*", 24 | cors({ 25 | origin : env.NODE_ENV === "production" ? env.BASE_URL : "*", 26 | allowMethods : ["POST", "GET", "OPTIONS"], 27 | maxAge : 600, 28 | credentials : true, 29 | }), 30 | ); 31 | 32 | const errorResponse = (c: Context, status: StatusCode, error: any) => c.json( 33 | { 34 | code : error?.code || "INTERNAL_SERVER_ERROR", 35 | title : error.name.replace(/([A-Z])/g, " $1").trim(), 36 | message : error.message, 37 | details : error.details, 38 | }, 39 | status, 40 | ); 41 | 42 | /** 43 | * Global error handler 44 | * @param {Error} err - The error object 45 | * @param {Context} c - The Hono context 46 | * @returns {Response} JSON response with error details 47 | */ 48 | app.onError((err, c) => { 49 | if (err instanceof ValidationError) { 50 | return errorResponse(c, 422, err); 51 | } 52 | if (err instanceof RequestTimeoutError) { 53 | return errorResponse(c, 504, err); 54 | } 55 | return errorResponse(c, 500, new Error("Internal Server Error")); 56 | }); 57 | 58 | app.route("/extract", extractRoute); 59 | app.route("/vectorstore", vectorStoreRoute); 60 | 61 | export const GET = handle(app); 62 | export const POST = handle(app); 63 | -------------------------------------------------------------------------------- /src/app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imamseptian/json-shift/849262fa5287c39f114b90dca0eebfdeb80ef0f8/src/app/favicon.ico -------------------------------------------------------------------------------- /src/app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer base { 6 | :root { 7 | --background: 0 0% 100%; 8 | --foreground: 222.2 84% 4.9%; 9 | --card: 0 0% 100%; 10 | --card-foreground: 222.2 84% 4.9%; 11 | --popover: 0 0% 100%; 12 | --popover-foreground: 222.2 84% 4.9%; 13 | --primary: 222.2 47.4% 11.2%; 14 | --primary-foreground: 210 40% 98%; 15 | --secondary: 210 40% 96.1%; 16 | --secondary-foreground: 222.2 47.4% 11.2%; 17 | --muted: 210 40% 96.1%; 18 | --muted-foreground: 215.4 16.3% 46.9%; 19 | --accent: 210 40% 96.1%; 20 | --accent-foreground: 222.2 47.4% 11.2%; 21 | --destructive: 0 84.2% 60.2%; 22 | --destructive-foreground: 210 40% 98%; 23 | --border: 214.3 31.8% 91.4%; 24 | --input: 214.3 31.8% 91.4%; 25 | --ring: 222.2 84% 4.9%; 26 | --radius: 0.3rem; 27 | --chart-1: 12 76% 61%; 28 | --chart-2: 173 58% 39%; 29 | --chart-3: 197 37% 24%; 30 | --chart-4: 43 74% 66%; 31 | --chart-5: 27 87% 67%; 32 | --success: 221.2 83.2% 53.3%; 33 | --success-foreground: 210 40% 98%; 34 | } 35 | 36 | .dark { 37 | --background: 222.2 84% 4.9%; 38 | --foreground: 210 40% 98%; 39 | --card: 222.2 84% 4.9%; 40 | --card-foreground: 210 40% 98%; 41 | --popover: 222.2 84% 4.9%; 42 | --popover-foreground: 210 40% 98%; 43 | --primary: 210 40% 98%; 44 | --primary-foreground: 222.2 47.4% 11.2%; 45 | --secondary: 217.2 32.6% 17.5%; 46 | --secondary-foreground: 210 40% 98%; 47 | --muted: 217.2 32.6% 17.5%; 48 | --muted-foreground: 215 20.2% 65.1%; 49 | --accent: 217.2 32.6% 17.5%; 50 | --accent-foreground: 210 40% 98%; 51 | --destructive: 0 62.8% 30.6%; 52 | --destructive-foreground: 210 40% 98%; 53 | --border: 217.2 32.6% 17.5%; 54 | --input: 217.2 32.6% 17.5%; 55 | --ring: 212.7 26.8% 83.9; 56 | --chart-1: 220 70% 50%; 57 | --chart-2: 160 60% 45%; 58 | --chart-3: 30 80% 55%; 59 | --chart-4: 280 65% 60%; 60 | --chart-5: 340 75% 55%; 61 | --success: 217.2 91.2% 59.8%; 62 | --success-foreground: 222.2 47.4% 11.2%; 63 | } 64 | } 65 | 66 | @layer base { 67 | * { 68 | @apply border-border; 69 | } 70 | body { 71 | @apply bg-background text-foreground; 72 | } 73 | } 74 | 75 | .sidebar { 76 | width: 400px; 77 | transition: margin-left 0.3s ease-in-out; 78 | } 79 | 80 | .sidebar-open { 81 | margin-left: 0; 82 | } 83 | 84 | .sidebar-close { 85 | margin-left: -400px; 86 | } 87 | -------------------------------------------------------------------------------- /src/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import Footer from "@/components/layout/footer"; 2 | import Navbar from "@/components/layout/navbar"; 3 | import { ThemeProvider } from "@/components/layout/theme-provider"; 4 | import { env } from "@/lib/env"; 5 | import { cn } from "@/lib/utils"; 6 | import type { Metadata } from "next"; 7 | import { Inter } from "next/font/google"; 8 | import "./globals.css"; 9 | 10 | const inter = Inter({ subsets: ["latin"] }); 11 | 12 | export const metadata: Metadata = { 13 | title: { 14 | default : "JsonShift | AI-Powered Web Data Extraction to JSON", 15 | template : "%s | JsonShift", 16 | }, 17 | description: 18 | "JsonShift utilizes scraping, embedding , and LLM technologies to extract and convert web content into structured JSON format based on user inputs. Simplify data extraction with our powerful tools.", 19 | keywords: [ 20 | "JSON", 21 | "data extraction", 22 | "AI-powered data extraction", 23 | "web scraping", 24 | "groq", 25 | "gemini", 26 | "cohere", 27 | "embedding", 28 | "scraper", 29 | 'langchain', 30 | "jsonshift", 31 | "data extraction", 32 | "web content to JSON", 33 | "LLM data processing", 34 | "website data extraction", 35 | "JSON generation", 36 | "web data to JSON", 37 | "structured data", 38 | "data transformation", 39 | ], 40 | authors : [{ name: "Imam Septian Adi Wijaya" }], 41 | creator : "Imam Septian Adi Wijaya", 42 | themeColor : [ 43 | { media: "(prefers-color-scheme: dark)", color: "black" }, 44 | { media: "(prefers-color-scheme: light)", color: "white" }, 45 | ], 46 | openGraph: { 47 | type : "website", 48 | locale : "en_US", 49 | url : env.BASE_URL, 50 | siteName : "JsonShift", 51 | title : "JsonShift | AI-Powered Web Data Extraction to JSON", 52 | description: 53 | "Experience the power of AI and advanced scraping with JsonShift to transform web content into customizable JSON outputs. Efficiently extract data tailored to your needs.", 54 | images: [ 55 | { 56 | url : `${env.BASE_URL}/og-image.png`, 57 | width : 1200, 58 | height : 630, 59 | alt : "JsonShift - AI-Powered Data Extraction", 60 | }, 61 | ], 62 | }, 63 | twitter: { 64 | card : "summary_large_image", 65 | site : "@yourTwitterHandle", 66 | title : "JsonShift | AI-Powered Web Data Extraction to JSON", 67 | description: 68 | "Transform web content into structured JSON outputs with JsonShift's AI-powered tools. Extract data quickly and accurately based on user-defined inputs.", 69 | images: [ 70 | { 71 | url : `${env.BASE_URL}/og-image.png`, 72 | width : 1200, 73 | height : 630, 74 | alt : "JsonShift - AI-Powered Data Extraction", 75 | }, 76 | ], 77 | }, 78 | robots: { 79 | index : true, 80 | follow : true, 81 | googleBot : { 82 | index : true, 83 | follow : true, 84 | "max-video-preview" : -1, 85 | "max-image-preview" : "large", 86 | "max-snippet" : -1, 87 | }, 88 | }, 89 | icons: { 90 | icon : "/favicon.ico", 91 | shortcut : "/favicon-16x16.png", 92 | apple : "/apple-touch-icon.png", 93 | }, 94 | alternates: { 95 | languages: { 96 | "en-US" : "/en-US", 97 | "es-ES" : "/es-ES", 98 | }, 99 | }, 100 | }; 101 | 102 | export default function RootLayout({ 103 | children, 104 | }: Readonly<{ 105 | children: React.ReactNode; 106 | }>) { 107 | return ( 108 | 109 | 112 | 118 |
119 | 120 |
{ children }
121 |
123 |
124 | 125 | 126 | ); 127 | } 128 | -------------------------------------------------------------------------------- /src/app/page.tsx: -------------------------------------------------------------------------------- 1 | 'use client'; 2 | 3 | import { 4 | useEffect, useRef, useState, 5 | } from "react"; 6 | import { UseFormReturn } from "react-hook-form"; 7 | import { v4 as uuid } from "uuid"; 8 | 9 | import BrowseTemplate from "@/components/browse-template"; 10 | import { ExecutionTime } from "@/components/execution-time-result"; 11 | import ResultSection from "@/components/result-section"; 12 | import TemplateForm from "@/components/template-form"; 13 | import { Button } from "@/components/ui/button"; 14 | import { applyValidationErrorsToForm } from "@/lib/error-utils"; 15 | import { ErrorObject } from "@/lib/types"; 16 | import { Template } from "@/schemas/template-schema"; 17 | import { useModelStore } from "@/store/model-store"; 18 | import { useTemplateStore } from "@/store/template-store"; 19 | 20 | /** 21 | * Homepage component for template management and execution 22 | * @returns {JSX.Element} The rendered Homepage component 23 | */ 24 | export default function Homepage(): JSX.Element { 25 | const { 26 | selectedTemplate, 27 | setSelectedTemplate, 28 | addTemplate, 29 | updateTemplate, 30 | } = useTemplateStore(); 31 | const { model: selectedModel } = useModelStore(); 32 | 33 | const resultRef = useRef(null); 34 | const [objectResult, setObjectResult] = useState(null); 35 | const [isSubmitting, setIsSubmitting] = useState(false); 36 | const [errorObject, setErrorObject] = useState(null); 37 | const [executionTime, setExecutionTime] = useState({ 38 | scrapeExecutionTime : null, 39 | embeddingTime : null, 40 | llmProcessingTime : null, 41 | }); 42 | 43 | useEffect(() => { 44 | setObjectResult(selectedTemplate?.latestResult ?? null); 45 | }, [selectedTemplate]); 46 | 47 | /** 48 | * Handles form submission and API interaction 49 | * @param {Template} formValues - The form values to be submitted 50 | * @param {UseFormReturn