├── .gitignore ├── LICENSE ├── README.md ├── package.json └── src └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .knowledge/ 3 | .env 4 | .DS_Store 5 | coverage/ 6 | *.log 7 | test.mjs 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | - `@openinterface/knowledge` npm package repo 4 | - say hi 👋 [@n_raidenai](https://x.com/n_raidenai) 5 | 6 | ## knowledge 7 | 8 | - agent tool to autonomously learn how to use APIs, SDKs, infra tools , ... 9 | - collects documentation for RAG, as it enables 10 | - crawling docs websites 11 | - crawling github repos for readmes, npm from package name 12 | - searching for use cases (via serper) from a single query 13 | - parse openapi/swagger definitions from urls 14 | - automatically manages vectorizing , embedding , indexing , concurrency 15 | - has local index powered by `@electric-sql/pglite` and `pgvector` 16 | - (wip) post processes collected documents to clean up and improve formatting 17 | - (wip) stores in remote index dbs (like supabase , weaviate , ... ) 18 | 19 | ## Installation 20 | 21 | ```bash 22 | npm install @openinterface/knowledge 23 | ``` 24 | 25 | ## Usage 26 | 27 | make a .env file, ensure it has these values 28 | 29 | ```env 30 | OPENAI_API_KEY = "REPLACE_KEY" # required 31 | 32 | SERPER_API_KEY = "REPLACE_KEY" # to enable knowledge.collect.learn feature 33 | SERPER_SEARCH_QUERIES = 2 # search queries per learn operation (if enabled) 34 | 35 | GITHUB_API_KEY = "REPLACE_KEY" # to enable knowledge.collect.github feature 36 | 37 | #PROXY_URL = http://your_proxy_url:port # optional , for scraping / crawling pages 38 | ``` 39 | 40 | import as follows 41 | 42 | ```javascript 43 | import knowledge from '@openinterface/knowledge'; 44 | ``` 45 | 46 | ## Use Case Examples 47 | 48 | ```javascript 49 | import knowledge from '@openinterface/knowledge'; 50 | 51 | // ==================================================================== 52 | // FETCHING DOCS / DATA + EMBEDDING RESULTS + INDEXING IN VECTOR DB 53 | // ==================================================================== 54 | 55 | // collect + index documentation by crawling a website's docs 56 | await knowledge.collect.crawl({ 57 | url: 'https://docs.railway.app/', 58 | vectorize: true, 59 | index: { 60 | local: true, 61 | // postgres : false, // remote index not implemented yet 62 | // weaviate : false, // remote index not implemented yet 63 | }, 64 | }); 65 | 66 | // collect + index tutorials/articles/docs by googling a use case (needs serper key in .env) 67 | await knowledge.collect.learn({ 68 | query: 'setup and deploy graphql with node', 69 | vectorize: true, 70 | index: { local: true }, 71 | });; 72 | 73 | // collect + index readmes from a github (needs github key in .env) 74 | await knowledge.collect.github({ 75 | url: 'https://github.com/resend/react-email', 76 | vectorize: true, 77 | index: { local: true }, 78 | }); 79 | // collect + index readmes from a npm , by crawling its assigned github repo (needs github key in .env) 80 | await knowledge.collect.npm({ 81 | name: 'react-confetti', 82 | vectorize: true, 83 | index: { local: true }, 84 | }); 85 | 86 | // collect + index every {method,route} combination from an openapi specifications file url (can be yaml or json) 87 | await knowledge.collect.openapi({ 88 | url: 'https://raw.githubusercontent.com/resend/resend-openapi/refs/heads/main/resend.yaml', 89 | vectorize: true, 90 | index: { local: true }, 91 | }); 92 | 93 | 94 | // ==================================================================== 95 | // QUERYING THE COLLECTED DATA 96 | // ==================================================================== 97 | 98 | // search example 99 | const retrieved = await knowledge.index.query.local({ 100 | query: "create graphql schemas for invoices", 101 | amount: 4 102 | }) 103 | /* 104 | -> retrieved : 105 | [ 106 | { 107 | uid, 108 | data: { 109 | meta: {...} 110 | content: "... documentation content ..." 111 | }, 112 | }, 113 | ... 114 | ] 115 | */ 116 | 117 | // RAG example 118 | const answer = await knowledge.index.ask.local({ 119 | query: `make a new nodejs project that : 120 | 121 | > makes a local vectra index 122 | > indexes from a csv list of my clients , which is 'name,email,phone,task_description' 123 | > write test cases ; 124 | 125 | no typescript, and use type : module 126 | 127 | answer with the new , entire project codebase , with every file needed (including any example), in format : 128 | \`\`\`yaml 129 | repo: 130 | - path: "" # full file path 131 | content: "" # full file content 132 | - ... 133 | \`\`\``, 134 | model: `o1-mini` 135 | }) 136 | console.dir({answer}) 137 | ``` 138 | 139 | ## Potential Issues 140 | 141 | - if using the local index features (and that depend on `@electric-sql/pglite` and `@electric-sql/pglite/pgvector`) in a cloud dockerized environment, might run into some issues. 142 | the npm installer for pgvector does not handle the full installation by default 143 | - although, should work without problem in local / browsers envs 144 | 145 | ## WIP 146 | 147 | - post processing retrieved documents (clean up and reformat with LLM) 148 | - indexing in remote vector database (supabase , weaviate) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@openinterface/knowledge", 3 | "version": "0.0.11", 4 | "description": "ai agent knowledge base management tool", 5 | "main": "src/index.js", 6 | "type": "module", 7 | "scripts": { 8 | "start": "node src/index.js", 9 | "test": "echo \"No tests yet\" && exit 0" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git+https://github.com/raidendotai/knowledge.git" 14 | }, 15 | "keywords": [ 16 | "knowledge", 17 | "crawler", 18 | "vectorize", 19 | "index", 20 | "LLM", 21 | "embeddings" 22 | ], 23 | "author": "openinterface", 24 | "license": "MIT", 25 | "bugs": { 26 | "url": "https://github.com/raidendotai/knowledge/issues" 27 | }, 28 | "homepage": "https://github.com/raidendotai/knowledge#readme", 29 | "dependencies": { 30 | "@electric-sql/pglite": "^0.2.11", 31 | "@readme/openapi-parser": "^2.6.0", 32 | "@sindresorhus/slugify": "^2.2.1", 33 | "async-retry": "^1.3.3", 34 | "axios": "^1.7.7", 35 | "cheerio": "^1.0.0", 36 | "cli-progress": "^3.12.0", 37 | "dotenv": "^16.4.5", 38 | "fs-extra": "^11.2.0", 39 | "https-proxy-agent": "^7.0.5", 40 | "node-html-markdown": "^1.3.0", 41 | "openai": "*", 42 | "p-queue": "^7.4.1", 43 | "tiktoken": "^1.0.17", 44 | "yaml": "^2.5.1" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import * as cheerio from "cheerio"; 3 | import PQueue from "p-queue"; 4 | import retry from "async-retry"; 5 | import crypto from "crypto"; 6 | import { URL } from "url"; 7 | import cliProgress from "cli-progress"; 8 | import path from "path"; 9 | import fs from "fs/promises"; 10 | import OpenAI from "openai"; 11 | import yaml from "yaml"; 12 | import dotenv from "dotenv"; 13 | import OpenAPIParser from "@readme/openapi-parser"; 14 | import slugify from "@sindresorhus/slugify"; 15 | import { PGlite } from "@electric-sql/pglite"; 16 | import { vector } from "@electric-sql/pglite/vector"; 17 | 18 | import pkg from "node-html-markdown"; 19 | const { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } = pkg; 20 | const nhm = new NodeHtmlMarkdown({}); 21 | 22 | dotenv.config(); 23 | import { get_encoding } from "tiktoken"; 24 | const enc = get_encoding("cl100k_base"); 25 | 26 | let openai; 27 | try { 28 | openai = new OpenAI({ 29 | apiKey: process.env.OPENAI_API_KEY, 30 | }); 31 | } catch (e) { 32 | console.error(e); 33 | } 34 | 35 | let LOCAL_PG_INSTANCE; 36 | let LOCAL_KNOWLEDGE_DB = {}; 37 | 38 | const KNOWLEDGE_DIR = path.join(process.cwd(), ".knowledge"); 39 | const DB_DIR = path.join(KNOWLEDGE_DIR, "db"); 40 | const VECTORS_DIR = path.join(KNOWLEDGE_DIR, "vectors"); 41 | const INDEX_DIR = path.join(KNOWLEDGE_DIR, "index"); 42 | 43 | try { 44 | const files = await fs.readdir(VECTORS_DIR, { withFileTypes: true }); 45 | const loadJsonFiles = async (dir) => { 46 | const entries = await fs.readdir(dir, { withFileTypes: true }); 47 | for (const entry of entries) { 48 | const fullPath = path.join(dir, entry.name); 49 | if (entry.isDirectory()) { 50 | await loadJsonFiles(fullPath); 51 | } else if (entry.isFile() && entry.name.endsWith('.json')) { 52 | try { 53 | const fileContent = await fs.readFile(fullPath, 'utf-8'); 54 | const jsonData = JSON.parse(fileContent); 55 | const { meta, uid, content } = jsonData; 56 | LOCAL_KNOWLEDGE_DB[uid] = { meta, content }; 57 | } catch (e) { 58 | console.error(e) 59 | } 60 | } 61 | } 62 | }; 63 | 64 | await loadJsonFiles(VECTORS_DIR); 65 | } catch (e) { 66 | console.error(`> no previous vectors db data to load`); 67 | } 68 | 69 | const initDirs = async () => { 70 | await fs.mkdir(DB_DIR, { recursive: true }); 71 | await fs.mkdir(VECTORS_DIR, { recursive: true }); 72 | await fs.mkdir(INDEX_DIR, { recursive: true }); 73 | }; 74 | 75 | await initDirs(); 76 | 77 | const queues = { 78 | search: new PQueue({ concurrency: 5 }), 79 | llm: new PQueue({ concurrency: 2 }), 80 | embed: new PQueue({ concurrency: 10 }), 81 | }; 82 | 83 | const SERPER_API_KEY = process.env.SERPER_API_KEY; 84 | if (!SERPER_API_KEY) { 85 | console.error("Error: SERPER_API_KEY is not set in .env file."); 86 | process.exit(1); 87 | } 88 | 89 | const serperInstance = axios.create({ 90 | baseURL: "https://google.serper.dev", 91 | headers: { 92 | "X-API-KEY": SERPER_API_KEY, 93 | "Content-Type": "application/json", 94 | }, 95 | timeout: 10000, 96 | }); 97 | 98 | const PROXY_URL = process.env.PROXY_URL || null; 99 | const getAxiosInstance = async () => { 100 | const axiosConfig = { 101 | timeout: 10000, 102 | headers: { 103 | "User-Agent": 104 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + 105 | "AppleWebKit/537.36 (KHTML, like Gecko) " + 106 | "Chrome/113.0.0.0 Safari/537.36", 107 | "Accept-Language": "en-US,en;q=0.9", 108 | }, 109 | validateStatus: (status) => status >= 200 && status < 400, 110 | }; 111 | 112 | if (PROXY_URL) { 113 | axiosConfig.proxy = false; // Disable default proxy handling 114 | axiosConfig.httpsAgent = new (await import("https-proxy-agent")).default( 115 | PROXY_URL, 116 | ); 117 | axiosConfig.httpAgent = new (await import("https-proxy-agent")).default( 118 | PROXY_URL, 119 | ); 120 | } 121 | 122 | return axios.create(axiosConfig); 123 | }; 124 | const axiosInstance = await getAxiosInstance(); 125 | const crawlQueue = new PQueue({ concurrency: 10 }); 126 | 127 | const _chunkify = (array, size) => { 128 | const chunks = []; 129 | for (let i = 0; i < array.length; i += size) { 130 | chunks.push(array.slice(i, i + size)); 131 | } 132 | return chunks; 133 | }; 134 | 135 | const lib = { 136 | utils: { 137 | search: async ({ query }) => { 138 | return queues.search.add(() => 139 | retry( 140 | async () => { 141 | const response = await serperInstance.post("/search", { q: query }); 142 | return response.data; 143 | }, 144 | { 145 | retries: 3, 146 | factor: 2, 147 | minTimeout: 1000, 148 | onRetry: (err, attempt) => { 149 | console.warn( 150 | `Search retry ${attempt} for query "${query}" due to ${err.message}`, 151 | ); 152 | }, 153 | }, 154 | ), 155 | ); 156 | }, 157 | llm: async ({ 158 | model = "gpt-4o-mini", 159 | messages, 160 | stream = process.stdout, 161 | }) => { 162 | return queues.llm.add(() => 163 | retry( 164 | async () => { 165 | let opts = { 166 | model, 167 | messages, 168 | } 169 | if (!model.startsWith('o1')) { 170 | opts.stream = true 171 | opts.stream_options = { include_usage: true } 172 | const streaming = await openai.chat.completions.create(opts); 173 | 174 | let text = ""; 175 | for await (const chunk of streaming) { 176 | const content = chunk.choices[0]?.delta?.content || ""; 177 | if (content) { 178 | text += content; 179 | stream.write(content); 180 | } 181 | } 182 | stream.write(`\n`); 183 | return text.trim(); 184 | } else { 185 | stream.write(`\no1 model thinking (stream disabled) ...`); 186 | const intervalId = setInterval(() => { stream.write(' ...') }, 1000); 187 | const response = await openai.chat.completions.create(opts); 188 | clearInterval(intervalId); 189 | stream.write(`\n`); 190 | return response.choices[0]?.message?.content.trim() || ""; 191 | } 192 | }, 193 | { 194 | retries: 3, 195 | factor: 2, 196 | minTimeout: 1000, 197 | onRetry: (err, attempt) => { 198 | console.warn( 199 | `LLM retry ${attempt} due to ${err.message}`, 200 | ); 201 | }, 202 | }, 203 | ), 204 | ); 205 | }, 206 | embed: async ({ texts, model = "text-embedding-3-small" }) => { 207 | const maxTokens = 8192; // Set maximum tokens limit 208 | 209 | const sliceTexts = (texts) => { 210 | return texts.map((text) => { 211 | const tokens = enc.encode(text); // Tokenize the text 212 | const txt = new TextDecoder().decode( 213 | enc.decode(tokens.slice(0, maxTokens)), 214 | ); 215 | return txt; // Slice to max tokens and decode back to text 216 | }); 217 | }; 218 | 219 | texts = sliceTexts(texts); 220 | 221 | return queues.embed.add(() => 222 | retry( 223 | async () => { 224 | const response = await openai.embeddings.create({ 225 | model, 226 | input: texts, 227 | encoding_format: "float", 228 | }); 229 | return { 230 | vectors: response.data 231 | .sort((a, b) => a.index - b.index) 232 | .map((e) => e.embedding), 233 | usage: { model, ...response.usage }, 234 | }; 235 | }, 236 | { 237 | retries: 3, 238 | factor: 2, 239 | minTimeout: 1000, 240 | onRetry: (err, attempt) => { 241 | console.warn(`embed retry ${attempt} due to ${err.message}`); 242 | }, 243 | }, 244 | ), 245 | ); 246 | }, 247 | process: { 248 | html: async ({ url, proxy = false, use_puppeteer = false }) => { }, 249 | typescript: async ({ name }) => { }, 250 | }, 251 | }, 252 | collect: { 253 | crawl: async ({ 254 | url, 255 | proxy = false, 256 | post_process = false, 257 | use_puppeteer = false, 258 | vectorize = false, 259 | index = false, 260 | }) => { 261 | const visited = new Set(); 262 | const docs = []; 263 | const queue = new PQueue({ concurrency: 5 }); 264 | 265 | const root_slug = url.split("://").slice(1).join("://").split("/")[0]; 266 | 267 | // Progress bar setup 268 | const progressBar = new cliProgress.SingleBar({ 269 | format: "Crawling |{bar}| {value}/{total} Pages", 270 | barCompleteChar: "\u2588", 271 | barIncompleteChar: "\u2591", 272 | hideCursor: true, 273 | }); 274 | progressBar.start(1, 0); 275 | 276 | // URL normalization function 277 | const normalizeUrl = (inputUrl) => { 278 | try { 279 | const parsedUrl = new URL(inputUrl); 280 | parsedUrl.hash = ""; // Remove fragment 281 | parsedUrl.pathname = parsedUrl.pathname.replace(/\/$/, ""); // Remove trailing slash 282 | return parsedUrl.toString(); 283 | } catch (e) { 284 | return null; // Invalid URL 285 | } 286 | }; 287 | 288 | // Process a single URL 289 | const processUrl = async (currentUrl) => { 290 | const normalizedUrl = normalizeUrl(currentUrl); 291 | if (!normalizedUrl) { 292 | console.warn(`Invalid URL skipped: ${currentUrl}`); 293 | return; 294 | } 295 | 296 | try { 297 | const response = await retry( 298 | async () => { 299 | return await axiosInstance.get(normalizedUrl); 300 | }, 301 | { 302 | retries: 4, 303 | factor: 2, 304 | minTimeout: 2000, 305 | onRetry: (err, attempt) => { 306 | console.warn( 307 | `> retry ${attempt} for ${normalizedUrl} due to ${err.message}`, 308 | ); 309 | }, 310 | }, 311 | ); 312 | 313 | const contentType = response.headers['content-type'] || ''; 314 | if (!contentType.includes('text/html')) { 315 | // console.warn(`Non-HTML content skipped: ${normalizedUrl}`); 316 | return; 317 | } 318 | 319 | const html = response.data; 320 | const $ = cheerio.load(html); 321 | const meta = { 322 | title: $("title").text() || "No title available", 323 | description: 324 | $('meta[name="description"]').attr("content") || 325 | "No description available", 326 | url: normalizedUrl, 327 | }; 328 | const bodyHtml = $("body").html(); // Get the body content 329 | const markdown = nhm.translate(bodyHtml); // Convert body content to markdown 330 | console.log(` > ${normalizedUrl}`); 331 | docs.push({ url: normalizedUrl, content: markdown }); 332 | if (markdown.length) { 333 | const slug = slugify( 334 | normalizedUrl.split("://").slice(1).join("://"), 335 | ); 336 | const dir = path.join(DB_DIR, "crawl", root_slug, "raw"); 337 | await fs.mkdir(dir, { recursive: true }); 338 | await fs.writeFile( 339 | path.join(dir, `${slug}.yaml`), 340 | yaml.stringify({ 341 | meta, 342 | content: markdown, 343 | }), 344 | "utf-8", 345 | ); 346 | } 347 | 348 | // Find all internal links 349 | const baseUrl = new URL(url); 350 | $("a[href]").each((_, elem) => { 351 | const href = $(elem).attr("href") || ""; 352 | if ( 353 | href.startsWith("mailto:") || 354 | href.startsWith("tel:") || 355 | href.startsWith("javascript:") 356 | ) { 357 | return; // Skip non-HTTP links 358 | } 359 | try { 360 | const link = new URL(href, normalizedUrl); 361 | if (link.origin === baseUrl.origin) { 362 | // Only process same-origin links 363 | const normalizedLink = normalizeUrl(link.toString()); 364 | if (normalizedLink && !visited.has(normalizedLink)) { 365 | visited.add(normalizedLink); 366 | queue.add(() => processUrl(normalizedLink)); 367 | progressBar.increment(); 368 | progressBar.setTotal(progressBar.getTotal() + 1); 369 | } 370 | } 371 | } catch (e) { 372 | // Ignore invalid URLs 373 | } 374 | }); 375 | } catch (error) { 376 | console.error(`Failed to process ${normalizedUrl}: ${error.message}`); 377 | } 378 | }; 379 | 380 | // Start crawling from the root URL 381 | const startingUrl = normalizeUrl(url); 382 | if (!startingUrl) { 383 | throw new Error("Invalid root URL provided."); 384 | } 385 | visited.add(startingUrl); 386 | queue.add(() => processUrl(startingUrl)); 387 | 388 | await queue.onIdle(); 389 | progressBar.stop(); 390 | 391 | // Clear the console after the progress bar finishes 392 | console.clear(); 393 | 394 | console.dir({ crawl: { url, done: true } }); 395 | 396 | // Post-process if enabled 397 | if (post_process) { 398 | await lib.post_process({}); 399 | } 400 | 401 | if (vectorize) { 402 | await lib.vectorize({ root: `crawl/${root_slug}` }); 403 | 404 | if (index) { 405 | const index_methods = Object.keys(index).filter(key => index[key] !== false); 406 | await Promise.all( 407 | index_methods.map(async (index_method) => { 408 | await lib.index.create[index_method]({ root: `crawl/${root_slug}` }); 409 | }) 410 | ) 411 | } 412 | } 413 | 414 | return; 415 | }, 416 | learn: async ({ 417 | query, 418 | proxy = false, 419 | post_process = false, 420 | use_puppeteer = false, 421 | vectorize = false, 422 | index = false, 423 | }) => { 424 | // Generate search queries using LLM 425 | const searchPrompt = `Generate a list of exactly ${process.env.SERPER_SEARCH_QUERIES} search queries to find information about:\n"${query}"\n\nin text format. For example:\n\`\`\`txt\n- example query- another query\n\`\`\`\n\n> do not wrap the search queries between quotes , should be raw text , one query per line\nyou are to write a total of : ${process.env.SERPER_SEARCH_QUERIES} search queries`; 426 | const searchQuerieResponse = await lib.utils.llm({ 427 | messages: [{ role: "user", content: searchPrompt }], 428 | }); 429 | console.dir({ searchQuerieResponse }); 430 | 431 | const queries = searchQuerieResponse 432 | .split("\n") 433 | .map((l) => l.trim()) 434 | .filter((line) => line.startsWith("-")) 435 | .map((line) => line.split("-").slice(1).join("-").trim()) 436 | .filter((q) => q.length > 0) 437 | .slice(0, parseInt(process.env.SERPER_SEARCH_QUERIES)); 438 | // console.dir({queries}) 439 | 440 | // Perform searches 441 | const searchResults = []; 442 | for (const q of queries) { 443 | const result = await lib.utils.search({ query: q }); 444 | searchResults.push(result); 445 | } 446 | 447 | // Collect HTML content from search results 448 | const docs = []; 449 | for (const res of searchResults) { 450 | if (res.organic) { 451 | // Assuming Serper returns organic results 452 | for (const item of res.organic) { 453 | docs.push({ url: item.link }); 454 | } 455 | } 456 | } 457 | 458 | const root_slug = slugify(query); 459 | // Save and process the documents as needed 460 | for (const doc of docs) { 461 | try { 462 | const response = await retry( 463 | async () => { 464 | return await axiosInstance.get(doc.url); 465 | }, 466 | { 467 | retries: 2, 468 | factor: 2, 469 | minTimeout: 1000, 470 | onRetry: (err, attempt) => { 471 | console.warn( 472 | `Retry ${attempt} for ${doc.url} due to ${err.message}`, 473 | ); 474 | }, 475 | }, 476 | ); 477 | 478 | const html = response.data; 479 | const $ = cheerio.load(html); 480 | const meta = { 481 | title: $("title").text() || "No title available", 482 | description: 483 | $('meta[name="description"]').attr("content") || 484 | "No description available", 485 | url: doc.url, 486 | }; 487 | const bodyHtml = $("body").html(); // Get the body content 488 | const markdown = nhm.translate(bodyHtml); // Convert body content to markdown 489 | console.log(` > ${doc.url}`); 490 | if (markdown.length) { 491 | const slug = slugify(doc.url.split("://").slice(1).join("://")); 492 | const dir = path.join(DB_DIR, "learn", root_slug, "raw"); 493 | await fs.mkdir(dir, { recursive: true }); 494 | await fs.writeFile( 495 | path.join(dir, `${slug}.yaml`), 496 | yaml.stringify({ 497 | meta, 498 | content: markdown, 499 | }), 500 | "utf-8", 501 | ); 502 | } 503 | } catch (error) { 504 | console.error(`Failed to process ${doc.url}: ${error.message}`); 505 | } 506 | } 507 | 508 | console.dir({ learn: { query, done: true } }); 509 | 510 | if (post_process) { 511 | await lib.post_process({}); 512 | } 513 | 514 | if (vectorize) { 515 | await lib.vectorize({ root: `learn/${root_slug}` }); 516 | if (index) { 517 | const index_methods = Object.keys(index).filter(key => index[key] !== false); 518 | await Promise.all( 519 | index_methods.map(async (index_method) => { 520 | await lib.index.create[index_method]({ root: `learn/${root_slug}` }); 521 | }) 522 | ) 523 | } 524 | } 525 | 526 | return; 527 | }, 528 | openapi: async ({ 529 | url, 530 | proxy = false, 531 | post_process = false, 532 | vectorize = false, 533 | index = false, 534 | }) => { 535 | const _circularReplacer = () => { 536 | const visited = new WeakSet(); 537 | return (key, value) => { 538 | if (typeof value === "object" && value !== null) { 539 | if (visited.has(value)) { 540 | return; 541 | } 542 | visited.add(value); 543 | } 544 | return value; 545 | }; 546 | }; 547 | 548 | function openapi3(query) { 549 | const api = query.data; 550 | if (api.components) delete api.components; 551 | const _api = { 552 | openapi: api.openapi, 553 | servers: api.servers, 554 | info: api.info, 555 | tags: api.tags ? api.tags : false, 556 | }; 557 | const paths = { ...api.paths }; 558 | const api_functions = Object.keys(paths).map((path_key) => { 559 | return Object.keys(paths[path_key]).map((method_key) => { 560 | let descriptions = []; 561 | if (paths[path_key].summary) 562 | descriptions.push(paths[path_key].summary); 563 | if (paths[path_key].description) 564 | descriptions.push(paths[path_key].description); 565 | if (paths[path_key][method_key].summary) 566 | descriptions.push(paths[path_key][method_key].summary); 567 | if (paths[path_key][method_key].description) 568 | descriptions.push(paths[path_key][method_key].description); 569 | 570 | let category = false; 571 | try { 572 | category = api.info["x-apisguru-categories"][0]; 573 | } catch (e) { 574 | false; 575 | } 576 | 577 | const openapi_specs = { 578 | ..._api, 579 | paths: { 580 | [path_key]: { 581 | [method_key]: paths[path_key][method_key], 582 | }, 583 | }, 584 | }; 585 | 586 | const _specs_string = JSON.stringify(openapi_specs).toLowerCase(); 587 | 588 | const auth = [ 589 | `Auth`, 590 | `Bearer`, 591 | `X-API-Key`, 592 | `X-Api-Key`, 593 | `BasicAuth`, 594 | `ApiKeyAuth`, 595 | `OpenID`, 596 | `OAuth2`, 597 | ].some((_auth_substring) => 598 | _specs_string 599 | .toLowerCase() 600 | .includes(_auth_substring.toLowerCase()), 601 | ); 602 | 603 | const _apiFunction = paths[path_key][method_key].operationId 604 | ? paths[path_key][method_key].operationId 605 | : `${path_key} : ${method_key}`; 606 | const _apiFunctionDescription = descriptions.join(`\n\n`); 607 | const _apiFunctionVectorize = 608 | `api info:\n` + 609 | `- name : ${api.info.title}\n` + 610 | `- description : ${api.info.description ? api.info.description.trim() : "" 611 | }\n` + 612 | `---\n` + 613 | `function name:\n` + 614 | `- ${_apiFunction.trim()}\n` + 615 | `---\n` + 616 | `function description:\n` + 617 | `- ${_apiFunctionDescription.trim()}\n` + 618 | `--\n` + 619 | `function route:\n` + 620 | `${Object.keys( 621 | openapi_specs.paths[Object.keys(openapi_specs.paths)[0]], 622 | )} : ${Object.keys(openapi_specs.paths)[0]}`; 623 | return { 624 | meta: { 625 | format: `openapi`, 626 | api: api.info.title, 627 | info: api.info.description ? api.info.description : ``, 628 | description: _apiFunctionVectorize, 629 | function: _apiFunction, 630 | urls: query.url ? [query.url] : [], 631 | }, 632 | content: yaml.stringify({ 633 | format: `openapi`, 634 | api: { 635 | name: api.info.title, 636 | description: api.info.description ? api.info.description : ``, 637 | }, 638 | function: _apiFunction, 639 | description: _apiFunctionDescription, 640 | category, 641 | auth, 642 | implement: { 643 | usage: [], 644 | openapi: openapi_specs, 645 | }, 646 | }), 647 | }; 648 | }); 649 | }); 650 | return api_functions.flat(); 651 | } 652 | 653 | async function run(query, attempt = 0) { 654 | if (attempt === 2) return false; 655 | 656 | if (!query.data) { 657 | if (!query.url) return false; 658 | query.data = (await axios.get(query.url)).data; 659 | query.data = 660 | typeof query.data === "string" 661 | ? yaml.parse(query.data) 662 | : query.data; 663 | } 664 | 665 | try { 666 | query.data = await OpenAPIParser.validate(query.data); 667 | } catch (e) { 668 | console.error(e, "fallback"); 669 | query.data = await OpenAPIParser.parse(query.data); 670 | } 671 | query.data = JSON.parse( 672 | JSON.stringify(query.data, _circularReplacer()), 673 | ); 674 | if (query.data.swagger) { 675 | if (!query.url) return false; 676 | attempt++; 677 | return await run( 678 | { 679 | url: `https://converter.swagger.io/api/convert?url=${query.url}`, 680 | }, 681 | attempt, 682 | ); 683 | } 684 | return openapi3(query); 685 | } 686 | 687 | let parsed_openapi; 688 | try { 689 | parsed_openapi = await run({ url }); 690 | } catch (error) { 691 | console.error(`failed openapi parse : ${url}: ${error.message}`); 692 | return []; 693 | } 694 | 695 | const root_slug = slugify(url.split("://").slice(1).join("://")); 696 | for (let api_fn of parsed_openapi) { 697 | const { meta, content } = api_fn; 698 | const slug = slugify(api_fn.meta.function); 699 | const dir = path.join(DB_DIR, "openapi", root_slug, "raw"); 700 | await fs.mkdir(dir, { recursive: true }); 701 | await fs.writeFile( 702 | path.join(dir, `${slug}.yaml`), 703 | yaml.stringify({ 704 | meta, 705 | content, 706 | }), 707 | "utf-8", 708 | ); 709 | } 710 | 711 | if (post_process) { 712 | await lib.post_process({}); 713 | } 714 | if (vectorize) { 715 | await lib.vectorize({ root: `openapi/${root_slug}` }); 716 | if (index) { 717 | const index_methods = Object.keys(index).filter(key => index[key] !== false); 718 | await Promise.all( 719 | index_methods.map(async (index_method) => { 720 | await lib.index.create[index_method]({ root: `openapi/${root_slug}` }); 721 | }) 722 | ) 723 | } 724 | } 725 | }, 726 | github: async ({ 727 | url, 728 | depth = 3, 729 | proxy = false, 730 | post_process = false, 731 | vectorize = false, 732 | index = false, 733 | }) => { 734 | const GITHUB_API_KEY = process.env.GITHUB_API_KEY; 735 | if (!GITHUB_API_KEY) { 736 | console.error("Error: GITHUB_API_KEY is not set in .env file."); 737 | return []; 738 | } 739 | 740 | const axiosInstance = axios.create({ 741 | headers: { 742 | Authorization: `${GITHUB_API_KEY}`, 743 | }, 744 | }); 745 | 746 | // Helper function to fetch content with retry 747 | async function fetchContentWithRetry(fetchUrl) { 748 | return await retry( 749 | async (bail) => { 750 | try { 751 | const response = await axiosInstance.get(fetchUrl); 752 | return response.data; 753 | } catch (error) { 754 | if (error.response && error.response.status < 500) { 755 | bail(new Error(`Non-retryable error: ${error.message}`)); 756 | return; 757 | } 758 | throw error; 759 | } 760 | }, 761 | { retries: 5 }, 762 | ); 763 | } 764 | 765 | // Recursive function to traverse directories up to specified depth 766 | async function traverseRepo(path, currentDepth) { 767 | if (currentDepth > depth) return []; 768 | 769 | const apiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${path}`; 770 | let contents; 771 | try { 772 | contents = await fetchContentWithRetry(apiUrl); 773 | } catch (error) { 774 | console.error( 775 | `Failed to fetch contents of ${path}: ${error.message}`, 776 | ); 777 | return []; 778 | } 779 | 780 | let readmes = []; 781 | 782 | for (const item of contents) { 783 | if (item.type === "file" && item.name.toLowerCase() === "readme.md") { 784 | const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${defaultBranch}/${item.path}`; 785 | try { 786 | const readmeContent = await fetchContentWithRetry(rawUrl); 787 | readmes.push({ git_path: item.path, content: readmeContent }); 788 | } catch (error) { 789 | console.error( 790 | `Failed to fetch README at ${item.path}: ${error.message}`, 791 | ); 792 | } 793 | } else if (item.type === "dir") { 794 | const subReadmes = await traverseRepo(item.path, currentDepth + 1); 795 | readmes = readmes.concat(subReadmes); 796 | } 797 | } 798 | 799 | return readmes; 800 | } 801 | 802 | // Parse the repo URL to extract owner and repo 803 | let owner, repo; 804 | try { 805 | const parsedUrl = new URL(url); 806 | const pathSegments = parsedUrl.pathname 807 | .split("/") 808 | .filter((seg) => seg.length); 809 | if (pathSegments.length < 2) { 810 | throw new Error("Invalid GitHub repository URL."); 811 | } 812 | owner = pathSegments[0]; 813 | repo = pathSegments[1].replace(/\.git$/, ""); 814 | } catch (error) { 815 | console.error(`Invalid URL provided: ${error.message}`); 816 | return []; 817 | } 818 | 819 | // Fetch repository metadata to get default branch 820 | let defaultBranch = "main"; 821 | try { 822 | const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; 823 | const repoData = await fetchContentWithRetry(repoUrl); 824 | defaultBranch = repoData.default_branch; 825 | } catch (error) { 826 | console.error(`Failed to fetch repository metadata: ${error.message}`); 827 | return []; 828 | } 829 | 830 | // Start traversing the repository 831 | const readmeList = await traverseRepo("", 1); 832 | 833 | const root_slug = `github.${owner}.${repo}`; 834 | for (let readme_file of readmeList) { 835 | const { git_path, content } = readme_file; 836 | if (content.length) { 837 | const slug = slugify(git_path); 838 | const dir = path.join(DB_DIR, "crawl", root_slug, "raw"); 839 | await fs.mkdir(dir, { recursive: true }); 840 | await fs.writeFile( 841 | path.join(dir, `${slug}.yaml`), 842 | yaml.stringify({ 843 | meta: { 844 | owner, 845 | repo, 846 | path: git_path, 847 | }, 848 | content, 849 | }), 850 | "utf-8", 851 | ); 852 | } 853 | } 854 | 855 | if (post_process) { 856 | await lib.post_process({}); 857 | } 858 | 859 | if (vectorize) { 860 | await lib.vectorize({ root: `crawl/${root_slug}` }); 861 | if (index) { 862 | const index_methods = Object.keys(index).filter(key => index[key] !== false); 863 | await Promise.all( 864 | index_methods.map(async (index_method) => { 865 | await lib.index.create[index_method]({ root: `crawl/${root_slug}` }); 866 | }) 867 | ) 868 | } 869 | } 870 | }, 871 | npm: async ({ 872 | name, 873 | proxy = false, 874 | post_process = false, 875 | vectorize = false, 876 | index = false, 877 | }) => { 878 | // Fetch package README using npm registry API 879 | // Fetch TypeScript definitions and process them 880 | try { 881 | const response = await axios.get(`https://registry.npmjs.org/${name}`); 882 | const latest = response.data["dist-tags"].latest; 883 | const git_url = 884 | response.data.versions[latest].homepage.split("#readme")[0]; 885 | console.dir({ latest, git_url }); 886 | if (git_url) 887 | return await lib.collect.github({ 888 | url: git_url, 889 | proxy, 890 | post_process, 891 | vectorize, 892 | index, 893 | }); 894 | return; 895 | } catch (error) { 896 | console.error( 897 | `Failed to fetch npm README for ${name}: ${error.message}`, 898 | ); 899 | return []; 900 | } 901 | }, 902 | }, 903 | post_process: async ({ }) => { 904 | // Iterate over each entry and generate cleaner Markdown using LLM 905 | const entries = await fs.readdir(DB_DIR, { withFileTypes: true }); 906 | for (const entry of entries) { 907 | if (entry.isDirectory()) { 908 | const contentPath = path.join(DB_DIR, entry.name, "content.md"); 909 | try { 910 | const content = await fs.readFile(contentPath, "utf-8"); 911 | const cleanerPrompt = `Improve the following markdown:\n\n${content}`; 912 | const cleaner = await lib.utils.llm({ prompt: cleanerPrompt }); 913 | await fs.writeFile(contentPath, cleaner, "utf-8"); 914 | } catch (error) { 915 | console.error( 916 | `Failed to post-process ${entry.name}: ${error.message}`, 917 | ); 918 | } 919 | } 920 | } 921 | return { status: "> postprocessing done" }; 922 | }, 923 | vectorize: async ({ root }) => { 924 | const processedDir = path.join(DB_DIR, root, "processed"); 925 | const rawDir = path.join(DB_DIR, root, "raw"); 926 | const dirToRead = (await fs.stat(processedDir).catch(() => false)) 927 | ? processedDir 928 | : rawDir; 929 | const entries = await fs.readdir(dirToRead, { withFileTypes: true }); 930 | const dataset = await Promise.all( 931 | entries.map(async (entry) => { 932 | const filepath = path.join(dirToRead, entry.name); 933 | const filecontent = await fs.readFile(filepath, "utf8"); 934 | const data = yaml.parse(filecontent); 935 | 936 | const uid = crypto 937 | .createHash("sha512") 938 | .update(data.content) 939 | .digest("hex"); 940 | return { 941 | scope: root, 942 | filepath, 943 | filename: entry.name, 944 | uid, 945 | ...data, 946 | vector_text: `${yaml.stringify(data.meta)}\n---\n\n${data.content.trim()}`, 947 | }; 948 | }), 949 | ); 950 | const batches = _chunkify(dataset, 15); 951 | await Promise.all( 952 | batches.map(async (chunk, chunk_index) => { 953 | // console.dir({chunk , chunk_index}) 954 | const vectors = ( 955 | await lib.utils.embed({ 956 | texts: chunk.map((entry) => entry.vector_text), 957 | }) 958 | ).vectors; 959 | await Promise.all( 960 | vectors.map(async (vector, idx) => { 961 | const item = chunk[idx]; 962 | const vectorDir = path.join(VECTORS_DIR, root); 963 | await fs.mkdir(vectorDir, { recursive: true }); 964 | await fs.writeFile( 965 | path.join(VECTORS_DIR, root, `${item.uid}.json`), 966 | JSON.stringify({ 967 | ...item, 968 | vector, 969 | }), 970 | "utf-8", 971 | ); 972 | if (LOCAL_KNOWLEDGE_DB) { 973 | LOCAL_KNOWLEDGE_DB[item.uid] = { meta: item.meta, content: item.content } 974 | } 975 | }), 976 | ); 977 | }), 978 | ); 979 | console.log(`> vectorized : ${root}`); 980 | }, 981 | index: { 982 | create: { 983 | local: async ({ root }) => { 984 | if (!LOCAL_PG_INSTANCE) { 985 | const metaDb = new PGlite(INDEX_DIR, { 986 | extensions: { 987 | vector, 988 | }, 989 | }); 990 | await metaDb.waitReady; 991 | LOCAL_PG_INSTANCE = metaDb; 992 | } 993 | try { 994 | await LOCAL_PG_INSTANCE.exec(` 995 | create extension if not exists vector; 996 | -- drop table if exists embeddings; -- Uncomment this line to reset the database 997 | create table if not exists embeddings ( 998 | id bigint primary key generated always as identity, 999 | uid text not null unique, 1000 | embedding vector (1536) 1001 | ); 1002 | 1003 | create index on embeddings using hnsw (embedding vector_ip_ops); 1004 | `); 1005 | } catch (e) { 1006 | console.error(e); 1007 | } 1008 | const entries = await fs.readdir(path.join(VECTORS_DIR, root)); 1009 | const jsonFiles = entries.filter((file) => file.endsWith(".json")); 1010 | const dataset = (await Promise.all( 1011 | jsonFiles.map(async (file) => { 1012 | try { 1013 | 1014 | const filePath = path.join(VECTORS_DIR, root, file); 1015 | const content = await fs.readFile(filePath, "utf-8"); 1016 | return JSON.parse(content); 1017 | } catch (e) { 1018 | console.error(e) 1019 | } 1020 | return false 1021 | }), 1022 | )).filter(e => e); 1023 | const chunks = _chunkify(dataset, 50); 1024 | for (let chunk of chunks) { 1025 | // Filter out entries that already exist in the database 1026 | const existingUids = await LOCAL_PG_INSTANCE.query(` 1027 | SELECT uid FROM embeddings WHERE uid IN (${chunk.map((entry) => `'${entry.uid}'`).join(", ")}); 1028 | `); 1029 | const existingUidSet = new Set( 1030 | existingUids.rows.map((row) => row.uid), 1031 | ); 1032 | 1033 | const newEntries = chunk.filter( 1034 | (entry) => !existingUidSet.has(entry.uid), 1035 | ); 1036 | if (newEntries.length > 0) { 1037 | const pg_entries = newEntries 1038 | .map((entry) => { 1039 | return `\t('${entry.uid}','${JSON.stringify(entry.vector)}')`; 1040 | }) 1041 | .join(",\n"); 1042 | 1043 | await LOCAL_PG_INSTANCE.exec(` 1044 | insert into embeddings (uid, embedding) values 1045 | ${pg_entries}; 1046 | `); 1047 | } 1048 | 1049 | console.dir( 1050 | await LOCAL_PG_INSTANCE.query(`SELECT COUNT(*) FROM embeddings;`), 1051 | { depth: null }, 1052 | ); 1053 | } 1054 | }, 1055 | supabase: async ({ }) => { }, 1056 | weaviate: async ({ }) => { }, 1057 | }, 1058 | query: { 1059 | local: async ({ query, embedding = false, threshold = 0.0, amount = 6 }) => { 1060 | if (!LOCAL_PG_INSTANCE) { 1061 | const metaDb = new PGlite(INDEX_DIR, { 1062 | extensions: { 1063 | vector, 1064 | }, 1065 | }); 1066 | await metaDb.waitReady; 1067 | LOCAL_PG_INSTANCE = metaDb; 1068 | } 1069 | const query_vector = embedding ? embedding : (await lib.utils.embed({ texts: [query] })).vectors[0] 1070 | const res = await LOCAL_PG_INSTANCE.query( 1071 | ` 1072 | select * from embeddings 1073 | where embeddings.embedding <#> $1 < $2 1074 | order by embeddings.embedding <#> $1 1075 | limit $3; 1076 | `, 1077 | [JSON.stringify(query_vector), -Number(threshold), Number(amount)] 1078 | ) 1079 | return res.rows.map(item => { 1080 | return { 1081 | uid: item.uid, 1082 | data: LOCAL_KNOWLEDGE_DB[item.uid] ? LOCAL_KNOWLEDGE_DB[item.uid] : false, 1083 | } 1084 | }) 1085 | }, 1086 | supabase: async ({ }) => { }, 1087 | weaviate: async ({ }) => { }, 1088 | }, 1089 | ask: { 1090 | local: async ({ query, model = "gpt-4o" }) => { 1091 | 1092 | const retrieved = await lib.index.query.local({ 1093 | query, 1094 | amount: 7 1095 | }) 1096 | const messages = [ 1097 | { 1098 | role: 'user', 1099 | content: `# FOUND REFERENCES : 1100 | 1101 | ${retrieved.map(entry => yaml.stringify(entry.data)).join('\n---\n')} 1102 | ------ 1103 | 1104 | # USER QUERY : 1105 | 1106 | ${query}` 1107 | } 1108 | ] 1109 | 1110 | return await lib.utils.llm({ messages, model }); 1111 | } 1112 | }, 1113 | }, 1114 | }; 1115 | 1116 | export default lib; 1117 | --------------------------------------------------------------------------------