├── .env.example ├── .gitignore ├── index.js ├── lib ├── Docs2Vector.d.ts └── Docs2Vector.js ├── package.json ├── readme.md └── script.js /.env.example: -------------------------------------------------------------------------------- 1 | GITHUB_TOKEN=your_github_token 2 | UPSTASH_VECTOR_REST_URL=your_upstash_vector_url 3 | UPSTASH_VECTOR_REST_TOKEN=your_upstash_vector_token 4 | 5 | # OPENAI_API_KEY=your_openai_api_key # Optional - if not provided, will use Upstash embeddings 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .env 3 | temp_repo/ 4 | .idea 5 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import Docs2Vector from "./lib/Docs2Vector.js"; 2 | 3 | export default Docs2Vector; -------------------------------------------------------------------------------- /lib/Docs2Vector.d.ts: -------------------------------------------------------------------------------- 1 | declare class Docs2Vector { 2 | private vectorUrl: string; 3 | private vectorToken: string; 4 | private openAiApiKey: string; 5 | private githubToken: string; 6 | private namespace: string; 7 | private index: any; 8 | private embeddings: any; 9 | private octokit: any; 10 | private textSplitter: any; 11 | 12 | constructor(); 13 | init(): Promise; 14 | reindex(): Promise; 15 | search(query: string, limit?: number): Promise; 16 | addDocument(content: string, metadata: any): Promise; 17 | 18 | private static generateId(content: string): string; 19 | private cloneRepository(repoUrl: string): Promise; 20 | private findMarkdownFiles(dir: string): Promise; 21 | private processFile(filePath: string): Promise>; 32 | 33 | run(repoUrl: string): Promise; 34 | } 35 | 36 | export default Docs2Vector; 37 | -------------------------------------------------------------------------------- /lib/Docs2Vector.js: -------------------------------------------------------------------------------- 1 | import dotenv from 'dotenv'; 2 | import { Octokit } from '@octokit/rest'; 3 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; 4 | import { Document } from "@langchain/core/documents"; 5 | import { OpenAIEmbeddings } from "@langchain/openai"; 6 | import { Index } from "@upstash/vector"; 7 | import { promises as fs } from 'fs'; 8 | import path from 'path'; 9 | import { simpleGit } from 'simple-git'; 10 | import crypto from 'crypto'; 11 | import { fileURLToPath } from 'url'; 12 | 13 | // ES modules fix for __dirname 14 | const __filename = fileURLToPath(import.meta.url); 15 | const __dirname = path.dirname(__filename); 16 | 17 | // Load environment variables from .env 18 | dotenv.config(); 19 | 20 | class Docs2Vector { 21 | constructor() { 22 | // Load parameters from environment variables 23 | this.vectorUrl = process.env.UPSTASH_VECTOR_REST_URL; 24 | this.vectorToken = process.env.UPSTASH_VECTOR_REST_TOKEN; 25 | this.openAiApiKey = process.env.OPENAI_API_KEY; 26 | this.githubToken = process.env.GITHUB_TOKEN; 27 | this.namespace = ""; 28 | 29 | // Initialize Upstash Vector 30 | this.index = new Index({ 31 | url: this.vectorUrl, 32 | token: this.vectorToken 33 | }); 34 | 35 | // Optionally initialize OpenAI embeddings 36 | if (this.openAiApiKey) { 37 | this.embeddings = new OpenAIEmbeddings({ 38 | openAIApiKey: this.openAiApiKey 39 | }); 40 | } 41 | 42 | // Optionally initialize GitHub API client 43 | if (this.githubToken) { 44 | this.octokit = new Octokit({ 45 | auth: this.githubToken 46 | }); 47 | } 48 | 49 | // Initialize text splitter 50 | this.textSplitter = new RecursiveCharacterTextSplitter({ 51 | chunkSize: 1000, 52 | chunkOverlap: 200, 53 | separators: ["\n\n", "\n", " ", ""] 54 | }); 55 | } 56 | 57 | // Private helper: Generate a unique ID for content chunks 58 | static #generateId(content) { 59 | return crypto.createHash('md5').update(content).digest('hex'); 60 | } 61 | 62 | // Private method: Clone GitHub repository 63 | async #cloneRepository(repoUrl) { 64 | const tempDir = path.join(process.cwd(), 'temp_repo'); 65 | 66 | // Clean up temporary directory if it exists 67 | await fs.rm(tempDir, { recursive: true, force: true }); 68 | 69 | // Clone the repository 70 | const git = simpleGit(); 71 | await git.clone(repoUrl, tempDir); 72 | 73 | return tempDir; 74 | } 75 | 76 | // Private method: Find all markdown files in a directory 77 | async #findMarkdownFiles(dir) { 78 | const files = await fs.readdir(dir, { withFileTypes: true }); 79 | let markdownFiles = []; 80 | 81 | for (const file of files) { 82 | const fullPath = path.join(dir, file.name); 83 | 84 | if (file.isDirectory() && !file.name.startsWith('.')) { 85 | const subFiles = await this.#findMarkdownFiles(fullPath); 86 | markdownFiles = [...markdownFiles, ...subFiles]; 87 | } else if (file.name.match(/\.(md|mdx)$/)) { 88 | markdownFiles.push(fullPath); 89 | } 90 | } 91 | 92 | return markdownFiles; 93 | } 94 | 95 | // Private method: Process a single markdown file 96 | async #processFile(filePath) { 97 | const content = await fs.readFile(filePath, 'utf-8'); 98 | const relativePath = path.relative(process.cwd(), filePath); 99 | 100 | const doc = new Document({ pageContent: content }); 101 | const chunks = await this.textSplitter.splitDocuments([doc]); 102 | 103 | // Create processed chunks 104 | const processedChunks = await Promise.all(chunks.map(async chunk => { 105 | const base = { 106 | id: Docs2Vector.#generateId(chunk.pageContent), 107 | metadata: { 108 | fileName: path.basename(filePath), 109 | filePath: relativePath, 110 | fileType: path.extname(filePath).substring(1), 111 | timestamp: new Date().getTime() 112 | } 113 | }; 114 | 115 | if (this.embeddings) { 116 | // Generate embeddings via OpenAI 117 | const [vector] = await this.embeddings.embedDocuments([chunk.pageContent]); 118 | return { ...base, vector, data: chunk.pageContent }; 119 | } else { 120 | // Data-only format for Upstash Vector 121 | return { ...base, data: chunk.pageContent }; 122 | } 123 | })); 124 | 125 | return processedChunks; 126 | } 127 | 128 | // Public method: Main run logic 129 | async run(repoUrl) { 130 | try { 131 | console.log(`Processing repository: ${repoUrl}`); 132 | this.namespace = repoUrl.split('/').pop().replace('.git', ''); 133 | console.log(`Using namespace: ${this.namespace}`); 134 | 135 | // Clone repository 136 | const repoDir = await this.#cloneRepository(repoUrl); 137 | 138 | // Find markdown files 139 | const markdownFiles = await this.#findMarkdownFiles(repoDir); 140 | console.log(`Found ${markdownFiles.length} markdown files`); 141 | 142 | // Process markdown files 143 | let allChunks = []; 144 | for (const file of markdownFiles) { 145 | console.log(`Processing file: ${file}`); 146 | const chunks = await this.#processFile(file); 147 | allChunks = [...allChunks, ...chunks]; 148 | } 149 | 150 | // Store embeddings in Upstash Vector 151 | console.log(`Storing ${allChunks.length} chunks in Upstash Vector`); 152 | const batchSize = 100; 153 | for (let i = 0; i < allChunks.length; i += batchSize) { 154 | const batch = allChunks.slice(i, i + batchSize); 155 | await this.index.upsert(batch, { namespace: this.namespace }); 156 | console.log(`Processed batch ${Math.floor(i / batchSize) + 1} of ${Math.ceil(allChunks.length / batchSize)}`); 157 | } 158 | 159 | // Clean up cloned repository 160 | await fs.rm(repoDir, { recursive: true, force: true }); 161 | console.log('Processing completed successfully'); 162 | } catch (error) { 163 | console.error('Error:', error.message); 164 | throw error; 165 | } 166 | } 167 | } 168 | 169 | export default Docs2Vector; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@upstash/docs2vector", 3 | "version": "1.0.2", 4 | "description": "A tool to process markdown files from GitHub repositories and store them in Upstash Vector", 5 | "main": "index.js", 6 | "type": "module", 7 | "files": [ 8 | "lib", 9 | "index.js" 10 | ], 11 | "scripts": { 12 | "start": "node index.js" 13 | }, 14 | "keywords": [ 15 | "github", 16 | "markdown", 17 | "vector-database", 18 | "upstash", 19 | "embeddings" 20 | ], 21 | "author": "", 22 | "license": "MIT", 23 | "publishConfig": { 24 | "access": "public" 25 | }, 26 | "dependencies": { 27 | "@langchain/core": "0.2.21", 28 | "@langchain/openai": "0.0.14", 29 | "@octokit/rest": "20.0.2", 30 | "@upstash/docs2vector": "^1.0.0", 31 | "@upstash/vector": "^1.0.4", 32 | "dotenv": "16.4.1", 33 | "langchain": "0.3.9", 34 | "simple-git": "3.22.0" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # GitHub Docs Vectorizer 2 | 3 | This tool processes Markdown files of you GitHub documentation repo, and insert them into Upstash Vector database. So you can build document search systems, AI-driven documentation assistants, or knowledge bases. 4 | 5 | ## Features 6 | - Recursively find all Markdown (`.md`) and MDX (`.mdx`) files in any GitHub repository 7 | - Chunk documents using LangChain's RecursiveCharacterTextSplitter 8 | - Supports both OpenAI and Upstash embeddings 9 | - Stores document chunks and metadata in Upstash Vector for semantic retrieval 10 | 11 | ## Prerequisites 12 | - Node.js (v16 or higher) 13 | - GitHub personal access token (required for repository access) 14 | - Upstash Vector database account (to store vectors) 15 | - OpenAI API key (optional, for generating embeddings) 16 | 17 | ## How to Find Your GitHub Token 18 | 19 |
20 | Click to expand instructions for getting your GitHub token 21 | 22 | 1. Go to [GitHub.com](https://github.com) and sign in to your account 23 | 2. Click on your profile picture in the top-right corner 24 | 3. Go to `Settings` > `Developer settings` > `Personal access tokens` > `Tokens (classic)` 25 | 4. Click `Generate new token` > `Generate new token (classic)` 26 | 5. Give your token a descriptive name in the "Note" field 27 | 6. Select the following scopes: 28 | - `repo` (Full control of private repositories) 29 | - `read:org` (Read organization data) 30 | 7. Click `Generate token` 31 |
32 | 33 | ## Installation Guide 34 | 35 | 1. Clone the repository or create a new directory: 36 | ```bash 37 | mkdir github-docs-vectorizer 38 | cd github-docs-vectorizer 39 | ``` 40 | 41 | 2. Ensure the following files are included in your directory: 42 | - `script.js`: The main script for processing 43 | - `package.json`: Manages project dependencies 44 | - `.env`: Contains your environment variables (explained below) 45 | 46 | 3. Install dependencies: 47 | ```bash 48 | npm install 49 | ``` 50 | 51 | 4. Set up a `.env` file in the root directory of your project with your credentials: 52 | ```env 53 | # Required for accessing GitHub repositories 54 | GITHUB_TOKEN=your_github_token 55 | 56 | # Required for storing vectors in Upstash 57 | UPSTASH_VECTOR_REST_URL=your_upstash_vector_url 58 | UPSTASH_VECTOR_REST_TOKEN=your_upstash_vector_token 59 | 60 | # Optional: Provide if using OpenAI embeddings 61 | OPENAI_API_KEY=your_openai_api_key 62 | ``` 63 | 64 | ## Usage 65 | 66 | Run the script by providing the GitHub repository URL as an argument: 67 | 68 | ```bash 69 | node script.js https://github.com/username/repository 70 | ``` 71 | 72 | Example: 73 | ```bash 74 | node script.js https://github.com/facebook/react 75 | ``` 76 | 77 | The script will: 78 | 1. Clone the specified repository 79 | 2. Find all Markdown files 80 | 3. Split content into chunks 81 | 4. Generate embeddings (using either OpenAI or Upstash) 82 | 5. Store the chunks in your Upstash Vector database 83 | 6. Clean up temporary files 84 | 85 | ## Configuration 86 | 87 | ### Embedding Options 88 | 89 | ### Supported Embedding Providers 90 | 91 | 1. OpenAI Embeddings (default if API key is provided) 92 | - Requires `OPENAI_API_KEY` in `.env` 93 | - Uses OpenAI's text-embedding-ada-002 model 94 | - You need to choose `custom` as embedding model while creating Vector Index 95 | 96 | 2. Upstash Embeddings (used when OpenAI API key is not provided) 97 | - No additional configuration needed 98 | - Uses Upstash's built-in embedding service 99 | - You need to choose an embedding model while creating Vector Index 100 | 101 | ### Customizing Document Chunking 102 | 103 | To adjust how documents are split into chunks, you can update the configuration in `script.js`: 104 | 105 | ```javascript 106 | const textSplitter = new RecursiveCharacterTextSplitter({ 107 | chunkSize: 1000, // Adjust chunk size as needed 108 | chunkOverlap: 200 // Adjust overlap as needed 109 | }); 110 | ``` 111 | 112 | ## SDK 113 | 114 | Use the SDK to trigger the functionality programmatically. 115 | 116 | ```shell 117 | npm install @upstash/docs2vector dotenv 118 | ``` 119 | 120 | ```javascript 121 | import dotenv from 'dotenv'; 122 | import Docs2Vector from "@upstash/docs2vector"; 123 | 124 | // Load environment variables 125 | dotenv.config(); 126 | 127 | async function main() { 128 | console.time('Processing Time'); 129 | try { 130 | // Step 1: Define the GitHub repository URL 131 | const githubRepoUrl = 'YOUR_GITHUB_URL'; 132 | 133 | // Print start message 134 | console.log(`Starting processing for the repository: ${githubRepoUrl}`); 135 | 136 | // Step 2: Initialize the Docs2Vector SDK 137 | const converter = new Docs2Vector(); 138 | 139 | // Step 3: Run the processing flow with Docs2Vector's `run` method 140 | await converter.run(githubRepoUrl); 141 | 142 | // Print success message 143 | console.log(`Successfully processed repository: ${githubRepoUrl}`); 144 | console.log('Vectors stored in Upstash Vector database.'); 145 | console.timeEnd('Processing Time'); 146 | } catch (error) { 147 | console.timeEnd('Processing Time'); 148 | console.error('An error occurred while processing the repository:', error.message); 149 | } 150 | } 151 | 152 | main(); 153 | ``` 154 | 155 | 156 | ## Metadata 157 | 158 | Metadata accompanies each stored chunk for improved context: 159 | - Original file name 160 | - File type (Markdown or MDX) 161 | - Relative file path in the repository 162 | - Document source for the specific chunk of text 163 | 164 | ## Error Handling 165 | The script is designed to handle errors gracefully in the following cases: 166 | - Invalid repository URLs provided 167 | - Missing or incorrect credentials 168 | - Unable to access or read the required files 169 | - Connectivity or network-related problems 170 | - Network problems 171 | 172 | In case of errors, the script will: 173 | 1. Log the error message 174 | 2. Clean up any temporary files 175 | 3. Exit with a non-zero status code 176 | 177 | ## Contributing 178 | 179 | Feel free to submit issues and enhancement requests! 180 | 181 | ## License 182 | 183 | MIT License - feel free to use this tool for any purpose. 184 | 185 | ## Credits 186 | 187 | This tool uses the following open-source packages: 188 | - **LangChain**: Handles document processing and vector store integration 189 | - **Octokit**: Facilitates interactions with the GitHub API 190 | - **simple-git**: Manages operations on Git repositories 191 | - **Upstash Vector**: Enables seamless storage and retrieval of document vectors 192 | -------------------------------------------------------------------------------- /script.js: -------------------------------------------------------------------------------- 1 | import Docs2Vector from './lib/Docs2Vector.js'; 2 | import dotenv from 'dotenv'; 3 | 4 | // Load environment variables 5 | dotenv.config(); 6 | 7 | async function main() { 8 | console.time('Processing Time'); 9 | try { 10 | // Step 1: Define the GitHub repository URL 11 | const githubRepoUrl = 'https://github.com/upstash/docs2vector'; 12 | 13 | // Print start message 14 | console.log(`Starting processing for the repository: ${githubRepoUrl}`); 15 | 16 | // Step 2: Initialize the Docs2Vector SDK 17 | const converter = new Docs2Vector(); 18 | 19 | // Step 3: Run the processing flow with Docs2Vector's `run` method 20 | await converter.run(githubRepoUrl); 21 | 22 | // Print success message 23 | console.log(`Successfully processed repository: ${githubRepoUrl}`); 24 | console.log('Vectors stored in Upstash Vector database.'); 25 | console.timeEnd('Processing Time'); 26 | } catch (error) { 27 | console.timeEnd('Processing Time'); 28 | console.error('An error occurred while processing the repository:', error.message); 29 | } 30 | } 31 | 32 | main(); --------------------------------------------------------------------------------