├── .eslintignore ├── .eslintrc ├── .github └── workflows │ └── npm-publish.yml ├── .gitignore ├── .husky ├── commit-msg └── pre-commit ├── .vscode ├── extensions.json ├── settings.json └── tasks.json ├── CITATION.cff ├── LICENSE ├── README.md ├── commitlint.config.js ├── embedchain ├── __tests__ │ └── readme.test.ts ├── chunkers │ ├── BaseChunker.ts │ ├── PdfFile.ts │ ├── QnaPair.ts │ ├── WebPage.ts │ └── index.ts ├── embedchain.ts ├── index.ts ├── loaders │ ├── BaseLoader.ts │ ├── LocalQnaPair.ts │ ├── PdfFile.ts │ ├── WebPage.ts │ └── index.ts ├── models │ ├── ChunkResult.ts │ ├── DataDict.ts │ ├── DataType.ts │ ├── FormattedResult.ts │ ├── Input.ts │ ├── LoaderResult.ts │ ├── Metadata.ts │ ├── Method.ts │ ├── QnAPair.ts │ └── index.ts ├── utils.ts └── vectordb │ ├── BaseVectorDb.ts │ ├── ChromaDb.ts │ └── index.ts ├── index.js ├── jest.config.js ├── lint-staged.config.js ├── package-lock.json ├── package.json ├── tsconfig.build.json └── tsconfig.json /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | // Configuration for JavaScript files 3 | "extends": [ 4 | "airbnb-base", 5 | "plugin:prettier/recommended" 6 | ], 7 | "rules": { 8 | "prettier/prettier": [ 9 | "error", 10 | { 11 | "singleQuote": true, 12 | "endOfLine": "auto" 13 | } 14 | ] 15 | }, 16 | "overrides": [ 17 | // Configuration for TypeScript files 18 | { 19 | "files": ["**/*.ts", "**/__tests__/*.test.ts"], 20 | "plugins": [ 21 | "@typescript-eslint", 22 | "unused-imports", 23 | "simple-import-sort" 24 | ], 25 | "extends": [ 26 | "airbnb-typescript", 27 | "plugin:prettier/recommended" 28 | ], 29 | "parserOptions": { 30 | "project": "./tsconfig.json" 31 | }, 32 | "rules": { 33 | "prettier/prettier": [ 34 | "error", 35 | { 36 | "singleQuote": true, 37 | "endOfLine": "auto" 38 | } 39 | ], 40 | "@typescript-eslint/comma-dangle": "off", // Avoid conflict rule between Eslint and Prettier 41 | "@typescript-eslint/consistent-type-imports": "error", // Ensure `import type` is used when it's necessary 42 | "import/prefer-default-export": "off", // Named export is easier to refactor automatically 43 | "simple-import-sort/imports": "error", // Import configuration for `eslint-plugin-simple-import-sort` 44 | "simple-import-sort/exports": "error", // Export configuration for `eslint-plugin-simple-import-sort` 45 | "@typescript-eslint/no-unused-vars": "off", 46 | "react/jsx-filename-extension": "off", // Gives error 47 | "unused-imports/no-unused-imports": "error", 48 | "unused-imports/no-unused-vars": [ 49 | "error", 50 | { "argsIgnorePattern": "^_" } 51 | ] 52 | } 53 | } 54 | ] 55 | } 56 | -------------------------------------------------------------------------------- /.github/workflows/npm-publish.yml: -------------------------------------------------------------------------------- 1 | name: Node.js Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-node@v3 13 | with: 14 | node-version: 16 15 | - run: npm ci 16 | - run: npm test 17 | - run: npm run build 18 | - uses: actions/upload-artifact@v3 19 | with: 20 | name: dist 21 | path: dist 22 | - uses: actions/upload-artifact@v3 23 | with: 24 | name: types 25 | path: types 26 | 27 | publish-npm: 28 | needs: build 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v3 32 | - uses: actions/setup-node@v3 33 | with: 34 | node-version: 16 35 | registry-url: https://registry.npmjs.org/ 36 | - uses: actions/download-artifact@v3 37 | with: 38 | name: dist 39 | path: dist 40 | - uses: actions/download-artifact@v3 41 | with: 42 | name: types 43 | path: types 44 | - run: npm ci 45 | - run: npm publish 46 | env: 47 | NODE_AUTH_TOKEN: ${{secrets.npm_token}} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | .ideas.md 133 | .todos.md 134 | 135 | # Custom 136 | dist 137 | types 138 | build -------------------------------------------------------------------------------- /.husky/commit-msg: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | 4 | npx --no -- commitlint --edit $1 5 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | 4 | # Disable concurent to run `check-types` after ESLint in lint-staged 5 | npx lint-staged --concurrent false 6 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "dbaeumer.vscode-eslint", 4 | "esbenp.prettier-vscode", 5 | "mikestead.dotenv", 6 | "csstools.postcss", 7 | "bradlc.vscode-tailwindcss", 8 | "Orta.vscode-jest" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "typescript.tsdk": "node_modules/typescript/lib", 3 | "typescript.enablePromptUseWorkspaceTsdk": true, 4 | "[javascript]": { 5 | "editor.tabSize": 2 6 | }, 7 | "[typescript]": { 8 | "editor.tabSize": 2 9 | }, 10 | "[typescriptreact]": { 11 | "editor.tabSize": 2 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "Project wide type checking with TypeScript", 8 | "type": "npm", 9 | "script": "check-types", 10 | "problemMatcher": ["$tsc"], 11 | "group": { 12 | "kind": "build", 13 | "isDefault": true 14 | }, 15 | "presentation": { 16 | "clear": true, 17 | "reveal": "never" 18 | } 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Singh" 5 | given-names: "Taranjeet" 6 | title: "Embedchain" 7 | date-released: 2023-06-25 8 | url: "https://github.com/embedchain/embedchainjs" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # embedchainjs 2 | 3 | [![Discord](https://dcbadge.vercel.app/api/server/CUU9FPhRNt?style=flat)](https://discord.gg/CUU9FPhRNt) 4 | [![Twitter](https://img.shields.io/twitter/follow/embedchain)](https://twitter.com/embedchain) 5 | [![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?logo=substack)](https://embedchain.substack.com/) 6 | 7 | embedchain is a framework to easily create LLM powered bots over any dataset. embedchainjs is Javascript version of embedchain. If you want a python version, check out [embedchain-python](https://github.com/embedchain/embedchain) 8 | 9 | # 🤝 Let's Talk Embedchain! 10 | 11 | Schedule a [Feedback Session](https://cal.com/taranjeetio/ec) with Taranjeet, the founder, to discuss any issues, provide feedback, or explore improvements. 12 | 13 | # How it works 14 | 15 | It abstracts the entire process of loading dataset, chunking it, creating embeddings and then storing in vector database. 16 | 17 | You can add a single or multiple dataset using `.add` and `.addLocal` function and then use `.query` function to find an answer from the added datasets. 18 | 19 | If you want to create a Naval Ravikant bot which has 2 of his blog posts, as well as a question and answer pair you supply, all you need to do is add the links to the blog posts and the QnA pair and embedchain will create a bot for you. 20 | 21 | ```javascript 22 | const dotenv = require("dotenv"); 23 | dotenv.config(); 24 | const { App } = require("embedchain"); 25 | 26 | //Run the app commands inside an async function only 27 | async function testApp() { 28 | const navalChatBot = await App(); 29 | 30 | // Embed Online Resources 31 | await navalChatBot.add("web_page", "https://nav.al/feedback"); 32 | await navalChatBot.add("web_page", "https://nav.al/agi"); 33 | await navalChatBot.add( 34 | "pdf_file", 35 | "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf" 36 | ); 37 | 38 | // Embed Local Resources 39 | await navalChatBot.addLocal("qna_pair", [ 40 | "Who is Naval Ravikant?", 41 | "Naval Ravikant is an Indian-American entrepreneur and investor.", 42 | ]); 43 | 44 | const result = await navalChatBot.query( 45 | "What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?" 46 | ); 47 | console.log(result); 48 | // answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality. 49 | } 50 | 51 | testApp(); 52 | ``` 53 | 54 | # Getting Started 55 | 56 | ## Installation 57 | 58 | - First make sure that you have the package installed. If not, then install it using `npm` 59 | 60 | ```bash 61 | npm install embedchain && npm install -S openai@^3.3.0 62 | ``` 63 | 64 | - Currently, it is only compatible with openai 3.X, not the latest version 4.X. Please make sure to use the right version, otherwise you will see the `ChromaDB` error `TypeError: OpenAIApi.Configuration is not a constructor` 65 | 66 | - Make sure that dotenv package is installed and your `OPENAI_API_KEY` in a file called `.env` in the root folder. You can install dotenv by 67 | 68 | ```js 69 | npm install dotenv 70 | ``` 71 | 72 | - Download and install Docker on your device by visiting [this link](https://www.docker.com/). You will need this to run Chroma vector database on your machine. 73 | 74 | - Run the following commands to setup Chroma container in Docker 75 | 76 | ```bash 77 | git clone https://github.com/chroma-core/chroma.git 78 | cd chroma 79 | docker-compose up -d --build 80 | ``` 81 | 82 | - Once Chroma container has been set up, run it inside Docker 83 | 84 | ## Usage 85 | 86 | - We use OpenAI's embedding model to create embeddings for chunks and ChatGPT API as LLM to get answer given the relevant docs. Make sure that you have an OpenAI account and an API key. If you have dont have an API key, you can create one by visiting [this link](https://platform.openai.com/account/api-keys). 87 | 88 | - Once you have the API key, set it in an environment variable called `OPENAI_API_KEY` 89 | 90 | ```js 91 | // Set this inside your .env file 92 | OPENAI_API_KEY = "sk-xxxx"; 93 | ``` 94 | 95 | - Load the environment variables inside your .js file using the following commands 96 | 97 | ```js 98 | const dotenv = require("dotenv"); 99 | dotenv.config(); 100 | ``` 101 | 102 | - Next import the `App` class from embedchain and use `.add` function to add any dataset. 103 | - Now your app is created. You can use `.query` function to get the answer for any query. 104 | 105 | ```js 106 | const dotenv = require("dotenv"); 107 | dotenv.config(); 108 | const { App } = require("embedchain"); 109 | 110 | async function testApp() { 111 | const navalChatBot = await App(); 112 | 113 | // Embed Online Resources 114 | await navalChatBot.add("web_page", "https://nav.al/feedback"); 115 | await navalChatBot.add("web_page", "https://nav.al/agi"); 116 | await navalChatBot.add( 117 | "pdf_file", 118 | "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf" 119 | ); 120 | 121 | // Embed Local Resources 122 | await navalChatBot.addLocal("qna_pair", [ 123 | "Who is Naval Ravikant?", 124 | "Naval Ravikant is an Indian-American entrepreneur and investor.", 125 | ]); 126 | 127 | const result = await navalChatBot.query( 128 | "What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?" 129 | ); 130 | console.log(result); 131 | // answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality. 132 | } 133 | 134 | testApp(); 135 | ``` 136 | 137 | - If there is any other app instance in your script or app, you can change the import as 138 | 139 | ```javascript 140 | const { App: EmbedChainApp } = require("embedchain"); 141 | 142 | // or 143 | 144 | const { App: ECApp } = require("embedchain"); 145 | ``` 146 | 147 | ## Format supported 148 | 149 | We support the following formats: 150 | 151 | ### PDF File 152 | 153 | To add any pdf file, use the data_type as `pdf_file`. Eg: 154 | 155 | ```javascript 156 | await app.add("pdf_file", "a_valid_url_where_pdf_file_can_be_accessed"); 157 | ``` 158 | 159 | ### Web Page 160 | 161 | To add any web page, use the data_type as `web_page`. Eg: 162 | 163 | ```javascript 164 | await app.add("web_page", "a_valid_web_page_url"); 165 | ``` 166 | 167 | ### QnA Pair 168 | 169 | To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg: 170 | 171 | ```javascript 172 | await app.addLocal("qna_pair", ["Question", "Answer"]); 173 | ``` 174 | 175 | ### More Formats coming soon 176 | 177 | - If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchainjs/issues) and we will add it to the list of supported formats. 178 | 179 | ## Testing 180 | 181 | Before you consume valueable tokens, you should make sure that the embedding you have done works and that it's receiving the correct document from the database. 182 | 183 | For this you can use the `dryRun` method. 184 | 185 | Following the example above, add this to your script: 186 | 187 | ```js 188 | let result = await naval_chat_bot.dryRun("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?");console.log(result); 189 | 190 | ''' 191 | Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 192 | terms of the unseen. And I think that’s critical. That is what humans do uniquely that no other creature, no other computer, no other intelligence—biological or artificial—that we have ever encountered does. And not only do we do it uniquely, but if we were to meet an alien species that also had the power to generate these good explanations, there is no explanation that they could generate that we could not understand. We are maximally capable of understanding. There is no concept out there that is possible in this physical reality that a human being, given sufficient time and resources and 193 | Query: What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts? 194 | Helpful Answer: 195 | ''' 196 | ``` 197 | 198 | _The embedding is confirmed to work as expected. It returns the right document, even if the question is asked slightly different. No prompt tokens have been consumed._ 199 | 200 | **The dry run will still consume tokens to embed your query, but it is only ~1/15 of the prompt.** 201 | 202 | # How does it work? 203 | 204 | Creating a chat bot over any dataset needs the following steps to happen 205 | 206 | - load the data 207 | - create meaningful chunks 208 | - create embeddings for each chunk 209 | - store the chunks in vector database 210 | 211 | Whenever a user asks any query, following process happens to find the answer for the query 212 | 213 | - create the embedding for query 214 | - find similar documents for this query from vector database 215 | - pass similar documents as context to LLM to get the final answer. 216 | 217 | The process of loading the dataset and then querying involves multiple steps and each steps has nuances of it is own. 218 | 219 | - How should I chunk the data? What is a meaningful chunk size? 220 | - How should I create embeddings for each chunk? Which embedding model should I use? 221 | - How should I store the chunks in vector database? Which vector database should I use? 222 | - Should I store meta data along with the embeddings? 223 | - How should I find similar documents for a query? Which ranking model should I use? 224 | 225 | These questions may be trivial for some but for a lot of us, it needs research, experimentation and time to find out the accurate answers. 226 | 227 | embedchain is a framework which takes care of all these nuances and provides a simple interface to create bots over any dataset. 228 | 229 | In the first release, we are making it easier for anyone to get a chatbot over any dataset up and running in less than a minute. All you need to do is create an app instance, add the data sets using `.add` function and then use `.query` function to get the relevant answer. 230 | 231 | # Tech Stack 232 | 233 | embedchain is built on the following stack: 234 | 235 | - [Langchain](https://github.com/hwchase17/langchain) as an LLM framework to load, chunk and index data 236 | - [OpenAI's Ada embedding model](https://platform.openai.com/docs/guides/embeddings) to create embeddings 237 | - [OpenAI's ChatGPT API](https://platform.openai.com/docs/guides/gpt/chat-completions-api) as LLM to get answers given the context 238 | - [Chroma](https://github.com/chroma-core/chroma) as the vector database to store embeddings 239 | 240 | # Team 241 | 242 | ## Author 243 | 244 | - Taranjeet Singh ([@taranjeetio](https://twitter.com/taranjeetio)) 245 | 246 | ## Maintainer 247 | 248 | - [cachho](https://github.com/cachho) 249 | - [sahilyadav902](https://github.com/sahilyadav902) 250 | 251 | ## Citation 252 | 253 | If you utilize this repository, please consider citing it with: 254 | ``` 255 | @misc{embedchain, 256 | author = {Taranjeet Singh}, 257 | title = {Embechain: Framework to easily create LLM powered bots over any dataset}, 258 | year = {2023}, 259 | publisher = {GitHub}, 260 | journal = {GitHub repository}, 261 | howpublished = {\url{https://github.com/embedchain/embedchainjs}}, 262 | } 263 | ``` 264 | -------------------------------------------------------------------------------- /commitlint.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { extends: ['@commitlint/config-conventional'] }; 2 | -------------------------------------------------------------------------------- /embedchain/__tests__/readme.test.ts: -------------------------------------------------------------------------------- 1 | import { EmbedChainApp } from '../embedchain'; 2 | 3 | const mockAdd = jest.fn(); 4 | const mockAddLocal = jest.fn(); 5 | const mockQuery = jest.fn(); 6 | 7 | jest.mock('../embedchain', () => { 8 | return { 9 | EmbedChainApp: jest.fn().mockImplementation(() => { 10 | return { 11 | add: mockAdd, 12 | addLocal: mockAddLocal, 13 | query: mockQuery, 14 | }; 15 | }), 16 | }; 17 | }); 18 | 19 | describe('Test App', () => { 20 | beforeEach(() => { 21 | jest.clearAllMocks(); 22 | }); 23 | 24 | it('tests the App', async () => { 25 | mockQuery.mockResolvedValue( 26 | 'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.' 27 | ); 28 | 29 | const navalChatBot = await new EmbedChainApp(undefined, false); 30 | 31 | // Embed Online Resources 32 | await navalChatBot.add('web_page', 'https://nav.al/feedback'); 33 | await navalChatBot.add('web_page', 'https://nav.al/agi'); 34 | await navalChatBot.add( 35 | 'pdf_file', 36 | 'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf' 37 | ); 38 | 39 | // Embed Local Resources 40 | await navalChatBot.addLocal('qna_pair', [ 41 | 'Who is Naval Ravikant?', 42 | 'Naval Ravikant is an Indian-American entrepreneur and investor.', 43 | ]); 44 | 45 | const result = await navalChatBot.query( 46 | 'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?' 47 | ); 48 | 49 | expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/feedback'); 50 | expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/agi'); 51 | expect(mockAdd).toHaveBeenCalledWith( 52 | 'pdf_file', 53 | 'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf' 54 | ); 55 | expect(mockAddLocal).toHaveBeenCalledWith('qna_pair', [ 56 | 'Who is Naval Ravikant?', 57 | 'Naval Ravikant is an Indian-American entrepreneur and investor.', 58 | ]); 59 | expect(mockQuery).toHaveBeenCalledWith( 60 | 'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?' 61 | ); 62 | expect(result).toBe( 63 | 'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.' 64 | ); 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /embedchain/chunkers/BaseChunker.ts: -------------------------------------------------------------------------------- 1 | import { createHash } from 'crypto'; 2 | import type { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; 3 | 4 | import type { BaseLoader } from '../loaders'; 5 | import type { Input, LoaderResult } from '../models'; 6 | import type { ChunkResult } from '../models/ChunkResult'; 7 | 8 | class BaseChunker { 9 | textSplitter: RecursiveCharacterTextSplitter; 10 | 11 | constructor(textSplitter: RecursiveCharacterTextSplitter) { 12 | this.textSplitter = textSplitter; 13 | } 14 | 15 | async createChunks(loader: BaseLoader, url: Input): Promise { 16 | const documents: ChunkResult['documents'] = []; 17 | const ids: ChunkResult['ids'] = []; 18 | const datas: LoaderResult = await loader.loadData(url); 19 | const metadatas: ChunkResult['metadatas'] = []; 20 | 21 | const dataPromises = datas.map(async (data) => { 22 | const { content, metaData } = data; 23 | const chunks: string[] = await this.textSplitter.splitText(content); 24 | chunks.forEach((chunk) => { 25 | const chunkId = createHash('sha256') 26 | .update(chunk + metaData.url) 27 | .digest('hex'); 28 | ids.push(chunkId); 29 | documents.push(chunk); 30 | metadatas.push(metaData); 31 | }); 32 | }); 33 | 34 | await Promise.all(dataPromises); 35 | 36 | return { 37 | documents, 38 | ids, 39 | metadatas, 40 | }; 41 | } 42 | } 43 | 44 | export { BaseChunker }; 45 | -------------------------------------------------------------------------------- /embedchain/chunkers/PdfFile.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; 2 | 3 | import { BaseChunker } from './BaseChunker'; 4 | 5 | interface TextSplitterChunkParams { 6 | chunkSize: number; 7 | chunkOverlap: number; 8 | keepSeparator: boolean; 9 | } 10 | 11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = { 12 | chunkSize: 1000, 13 | chunkOverlap: 0, 14 | keepSeparator: false, 15 | }; 16 | 17 | class PdfFileChunker extends BaseChunker { 18 | constructor() { 19 | const textSplitter = new RecursiveCharacterTextSplitter( 20 | TEXT_SPLITTER_CHUNK_PARAMS 21 | ); 22 | super(textSplitter); 23 | } 24 | } 25 | 26 | export { PdfFileChunker }; 27 | -------------------------------------------------------------------------------- /embedchain/chunkers/QnaPair.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; 2 | 3 | import { BaseChunker } from './BaseChunker'; 4 | 5 | interface TextSplitterChunkParams { 6 | chunkSize: number; 7 | chunkOverlap: number; 8 | keepSeparator: boolean; 9 | } 10 | 11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = { 12 | chunkSize: 300, 13 | chunkOverlap: 0, 14 | keepSeparator: false, 15 | }; 16 | 17 | class QnaPairChunker extends BaseChunker { 18 | constructor() { 19 | const textSplitter = new RecursiveCharacterTextSplitter( 20 | TEXT_SPLITTER_CHUNK_PARAMS 21 | ); 22 | super(textSplitter); 23 | } 24 | } 25 | 26 | export { QnaPairChunker }; 27 | -------------------------------------------------------------------------------- /embedchain/chunkers/WebPage.ts: -------------------------------------------------------------------------------- 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; 2 | 3 | import { BaseChunker } from './BaseChunker'; 4 | 5 | interface TextSplitterChunkParams { 6 | chunkSize: number; 7 | chunkOverlap: number; 8 | keepSeparator: boolean; 9 | } 10 | 11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = { 12 | chunkSize: 500, 13 | chunkOverlap: 0, 14 | keepSeparator: false, 15 | }; 16 | 17 | class WebPageChunker extends BaseChunker { 18 | constructor() { 19 | const textSplitter = new RecursiveCharacterTextSplitter( 20 | TEXT_SPLITTER_CHUNK_PARAMS 21 | ); 22 | super(textSplitter); 23 | } 24 | } 25 | 26 | export { WebPageChunker }; 27 | -------------------------------------------------------------------------------- /embedchain/chunkers/index.ts: -------------------------------------------------------------------------------- 1 | import { BaseChunker } from './BaseChunker'; 2 | import { PdfFileChunker } from './PdfFile'; 3 | import { QnaPairChunker } from './QnaPair'; 4 | import { WebPageChunker } from './WebPage'; 5 | 6 | export { BaseChunker, PdfFileChunker, QnaPairChunker, WebPageChunker }; 7 | -------------------------------------------------------------------------------- /embedchain/embedchain.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable max-classes-per-file */ 2 | import type { Collection } from 'chromadb'; 3 | import type { QueryResponse } from 'chromadb/dist/main/types'; 4 | import * as fs from 'fs'; 5 | import { Document } from 'langchain/document'; 6 | import OpenAI from 'openai'; 7 | import * as path from 'path'; 8 | import { v4 as uuidv4 } from 'uuid'; 9 | 10 | import type { BaseChunker } from './chunkers'; 11 | import { PdfFileChunker, QnaPairChunker, WebPageChunker } from './chunkers'; 12 | import type { BaseLoader } from './loaders'; 13 | import { LocalQnaPairLoader, PdfFileLoader, WebPageLoader } from './loaders'; 14 | import type { 15 | DataDict, 16 | DataType, 17 | FormattedResult, 18 | Input, 19 | LocalInput, 20 | Metadata, 21 | Method, 22 | RemoteInput, 23 | } from './models'; 24 | import { ChromaDB } from './vectordb'; 25 | import type { BaseVectorDB } from './vectordb/BaseVectorDb'; 26 | 27 | const openai = new OpenAI({ 28 | apiKey: process.env.OPENAI_API_KEY, 29 | }); 30 | 31 | class EmbedChain { 32 | dbClient: any; 33 | 34 | // TODO: Definitely assign 35 | collection!: Collection; 36 | 37 | userAsks: [DataType, Input][] = []; 38 | 39 | initApp: Promise; 40 | 41 | collectMetrics: boolean; 42 | 43 | sId: string; // sessionId 44 | 45 | constructor(db?: BaseVectorDB, collectMetrics: boolean = true) { 46 | if (!db) { 47 | this.initApp = this.setupChroma(); 48 | } else { 49 | this.initApp = this.setupOther(db); 50 | } 51 | 52 | this.collectMetrics = collectMetrics; 53 | 54 | // Send anonymous telemetry 55 | this.sId = uuidv4(); 56 | this.sendTelemetryEvent('init'); 57 | } 58 | 59 | async setupChroma(): Promise { 60 | const db = new ChromaDB(); 61 | await db.initDb; 62 | this.dbClient = db.client; 63 | if (db.collection) { 64 | this.collection = db.collection; 65 | } else { 66 | // TODO: Add proper error handling 67 | console.error('No collection'); 68 | } 69 | } 70 | 71 | async setupOther(db: BaseVectorDB): Promise { 72 | await db.initDb; 73 | // TODO: Figure out how we can initialize an unknown database. 74 | // this.dbClient = db.client; 75 | // this.collection = db.collection; 76 | this.userAsks = []; 77 | } 78 | 79 | static getLoader(dataType: DataType) { 80 | const loaders: { [t in DataType]: BaseLoader } = { 81 | pdf_file: new PdfFileLoader(), 82 | web_page: new WebPageLoader(), 83 | qna_pair: new LocalQnaPairLoader(), 84 | }; 85 | return loaders[dataType]; 86 | } 87 | 88 | static getChunker(dataType: DataType) { 89 | const chunkers: { [t in DataType]: BaseChunker } = { 90 | pdf_file: new PdfFileChunker(), 91 | web_page: new WebPageChunker(), 92 | qna_pair: new QnaPairChunker(), 93 | }; 94 | return chunkers[dataType]; 95 | } 96 | 97 | public async add(dataType: DataType, url: RemoteInput) { 98 | const loader = EmbedChain.getLoader(dataType); 99 | const chunker = EmbedChain.getChunker(dataType); 100 | this.userAsks.push([dataType, url]); 101 | const { documents, countNewChunks } = await this.loadAndEmbed( 102 | loader, 103 | chunker, 104 | url 105 | ); 106 | 107 | if (this.collectMetrics) { 108 | const wordCount = documents.reduce( 109 | (sum, document) => sum + document.split(' ').length, 110 | 0 111 | ); 112 | 113 | this.sendTelemetryEvent('add', { 114 | data_type: dataType, 115 | word_count: wordCount, 116 | chunks_count: countNewChunks, 117 | }); 118 | } 119 | } 120 | 121 | public async addLocal(dataType: DataType, content: LocalInput) { 122 | const loader = EmbedChain.getLoader(dataType); 123 | const chunker = EmbedChain.getChunker(dataType); 124 | this.userAsks.push([dataType, content]); 125 | const { documents, countNewChunks } = await this.loadAndEmbed( 126 | loader, 127 | chunker, 128 | content 129 | ); 130 | 131 | if (this.collectMetrics) { 132 | const wordCount = documents.reduce( 133 | (sum, document) => sum + document.split(' ').length, 134 | 0 135 | ); 136 | 137 | this.sendTelemetryEvent('add_local', { 138 | data_type: dataType, 139 | word_count: wordCount, 140 | chunks_count: countNewChunks, 141 | }); 142 | } 143 | } 144 | 145 | protected async loadAndEmbed( 146 | loader: any, 147 | chunker: BaseChunker, 148 | src: Input 149 | ): Promise<{ 150 | documents: string[]; 151 | metadatas: Metadata[]; 152 | ids: string[]; 153 | countNewChunks: number; 154 | }> { 155 | const embeddingsData = await chunker.createChunks(loader, src); 156 | let { documents, ids, metadatas } = embeddingsData; 157 | 158 | const existingDocs = await this.collection.get({ ids }); 159 | const existingIds = new Set(existingDocs.ids); 160 | 161 | if (existingIds.size > 0) { 162 | const dataDict: DataDict = {}; 163 | for (let i = 0; i < ids.length; i += 1) { 164 | const id = ids[i]; 165 | if (!existingIds.has(id)) { 166 | dataDict[id] = { doc: documents[i], meta: metadatas[i] }; 167 | } 168 | } 169 | 170 | if (Object.keys(dataDict).length === 0) { 171 | console.log(`All data from ${src} already exists in the database.`); 172 | return { documents: [], metadatas: [], ids: [], countNewChunks: 0 }; 173 | } 174 | ids = Object.keys(dataDict); 175 | const dataValues = Object.values(dataDict); 176 | documents = dataValues.map(({ doc }) => doc); 177 | metadatas = dataValues.map(({ meta }) => meta); 178 | } 179 | 180 | const countBeforeAddition = await this.count(); 181 | await this.collection.add({ documents, metadatas, ids }); 182 | const countNewChunks = (await this.count()) - countBeforeAddition; 183 | console.log( 184 | `Successfully saved ${src}. New chunks count: ${countNewChunks}` 185 | ); 186 | return { documents, metadatas, ids, countNewChunks }; 187 | } 188 | 189 | static async formatResult( 190 | results: QueryResponse 191 | ): Promise { 192 | return results.documents[0].map((document: any, index: number) => { 193 | const metadata = results.metadatas[0][index] || {}; 194 | // TODO: Add proper error handling 195 | const distance = results.distances ? results.distances[0][index] : null; 196 | return [new Document({ pageContent: document, metadata }), distance]; 197 | }); 198 | } 199 | 200 | static async getOpenAiAnswer(prompt: string) { 201 | const messages: OpenAI.Chat.CreateChatCompletionRequestMessage[] = [ 202 | { role: 'user', content: prompt }, 203 | ]; 204 | const response = await openai.chat.completions.create({ 205 | model: 'gpt-3.5-turbo', 206 | messages, 207 | temperature: 0, 208 | max_tokens: 1000, 209 | top_p: 1, 210 | }); 211 | return ( 212 | response.choices[0].message?.content ?? 'Response could not be processed.' 213 | ); 214 | } 215 | 216 | protected async retrieveFromDatabase(inputQuery: string) { 217 | const result = await this.collection.query({ 218 | nResults: 1, 219 | queryTexts: [inputQuery], 220 | }); 221 | const resultFormatted = await EmbedChain.formatResult(result); 222 | const content = resultFormatted[0][0].pageContent; 223 | return content; 224 | } 225 | 226 | static generatePrompt(inputQuery: string, context: any) { 227 | const prompt = `Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n${context}\nQuery: ${inputQuery}\nHelpful Answer:`; 228 | return prompt; 229 | } 230 | 231 | static async getAnswerFromLlm(prompt: string) { 232 | const answer = await EmbedChain.getOpenAiAnswer(prompt); 233 | return answer; 234 | } 235 | 236 | public async query(inputQuery: string) { 237 | const context = await this.retrieveFromDatabase(inputQuery); 238 | const prompt = EmbedChain.generatePrompt(inputQuery, context); 239 | const answer = await EmbedChain.getAnswerFromLlm(prompt); 240 | this.sendTelemetryEvent('query'); 241 | return answer; 242 | } 243 | 244 | public async dryRun(input_query: string) { 245 | const context = await this.retrieveFromDatabase(input_query); 246 | const prompt = EmbedChain.generatePrompt(input_query, context); 247 | return prompt; 248 | } 249 | 250 | /** 251 | * Count the number of embeddings. 252 | * @returns {Promise}: The number of embeddings. 253 | */ 254 | public count(): Promise { 255 | return this.collection.count(); 256 | } 257 | 258 | protected async sendTelemetryEvent(method: Method, extraMetadata?: object) { 259 | if (!this.collectMetrics) { 260 | return; 261 | } 262 | const url = 'https://api.embedchain.ai/api/v1/telemetry/'; 263 | 264 | // Read package version from filesystem (because it's not in the ts root dir) 265 | const packageJsonPath = path.join(__dirname, '..', 'package.json'); 266 | const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8')); 267 | 268 | const metadata = { 269 | s_id: this.sId, 270 | version: packageJson.version, 271 | method, 272 | language: 'js', 273 | ...extraMetadata, 274 | }; 275 | 276 | const maxRetries = 3; 277 | 278 | // Retry the fetch 279 | for (let i = 0; i < maxRetries; i += 1) { 280 | try { 281 | // eslint-disable-next-line no-await-in-loop 282 | const response = await fetch(url, { 283 | method: 'POST', 284 | body: JSON.stringify({ metadata }), 285 | }); 286 | 287 | if (response.ok) { 288 | // Break out of the loop if the request was successful 289 | break; 290 | } else { 291 | // Log the unsuccessful response (optional) 292 | console.error( 293 | `Telemetry: Attempt ${i + 1} failed with status:`, 294 | response.status 295 | ); 296 | } 297 | } catch (error) { 298 | // Log the error (optional) 299 | console.error(`Telemetry: Attempt ${i + 1} failed with error:`, error); 300 | } 301 | 302 | // If this was the last attempt, throw an error or handle the failure 303 | if (i === maxRetries - 1) { 304 | console.error('Telemetry: Max retries reached'); 305 | } 306 | } 307 | } 308 | } 309 | 310 | class EmbedChainApp extends EmbedChain { 311 | // The EmbedChain app. 312 | // Has two functions: add and query. 313 | // adds(dataType, url): adds the data from the given URL to the vector db. 314 | // query(query): finds answer to the given query using vector database and LLM. 315 | } 316 | 317 | export { EmbedChainApp }; 318 | -------------------------------------------------------------------------------- /embedchain/index.ts: -------------------------------------------------------------------------------- 1 | import { EmbedChainApp } from './embedchain'; 2 | 3 | export const App = async () => { 4 | const app = new EmbedChainApp(); 5 | await app.initApp; 6 | return app; 7 | }; 8 | -------------------------------------------------------------------------------- /embedchain/loaders/BaseLoader.ts: -------------------------------------------------------------------------------- 1 | import type { Input, LoaderResult } from '../models'; 2 | 3 | export abstract class BaseLoader { 4 | abstract loadData(src: Input): Promise; 5 | } 6 | -------------------------------------------------------------------------------- /embedchain/loaders/LocalQnaPair.ts: -------------------------------------------------------------------------------- 1 | import type { LoaderResult, QnaPair } from '../models'; 2 | import { BaseLoader } from './BaseLoader'; 3 | 4 | class LocalQnaPairLoader extends BaseLoader { 5 | // eslint-disable-next-line class-methods-use-this 6 | async loadData(content: QnaPair): Promise { 7 | const [question, answer] = content; 8 | const contentText = `Q: ${question}\nA: ${answer}`; 9 | const metaData = { 10 | url: 'local', 11 | }; 12 | return [ 13 | { 14 | content: contentText, 15 | metaData, 16 | }, 17 | ]; 18 | } 19 | } 20 | 21 | export { LocalQnaPairLoader }; 22 | -------------------------------------------------------------------------------- /embedchain/loaders/PdfFile.ts: -------------------------------------------------------------------------------- 1 | import type { TextContent } from 'pdfjs-dist/types/src/display/api'; 2 | 3 | import type { LoaderResult, Metadata } from '../models'; 4 | import { cleanString } from '../utils'; 5 | import { BaseLoader } from './BaseLoader'; 6 | 7 | const pdfjsLib = require('pdfjs-dist'); 8 | 9 | interface Page { 10 | page_content: string; 11 | } 12 | 13 | class PdfFileLoader extends BaseLoader { 14 | static async getPagesFromPdf(url: string): Promise { 15 | const loadingTask = pdfjsLib.getDocument(url); 16 | const pdf = await loadingTask.promise; 17 | const { numPages } = pdf; 18 | 19 | const promises = Array.from({ length: numPages }, async (_, i) => { 20 | const page = await pdf.getPage(i + 1); 21 | const pageText: TextContent = await page.getTextContent(); 22 | const pageContent: string = pageText.items 23 | .map((item) => ('str' in item ? item.str : '')) 24 | .join(' '); 25 | 26 | return { 27 | page_content: pageContent, 28 | }; 29 | }); 30 | 31 | return Promise.all(promises); 32 | } 33 | 34 | // eslint-disable-next-line class-methods-use-this 35 | async loadData(url: string): Promise { 36 | const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url); 37 | const output: LoaderResult = []; 38 | 39 | if (!pages.length) { 40 | throw new Error('No data found'); 41 | } 42 | 43 | pages.forEach((page) => { 44 | let content: string = page.page_content; 45 | content = cleanString(content); 46 | const metaData: Metadata = { 47 | url, 48 | }; 49 | output.push({ 50 | content, 51 | metaData, 52 | }); 53 | }); 54 | return output; 55 | } 56 | } 57 | 58 | export { PdfFileLoader }; 59 | -------------------------------------------------------------------------------- /embedchain/loaders/WebPage.ts: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import { JSDOM } from 'jsdom'; 3 | 4 | import { cleanString } from '../utils'; 5 | import { BaseLoader } from './BaseLoader'; 6 | 7 | class WebPageLoader extends BaseLoader { 8 | // eslint-disable-next-line class-methods-use-this 9 | async loadData(url: string) { 10 | const response = await axios.get(url); 11 | const html = response.data; 12 | const dom = new JSDOM(html); 13 | const { document } = dom.window; 14 | const unwantedTags = [ 15 | 'nav', 16 | 'aside', 17 | 'form', 18 | 'header', 19 | 'noscript', 20 | 'svg', 21 | 'canvas', 22 | 'footer', 23 | 'script', 24 | 'style', 25 | ]; 26 | unwantedTags.forEach((tagName) => { 27 | const elements = document.getElementsByTagName(tagName); 28 | Array.from(elements).forEach((element) => { 29 | // eslint-disable-next-line no-param-reassign 30 | (element as HTMLElement).textContent = ' '; 31 | }); 32 | }); 33 | 34 | const output = []; 35 | let content = document.body.textContent; 36 | if (!content) { 37 | throw new Error('Web page content is empty.'); 38 | } 39 | content = cleanString(content); 40 | const metaData = { 41 | url, 42 | }; 43 | output.push({ 44 | content, 45 | metaData, 46 | }); 47 | return output; 48 | } 49 | } 50 | 51 | export { WebPageLoader }; 52 | -------------------------------------------------------------------------------- /embedchain/loaders/index.ts: -------------------------------------------------------------------------------- 1 | import { BaseLoader } from './BaseLoader'; 2 | import { LocalQnaPairLoader } from './LocalQnaPair'; 3 | import { PdfFileLoader } from './PdfFile'; 4 | import { WebPageLoader } from './WebPage'; 5 | 6 | export { BaseLoader, LocalQnaPairLoader, PdfFileLoader, WebPageLoader }; 7 | -------------------------------------------------------------------------------- /embedchain/models/ChunkResult.ts: -------------------------------------------------------------------------------- 1 | import type { Metadata } from './Metadata'; 2 | 3 | export type ChunkResult = { 4 | documents: string[]; 5 | ids: string[]; 6 | metadatas: Metadata[]; 7 | }; 8 | -------------------------------------------------------------------------------- /embedchain/models/DataDict.ts: -------------------------------------------------------------------------------- 1 | import type { ChunkResult } from './ChunkResult'; 2 | 3 | type Data = { 4 | doc: ChunkResult['documents'][0]; 5 | meta: ChunkResult['metadatas'][0]; 6 | }; 7 | 8 | export type DataDict = { 9 | [id: string]: Data; 10 | }; 11 | -------------------------------------------------------------------------------- /embedchain/models/DataType.ts: -------------------------------------------------------------------------------- 1 | export type DataType = 'pdf_file' | 'web_page' | 'qna_pair'; 2 | -------------------------------------------------------------------------------- /embedchain/models/FormattedResult.ts: -------------------------------------------------------------------------------- 1 | import type { Document } from 'langchain/document'; 2 | 3 | export type FormattedResult = [Document, number | null]; 4 | -------------------------------------------------------------------------------- /embedchain/models/Input.ts: -------------------------------------------------------------------------------- 1 | import type { QnaPair } from './QnAPair'; 2 | 3 | export type RemoteInput = string; 4 | 5 | export type LocalInput = QnaPair; 6 | 7 | export type Input = RemoteInput | LocalInput; 8 | -------------------------------------------------------------------------------- /embedchain/models/LoaderResult.ts: -------------------------------------------------------------------------------- 1 | import type { Metadata } from './Metadata'; 2 | 3 | export type LoaderResult = { content: any; metaData: Metadata }[]; 4 | -------------------------------------------------------------------------------- /embedchain/models/Metadata.ts: -------------------------------------------------------------------------------- 1 | export type Metadata = { 2 | url: string; 3 | }; 4 | -------------------------------------------------------------------------------- /embedchain/models/Method.ts: -------------------------------------------------------------------------------- 1 | export type Method = 'init' | 'query' | 'add' | 'add_local'; 2 | -------------------------------------------------------------------------------- /embedchain/models/QnAPair.ts: -------------------------------------------------------------------------------- 1 | type Question = string; 2 | type Answer = string; 3 | 4 | export type QnaPair = [Question, Answer]; 5 | -------------------------------------------------------------------------------- /embedchain/models/index.ts: -------------------------------------------------------------------------------- 1 | import { DataDict } from './DataDict'; 2 | import { DataType } from './DataType'; 3 | import { FormattedResult } from './FormattedResult'; 4 | import { Input, LocalInput, RemoteInput } from './Input'; 5 | import { LoaderResult } from './LoaderResult'; 6 | import { Metadata } from './Metadata'; 7 | import { Method } from './Method'; 8 | import { QnaPair } from './QnAPair'; 9 | 10 | export { 11 | DataDict, 12 | DataType, 13 | FormattedResult, 14 | Input, 15 | LoaderResult, 16 | LocalInput, 17 | Metadata, 18 | Method, 19 | QnaPair, 20 | RemoteInput, 21 | }; 22 | -------------------------------------------------------------------------------- /embedchain/utils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This function takes in a string and performs a series of text cleaning operations. 3 | * @param {str} text: The text to be cleaned. This is expected to be a string. 4 | * @returns {str}: The cleaned text after all the cleaning operations have been performed. 5 | */ 6 | export function cleanString(text: string): string { 7 | // Replacement of newline characters: 8 | let cleanedText = text.replace(/\n/g, ' '); 9 | 10 | // Stripping and reducing multiple spaces to single: 11 | cleanedText = cleanedText.trim().replace(/\s+/g, ' '); 12 | 13 | // Removing backslashes: 14 | cleanedText = cleanedText.replace(/\\/g, ''); 15 | 16 | // Replacing hash characters: 17 | cleanedText = cleanedText.replace(/#/g, ' '); 18 | 19 | // Eliminating consecutive non-alphanumeric characters: 20 | // This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string 21 | // and replaces each group of such characters with a single occurrence of that character. 22 | // For example, "!!! hello !!!" would become "! hello !". 23 | cleanedText = cleanedText.replace(/([^\w\s])\1*/g, '$1'); 24 | 25 | return cleanedText; 26 | } 27 | -------------------------------------------------------------------------------- /embedchain/vectordb/BaseVectorDb.ts: -------------------------------------------------------------------------------- 1 | class BaseVectorDB { 2 | initDb: Promise; 3 | 4 | constructor() { 5 | this.initDb = this.getClientAndCollection(); 6 | } 7 | 8 | // eslint-disable-next-line class-methods-use-this 9 | protected async getClientAndCollection(): Promise { 10 | throw new Error('getClientAndCollection() method is not implemented'); 11 | } 12 | } 13 | 14 | export { BaseVectorDB }; 15 | -------------------------------------------------------------------------------- /embedchain/vectordb/ChromaDb.ts: -------------------------------------------------------------------------------- 1 | import type { Collection } from 'chromadb'; 2 | import { ChromaClient, OpenAIEmbeddingFunction } from 'chromadb'; 3 | 4 | import { BaseVectorDB } from './BaseVectorDb'; 5 | 6 | const embedder = new OpenAIEmbeddingFunction({ 7 | openai_api_key: process.env.OPENAI_API_KEY ?? '', 8 | }); 9 | 10 | class ChromaDB extends BaseVectorDB { 11 | client: ChromaClient | undefined; 12 | 13 | collection: Collection | null = null; 14 | 15 | // eslint-disable-next-line @typescript-eslint/no-useless-constructor 16 | constructor() { 17 | super(); 18 | } 19 | 20 | protected async getClientAndCollection(): Promise { 21 | this.client = new ChromaClient({ path: 'http://localhost:8000' }); 22 | try { 23 | this.collection = await this.client.getCollection({ 24 | name: 'embedchain_store', 25 | embeddingFunction: embedder, 26 | }); 27 | } catch (err) { 28 | if (!this.collection) { 29 | this.collection = await this.client.createCollection({ 30 | name: 'embedchain_store', 31 | embeddingFunction: embedder, 32 | }); 33 | } 34 | } 35 | } 36 | } 37 | 38 | export { ChromaDB }; 39 | -------------------------------------------------------------------------------- /embedchain/vectordb/index.ts: -------------------------------------------------------------------------------- 1 | import { ChromaDB } from './ChromaDb'; 2 | 3 | export { ChromaDB }; 4 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const { EmbedChainApp } = require("./embedchain/embedchain"); 2 | 3 | async function App() { 4 | const app = new EmbedChainApp(); 5 | await app.init_app; 6 | return app; 7 | } 8 | 9 | module.exports = { App }; 10 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: 'ts-jest', 3 | testEnvironment: 'node', 4 | testPathIgnorePatterns: ['.d.ts'], 5 | }; 6 | -------------------------------------------------------------------------------- /lint-staged.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | '*.{js,ts}': ['eslint --fix', 'eslint'], 3 | '**/*.ts?(x)': () => 'npm run check-types', 4 | '*.json': ['prettier --write'], 5 | }; 6 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "embedchain", 3 | "version": "0.0.8", 4 | "description": "embedchain is a framework to easily create LLM powered bots over any dataset", 5 | "main": "dist/index.js", 6 | "types": "types/index.d.ts", 7 | "files": [ 8 | "dist", 9 | "types" 10 | ], 11 | "scripts": { 12 | "build": "tsc -p tsconfig.build.json --listFiles", 13 | "prepare": "husky install", 14 | "test": "jest", 15 | "check-types": "tsc --noEmit --pretty" 16 | }, 17 | "author": "Taranjeet Singh", 18 | "license": "Apache-2.0", 19 | "dependencies": { 20 | "axios": "^1.4.0", 21 | "chromadb": "^1.5.6", 22 | "jsdom": "^22.1.0", 23 | "langchain": "^0.0.136", 24 | "openai": "^4.3.1", 25 | "pdfjs-dist": "^3.8.162", 26 | "uuid": "^9.0.0" 27 | }, 28 | "devDependencies": { 29 | "@commitlint/cli": "^17.1.2", 30 | "@commitlint/config-conventional": "^17.1.0", 31 | "@commitlint/cz-commitlint": "^17.1.2", 32 | "@types/jest": "^29.5.1", 33 | "@types/jsdom": "^21.1.1", 34 | "@typescript-eslint/eslint-plugin": "^5.41.0", 35 | "@typescript-eslint/parser": "^5.41.0", 36 | "eslint": "^8.34.0", 37 | "eslint-config-airbnb-base": "^15.0.0", 38 | "eslint-config-airbnb-typescript": "^17.0.0", 39 | "eslint-config-prettier": "^8.5.0", 40 | "eslint-plugin-import": "^2.27.5", 41 | "eslint-plugin-prettier": "^4.2.1", 42 | "eslint-plugin-simple-import-sort": "^8.0.0", 43 | "eslint-plugin-testing-library": "^5.9.1", 44 | "eslint-plugin-unused-imports": "^2.0.0", 45 | "husky": "^8.0.1", 46 | "jest": "^29.5.0", 47 | "lint-staged": "^13.0.3", 48 | "prettier": "^2.7.1", 49 | "ts-jest": "^29.1.0", 50 | "ts-loader": "^9.4.2", 51 | "typescript": "^5.2.2" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "exclude": ["embedchain/__tests__"] 4 | } 5 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es6", 4 | "module": "CommonJS", 5 | "strict": true, 6 | "outDir": "dist", 7 | "rootDir": "embedchain", 8 | "sourceMap": true, 9 | "declaration": true, 10 | "declarationDir": "types", 11 | "esModuleInterop": true 12 | }, 13 | "include": ["embedchain/**/*.ts"], 14 | "exclude": ["node_modules", "dist"] 15 | } 16 | --------------------------------------------------------------------------------