├── .eslintignore
├── .eslintrc
├── .github
    └── workflows
    │   └── npm-publish.yml
├── .gitignore
├── .husky
    ├── commit-msg
    └── pre-commit
├── .vscode
    ├── extensions.json
    ├── settings.json
    └── tasks.json
├── CITATION.cff
├── LICENSE
├── README.md
├── commitlint.config.js
├── embedchain
    ├── __tests__
    │   └── readme.test.ts
    ├── chunkers
    │   ├── BaseChunker.ts
    │   ├── PdfFile.ts
    │   ├── QnaPair.ts
    │   ├── WebPage.ts
    │   └── index.ts
    ├── embedchain.ts
    ├── index.ts
    ├── loaders
    │   ├── BaseLoader.ts
    │   ├── LocalQnaPair.ts
    │   ├── PdfFile.ts
    │   ├── WebPage.ts
    │   └── index.ts
    ├── models
    │   ├── ChunkResult.ts
    │   ├── DataDict.ts
    │   ├── DataType.ts
    │   ├── FormattedResult.ts
    │   ├── Input.ts
    │   ├── LoaderResult.ts
    │   ├── Metadata.ts
    │   ├── Method.ts
    │   ├── QnAPair.ts
    │   └── index.ts
    ├── utils.ts
    └── vectordb
    │   ├── BaseVectorDb.ts
    │   ├── ChromaDb.ts
    │   └── index.ts
├── index.js
├── jest.config.js
├── lint-staged.config.js
├── package-lock.json
├── package.json
├── tsconfig.build.json
└── tsconfig.json


/.eslintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Configuration for JavaScript files
 3 |     "extends": [
 4 |       "airbnb-base",
 5 |       "plugin:prettier/recommended"
 6 |     ],
 7 |     "rules": {
 8 |       "prettier/prettier": [
 9 |         "error",
10 |         {
11 |           "singleQuote": true,
12 |           "endOfLine": "auto"
13 |         }
14 |       ]
15 |     },
16 |     "overrides": [
17 |       // Configuration for TypeScript files
18 |       {
19 |         "files": ["**/*.ts", "**/__tests__/*.test.ts"],
20 |         "plugins": [
21 |           "@typescript-eslint",
22 |           "unused-imports",
23 |           "simple-import-sort"
24 |         ],
25 |         "extends": [
26 |           "airbnb-typescript",
27 |           "plugin:prettier/recommended"
28 |         ],
29 |         "parserOptions": {
30 |           "project": "./tsconfig.json"
31 |         },
32 |         "rules": {
33 |           "prettier/prettier": [
34 |             "error",
35 |             {
36 |               "singleQuote": true,
37 |               "endOfLine": "auto"
38 |             }
39 |           ],
40 |           "@typescript-eslint/comma-dangle": "off", // Avoid conflict rule between Eslint and Prettier
41 |           "@typescript-eslint/consistent-type-imports": "error", // Ensure `import type` is used when it's necessary
42 |           "import/prefer-default-export": "off", // Named export is easier to refactor automatically
43 |           "simple-import-sort/imports": "error", // Import configuration for `eslint-plugin-simple-import-sort`
44 |           "simple-import-sort/exports": "error", // Export configuration for `eslint-plugin-simple-import-sort`
45 |           "@typescript-eslint/no-unused-vars": "off",
46 |           "react/jsx-filename-extension": "off", // Gives error
47 |           "unused-imports/no-unused-imports": "error",
48 |           "unused-imports/no-unused-vars": [
49 |             "error",
50 |             { "argsIgnorePattern": "^_" }
51 |           ]
52 |         }
53 |       }
54 |     ]
55 |   }
56 |   


--------------------------------------------------------------------------------
/.github/workflows/npm-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Node.js Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 |       - uses: actions/setup-node@v3
13 |         with:
14 |           node-version: 16
15 |       - run: npm ci
16 |       - run: npm test
17 |       - run: npm run build
18 |       - uses: actions/upload-artifact@v3
19 |         with:
20 |           name: dist
21 |           path: dist
22 |       - uses: actions/upload-artifact@v3
23 |         with:
24 |           name: types
25 |           path: types
26 | 
27 |   publish-npm:
28 |     needs: build
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v3
32 |       - uses: actions/setup-node@v3
33 |         with:
34 |           node-version: 16
35 |           registry-url: https://registry.npmjs.org/
36 |       - uses: actions/download-artifact@v3
37 |         with:
38 |           name: dist
39 |           path: dist
40 |       - uses: actions/download-artifact@v3
41 |         with:
42 |           name: types
43 |           path: types
44 |       - run: npm ci
45 |       - run: npm publish
46 |         env:
47 |           NODE_AUTH_TOKEN: ${{secrets.npm_token}}
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 
132 | .ideas.md
133 | .todos.md
134 | 
135 | # Custom
136 | dist
137 | types
138 | build


--------------------------------------------------------------------------------
/.husky/commit-msg:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | . "$(dirname "$0")/_/husky.sh"
3 | 
4 | npx --no -- commitlint --edit $1
5 | 


--------------------------------------------------------------------------------
/.husky/pre-commit:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | . "$(dirname "$0")/_/husky.sh"
3 | 
4 | # Disable concurent to run `check-types` after ESLint in lint-staged
5 | npx lint-staged --concurrent false
6 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "dbaeumer.vscode-eslint",
 4 |     "esbenp.prettier-vscode",
 5 |     "mikestead.dotenv",
 6 |     "csstools.postcss",
 7 |     "bradlc.vscode-tailwindcss",
 8 |     "Orta.vscode-jest"
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "typescript.tsdk": "node_modules/typescript/lib",
 3 |   "typescript.enablePromptUseWorkspaceTsdk": true,
 4 |   "[javascript]": {
 5 |     "editor.tabSize": 2
 6 |   },
 7 |   "[typescript]": {
 8 |     "editor.tabSize": 2
 9 |   },
10 |   "[typescriptreact]": {
11 |     "editor.tabSize": 2
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // See https://go.microsoft.com/fwlink/?LinkId=733558
 3 |   // for the documentation about the tasks.json format
 4 |   "version": "2.0.0",
 5 |   "tasks": [
 6 |     {
 7 |       "label": "Project wide type checking with TypeScript",
 8 |       "type": "npm",
 9 |       "script": "check-types",
10 |       "problemMatcher": ["$tsc"],
11 |       "group": {
12 |         "kind": "build",
13 |         "isDefault": true
14 |       },
15 |       "presentation": {
16 |         "clear": true,
17 |         "reveal": "never"
18 |       }
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Singh"
5 |   given-names: "Taranjeet"
6 | title: "Embedchain"
7 | date-released: 2023-06-25
8 | url: "https://github.com/embedchain/embedchainjs"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # embedchainjs
  2 | 
  3 | [![Discord](https://dcbadge.vercel.app/api/server/CUU9FPhRNt?style=flat)](https://discord.gg/CUU9FPhRNt)
  4 | [![Twitter](https://img.shields.io/twitter/follow/embedchain)](https://twitter.com/embedchain)
  5 | [![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?logo=substack)](https://embedchain.substack.com/)
  6 | 
  7 | embedchain is a framework to easily create LLM powered bots over any dataset. embedchainjs is Javascript version of embedchain. If you want a python version, check out [embedchain-python](https://github.com/embedchain/embedchain)
  8 | 
  9 | # 🤝 Let's Talk Embedchain!
 10 | 
 11 | Schedule a [Feedback Session](https://cal.com/taranjeetio/ec) with Taranjeet, the founder, to discuss any issues, provide feedback, or explore improvements.
 12 | 
 13 | # How it works
 14 | 
 15 | It abstracts the entire process of loading dataset, chunking it, creating embeddings and then storing in vector database.
 16 | 
 17 | You can add a single or multiple dataset using `.add` and `.addLocal` function and then use `.query` function to find an answer from the added datasets.
 18 | 
 19 | If you want to create a Naval Ravikant bot which has 2 of his blog posts, as well as a question and answer pair you supply, all you need to do is add the links to the blog posts and the QnA pair and embedchain will create a bot for you.
 20 | 
 21 | ```javascript
 22 | const dotenv = require("dotenv");
 23 | dotenv.config();
 24 | const { App } = require("embedchain");
 25 | 
 26 | //Run the app commands inside an async function only
 27 | async function testApp() {
 28 |   const navalChatBot = await App();
 29 | 
 30 |   // Embed Online Resources
 31 |   await navalChatBot.add("web_page", "https://nav.al/feedback");
 32 |   await navalChatBot.add("web_page", "https://nav.al/agi");
 33 |   await navalChatBot.add(
 34 |     "pdf_file",
 35 |     "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
 36 |   );
 37 | 
 38 |   // Embed Local Resources
 39 |   await navalChatBot.addLocal("qna_pair", [
 40 |     "Who is Naval Ravikant?",
 41 |     "Naval Ravikant is an Indian-American entrepreneur and investor.",
 42 |   ]);
 43 | 
 44 |   const result = await navalChatBot.query(
 45 |     "What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"
 46 |   );
 47 |   console.log(result);
 48 |   // answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
 49 | }
 50 | 
 51 | testApp();
 52 | ```
 53 | 
 54 | # Getting Started
 55 | 
 56 | ## Installation
 57 | 
 58 | - First make sure that you have the package installed. If not, then install it using `npm`
 59 | 
 60 | ```bash
 61 | npm install embedchain && npm install -S openai@^3.3.0
 62 | ```
 63 | 
 64 | - Currently, it is only compatible with openai 3.X, not the latest version 4.X. Please make sure to use the right version, otherwise you will see the `ChromaDB` error `TypeError: OpenAIApi.Configuration is not a constructor`
 65 | 
 66 | - Make sure that dotenv package is installed and your `OPENAI_API_KEY` in a file called `.env` in the root folder. You can install dotenv by
 67 | 
 68 | ```js
 69 | npm install dotenv
 70 | ```
 71 | 
 72 | - Download and install Docker on your device by visiting [this link](https://www.docker.com/). You will need this to run Chroma vector database on your machine.
 73 | 
 74 | - Run the following commands to setup Chroma container in Docker
 75 | 
 76 | ```bash
 77 | git clone https://github.com/chroma-core/chroma.git
 78 | cd chroma
 79 | docker-compose up -d --build
 80 | ```
 81 | 
 82 | - Once Chroma container has been set up, run it inside Docker
 83 | 
 84 | ## Usage
 85 | 
 86 | - We use OpenAI's embedding model to create embeddings for chunks and ChatGPT API as LLM to get answer given the relevant docs. Make sure that you have an OpenAI account and an API key. If you have dont have an API key, you can create one by visiting [this link](https://platform.openai.com/account/api-keys).
 87 | 
 88 | - Once you have the API key, set it in an environment variable called `OPENAI_API_KEY`
 89 | 
 90 | ```js
 91 | // Set this inside your .env file
 92 | OPENAI_API_KEY = "sk-xxxx";
 93 | ```
 94 | 
 95 | - Load the environment variables inside your .js file using the following commands
 96 | 
 97 | ```js
 98 | const dotenv = require("dotenv");
 99 | dotenv.config();
100 | ```
101 | 
102 | - Next import the `App` class from embedchain and use `.add` function to add any dataset.
103 | - Now your app is created. You can use `.query` function to get the answer for any query.
104 | 
105 | ```js
106 | const dotenv = require("dotenv");
107 | dotenv.config();
108 | const { App } = require("embedchain");
109 | 
110 | async function testApp() {
111 |   const navalChatBot = await App();
112 | 
113 |   // Embed Online Resources
114 |   await navalChatBot.add("web_page", "https://nav.al/feedback");
115 |   await navalChatBot.add("web_page", "https://nav.al/agi");
116 |   await navalChatBot.add(
117 |     "pdf_file",
118 |     "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
119 |   );
120 | 
121 |   // Embed Local Resources
122 |   await navalChatBot.addLocal("qna_pair", [
123 |     "Who is Naval Ravikant?",
124 |     "Naval Ravikant is an Indian-American entrepreneur and investor.",
125 |   ]);
126 | 
127 |   const result = await navalChatBot.query(
128 |     "What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"
129 |   );
130 |   console.log(result);
131 |   // answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
132 | }
133 | 
134 | testApp();
135 | ```
136 | 
137 | - If there is any other app instance in your script or app, you can change the import as
138 | 
139 | ```javascript
140 | const { App: EmbedChainApp } = require("embedchain");
141 | 
142 | // or
143 | 
144 | const { App: ECApp } = require("embedchain");
145 | ```
146 | 
147 | ## Format supported
148 | 
149 | We support the following formats:
150 | 
151 | ### PDF File
152 | 
153 | To add any pdf file, use the data_type as `pdf_file`. Eg:
154 | 
155 | ```javascript
156 | await app.add("pdf_file", "a_valid_url_where_pdf_file_can_be_accessed");
157 | ```
158 | 
159 | ### Web Page
160 | 
161 | To add any web page, use the data_type as `web_page`. Eg:
162 | 
163 | ```javascript
164 | await app.add("web_page", "a_valid_web_page_url");
165 | ```
166 | 
167 | ### QnA Pair
168 | 
169 | To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
170 | 
171 | ```javascript
172 | await app.addLocal("qna_pair", ["Question", "Answer"]);
173 | ```
174 | 
175 | ### More Formats coming soon
176 | 
177 | - If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchainjs/issues) and we will add it to the list of supported formats.
178 | 
179 | ## Testing
180 | 
181 | Before you consume valueable tokens, you should make sure that the embedding you have done works and that it's receiving the correct document from the database.
182 | 
183 | For this you can use the `dryRun` method.
184 | 
185 | Following the example above, add this to your script:
186 | 
187 | ```js
188 | let result = await naval_chat_bot.dryRun("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?");console.log(result);
189 | 
190 | '''
191 | Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
192 | terms of the unseen. And I think that’s critical. That is what humans do uniquely that no other creature, no other computer, no other intelligence—biological or artificial—that we have ever encountered does. And not only do we do it uniquely, but if we were to meet an alien species that also had the power to generate these good explanations, there is no explanation that they could generate that we could not understand. We are maximally capable of understanding. There is no concept out there that is possible in this physical reality that a human being, given sufficient time and resources and
193 | Query: What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?
194 | Helpful Answer:
195 | '''
196 | ```
197 | 
198 | _The embedding is confirmed to work as expected. It returns the right document, even if the question is asked slightly different. No prompt tokens have been consumed._
199 | 
200 | **The dry run will still consume tokens to embed your query, but it is only ~1/15 of the prompt.**
201 | 
202 | # How does it work?
203 | 
204 | Creating a chat bot over any dataset needs the following steps to happen
205 | 
206 | - load the data
207 | - create meaningful chunks
208 | - create embeddings for each chunk
209 | - store the chunks in vector database
210 | 
211 | Whenever a user asks any query, following process happens to find the answer for the query
212 | 
213 | - create the embedding for query
214 | - find similar documents for this query from vector database
215 | - pass similar documents as context to LLM to get the final answer.
216 | 
217 | The process of loading the dataset and then querying involves multiple steps and each steps has nuances of it is own.
218 | 
219 | - How should I chunk the data? What is a meaningful chunk size?
220 | - How should I create embeddings for each chunk? Which embedding model should I use?
221 | - How should I store the chunks in vector database? Which vector database should I use?
222 | - Should I store meta data along with the embeddings?
223 | - How should I find similar documents for a query? Which ranking model should I use?
224 | 
225 | These questions may be trivial for some but for a lot of us, it needs research, experimentation and time to find out the accurate answers.
226 | 
227 | embedchain is a framework which takes care of all these nuances and provides a simple interface to create bots over any dataset.
228 | 
229 | In the first release, we are making it easier for anyone to get a chatbot over any dataset up and running in less than a minute. All you need to do is create an app instance, add the data sets using `.add` function and then use `.query` function to get the relevant answer.
230 | 
231 | # Tech Stack
232 | 
233 | embedchain is built on the following stack:
234 | 
235 | - [Langchain](https://github.com/hwchase17/langchain) as an LLM framework to load, chunk and index data
236 | - [OpenAI's Ada embedding model](https://platform.openai.com/docs/guides/embeddings) to create embeddings
237 | - [OpenAI's ChatGPT API](https://platform.openai.com/docs/guides/gpt/chat-completions-api) as LLM to get answers given the context
238 | - [Chroma](https://github.com/chroma-core/chroma) as the vector database to store embeddings
239 | 
240 | # Team
241 | 
242 | ## Author
243 | 
244 | - Taranjeet Singh ([@taranjeetio](https://twitter.com/taranjeetio))
245 | 
246 | ## Maintainer
247 | 
248 | - [cachho](https://github.com/cachho)
249 | - [sahilyadav902](https://github.com/sahilyadav902)
250 | 
251 | ## Citation
252 | 
253 | If you utilize this repository, please consider citing it with:
254 | ```
255 | @misc{embedchain,
256 |   author = {Taranjeet Singh},
257 |   title = {Embechain: Framework to easily create LLM powered bots over any dataset},
258 |   year = {2023},
259 |   publisher = {GitHub},
260 |   journal = {GitHub repository},
261 |   howpublished = {\url{https://github.com/embedchain/embedchainjs}},
262 | }
263 | ```
264 | 


--------------------------------------------------------------------------------
/commitlint.config.js:
--------------------------------------------------------------------------------
1 | module.exports = { extends: ['@commitlint/config-conventional'] };
2 | 


--------------------------------------------------------------------------------
/embedchain/__tests__/readme.test.ts:
--------------------------------------------------------------------------------
 1 | import { EmbedChainApp } from '../embedchain';
 2 | 
 3 | const mockAdd = jest.fn();
 4 | const mockAddLocal = jest.fn();
 5 | const mockQuery = jest.fn();
 6 | 
 7 | jest.mock('../embedchain', () => {
 8 |   return {
 9 |     EmbedChainApp: jest.fn().mockImplementation(() => {
10 |       return {
11 |         add: mockAdd,
12 |         addLocal: mockAddLocal,
13 |         query: mockQuery,
14 |       };
15 |     }),
16 |   };
17 | });
18 | 
19 | describe('Test App', () => {
20 |   beforeEach(() => {
21 |     jest.clearAllMocks();
22 |   });
23 | 
24 |   it('tests the App', async () => {
25 |     mockQuery.mockResolvedValue(
26 |       'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.'
27 |     );
28 | 
29 |     const navalChatBot = await new EmbedChainApp(undefined, false);
30 | 
31 |     // Embed Online Resources
32 |     await navalChatBot.add('web_page', 'https://nav.al/feedback');
33 |     await navalChatBot.add('web_page', 'https://nav.al/agi');
34 |     await navalChatBot.add(
35 |       'pdf_file',
36 |       'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf'
37 |     );
38 | 
39 |     // Embed Local Resources
40 |     await navalChatBot.addLocal('qna_pair', [
41 |       'Who is Naval Ravikant?',
42 |       'Naval Ravikant is an Indian-American entrepreneur and investor.',
43 |     ]);
44 | 
45 |     const result = await navalChatBot.query(
46 |       'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?'
47 |     );
48 | 
49 |     expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/feedback');
50 |     expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/agi');
51 |     expect(mockAdd).toHaveBeenCalledWith(
52 |       'pdf_file',
53 |       'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf'
54 |     );
55 |     expect(mockAddLocal).toHaveBeenCalledWith('qna_pair', [
56 |       'Who is Naval Ravikant?',
57 |       'Naval Ravikant is an Indian-American entrepreneur and investor.',
58 |     ]);
59 |     expect(mockQuery).toHaveBeenCalledWith(
60 |       'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?'
61 |     );
62 |     expect(result).toBe(
63 |       'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.'
64 |     );
65 |   });
66 | });
67 | 


--------------------------------------------------------------------------------
/embedchain/chunkers/BaseChunker.ts:
--------------------------------------------------------------------------------
 1 | import { createHash } from 'crypto';
 2 | import type { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 3 | 
 4 | import type { BaseLoader } from '../loaders';
 5 | import type { Input, LoaderResult } from '../models';
 6 | import type { ChunkResult } from '../models/ChunkResult';
 7 | 
 8 | class BaseChunker {
 9 |   textSplitter: RecursiveCharacterTextSplitter;
10 | 
11 |   constructor(textSplitter: RecursiveCharacterTextSplitter) {
12 |     this.textSplitter = textSplitter;
13 |   }
14 | 
15 |   async createChunks(loader: BaseLoader, url: Input): Promise<ChunkResult> {
16 |     const documents: ChunkResult['documents'] = [];
17 |     const ids: ChunkResult['ids'] = [];
18 |     const datas: LoaderResult = await loader.loadData(url);
19 |     const metadatas: ChunkResult['metadatas'] = [];
20 | 
21 |     const dataPromises = datas.map(async (data) => {
22 |       const { content, metaData } = data;
23 |       const chunks: string[] = await this.textSplitter.splitText(content);
24 |       chunks.forEach((chunk) => {
25 |         const chunkId = createHash('sha256')
26 |           .update(chunk + metaData.url)
27 |           .digest('hex');
28 |         ids.push(chunkId);
29 |         documents.push(chunk);
30 |         metadatas.push(metaData);
31 |       });
32 |     });
33 | 
34 |     await Promise.all(dataPromises);
35 | 
36 |     return {
37 |       documents,
38 |       ids,
39 |       metadatas,
40 |     };
41 |   }
42 | }
43 | 
44 | export { BaseChunker };
45 | 


--------------------------------------------------------------------------------
/embedchain/chunkers/PdfFile.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 2 | 
 3 | import { BaseChunker } from './BaseChunker';
 4 | 
 5 | interface TextSplitterChunkParams {
 6 |   chunkSize: number;
 7 |   chunkOverlap: number;
 8 |   keepSeparator: boolean;
 9 | }
10 | 
11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
12 |   chunkSize: 1000,
13 |   chunkOverlap: 0,
14 |   keepSeparator: false,
15 | };
16 | 
17 | class PdfFileChunker extends BaseChunker {
18 |   constructor() {
19 |     const textSplitter = new RecursiveCharacterTextSplitter(
20 |       TEXT_SPLITTER_CHUNK_PARAMS
21 |     );
22 |     super(textSplitter);
23 |   }
24 | }
25 | 
26 | export { PdfFileChunker };
27 | 


--------------------------------------------------------------------------------
/embedchain/chunkers/QnaPair.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 2 | 
 3 | import { BaseChunker } from './BaseChunker';
 4 | 
 5 | interface TextSplitterChunkParams {
 6 |   chunkSize: number;
 7 |   chunkOverlap: number;
 8 |   keepSeparator: boolean;
 9 | }
10 | 
11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
12 |   chunkSize: 300,
13 |   chunkOverlap: 0,
14 |   keepSeparator: false,
15 | };
16 | 
17 | class QnaPairChunker extends BaseChunker {
18 |   constructor() {
19 |     const textSplitter = new RecursiveCharacterTextSplitter(
20 |       TEXT_SPLITTER_CHUNK_PARAMS
21 |     );
22 |     super(textSplitter);
23 |   }
24 | }
25 | 
26 | export { QnaPairChunker };
27 | 


--------------------------------------------------------------------------------
/embedchain/chunkers/WebPage.ts:
--------------------------------------------------------------------------------
 1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 2 | 
 3 | import { BaseChunker } from './BaseChunker';
 4 | 
 5 | interface TextSplitterChunkParams {
 6 |   chunkSize: number;
 7 |   chunkOverlap: number;
 8 |   keepSeparator: boolean;
 9 | }
10 | 
11 | const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
12 |   chunkSize: 500,
13 |   chunkOverlap: 0,
14 |   keepSeparator: false,
15 | };
16 | 
17 | class WebPageChunker extends BaseChunker {
18 |   constructor() {
19 |     const textSplitter = new RecursiveCharacterTextSplitter(
20 |       TEXT_SPLITTER_CHUNK_PARAMS
21 |     );
22 |     super(textSplitter);
23 |   }
24 | }
25 | 
26 | export { WebPageChunker };
27 | 


--------------------------------------------------------------------------------
/embedchain/chunkers/index.ts:
--------------------------------------------------------------------------------
1 | import { BaseChunker } from './BaseChunker';
2 | import { PdfFileChunker } from './PdfFile';
3 | import { QnaPairChunker } from './QnaPair';
4 | import { WebPageChunker } from './WebPage';
5 | 
6 | export { BaseChunker, PdfFileChunker, QnaPairChunker, WebPageChunker };
7 | 


--------------------------------------------------------------------------------
/embedchain/embedchain.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable max-classes-per-file */
  2 | import type { Collection } from 'chromadb';
  3 | import type { QueryResponse } from 'chromadb/dist/main/types';
  4 | import * as fs from 'fs';
  5 | import { Document } from 'langchain/document';
  6 | import OpenAI from 'openai';
  7 | import * as path from 'path';
  8 | import { v4 as uuidv4 } from 'uuid';
  9 | 
 10 | import type { BaseChunker } from './chunkers';
 11 | import { PdfFileChunker, QnaPairChunker, WebPageChunker } from './chunkers';
 12 | import type { BaseLoader } from './loaders';
 13 | import { LocalQnaPairLoader, PdfFileLoader, WebPageLoader } from './loaders';
 14 | import type {
 15 |   DataDict,
 16 |   DataType,
 17 |   FormattedResult,
 18 |   Input,
 19 |   LocalInput,
 20 |   Metadata,
 21 |   Method,
 22 |   RemoteInput,
 23 | } from './models';
 24 | import { ChromaDB } from './vectordb';
 25 | import type { BaseVectorDB } from './vectordb/BaseVectorDb';
 26 | 
 27 | const openai = new OpenAI({
 28 |   apiKey: process.env.OPENAI_API_KEY,
 29 | });
 30 | 
 31 | class EmbedChain {
 32 |   dbClient: any;
 33 | 
 34 |   // TODO: Definitely assign
 35 |   collection!: Collection;
 36 | 
 37 |   userAsks: [DataType, Input][] = [];
 38 | 
 39 |   initApp: Promise<void>;
 40 | 
 41 |   collectMetrics: boolean;
 42 | 
 43 |   sId: string; // sessionId
 44 | 
 45 |   constructor(db?: BaseVectorDB, collectMetrics: boolean = true) {
 46 |     if (!db) {
 47 |       this.initApp = this.setupChroma();
 48 |     } else {
 49 |       this.initApp = this.setupOther(db);
 50 |     }
 51 | 
 52 |     this.collectMetrics = collectMetrics;
 53 | 
 54 |     // Send anonymous telemetry
 55 |     this.sId = uuidv4();
 56 |     this.sendTelemetryEvent('init');
 57 |   }
 58 | 
 59 |   async setupChroma(): Promise<void> {
 60 |     const db = new ChromaDB();
 61 |     await db.initDb;
 62 |     this.dbClient = db.client;
 63 |     if (db.collection) {
 64 |       this.collection = db.collection;
 65 |     } else {
 66 |       // TODO: Add proper error handling
 67 |       console.error('No collection');
 68 |     }
 69 |   }
 70 | 
 71 |   async setupOther(db: BaseVectorDB): Promise<void> {
 72 |     await db.initDb;
 73 |     // TODO: Figure out how we can initialize an unknown database.
 74 |     // this.dbClient = db.client;
 75 |     // this.collection = db.collection;
 76 |     this.userAsks = [];
 77 |   }
 78 | 
 79 |   static getLoader(dataType: DataType) {
 80 |     const loaders: { [t in DataType]: BaseLoader } = {
 81 |       pdf_file: new PdfFileLoader(),
 82 |       web_page: new WebPageLoader(),
 83 |       qna_pair: new LocalQnaPairLoader(),
 84 |     };
 85 |     return loaders[dataType];
 86 |   }
 87 | 
 88 |   static getChunker(dataType: DataType) {
 89 |     const chunkers: { [t in DataType]: BaseChunker } = {
 90 |       pdf_file: new PdfFileChunker(),
 91 |       web_page: new WebPageChunker(),
 92 |       qna_pair: new QnaPairChunker(),
 93 |     };
 94 |     return chunkers[dataType];
 95 |   }
 96 | 
 97 |   public async add(dataType: DataType, url: RemoteInput) {
 98 |     const loader = EmbedChain.getLoader(dataType);
 99 |     const chunker = EmbedChain.getChunker(dataType);
100 |     this.userAsks.push([dataType, url]);
101 |     const { documents, countNewChunks } = await this.loadAndEmbed(
102 |       loader,
103 |       chunker,
104 |       url
105 |     );
106 | 
107 |     if (this.collectMetrics) {
108 |       const wordCount = documents.reduce(
109 |         (sum, document) => sum + document.split(' ').length,
110 |         0
111 |       );
112 | 
113 |       this.sendTelemetryEvent('add', {
114 |         data_type: dataType,
115 |         word_count: wordCount,
116 |         chunks_count: countNewChunks,
117 |       });
118 |     }
119 |   }
120 | 
121 |   public async addLocal(dataType: DataType, content: LocalInput) {
122 |     const loader = EmbedChain.getLoader(dataType);
123 |     const chunker = EmbedChain.getChunker(dataType);
124 |     this.userAsks.push([dataType, content]);
125 |     const { documents, countNewChunks } = await this.loadAndEmbed(
126 |       loader,
127 |       chunker,
128 |       content
129 |     );
130 | 
131 |     if (this.collectMetrics) {
132 |       const wordCount = documents.reduce(
133 |         (sum, document) => sum + document.split(' ').length,
134 |         0
135 |       );
136 | 
137 |       this.sendTelemetryEvent('add_local', {
138 |         data_type: dataType,
139 |         word_count: wordCount,
140 |         chunks_count: countNewChunks,
141 |       });
142 |     }
143 |   }
144 | 
145 |   protected async loadAndEmbed(
146 |     loader: any,
147 |     chunker: BaseChunker,
148 |     src: Input
149 |   ): Promise<{
150 |     documents: string[];
151 |     metadatas: Metadata[];
152 |     ids: string[];
153 |     countNewChunks: number;
154 |   }> {
155 |     const embeddingsData = await chunker.createChunks(loader, src);
156 |     let { documents, ids, metadatas } = embeddingsData;
157 | 
158 |     const existingDocs = await this.collection.get({ ids });
159 |     const existingIds = new Set(existingDocs.ids);
160 | 
161 |     if (existingIds.size > 0) {
162 |       const dataDict: DataDict = {};
163 |       for (let i = 0; i < ids.length; i += 1) {
164 |         const id = ids[i];
165 |         if (!existingIds.has(id)) {
166 |           dataDict[id] = { doc: documents[i], meta: metadatas[i] };
167 |         }
168 |       }
169 | 
170 |       if (Object.keys(dataDict).length === 0) {
171 |         console.log(`All data from ${src} already exists in the database.`);
172 |         return { documents: [], metadatas: [], ids: [], countNewChunks: 0 };
173 |       }
174 |       ids = Object.keys(dataDict);
175 |       const dataValues = Object.values(dataDict);
176 |       documents = dataValues.map(({ doc }) => doc);
177 |       metadatas = dataValues.map(({ meta }) => meta);
178 |     }
179 | 
180 |     const countBeforeAddition = await this.count();
181 |     await this.collection.add({ documents, metadatas, ids });
182 |     const countNewChunks = (await this.count()) - countBeforeAddition;
183 |     console.log(
184 |       `Successfully saved ${src}. New chunks count: ${countNewChunks}`
185 |     );
186 |     return { documents, metadatas, ids, countNewChunks };
187 |   }
188 | 
189 |   static async formatResult(
190 |     results: QueryResponse
191 |   ): Promise<FormattedResult[]> {
192 |     return results.documents[0].map((document: any, index: number) => {
193 |       const metadata = results.metadatas[0][index] || {};
194 |       // TODO: Add proper error handling
195 |       const distance = results.distances ? results.distances[0][index] : null;
196 |       return [new Document({ pageContent: document, metadata }), distance];
197 |     });
198 |   }
199 | 
200 |   static async getOpenAiAnswer(prompt: string) {
201 |     const messages: OpenAI.Chat.CreateChatCompletionRequestMessage[] = [
202 |       { role: 'user', content: prompt },
203 |     ];
204 |     const response = await openai.chat.completions.create({
205 |       model: 'gpt-3.5-turbo',
206 |       messages,
207 |       temperature: 0,
208 |       max_tokens: 1000,
209 |       top_p: 1,
210 |     });
211 |     return (
212 |       response.choices[0].message?.content ?? 'Response could not be processed.'
213 |     );
214 |   }
215 | 
216 |   protected async retrieveFromDatabase(inputQuery: string) {
217 |     const result = await this.collection.query({
218 |       nResults: 1,
219 |       queryTexts: [inputQuery],
220 |     });
221 |     const resultFormatted = await EmbedChain.formatResult(result);
222 |     const content = resultFormatted[0][0].pageContent;
223 |     return content;
224 |   }
225 | 
226 |   static generatePrompt(inputQuery: string, context: any) {
227 |     const prompt = `Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n${context}\nQuery: ${inputQuery}\nHelpful Answer:`;
228 |     return prompt;
229 |   }
230 | 
231 |   static async getAnswerFromLlm(prompt: string) {
232 |     const answer = await EmbedChain.getOpenAiAnswer(prompt);
233 |     return answer;
234 |   }
235 | 
236 |   public async query(inputQuery: string) {
237 |     const context = await this.retrieveFromDatabase(inputQuery);
238 |     const prompt = EmbedChain.generatePrompt(inputQuery, context);
239 |     const answer = await EmbedChain.getAnswerFromLlm(prompt);
240 |     this.sendTelemetryEvent('query');
241 |     return answer;
242 |   }
243 | 
244 |   public async dryRun(input_query: string) {
245 |     const context = await this.retrieveFromDatabase(input_query);
246 |     const prompt = EmbedChain.generatePrompt(input_query, context);
247 |     return prompt;
248 |   }
249 | 
250 |   /**
251 |    * Count the number of embeddings.
252 |    * @returns {Promise<number>}: The number of embeddings.
253 |    */
254 |   public count(): Promise<number> {
255 |     return this.collection.count();
256 |   }
257 | 
258 |   protected async sendTelemetryEvent(method: Method, extraMetadata?: object) {
259 |     if (!this.collectMetrics) {
260 |       return;
261 |     }
262 |     const url = 'https://api.embedchain.ai/api/v1/telemetry/';
263 | 
264 |     // Read package version from filesystem (because it's not in the ts root dir)
265 |     const packageJsonPath = path.join(__dirname, '..', 'package.json');
266 |     const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8'));
267 | 
268 |     const metadata = {
269 |       s_id: this.sId,
270 |       version: packageJson.version,
271 |       method,
272 |       language: 'js',
273 |       ...extraMetadata,
274 |     };
275 | 
276 |     const maxRetries = 3;
277 | 
278 |     // Retry the fetch
279 |     for (let i = 0; i < maxRetries; i += 1) {
280 |       try {
281 |         // eslint-disable-next-line no-await-in-loop
282 |         const response = await fetch(url, {
283 |           method: 'POST',
284 |           body: JSON.stringify({ metadata }),
285 |         });
286 | 
287 |         if (response.ok) {
288 |           // Break out of the loop if the request was successful
289 |           break;
290 |         } else {
291 |           // Log the unsuccessful response (optional)
292 |           console.error(
293 |             `Telemetry: Attempt ${i + 1} failed with status:`,
294 |             response.status
295 |           );
296 |         }
297 |       } catch (error) {
298 |         // Log the error (optional)
299 |         console.error(`Telemetry: Attempt ${i + 1} failed with error:`, error);
300 |       }
301 | 
302 |       // If this was the last attempt, throw an error or handle the failure
303 |       if (i === maxRetries - 1) {
304 |         console.error('Telemetry: Max retries reached');
305 |       }
306 |     }
307 |   }
308 | }
309 | 
310 | class EmbedChainApp extends EmbedChain {
311 |   // The EmbedChain app.
312 |   // Has two functions: add and query.
313 |   // adds(dataType, url): adds the data from the given URL to the vector db.
314 |   // query(query): finds answer to the given query using vector database and LLM.
315 | }
316 | 
317 | export { EmbedChainApp };
318 | 


--------------------------------------------------------------------------------
/embedchain/index.ts:
--------------------------------------------------------------------------------
1 | import { EmbedChainApp } from './embedchain';
2 | 
3 | export const App = async () => {
4 |   const app = new EmbedChainApp();
5 |   await app.initApp;
6 |   return app;
7 | };
8 | 


--------------------------------------------------------------------------------
/embedchain/loaders/BaseLoader.ts:
--------------------------------------------------------------------------------
1 | import type { Input, LoaderResult } from '../models';
2 | 
3 | export abstract class BaseLoader {
4 |   abstract loadData(src: Input): Promise<LoaderResult>;
5 | }
6 | 


--------------------------------------------------------------------------------
/embedchain/loaders/LocalQnaPair.ts:
--------------------------------------------------------------------------------
 1 | import type { LoaderResult, QnaPair } from '../models';
 2 | import { BaseLoader } from './BaseLoader';
 3 | 
 4 | class LocalQnaPairLoader extends BaseLoader {
 5 |   // eslint-disable-next-line class-methods-use-this
 6 |   async loadData(content: QnaPair): Promise<LoaderResult> {
 7 |     const [question, answer] = content;
 8 |     const contentText = `Q: ${question}\nA: ${answer}`;
 9 |     const metaData = {
10 |       url: 'local',
11 |     };
12 |     return [
13 |       {
14 |         content: contentText,
15 |         metaData,
16 |       },
17 |     ];
18 |   }
19 | }
20 | 
21 | export { LocalQnaPairLoader };
22 | 


--------------------------------------------------------------------------------
/embedchain/loaders/PdfFile.ts:
--------------------------------------------------------------------------------
 1 | import type { TextContent } from 'pdfjs-dist/types/src/display/api';
 2 | 
 3 | import type { LoaderResult, Metadata } from '../models';
 4 | import { cleanString } from '../utils';
 5 | import { BaseLoader } from './BaseLoader';
 6 | 
 7 | const pdfjsLib = require('pdfjs-dist');
 8 | 
 9 | interface Page {
10 |   page_content: string;
11 | }
12 | 
13 | class PdfFileLoader extends BaseLoader {
14 |   static async getPagesFromPdf(url: string): Promise<Page[]> {
15 |     const loadingTask = pdfjsLib.getDocument(url);
16 |     const pdf = await loadingTask.promise;
17 |     const { numPages } = pdf;
18 | 
19 |     const promises = Array.from({ length: numPages }, async (_, i) => {
20 |       const page = await pdf.getPage(i + 1);
21 |       const pageText: TextContent = await page.getTextContent();
22 |       const pageContent: string = pageText.items
23 |         .map((item) => ('str' in item ? item.str : ''))
24 |         .join(' ');
25 | 
26 |       return {
27 |         page_content: pageContent,
28 |       };
29 |     });
30 | 
31 |     return Promise.all(promises);
32 |   }
33 | 
34 |   // eslint-disable-next-line class-methods-use-this
35 |   async loadData(url: string): Promise<LoaderResult> {
36 |     const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url);
37 |     const output: LoaderResult = [];
38 | 
39 |     if (!pages.length) {
40 |       throw new Error('No data found');
41 |     }
42 | 
43 |     pages.forEach((page) => {
44 |       let content: string = page.page_content;
45 |       content = cleanString(content);
46 |       const metaData: Metadata = {
47 |         url,
48 |       };
49 |       output.push({
50 |         content,
51 |         metaData,
52 |       });
53 |     });
54 |     return output;
55 |   }
56 | }
57 | 
58 | export { PdfFileLoader };
59 | 


--------------------------------------------------------------------------------
/embedchain/loaders/WebPage.ts:
--------------------------------------------------------------------------------
 1 | import axios from 'axios';
 2 | import { JSDOM } from 'jsdom';
 3 | 
 4 | import { cleanString } from '../utils';
 5 | import { BaseLoader } from './BaseLoader';
 6 | 
 7 | class WebPageLoader extends BaseLoader {
 8 |   // eslint-disable-next-line class-methods-use-this
 9 |   async loadData(url: string) {
10 |     const response = await axios.get(url);
11 |     const html = response.data;
12 |     const dom = new JSDOM(html);
13 |     const { document } = dom.window;
14 |     const unwantedTags = [
15 |       'nav',
16 |       'aside',
17 |       'form',
18 |       'header',
19 |       'noscript',
20 |       'svg',
21 |       'canvas',
22 |       'footer',
23 |       'script',
24 |       'style',
25 |     ];
26 |     unwantedTags.forEach((tagName) => {
27 |       const elements = document.getElementsByTagName(tagName);
28 |       Array.from(elements).forEach((element) => {
29 |         // eslint-disable-next-line no-param-reassign
30 |         (element as HTMLElement).textContent = ' ';
31 |       });
32 |     });
33 | 
34 |     const output = [];
35 |     let content = document.body.textContent;
36 |     if (!content) {
37 |       throw new Error('Web page content is empty.');
38 |     }
39 |     content = cleanString(content);
40 |     const metaData = {
41 |       url,
42 |     };
43 |     output.push({
44 |       content,
45 |       metaData,
46 |     });
47 |     return output;
48 |   }
49 | }
50 | 
51 | export { WebPageLoader };
52 | 


--------------------------------------------------------------------------------
/embedchain/loaders/index.ts:
--------------------------------------------------------------------------------
1 | import { BaseLoader } from './BaseLoader';
2 | import { LocalQnaPairLoader } from './LocalQnaPair';
3 | import { PdfFileLoader } from './PdfFile';
4 | import { WebPageLoader } from './WebPage';
5 | 
6 | export { BaseLoader, LocalQnaPairLoader, PdfFileLoader, WebPageLoader };
7 | 


--------------------------------------------------------------------------------
/embedchain/models/ChunkResult.ts:
--------------------------------------------------------------------------------
1 | import type { Metadata } from './Metadata';
2 | 
3 | export type ChunkResult = {
4 |   documents: string[];
5 |   ids: string[];
6 |   metadatas: Metadata[];
7 | };
8 | 


--------------------------------------------------------------------------------
/embedchain/models/DataDict.ts:
--------------------------------------------------------------------------------
 1 | import type { ChunkResult } from './ChunkResult';
 2 | 
 3 | type Data = {
 4 |   doc: ChunkResult['documents'][0];
 5 |   meta: ChunkResult['metadatas'][0];
 6 | };
 7 | 
 8 | export type DataDict = {
 9 |   [id: string]: Data;
10 | };
11 | 


--------------------------------------------------------------------------------
/embedchain/models/DataType.ts:
--------------------------------------------------------------------------------
1 | export type DataType = 'pdf_file' | 'web_page' | 'qna_pair';
2 | 


--------------------------------------------------------------------------------
/embedchain/models/FormattedResult.ts:
--------------------------------------------------------------------------------
1 | import type { Document } from 'langchain/document';
2 | 
3 | export type FormattedResult = [Document, number | null];
4 | 


--------------------------------------------------------------------------------
/embedchain/models/Input.ts:
--------------------------------------------------------------------------------
1 | import type { QnaPair } from './QnAPair';
2 | 
3 | export type RemoteInput = string;
4 | 
5 | export type LocalInput = QnaPair;
6 | 
7 | export type Input = RemoteInput | LocalInput;
8 | 


--------------------------------------------------------------------------------
/embedchain/models/LoaderResult.ts:
--------------------------------------------------------------------------------
1 | import type { Metadata } from './Metadata';
2 | 
3 | export type LoaderResult = { content: any; metaData: Metadata }[];
4 | 


--------------------------------------------------------------------------------
/embedchain/models/Metadata.ts:
--------------------------------------------------------------------------------
1 | export type Metadata = {
2 |   url: string;
3 | };
4 | 


--------------------------------------------------------------------------------
/embedchain/models/Method.ts:
--------------------------------------------------------------------------------
1 | export type Method = 'init' | 'query' | 'add' | 'add_local';
2 | 


--------------------------------------------------------------------------------
/embedchain/models/QnAPair.ts:
--------------------------------------------------------------------------------
1 | type Question = string;
2 | type Answer = string;
3 | 
4 | export type QnaPair = [Question, Answer];
5 | 


--------------------------------------------------------------------------------
/embedchain/models/index.ts:
--------------------------------------------------------------------------------
 1 | import { DataDict } from './DataDict';
 2 | import { DataType } from './DataType';
 3 | import { FormattedResult } from './FormattedResult';
 4 | import { Input, LocalInput, RemoteInput } from './Input';
 5 | import { LoaderResult } from './LoaderResult';
 6 | import { Metadata } from './Metadata';
 7 | import { Method } from './Method';
 8 | import { QnaPair } from './QnAPair';
 9 | 
10 | export {
11 |   DataDict,
12 |   DataType,
13 |   FormattedResult,
14 |   Input,
15 |   LoaderResult,
16 |   LocalInput,
17 |   Metadata,
18 |   Method,
19 |   QnaPair,
20 |   RemoteInput,
21 | };
22 | 


--------------------------------------------------------------------------------
/embedchain/utils.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This function takes in a string and performs a series of text cleaning operations.
 3 |  * @param {str} text: The text to be cleaned. This is expected to be a string.
 4 |  * @returns {str}: The cleaned text after all the cleaning operations have been performed.
 5 |  */
 6 | export function cleanString(text: string): string {
 7 |   // Replacement of newline characters:
 8 |   let cleanedText = text.replace(/\n/g, ' ');
 9 | 
10 |   // Stripping and reducing multiple spaces to single:
11 |   cleanedText = cleanedText.trim().replace(/\s+/g, ' ');
12 | 
13 |   // Removing backslashes:
14 |   cleanedText = cleanedText.replace(/\\/g, '');
15 | 
16 |   // Replacing hash characters:
17 |   cleanedText = cleanedText.replace(/#/g, ' ');
18 | 
19 |   // Eliminating consecutive non-alphanumeric characters:
20 |   // This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string
21 |   // and replaces each group of such characters with a single occurrence of that character.
22 |   // For example, "!!! hello !!!" would become "! hello !".
23 |   cleanedText = cleanedText.replace(/([^\w\s])\1*/g, '$1');
24 | 
25 |   return cleanedText;
26 | }
27 | 


--------------------------------------------------------------------------------
/embedchain/vectordb/BaseVectorDb.ts:
--------------------------------------------------------------------------------
 1 | class BaseVectorDB {
 2 |   initDb: Promise<void>;
 3 | 
 4 |   constructor() {
 5 |     this.initDb = this.getClientAndCollection();
 6 |   }
 7 | 
 8 |   // eslint-disable-next-line class-methods-use-this
 9 |   protected async getClientAndCollection(): Promise<void> {
10 |     throw new Error('getClientAndCollection() method is not implemented');
11 |   }
12 | }
13 | 
14 | export { BaseVectorDB };
15 | 


--------------------------------------------------------------------------------
/embedchain/vectordb/ChromaDb.ts:
--------------------------------------------------------------------------------
 1 | import type { Collection } from 'chromadb';
 2 | import { ChromaClient, OpenAIEmbeddingFunction } from 'chromadb';
 3 | 
 4 | import { BaseVectorDB } from './BaseVectorDb';
 5 | 
 6 | const embedder = new OpenAIEmbeddingFunction({
 7 |   openai_api_key: process.env.OPENAI_API_KEY ?? '',
 8 | });
 9 | 
10 | class ChromaDB extends BaseVectorDB {
11 |   client: ChromaClient | undefined;
12 | 
13 |   collection: Collection | null = null;
14 | 
15 |   // eslint-disable-next-line @typescript-eslint/no-useless-constructor
16 |   constructor() {
17 |     super();
18 |   }
19 | 
20 |   protected async getClientAndCollection(): Promise<void> {
21 |     this.client = new ChromaClient({ path: 'http://localhost:8000' });
22 |     try {
23 |       this.collection = await this.client.getCollection({
24 |         name: 'embedchain_store',
25 |         embeddingFunction: embedder,
26 |       });
27 |     } catch (err) {
28 |       if (!this.collection) {
29 |         this.collection = await this.client.createCollection({
30 |           name: 'embedchain_store',
31 |           embeddingFunction: embedder,
32 |         });
33 |       }
34 |     }
35 |   }
36 | }
37 | 
38 | export { ChromaDB };
39 | 


--------------------------------------------------------------------------------
/embedchain/vectordb/index.ts:
--------------------------------------------------------------------------------
1 | import { ChromaDB } from './ChromaDb';
2 | 
3 | export { ChromaDB };
4 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const { EmbedChainApp } = require("./embedchain/embedchain");
 2 | 
 3 | async function App() {
 4 |   const app = new EmbedChainApp();
 5 |   await app.init_app;
 6 |   return app;
 7 | }
 8 | 
 9 | module.exports = { App };
10 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   preset: 'ts-jest',
3 |   testEnvironment: 'node',
4 |   testPathIgnorePatterns: ['.d.ts'],
5 | };
6 | 


--------------------------------------------------------------------------------
/lint-staged.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   '*.{js,ts}': ['eslint --fix', 'eslint'],
3 |   '**/*.ts?(x)': () => 'npm run check-types',
4 |   '*.json': ['prettier --write'],
5 | };
6 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "embedchain",
 3 |   "version": "0.0.8",
 4 |   "description": "embedchain is a framework to easily create LLM powered bots over any dataset",
 5 |   "main": "dist/index.js",
 6 |   "types": "types/index.d.ts",
 7 |   "files": [
 8 |     "dist",
 9 |     "types"
10 |   ],
11 |   "scripts": {
12 |     "build": "tsc -p tsconfig.build.json --listFiles",
13 |     "prepare": "husky install",
14 |     "test": "jest",
15 |     "check-types": "tsc --noEmit --pretty"
16 |   },
17 |   "author": "Taranjeet Singh",
18 |   "license": "Apache-2.0",
19 |   "dependencies": {
20 |     "axios": "^1.4.0",
21 |     "chromadb": "^1.5.6",
22 |     "jsdom": "^22.1.0",
23 |     "langchain": "^0.0.136",
24 |     "openai": "^4.3.1",
25 |     "pdfjs-dist": "^3.8.162",
26 |     "uuid": "^9.0.0"
27 |   },
28 |   "devDependencies": {
29 |     "@commitlint/cli": "^17.1.2",
30 |     "@commitlint/config-conventional": "^17.1.0",
31 |     "@commitlint/cz-commitlint": "^17.1.2",
32 |     "@types/jest": "^29.5.1",
33 |     "@types/jsdom": "^21.1.1",
34 |     "@typescript-eslint/eslint-plugin": "^5.41.0",
35 |     "@typescript-eslint/parser": "^5.41.0",
36 |     "eslint": "^8.34.0",
37 |     "eslint-config-airbnb-base": "^15.0.0",
38 |     "eslint-config-airbnb-typescript": "^17.0.0",
39 |     "eslint-config-prettier": "^8.5.0",
40 |     "eslint-plugin-import": "^2.27.5",
41 |     "eslint-plugin-prettier": "^4.2.1",
42 |     "eslint-plugin-simple-import-sort": "^8.0.0",
43 |     "eslint-plugin-testing-library": "^5.9.1",
44 |     "eslint-plugin-unused-imports": "^2.0.0",
45 |     "husky": "^8.0.1",
46 |     "jest": "^29.5.0",
47 |     "lint-staged": "^13.0.3",
48 |     "prettier": "^2.7.1",
49 |     "ts-jest": "^29.1.0",
50 |     "ts-loader": "^9.4.2",
51 |     "typescript": "^5.2.2"
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/tsconfig.build.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "./tsconfig.json",
3 |   "exclude": ["embedchain/__tests__"]
4 | }
5 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es6",
 4 |     "module": "CommonJS",
 5 |     "strict": true,
 6 |     "outDir": "dist",
 7 |     "rootDir": "embedchain",
 8 |     "sourceMap": true,
 9 |     "declaration": true,
10 |     "declarationDir": "types",
11 |     "esModuleInterop": true
12 |   },
13 |   "include": ["embedchain/**/*.ts"],
14 |   "exclude": ["node_modules", "dist"]
15 | }
16 | 


--------------------------------------------------------------------------------