├── .cargo
└── audit.toml
├── .github
├── PULL_REQUEST_TEMPLATE.md
├── actions
│ ├── check
│ │ └── action.yml
│ └── setup
│ │ └── action.yml
└── workflows
│ └── ci.yaml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── examples
├── Cargo.toml
├── src
│ ├── example_document_qa.rs
│ ├── example_embedding.rs
│ ├── example_github_pr_summary.rs
│ ├── example_github_repo_loader.rs
│ ├── example_markdown_loader.rs
│ ├── example_prompt_template.rs
│ ├── example_vector_store.rs
│ ├── kit
│ │ ├── mod.rs
│ │ └── repl.rs
│ └── lib.rs
└── testdata
│ └── markdowns
│ ├── copy.md
│ └── subdir
│ └── 42-data-type-map.md
├── llmchain
├── Cargo.toml
├── README.md
├── src
│ ├── common
│ │ ├── mod.rs
│ │ ├── string.rs
│ │ └── token.rs
│ ├── embeddings
│ │ ├── databend
│ │ │ ├── databend.rs
│ │ │ └── mod.rs
│ │ ├── embedding.rs
│ │ ├── mod.rs
│ │ └── openai
│ │ │ ├── mod.rs
│ │ │ └── openai.rs
│ ├── lib.rs
│ ├── llms
│ │ ├── azure_openai
│ │ │ ├── azure_openai.rs
│ │ │ └── mod.rs
│ │ ├── databend
│ │ │ ├── databend.rs
│ │ │ └── mod.rs
│ │ ├── llm.rs
│ │ ├── mod.rs
│ │ └── openai
│ │ │ ├── mod.rs
│ │ │ └── openai.rs
│ ├── loaders
│ │ ├── directory
│ │ │ ├── directory_loader.rs
│ │ │ └── mod.rs
│ │ ├── disk
│ │ │ ├── disk.rs
│ │ │ ├── local_disk.rs
│ │ │ ├── mod.rs
│ │ │ └── remote_disk.rs
│ │ ├── document.rs
│ │ ├── document_loader.rs
│ │ ├── document_path.rs
│ │ ├── document_splitter.rs
│ │ ├── documents.rs
│ │ ├── github
│ │ │ ├── github_pr_loader.rs
│ │ │ ├── github_pr_splitter.rs
│ │ │ ├── github_repo_loader.rs
│ │ │ └── mod.rs
│ │ ├── markdown
│ │ │ ├── markdown_loader.rs
│ │ │ ├── markdown_splitter.rs
│ │ │ └── mod.rs
│ │ ├── mod.rs
│ │ └── text
│ │ │ ├── mod.rs
│ │ │ ├── text_loader.rs
│ │ │ └── text_splitter.rs
│ ├── memory
│ │ ├── github_pr_summary.rs
│ │ ├── mod.rs
│ │ └── summary.rs
│ ├── prompts
│ │ ├── document_retrieval_prompt.rs
│ │ ├── github_pr_summary_prompt.rs
│ │ ├── mod.rs
│ │ ├── prompt.rs
│ │ └── text_to_sql_prompt.rs
│ └── vector_stores
│ │ ├── databend
│ │ ├── databend.rs
│ │ └── mod.rs
│ │ ├── mod.rs
│ │ └── vector_store.rs
└── tests
│ ├── it
│ ├── common
│ │ ├── mod.rs
│ │ ├── string.rs
│ │ └── token.rs
│ ├── embeddings
│ │ ├── databend
│ │ │ ├── databend.rs
│ │ │ └── mod.rs
│ │ ├── mod.rs
│ │ └── openai
│ │ │ ├── mod.rs
│ │ │ └── openai.rs
│ ├── llms
│ │ ├── azure_openai
│ │ │ ├── azure_openai.rs
│ │ │ └── mod.rs
│ │ ├── databend
│ │ │ ├── databend.rs
│ │ │ └── mod.rs
│ │ ├── mod.rs
│ │ └── openai
│ │ │ ├── mod.rs
│ │ │ └── openai.rs
│ ├── loaders
│ │ ├── directory
│ │ │ ├── directory_loader.rs
│ │ │ ├── directory_splitter.rs
│ │ │ └── mod.rs
│ │ ├── github
│ │ │ ├── github_pr_loader.rs
│ │ │ ├── github_pr_splitter.rs
│ │ │ ├── github_repo_loader.rs
│ │ │ └── mod.rs
│ │ ├── markdown
│ │ │ ├── markdown_loader.rs
│ │ │ ├── markdown_splitter.rs
│ │ │ └── mod.rs
│ │ ├── mod.rs
│ │ └── text
│ │ │ ├── mod.rs
│ │ │ ├── text_loader.rs
│ │ │ └── text_splitter.rs
│ ├── main.rs
│ ├── prompts
│ │ ├── document_retrieval_prompt.rs
│ │ ├── mod.rs
│ │ ├── prompt_template.rs
│ │ └── text_to_sql_prompt.rs
│ └── vector_stores
│ │ ├── databend
│ │ ├── databend.rs
│ │ └── mod.rs
│ │ └── mod.rs
│ └── testdata
│ ├── loaders
│ ├── directory
│ │ ├── copy.md
│ │ ├── directory_loader.golden
│ │ ├── directory_splitter_chunk_100.golden
│ │ └── subdir
│ │ │ └── 42-data-type-map.md
│ ├── github
│ │ ├── github_pr_loader.golden
│ │ ├── github_pr_splitter_default.golden
│ │ └── github_repo_loader.golden
│ ├── markdown
│ │ ├── copy-hyphen.md
│ │ ├── copy.md
│ │ ├── copy_md_loader.golden
│ │ ├── copy_md_splitter_chunk_100.golden
│ │ ├── copy_md_splitter_custom_separator.golden
│ │ └── copy_md_splitter_default.golden
│ └── text
│ │ ├── example.txt
│ │ ├── example_txt_loader.golden
│ │ ├── example_txt_splitter_chunk_10.golden
│ │ └── example_txt_splitter_default.golden
│ └── prompts
│ ├── document_retrieval_prompt.golden
│ ├── prompt_template.golden
│ └── text_to_sql_prompt.golden
├── rust-toolchain.toml
└── rustfmt.toml
/.cargo/audit.toml:
--------------------------------------------------------------------------------
1 | [advisories]
2 | ignore = [
3 | #Crate: time
4 | #Version: 0.1.45
5 | #Title: Potential segfault in the time crate
6 | #Date: 2020-11-18
7 | #ID: RUSTSEC-2020-0071
8 | #URL: https://rustsec.org/advisories/RUSTSEC-2020-0071
9 | "RUSTSEC-2020-0071"
10 | ]
11 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Summary
2 |
3 | Summary about this PR
4 |
5 | Closes #issue
--------------------------------------------------------------------------------
/.github/actions/check/action.yml:
--------------------------------------------------------------------------------
1 | name: 'Check'
2 | description: 'Check will do all essential checks'
3 | inputs:
4 | github_token:
5 | description: "Github Token"
6 | required: true
7 | runs:
8 | using: "composite"
9 | steps:
10 | - name: Format
11 | shell: bash
12 | run: |
13 | cargo fmt --all -- --check
14 |
15 | - name: Clippy
16 | shell: bash
17 | run: |
18 | cargo clippy --all-targets -- -D warnings
19 |
--------------------------------------------------------------------------------
/.github/actions/setup/action.yml:
--------------------------------------------------------------------------------
1 | name: Setup Rust Builder
2 | description: 'Prepare Rust Build Environment'
3 | inputs:
4 | cache-key:
5 | description: 'the rust cache key suffix'
6 | required: false
7 | default: ''
8 |
9 | runs:
10 | using: "composite"
11 | steps:
12 | - name: Setup sccache
13 | uses: mozilla-actions/sccache-action@v0.0.3
14 | - name: Cache Cargo
15 | uses: actions/cache@v3
16 | with:
17 | path: |
18 | ~/.cargo/registry/
19 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.toml') }}-${{ inputs.cache-key }}
20 | restore-keys: |
21 | ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.toml') }}
22 | ${{ runner.os }}-cargo
23 | - name: Setup rust related environment variables
24 | shell: bash
25 | run: |
26 | # Update rust to latest stable
27 | # rustup update stable
28 | # Disable full debug symbol generation to speed up CI build and keep memory down
29 | # "1" means line tables only, which is useful for panic tracebacks.
30 | echo "RUSTFLAGS=-C debuginfo=1" >> $GITHUB_ENV
31 | # Enable backtraces
32 | echo "RUST_BACKTRACE=1" >> $GITHUB_ENV
33 | # Enable logging
34 | echo "RUST_LOG=debug" >> $GITHUB_ENV
35 | # Enable sparse index
36 | echo "CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse" >> $GITHUB_ENV
37 | # Enable sccache
38 | echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
39 | echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
40 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: ci
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | concurrency:
10 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
11 | cancel-in-progress: true
12 |
13 | jobs:
14 | check:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v4
18 | - uses: ./.github/actions/setup
19 | with:
20 | cache-key: check
21 | - uses: ./.github/actions/check
22 | with:
23 | github_token: ${{ secrets.GITHUB_TOKEN }}
24 |
25 | build:
26 | runs-on: ${{ matrix.os }}
27 | strategy:
28 | matrix:
29 | os:
30 | - ubuntu-latest
31 | - macos-11
32 | steps:
33 | - uses: actions/checkout@v4
34 | - uses: ./.github/actions/setup
35 | with:
36 | cache-key: build
37 | - run: cargo build
38 |
39 | unit:
40 | runs-on: ubuntu-latest
41 | if: github.event_name == 'push'
42 | env:
43 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
44 | DATABEND_DSN: ${{ secrets.DATABEND_DSN }}
45 | L_GITHUB_TOKEN: ${{ secrets.L_GITHUB_TOKEN }}
46 | steps:
47 | - uses: actions/checkout@v4
48 | - uses: ./.github/actions/setup
49 | with:
50 | cache-key: unit
51 | - run: cargo test --all-features -- --show-output
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | debug/
4 | target/
5 |
6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
8 | Cargo.lock
9 |
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk
12 |
13 | # MSVC Windows builds of rustc generate these, which store debugging information
14 | *.pdb
15 | .idea/
16 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | resolver = "2"
3 |
4 | members = [
5 | "llmchain",
6 | "examples"
7 | ]
8 |
9 |
10 | [workspace.dependencies]
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: check build test integration
2 |
3 | default: build
4 |
5 | lint:
6 | cargo fmt --all
7 | cargo clippy --all-targets --all-features -- -D warnings
8 | # Unused deps
9 | cargo machete
10 |
11 | build:
12 | cargo build
13 |
14 | test:
15 | cargo test
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
llmchain: Modern Data Transformations with LLM
2 | 🦀 + Large Language Models, inspired by LangChain.
3 |
8 |
9 | ## Features
10 |
11 | - **Models**: LLMs & Chat Models & Embedding Models
12 | - **LLMS**: OpenAI/AzureOpenAI/[DatabendCloud](https://app.databend.com)
13 |
14 | - **Prompts**: LLMs & Chat Prompt Templates
15 |
16 | - **Indexes**: Documents Loaders & Text Splitters & Vector Store & Retrievers
17 | - **Documents Loaders**: MarkdownLoader/DirectoryLoader/TextLoader/GithubPullRequestLoader
18 | - **Documents Splitters**: MarkdownSplitter, TextSplitter
19 | - **Vector Store**: [DatabendCloud](https://app.databend.com)
20 |
21 | - **Chains**: Seamlessly combines multiple actions to create unified, coherent AI services
22 |
23 | ## Examples
24 |
25 | Please see [examples](https://github.com/shafishlabs/llmchain.rs/tree/main/examples).
26 |
27 | ## Who is using llmchain?
28 |
29 | - [AskBend](https://github.com/datafuselabs/askbend): Leveraging Databend Cloud for Advanced AI Services
30 |
--------------------------------------------------------------------------------
/examples/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "llmchain-examples"
3 | version = "0.1.0"
4 | edition = "2021"
5 | license = "Apache-2.0"
6 | description = "A Rust framework for large language models, inspired by LangChain"
7 | categories = ["LLMs"]
8 | keywords = ["LLMs"]
9 |
10 | [[bin]]
11 | name = "embedding_example"
12 | path = "src/example_embedding.rs"
13 | doctest = false
14 | test = false
15 |
16 | [[bin]]
17 | name = "example_markdown_loader"
18 | path = "src/example_markdown_loader.rs"
19 | doctest = false
20 | test = false
21 |
22 | [[bin]]
23 | name = "example_prompt_template"
24 | path = "src/example_prompt_template.rs"
25 | doctest = false
26 | test = false
27 |
28 | [[bin]]
29 | name = "example_vector_store"
30 | path = "src/example_vector_store.rs"
31 | doctest = false
32 | test = false
33 |
34 | [[bin]]
35 | name = "example_document_qa"
36 | path = "src/example_document_qa.rs"
37 | doctest = false
38 | test = false
39 |
40 | [[bin]]
41 | name = "example_github_repo_loader"
42 | path = "src/example_github_repo_loader.rs"
43 | doctest = false
44 | test = false
45 |
46 | [[bin]]
47 | name = "example_github_pr_summary"
48 | path = "src/example_github_pr_summary.rs"
49 | doctest = false
50 | test = false
51 |
52 | [dependencies]
53 | anyhow = "1.0.*"
54 | colored = "2.0.0"
55 | env_logger = "0.10.0"
56 | llmchain = { path = "../llmchain" }
57 | log = "0.4.17"
58 | rustyline = "12.0.0"
59 | tokio = { version = "1.28.0", features = ["full"] }
60 | url = "2.4.0"
61 |
62 | [dev-dependencies]
63 |
--------------------------------------------------------------------------------
/examples/src/example_document_qa.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::env;
17 | use std::sync::Arc;
18 | use std::time::Instant;
19 |
20 | use anyhow::Result;
21 | use env_logger::Env;
22 | use llmchain::DatabendEmbedding;
23 | use llmchain::DatabendLLM;
24 | use llmchain::DatabendVectorStore;
25 | use llmchain::DirectoryLoader;
26 | use llmchain::DocumentLoader;
27 | use llmchain::DocumentPath;
28 | use llmchain::DocumentRetrievalPrompt;
29 | use llmchain::DocumentSplitter;
30 | use llmchain::LocalDisk;
31 | use llmchain::MarkdownLoader;
32 | use llmchain::MarkdownSplitter;
33 | use llmchain::Prompt;
34 | use llmchain::VectorStore;
35 | use llmchain::LLM;
36 | use log::info;
37 |
38 | /// EXPORT DATABEND_DSN=
39 | /// cargo run --bin example_document_qa
40 | #[tokio::main]
41 | async fn main() -> Result<()> {
42 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
43 |
44 | let dsn = std::env::var("DATABEND_DSN")
45 | .map_err(|_| {
46 | "DATABEND_DSN is empty, please EXPORT DATABEND_DSN=".to_string()
47 | })
48 | .unwrap();
49 |
50 | let args: Vec = env::args().collect();
51 | if !args.is_empty() {
52 | let arg = args.get(1).unwrap();
53 | match arg.as_str() {
54 | "embedding" => embeddings(&dsn).await?,
55 | "query" => query(&dsn).await?,
56 | _ => {
57 | info!("cargo run --bin example_document_qa [embedding|query]")
58 | }
59 | }
60 | }
61 |
62 | Ok(())
63 | }
64 |
65 | async fn embeddings(databend_dsn: &str) -> Result<()> {
66 | // dir.
67 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
68 | let testdata_dir = format!("{}/examples/testdata", curdir);
69 | let directory_dir = format!("{}/markdowns/", testdata_dir);
70 |
71 | // Embedding.
72 | {
73 | let start = Instant::now();
74 | // Loader.
75 | info!("Prepare to load all the documents {}", directory_dir);
76 | let directory_loader = DirectoryLoader::create(LocalDisk::create()?)
77 | .with_loader("**/*.md", MarkdownLoader::create(LocalDisk::create()?));
78 | let documents = directory_loader
79 | .load(DocumentPath::from_string(&directory_dir))
80 | .await?;
81 | info!(
82 | "Load all the documents {} done, cost: {}",
83 | directory_dir,
84 | start.elapsed().as_secs()
85 | );
86 |
87 | // Splitter.
88 | info!(
89 | "Prepare to split all the documents, count: {}",
90 | documents.len()
91 | );
92 | let start = Instant::now();
93 | let documents = MarkdownSplitter::create().split_documents(&documents)?;
94 | info!(
95 | "Split all to documents, count: {}, cost: {}",
96 | documents.len(),
97 | start.elapsed().as_secs()
98 | );
99 |
100 | // embedding.
101 | info!(
102 | "Prepare to indexing the documents, count: {}",
103 | documents.len()
104 | );
105 | let start = Instant::now();
106 | let databend_embedding = Arc::new(DatabendEmbedding::create(databend_dsn));
107 | let databend = DatabendVectorStore::create(databend_dsn, databend_embedding);
108 | databend.init().await?;
109 |
110 | // indexing.
111 | let uuids = databend.add_documents(&documents).await?;
112 | info!(
113 | "Indexing the documents done, count: {}, cost: {}",
114 | uuids.len(),
115 | start.elapsed().as_secs()
116 | );
117 |
118 | Ok(())
119 | }
120 | }
121 |
122 | async fn query(databend_dsn: &str) -> Result<()> {
123 | let start = Instant::now();
124 | let question = "how to do COPY in databend";
125 |
126 | let databend_embedding = Arc::new(DatabendEmbedding::create(databend_dsn));
127 | let databend = DatabendVectorStore::create(databend_dsn, databend_embedding);
128 | databend.init().await?;
129 | let similarities = databend.similarity_search(question, 3).await?;
130 | info!(
131 | "query: {}, similarity documents: {:?}, cost: {}",
132 | question,
133 | similarities.len(),
134 | start.elapsed().as_secs()
135 | );
136 |
137 | let contexts = similarities
138 | .iter()
139 | .map(|x| format!("context:{}\nsource:{}", x.content, x.path))
140 | .collect::>()
141 | .join("");
142 | let prompt_template = DocumentRetrievalPrompt::create().with_instructions(vec!["Present your answer in markdown format, including code snippets if have, format the code snippets with SQL type if necessary.",
143 | "Do not include any links or external references in your response.\n",
144 | "Do not change the code snippets.\n",
145 | "Do not change the SQL syntax, please don't make up the function.\n",
146 | "Do not change explain any code snippets.\n",
147 | "Make the whole answer as short as possible to keep the code snippets.\n"
148 | ]);
149 | let mut input_variables = HashMap::new();
150 | input_variables.insert("question", question);
151 | input_variables.insert("contexts", &contexts);
152 | let prompt = prompt_template.format(input_variables)?;
153 |
154 | //
155 | let databend_llm = DatabendLLM::create(databend_dsn);
156 | let answer = databend_llm.generate(&prompt).await?;
157 | info!("question: {}", question);
158 | info!("answer: {:?}", answer);
159 | Ok(())
160 | }
161 |
--------------------------------------------------------------------------------
/examples/src/example_embedding.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use env_logger::Env;
17 | use llmchain::DatabendEmbedding;
18 | use llmchain::Document;
19 | use llmchain::Documents;
20 | use llmchain::Embedding;
21 | use log::info;
22 |
23 | /// EXPORT DATABEND_DSN=
24 | /// cargo run --bin example_embedding
25 | #[tokio::main]
26 | async fn main() -> Result<()> {
27 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
28 |
29 | let dsn = std::env::var("DATABEND_DSN")
30 | .map_err(|_| {
31 | "DATABEND_DSN is empty, please EXPORT DATABEND_DSN=".to_string()
32 | })
33 | .unwrap();
34 |
35 | // Sample documents.
36 | let documents = Documents::from(vec![
37 | Document::create("", "hello"),
38 | Document::create("", "llmchain.rs"),
39 | ]);
40 |
41 | // create embedding.
42 | let embeddings = DatabendEmbedding::create(&dsn);
43 |
44 | // embedding documents.
45 | let document_result = embeddings.embed_documents(&documents).await?;
46 | info!("{:?}", document_result);
47 |
48 | Ok(())
49 | }
50 |
--------------------------------------------------------------------------------
/examples/src/example_github_pr_summary.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use env_logger::Env;
17 | use llmchain::DatabendLLM;
18 | use llmchain::DocumentLoader;
19 | use llmchain::DocumentPath;
20 | use llmchain::DocumentSplitter;
21 | use llmchain::GithubPRDiffSplitter;
22 | use llmchain::GithubPRLoader;
23 | use llmchain::GithubPRSummary;
24 | use llmchain::Summarize;
25 | use llmchain_examples::kit::handle_repl;
26 | use llmchain_examples::kit::ReplAsyncCallback;
27 | use log::info;
28 | use url::Url;
29 |
30 | /// EXPORT DATABEND_DSN=
31 | /// cargo run --bin example_github_pr_summary
32 | #[tokio::main]
33 | async fn main() -> Result<(), Box> {
34 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
35 | let callback: Box = Box::new(|input| Box::pin(github_pr_summary(input)));
36 | handle_repl("pr> ", callback).await?;
37 |
38 | Ok(())
39 | }
40 |
41 | async fn github_pr_summary(pr: String) -> Result {
42 | if pr.is_empty() {
43 | return Ok("Input Github PR URL which you want to summary".to_string());
44 | }
45 |
46 | let (owner, repo, pull_id) = parse_github_pr(&pr)?;
47 | let databend_dsn = std::env::var("DATABEND_DSN")
48 | .map_err(|_| {
49 | "DATABEND_DSN is empty, please EXPORT DATABEND_DSN=".to_string()
50 | })
51 | .unwrap();
52 |
53 | let github_token = std::env::var("GITHUB_TOKEN").unwrap_or("".to_string());
54 |
55 | let documents = GithubPRLoader::create(&owner, &repo, &github_token)
56 | .load(DocumentPath::from_list(vec![pull_id]))
57 | .await?;
58 |
59 | let documents = GithubPRDiffSplitter::create()
60 | .with_chunk_size(8000)
61 | .split_documents(&documents)
62 | .unwrap();
63 |
64 | let databend_llm = DatabendLLM::create(&databend_dsn);
65 | let summary = GithubPRSummary::create(databend_llm);
66 | summary.add_documents(&documents).await?;
67 | let pr_summary = summary.final_summary().await?;
68 |
69 | let final_summary = format!(
70 | "{}\nTokens:{}\n## Summary(By llmchain.rs)\n{}",
71 | pr,
72 | summary.tokens(),
73 | pr_summary
74 | );
75 | Ok(final_summary)
76 | }
77 |
78 | fn parse_github_pr(url: &str) -> Result<(String, String, usize)> {
79 | let parsed_url = Url::parse(url)?;
80 |
81 | let mut segments = parsed_url.path_segments().expect("path segments");
82 |
83 | let owner = segments.next().expect("owner").to_string();
84 | info!("owner: {}", owner);
85 | let repo = segments.next().expect("repo").to_string();
86 | info!("repo: {}", repo);
87 |
88 | // Ignoring the next segment because it's "pull" or "pulls"
89 | let _ = segments.next();
90 |
91 | let pull_id_str = segments.next().expect("pr id").to_string();
92 | let pull_id: usize = pull_id_str.parse().expect("parse pr id error");
93 |
94 | Ok((owner, repo, pull_id))
95 | }
96 |
--------------------------------------------------------------------------------
/examples/src/example_github_repo_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use env_logger::Env;
17 | use llmchain::DocumentLoader;
18 | use llmchain::DocumentPath;
19 | use llmchain::GithubRepoLoader;
20 | use log::info;
21 |
22 | /// cargo run --bin example_github_repo_loader
23 | #[tokio::main]
24 | async fn main() -> Result<()> {
25 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
26 |
27 | // documents
28 | let documents = GithubRepoLoader::create()
29 | .load(DocumentPath::from_string(
30 | "https://github.com/shafishlabs/llmchain.rs",
31 | ))
32 | .await?;
33 |
34 | info!("{:?}", documents);
35 |
36 | Ok(())
37 | }
38 |
--------------------------------------------------------------------------------
/examples/src/example_markdown_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use env_logger::Env;
17 | use llmchain::DirectoryLoader;
18 | use llmchain::DocumentLoader;
19 | use llmchain::DocumentPath;
20 | use llmchain::LocalDisk;
21 | use llmchain::MarkdownLoader;
22 | use log::info;
23 |
24 | /// cargo run --bin example_markdown_loader
25 | #[tokio::main]
26 | async fn main() -> Result<()> {
27 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
28 |
29 | // dir.
30 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
31 | let testdata_dir = format!("{}/examples/testdata", curdir);
32 | let directory_dir = format!("{}/markdowns/", testdata_dir);
33 | info!("{}", directory_dir);
34 |
35 | // Loader from local disk.
36 | let local_disk = LocalDisk::create()?;
37 |
38 | // Markdown loader with a local disk.
39 | let markdown_loader = MarkdownLoader::create(local_disk);
40 |
41 | // Directory loader.
42 | let directory_loader =
43 | DirectoryLoader::create(LocalDisk::create()?).with_loader("**/*.md", markdown_loader);
44 |
45 | // loader all documents.
46 | let documents = directory_loader
47 | .load(DocumentPath::from_string(&directory_dir))
48 | .await?;
49 | info!("{:?}", documents);
50 |
51 | Ok(())
52 | }
53 |
--------------------------------------------------------------------------------
/examples/src/example_prompt_template.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 |
17 | use anyhow::Result;
18 | use env_logger::Env;
19 | use llmchain::Prompt;
20 | use llmchain::PromptTemplate;
21 | use log::info;
22 |
23 | /// cargo run --bin example_prompt_template
24 | #[tokio::main]
25 | async fn main() -> Result<()> {
26 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
27 |
28 | // prompt template.
29 | let prompt_template = PromptTemplate::create("Hello {{name}}", vec!["name".to_string()]);
30 |
31 | // input variables.
32 | let mut input_variables = HashMap::new();
33 | input_variables.insert("name", "llmchain.rs");
34 |
35 | // format the template.
36 | let prompt = prompt_template.format(input_variables)?;
37 |
38 | info!("prompt: {}", prompt);
39 |
40 | Ok(())
41 | }
42 |
--------------------------------------------------------------------------------
/examples/src/example_vector_store.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 | use env_logger::Env;
19 | use llmchain::DatabendEmbedding;
20 | use llmchain::DatabendVectorStore;
21 | use llmchain::Document;
22 | use llmchain::Documents;
23 | use llmchain::VectorStore;
24 | use log::info;
25 |
26 | /// EXPORT DATABEND_DSN=
27 | /// cargo run --bin example_vector_store
28 | #[tokio::main]
29 | async fn main() -> Result<()> {
30 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
31 |
32 | let dsn = std::env::var("DATABEND_DSN")
33 | .map_err(|_| {
34 | "DATABEND_DSN is empty, please EXPORT DATABEND_DSN=".to_string()
35 | })
36 | .unwrap();
37 |
38 | // Sample documents.
39 | let documents = Documents::from(vec![
40 | Document::create("1.md", "hello"),
41 | Document::create("2.md", "llmchain.rs"),
42 | ]);
43 |
44 | // create embedding.
45 | let databend_embedding = Arc::new(DatabendEmbedding::create(&dsn));
46 |
47 | // create databend vector store.
48 | let databend = DatabendVectorStore::create(&dsn, databend_embedding);
49 | databend.init().await?;
50 |
51 | // add documents to vector store.
52 | let uuids = databend.add_documents(&documents).await?;
53 | info!("embedding uuids:{:?}", uuids);
54 |
55 | // query a similarity document.
56 | let query = "llmchain";
57 | let similarities = databend.similarity_search("llmchain", 1).await?;
58 | info!("query:{}, similarity documents:{:?}", query, similarities);
59 |
60 | Ok(())
61 | }
62 |
--------------------------------------------------------------------------------
/examples/src/kit/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod repl;
16 |
17 | pub use repl::*;
18 |
--------------------------------------------------------------------------------
/examples/src/kit/repl.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::future::Future;
16 | use std::io;
17 | use std::io::Write;
18 | use std::pin::Pin;
19 | use std::thread;
20 | use std::time::Duration;
21 |
22 | use anyhow::Result;
23 | use colored::*;
24 | use rustyline::config::Builder;
25 | use rustyline::error::ReadlineError;
26 | use rustyline::CompletionType;
27 | use rustyline::DefaultEditor;
28 |
29 | pub type ReplAsyncCallback =
30 | dyn Fn(String) -> Pin> + Send>> + Send + Sync;
31 |
32 | pub async fn handle_repl(
33 | hint: &str,
34 | callback: Box,
35 | ) -> Result<(), Box> {
36 | let config = Builder::new()
37 | .completion_prompt_limit(5)
38 | .completion_type(CompletionType::Circular)
39 | .build();
40 |
41 | let mut rl = DefaultEditor::with_config(config)?;
42 |
43 | loop {
44 | match rl.readline(hint) {
45 | Ok(line) => {
46 | let result = (callback)(line).await?;
47 | let stdout = io::stdout();
48 | let mut handle = stdout.lock();
49 | for word in result.split_terminator('\n') {
50 | let colored_word = word.green();
51 | writeln!(handle, "{} ", colored_word)?; // print word and a space
52 | handle.flush()?;
53 | thread::sleep(Duration::from_millis(80)); // sleep for 500ms
54 | }
55 | }
56 | Err(e) => match e {
57 | ReadlineError::Io(err) => {
58 | eprintln!("io err: {err}");
59 | return Err(Box::new(err));
60 | }
61 | ReadlineError::Interrupted => {
62 | println!("^C");
63 | }
64 | ReadlineError::Eof => {
65 | break;
66 | }
67 | _ => {}
68 | },
69 | }
70 | }
71 | println!("Bye~");
72 |
73 | Ok(())
74 | }
75 |
--------------------------------------------------------------------------------
/examples/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | pub mod kit;
16 |
--------------------------------------------------------------------------------
/examples/testdata/markdowns/subdir/42-data-type-map.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Map
3 | ---
4 |
5 | The MAP data structure is utilized for holding a set of `Key:Value` pairs, and stores data using a nested data structure of Array(Tuple(key, value)). It is appropriate in situations where the data type is constant, but the `Key`'s value cannot be entirely ascertained.
6 |
7 | ## Understanding Key:Value
8 |
9 | The `Key` is of a specified basic data type, including Boolean, Number, Decimal, String, Date, or Timestamp. A `Key`'s value cannot be Null, and duplicates are not allowed. The `Value` can be any data type, including nested arrays, tuples, and so on.
10 |
11 | Map data can be generated through `Key:Value` pairs enclosed in curly braces or by using the Map function to convert two arrays into a Map. The Map function takes two arrays as input, where the elements in the first array serve as the keys and the elements in the second array serve as the values. See an example below:
12 |
13 | ```sql
14 | -- Input arrays: [1, 2] and ['v1', 'v2']
15 | -- Resulting Map: {1: 'v1', 2: 'v2'}
16 |
17 | SELECT {'k1': 1, 'k2': 2}, map([1, 2], ['v1', 'v2']);
18 | +-----------------+---------------------------+
19 | | {'k1':1,'k2':2} | map([1, 2], ['v1', 'v2']) |
20 | +-----------------+---------------------------+
21 | | {'k1':1,'k2':2} | {1:'v1',2:'v2'} |
22 | +-----------------+---------------------------+
23 | ```
24 |
25 | ## Map and Bloom Filter Index
26 |
27 | In Databend Map, a bloom filter index is created for the value with certain data types: `Numeric`, `String`, `Timestamp`, and `Date`.
28 |
29 | This makes it easier and faster to search for values in the MAP data structure.
30 |
31 | The implementation of the bloom filter index in Databend Map is in [PR#10457](https://github.com/datafuselabs/databend/pull/10457).
32 |
33 | The bloom filter is particularly effective in reducing query time when the queried value does not exist.
34 |
35 | For example:
36 | ```sql
37 | select * from nginx_log where log['ip'] = '205.91.162.148';
38 | +----+----------------------------------------+
39 | | id | log |
40 | +----+----------------------------------------+
41 | | 1 | {'ip':'205.91.162.148','url':'test-1'} |
42 | +----+----------------------------------------+
43 | 1 row in set
44 | Time: 1.733s
45 |
46 | select * from nginx_log where log['ip'] = '205.91.162.141';
47 | +----+-----+
48 | | id | log |
49 | +----+-----+
50 | +----+-----+
51 | 0 rows in set
52 | Time: 0.129s
53 | ```
54 |
55 | ## Examples
56 |
57 | **Create a table with a Map column for storing web traffic data**
58 |
59 | ```sql
60 | CREATE TABLE web_traffic_data(id INT64, traffic_info MAP(STRING, STRING));
61 |
62 | DESC web_traffic_data;
63 | +-------------+--------------------+------+---------+-------+
64 | | Field | Type | Null | Default | Extra |
65 | +-------------+--------------------+------+---------+-------+
66 | | id | INT64 | NO | | |
67 | | traffic_info| MAP(STRING, STRING)| NO | {} | |
68 | +-------------+--------------------+------+---------+-------+
69 | ```
70 |
71 | **Insert Map data containing IP addresses and URLs visited**
72 |
73 | ```sql
74 | INSERT INTO web_traffic_data VALUES(1, {'ip': '192.168.1.1', 'url': 'example.com/home'}),
75 | (2, {'ip': '192.168.1.2', 'url': 'example.com/about'}),
76 | (3, {'ip': '192.168.1.1', 'url': 'example.com/contact'});
77 | ```
78 |
79 | **Query**
80 |
81 | ```sql
82 | SELECT * FROM web_traffic_data;
83 |
84 | +----+-----------------------------------+
85 | | id | traffic_info |
86 | +----+-----------------------------------+
87 | | 1 | {'ip':'192.168.1.1','url':'example.com/home'} |
88 | | 2 | {'ip':'192.168.1.2','url':'example.com/about'} |
89 | | 3 | {'ip':'192.168.1.1','url':'example.com/contact'} |
90 | +----+-----------------------------------+
91 | ```
92 |
93 | **Query the number of visits per IP address**
94 |
95 | ```sql
96 | SELECT traffic_info['ip'] as ip_address, COUNT(*) as visits
97 | FROM web_traffic_data
98 | GROUP BY traffic_info['ip'];
99 |
100 | +-------------+--------+
101 | | ip_address | visits |
102 | +-------------+--------+
103 | | 192.168.1.1 | 2 |
104 | | 192.168.1.2 | 1 |
105 | +-------------+--------+
106 | ```
107 |
108 | **Query the most visited URLs**
109 | ```sql
110 | SELECT traffic_info['url'] as url, COUNT(*) as visits
111 | FROM web_traffic_data
112 | GROUP BY traffic_info['url']
113 | ORDER BY visits DESC
114 | LIMIT 3;
115 |
116 | +---------------------+--------+
117 | | url | visits |
118 | +---------------------+--------+
119 | | example.com/home | 1 |
120 | | example.com/about | 1 |
121 | | example.com/contact | 1 |
122 | +---------------------+--------+
123 | ```
124 |
--------------------------------------------------------------------------------
/llmchain/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "llmchain"
3 | version = "0.1.3"
4 | authors = ["BohuTANG "]
5 | edition = "2021"
6 | description = "A Rust framework for large language models, inspired by LangChain"
7 | repository = "https://github.com/shafishlabs/llmchain.rs"
8 | license = "Apache-2.0"
9 | keywords = ["openai", "ai", "llm", "gpt", "library"]
10 |
11 | [dependencies]
12 | anyhow = "1.0.*"
13 | async-openai = "0.14.1"
14 | async-recursion = "1.0.5"
15 | async-trait = "0.1.68"
16 | databend-driver = "0.12.5"
17 | derive_builder = "0.20.0"
18 | env_logger = "0.11.1"
19 | futures = "0.3.*"
20 | git2 = "0.18.0"
21 | glob = "0.3.1"
22 | goldenfile = "1.4"
23 | log = "0.4.17"
24 | md5 = "0.7.0"
25 | octocrab = "0.33.3"
26 | opendal = "0.44.2"
27 | parking_lot = "0.12.1"
28 | patch = "0.7.0"
29 | rayon = "1.7.0"
30 | regex = "1.8.1"
31 | reqwest = "0.11.24"
32 | serde_json = "1.0.95"
33 | tiktoken-rs = "0.5.0"
34 | tokio = { version = "1.28.0", features = ["full"] }
35 | tokio-stream = "0.1.12"
36 | uuid = "1.3.3"
37 |
38 | [dev-dependencies]
39 |
--------------------------------------------------------------------------------
/llmchain/README.md:
--------------------------------------------------------------------------------
1 | llmchain: Modern Data Transformations with LLM
2 | 🦀 + Large Language Models, inspired by LangChain.
3 |
8 |
9 | ## Features
10 |
11 | - **Models**: LLMs & Chat Models & Embedding Models
12 | - **LLMS**: OpenAI/AzureOpenAI/[DatabendCloud](https://app.databend.com)
13 |
14 | - **Prompts**: LLMs & Chat Prompt Templates
15 |
16 | - **Indexes**: Documents Loaders & Text Splitters & Vector Store & Retrievers
17 | - **Documents Loaders**: MarkdownLoader/DirectoryLoader/TextLoader/GithubPullRequestLoader
18 | - **Documents Splitters**: MarkdownSplitter, TextSplitter
19 | - **Vector Store**: [DatabendCloud](https://app.databend.com)
20 |
21 | - **Chains**: Seamlessly combines multiple actions to create unified, coherent AI services
22 |
23 | ## Examples
24 |
25 | Please see [examples](https://github.com/shafishlabs/llmchain.rs/tree/main/examples).
26 |
27 | ## Who is using llmchain?
28 |
29 | - [AskBend](https://github.com/datafuselabs/askbend): Leveraging Databend Cloud for Advanced AI Services
30 |
--------------------------------------------------------------------------------
/llmchain/src/common/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | mod string;
15 | mod token;
16 |
17 | pub use string::escape_sql_string;
18 | pub use token::chat_tokens;
19 |
--------------------------------------------------------------------------------
/llmchain/src/common/string.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | pub fn escape_sql_string(input: &str) -> String {
16 | input
17 | .replace('\\', "\\\\")
18 | .replace('\'', "''")
19 | .replace('\n', " ")
20 | .replace('\r', "\\r")
21 | }
22 |
--------------------------------------------------------------------------------
/llmchain/src/common/token.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use tiktoken_rs::r50k_base;
17 |
18 | pub fn chat_tokens(input: &str) -> Result> {
19 | let rke = r50k_base()?;
20 | rke.split_by_token(input, true)
21 | }
22 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::DatabendLLM;
20 | use crate::Documents;
21 | use crate::Embedding;
22 | use crate::LLM;
23 |
24 | pub struct DatabendEmbedding {
25 | llm: Arc,
26 | }
27 |
28 | impl DatabendEmbedding {
29 | pub fn create(dsn: &str) -> Self {
30 | DatabendEmbedding {
31 | llm: DatabendLLM::create(dsn),
32 | }
33 | }
34 | }
35 |
36 | #[async_trait::async_trait]
37 | impl Embedding for DatabendEmbedding {
38 | async fn embed_query(&self, input: &str) -> Result> {
39 | let inputs = vec![input.to_string()];
40 | let result = self.llm.embedding(inputs).await?;
41 |
42 | if result.embeddings.is_empty() {
43 | Ok(vec![])
44 | } else {
45 | Ok(result.embeddings[0].clone())
46 | }
47 | }
48 |
49 | async fn embed_documents(&self, inputs: &Documents) -> Result>> {
50 | let inputs = inputs.iter().map(|x| x.content).collect::>();
51 | let result = self.llm.embedding(inputs).await?;
52 |
53 | Ok(result.embeddings)
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
18 | pub use databend::DatabendEmbedding;
19 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/embedding.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::Documents;
18 |
19 | #[async_trait::async_trait]
20 | pub trait Embedding: Send + Sync {
21 | async fn embed_query(&self, input: &str) -> Result>;
22 | async fn embed_documents(&self, inputs: &Documents) -> Result>>;
23 | }
24 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod databend;
16 | mod embedding;
17 | mod openai;
18 |
19 | pub use databend::DatabendEmbedding;
20 | pub use embedding::Embedding;
21 | pub use openai::OpenAIEmbedding;
22 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod openai;
17 |
18 | pub use openai::OpenAIEmbedding;
19 |
--------------------------------------------------------------------------------
/llmchain/src/embeddings/openai/openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::Documents;
20 | use crate::Embedding;
21 | use crate::OpenAI;
22 | use crate::LLM;
23 |
24 | pub struct OpenAIEmbedding {
25 | llm: Arc,
26 | }
27 |
28 | impl OpenAIEmbedding {
29 | pub fn create>>(open_ai: T) -> Self {
30 | OpenAIEmbedding {
31 | llm: open_ai.into(),
32 | }
33 | }
34 | }
35 |
36 | #[async_trait::async_trait]
37 | impl Embedding for OpenAIEmbedding {
38 | async fn embed_query(&self, input: &str) -> Result> {
39 | let inputs = vec![input.to_string()];
40 | let result = self.llm.embedding(inputs).await?;
41 |
42 | if result.embeddings.is_empty() {
43 | Ok(vec![])
44 | } else {
45 | Ok(result.embeddings[0].clone())
46 | }
47 | }
48 |
49 | async fn embed_documents(&self, inputs: &Documents) -> Result>> {
50 | let inputs = inputs.iter().map(|x| x.content).collect::>();
51 | let result = self.llm.embedding(inputs).await?;
52 |
53 | Ok(result.embeddings)
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/llmchain/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod common;
16 | mod embeddings;
17 | mod llms;
18 | mod loaders;
19 | mod memory;
20 | mod prompts;
21 | mod vector_stores;
22 |
23 | pub use common::*;
24 | pub use embeddings::*;
25 | pub use llms::*;
26 | pub use loaders::*;
27 | pub use memory::*;
28 | pub use prompts::*;
29 | pub use vector_stores::*;
30 |
--------------------------------------------------------------------------------
/llmchain/src/llms/azure_openai/azure_openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 | use async_openai::config::AzureConfig;
19 | use async_openai::types::ChatCompletionRequestMessageArgs;
20 | use async_openai::types::CreateChatCompletionRequestArgs;
21 | use async_openai::types::CreateEmbeddingRequestArgs;
22 | use async_openai::types::Role;
23 | use async_openai::Client;
24 | use parking_lot::RwLock;
25 |
26 | use crate::EmbeddingResult;
27 | use crate::GenerateResult;
28 | use crate::OpenAIEmbeddingModel;
29 | use crate::OpenAIGenerateModel;
30 | use crate::LLM;
31 |
32 | pub struct AzureOpenAI {
33 | api_base: String,
34 | api_key: String,
35 | api_version: String,
36 | deployment_id: String,
37 |
38 | // The maximum number of tokens allowed for the generated answer.
39 | // By default, the number of tokens the model can return will be (4095 - prompt tokens).
40 | max_tokens: RwLock,
41 |
42 | // What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
43 | // We generally recommend altering this or top_p but not both.
44 | temperature: RwLock,
45 |
46 | embedding_model: RwLock,
47 | generate_model: RwLock,
48 | }
49 |
50 | impl AzureOpenAI {
51 | pub fn create(api_base: &str, api_key: &str, deployment_id: &str) -> Arc {
52 | Arc::new(AzureOpenAI {
53 | api_base: api_base.to_string(),
54 | api_key: api_key.to_string(),
55 | api_version: "2023-03-15-preview".to_string(),
56 | deployment_id: deployment_id.to_string(),
57 | max_tokens: RwLock::new(4095),
58 | temperature: RwLock::new(1.0),
59 | embedding_model: RwLock::new(OpenAIEmbeddingModel::TextEmbeddingAda002),
60 | generate_model: RwLock::new(OpenAIGenerateModel::Gpt35),
61 | })
62 | }
63 |
64 | pub fn with_max_tokens(self: &Arc, max_tokens: u16) -> Arc {
65 | *self.max_tokens.write() = max_tokens;
66 | self.clone()
67 | }
68 |
69 | pub fn with_embedding_model(self: &Arc, model: OpenAIEmbeddingModel) -> Arc {
70 | *self.embedding_model.write() = model;
71 | self.clone()
72 | }
73 |
74 | pub fn with_generate_model(self: &Arc, model: OpenAIGenerateModel) -> Arc {
75 | *self.generate_model.write() = model;
76 | self.clone()
77 | }
78 |
79 | pub fn with_temperature(self: &Arc, temperature: f32) -> Arc {
80 | *self.temperature.write() = temperature;
81 | self.clone()
82 | }
83 |
84 | pub fn get_client(&self) -> Client {
85 | let conf = AzureConfig::new()
86 | .with_api_key(&self.api_key)
87 | .with_api_base(&self.api_base)
88 | .with_deployment_id(&self.deployment_id)
89 | .with_api_version(&self.api_version);
90 | Client::with_config(conf)
91 | }
92 | }
93 |
94 | #[async_trait::async_trait]
95 | impl LLM for AzureOpenAI {
96 | async fn embedding(&self, inputs: Vec) -> Result {
97 | let request = CreateEmbeddingRequestArgs::default()
98 | .model(&self.embedding_model.read().to_string())
99 | .input(inputs)
100 | .build()?;
101 |
102 | let client = self.get_client();
103 | let response = client.embeddings().create(request).await?;
104 | let mut embeddings = Vec::with_capacity(response.data.len());
105 | for embedding in &response.data {
106 | embeddings.push(embedding.embedding.clone());
107 | }
108 |
109 | let embedding_result = EmbeddingResult {
110 | prompt_tokens: response.usage.prompt_tokens,
111 | total_tokens: response.usage.total_tokens,
112 | embeddings,
113 | };
114 | Ok(embedding_result)
115 | }
116 |
117 | async fn generate(&self, input: &str) -> Result {
118 | let request = CreateChatCompletionRequestArgs::default()
119 | .max_tokens(*self.max_tokens.read() - input.len() as u16)
120 | .model(&self.generate_model.read().to_string())
121 | .temperature(*self.temperature.read())
122 | .messages([ChatCompletionRequestMessageArgs::default()
123 | .role(Role::Assistant)
124 | .content(input)
125 | .build()?])
126 | .build()?;
127 |
128 | let client = self.get_client();
129 | let response = client.chat().create(request).await?;
130 |
131 | let mut generate_result = GenerateResult::default();
132 |
133 | // Usage.
134 | if let Some(usage) = response.usage {
135 | generate_result.prompt_tokens = usage.prompt_tokens;
136 | generate_result.total_tokens = usage.total_tokens;
137 | generate_result.completion_tokens = usage.completion_tokens;
138 | }
139 |
140 | if let Some(choice) = response.choices.first() {
141 | generate_result.generation = choice.message.content.clone().unwrap_or_default();
142 | }
143 |
144 | Ok(generate_result)
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/llmchain/src/llms/azure_openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod azure_openai;
17 |
18 | pub use azure_openai::AzureOpenAI;
19 |
--------------------------------------------------------------------------------
/llmchain/src/llms/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::anyhow;
18 | use anyhow::Result;
19 | use databend_driver::Client;
20 | use log::info;
21 | use tokio_stream::StreamExt;
22 |
23 | use crate::escape_sql_string;
24 | use crate::EmbeddingResult;
25 | use crate::GenerateResult;
26 | use crate::LLM;
27 |
28 | pub struct DatabendLLM {
29 | client: Client,
30 | }
31 |
32 | impl DatabendLLM {
33 | pub fn create(dsn: &str) -> Arc {
34 | Arc::new(DatabendLLM {
35 | client: Client::new(dsn.to_string()),
36 | })
37 | }
38 | }
39 |
40 | #[async_trait::async_trait]
41 | impl LLM for DatabendLLM {
42 | async fn embedding(&self, inputs: Vec) -> Result {
43 | let conn = self.client.get_conn().await?;
44 | let mut embeddings = vec![];
45 | for (i, input) in inputs.iter().enumerate() {
46 | let now = std::time::Instant::now();
47 | type RowResult = (String,);
48 | let mut rows = conn
49 | .query_iter(&format!(
50 | "SELECT ai_embedding_vector('{}')",
51 | escape_sql_string(input)
52 | ))
53 | .await?;
54 | while let Some(row) = rows.next().await {
55 | let row: RowResult = row?.try_into().map_err(|e: String| anyhow!(e))?;
56 | let array_vec: Vec = serde_json::from_str(&row.0)?;
57 | info!(
58 | "embedding {}/{}, time: {:?}",
59 | i + 1,
60 | inputs.len(),
61 | now.elapsed()
62 | );
63 | embeddings.push(array_vec);
64 | }
65 | }
66 |
67 | Ok(EmbeddingResult {
68 | prompt_tokens: 0,
69 | total_tokens: 0,
70 | embeddings,
71 | })
72 | }
73 |
74 | async fn generate(&self, input: &str) -> Result {
75 | let conn = self.client.get_conn().await?;
76 | let row = conn
77 | .query_row(&format!(
78 | "SELECT ai_text_completion('{}')",
79 | escape_sql_string(input)
80 | ))
81 | .await?;
82 |
83 | let generation = match row {
84 | Some(row) => {
85 | let (gen,): (String,) = row.try_into().map_err(|e: String| anyhow!(e))?;
86 | gen
87 | }
88 | None => "".to_string(),
89 | };
90 |
91 | Ok(GenerateResult {
92 | prompt_tokens: 0,
93 | completion_tokens: 0,
94 | total_tokens: 0,
95 | generation,
96 | })
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/llmchain/src/llms/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
18 | pub use databend::DatabendLLM;
19 |
--------------------------------------------------------------------------------
/llmchain/src/llms/llm.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | pub struct EmbeddingResult {
18 | // Usage
19 | pub prompt_tokens: u32,
20 | pub total_tokens: u32,
21 |
22 | pub embeddings: Vec>,
23 | }
24 |
25 | #[derive(Default, Debug)]
26 | pub struct GenerateResult {
27 | // Usage
28 | pub prompt_tokens: u32,
29 | pub completion_tokens: u32,
30 | pub total_tokens: u32,
31 |
32 | pub generation: String,
33 | }
34 |
35 | pub struct ChatResult {
36 | pub role: String,
37 | pub content: String,
38 | }
39 |
40 | #[async_trait::async_trait]
41 | pub trait LLM: Send + Sync {
42 | async fn embedding(&self, inputs: Vec) -> Result;
43 | async fn generate(&self, input: &str) -> Result;
44 | async fn chat(&self, _input: Vec) -> Result> {
45 | unimplemented!("")
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/llmchain/src/llms/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod azure_openai;
16 | mod databend;
17 | mod llm;
18 | mod openai;
19 |
20 | pub use azure_openai::AzureOpenAI;
21 | pub use databend::DatabendLLM;
22 | pub use llm::*;
23 | pub use openai::OpenAI;
24 | pub use openai::OpenAIBuilder;
25 | pub use openai::OpenAIBuilderError;
26 | pub use openai::OpenAIEmbeddingModel;
27 | pub use openai::OpenAIGenerateModel;
28 |
--------------------------------------------------------------------------------
/llmchain/src/llms/openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod openai;
17 |
18 | pub use openai::OpenAI;
19 | pub use openai::OpenAIBuilder;
20 | pub use openai::OpenAIBuilderError;
21 | pub use openai::OpenAIEmbeddingModel;
22 | pub use openai::OpenAIGenerateModel;
23 |
--------------------------------------------------------------------------------
/llmchain/src/llms/openai/openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use async_openai::config::OpenAIConfig;
17 | use async_openai::types::ChatCompletionRequestMessageArgs;
18 | use async_openai::types::CreateChatCompletionRequestArgs;
19 | use async_openai::types::CreateEmbeddingRequestArgs;
20 | use async_openai::types::Role;
21 | use async_openai::Client;
22 | use derive_builder::Builder;
23 |
24 | use crate::EmbeddingResult;
25 | use crate::GenerateResult;
26 | use crate::LLM;
27 |
28 | pub enum OpenAIEmbeddingModel {
29 | TextEmbeddingAda002,
30 | }
31 |
32 | impl ToString for OpenAIEmbeddingModel {
33 | fn to_string(&self) -> String {
34 | "text-embedding-ada-002".to_string()
35 | }
36 | }
37 |
38 | pub enum OpenAIGenerateModel {
39 | Gpt35,
40 | Gpt4,
41 | }
42 |
43 | impl ToString for OpenAIGenerateModel {
44 | fn to_string(&self) -> String {
45 | match self {
46 | OpenAIGenerateModel::Gpt35 => "gpt-3.5-turbo".to_string(),
47 | OpenAIGenerateModel::Gpt4 => "gpt-4".to_string(),
48 | }
49 | }
50 | }
51 |
52 | #[derive(Builder)]
53 | #[builder(name = "OpenAIBuilder")]
54 | #[builder(derive(Debug))]
55 | pub struct OpenAI {
56 | api_base: String,
57 | api_key: String,
58 | org_id: Option,
59 |
60 | // The maximum number of tokens allowed for the generated answer.
61 | // By default, the number of tokens the model can return will be (4095 - prompt tokens).
62 | max_tokens: u16,
63 |
64 | // What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
65 | // We generally recommend altering this or top_p but not both.
66 | temperature: f32,
67 |
68 | embedding_model: String,
69 | generate_model: String,
70 |
71 | http_client: reqwest::Client,
72 | }
73 |
74 | impl OpenAI {
75 | pub fn create>(api_key: S) -> OpenAI {
76 | OpenAIBuilder::default()
77 | .api_key(api_key.into())
78 | .build()
79 | .unwrap()
80 | }
81 |
82 | fn get_client(&self) -> Client {
83 | let mut conf = OpenAIConfig::new()
84 | .with_api_key(&self.api_key)
85 | .with_api_base(&self.api_base);
86 |
87 | if let Some(org_id) = &self.org_id {
88 | conf = conf.with_org_id(org_id);
89 | }
90 |
91 | Client::with_config(conf).with_http_client(self.http_client.clone())
92 | }
93 | }
94 |
95 | #[async_trait::async_trait]
96 | impl LLM for OpenAI {
97 | async fn embedding(&self, inputs: Vec) -> Result {
98 | let request = CreateEmbeddingRequestArgs::default()
99 | .model(&self.embedding_model.to_string())
100 | .input(inputs)
101 | .build()?;
102 |
103 | let client = self.get_client();
104 | let response = client.embeddings().create(request).await?;
105 | let mut embeddings = Vec::with_capacity(response.data.len());
106 | for embedding in &response.data {
107 | embeddings.push(embedding.embedding.clone());
108 | }
109 |
110 | let embedding_result = EmbeddingResult {
111 | prompt_tokens: response.usage.prompt_tokens,
112 | total_tokens: response.usage.total_tokens,
113 | embeddings,
114 | };
115 | Ok(embedding_result)
116 | }
117 |
118 | async fn generate(&self, input: &str) -> Result {
119 | let request = CreateChatCompletionRequestArgs::default()
120 | .max_tokens(self.max_tokens - input.len() as u16)
121 | .model(self.generate_model.to_string())
122 | .temperature(self.temperature)
123 | .messages([ChatCompletionRequestMessageArgs::default()
124 | .role(Role::Assistant)
125 | .content(input)
126 | .build()?])
127 | .build()?;
128 |
129 | let client = self.get_client();
130 | let response = client.chat().create(request).await?;
131 |
132 | let mut generate_result = GenerateResult::default();
133 |
134 | // Usage.
135 | if let Some(usage) = response.usage {
136 | generate_result.prompt_tokens = usage.prompt_tokens;
137 | generate_result.total_tokens = usage.total_tokens;
138 | generate_result.completion_tokens = usage.completion_tokens;
139 | }
140 |
141 | if let Some(choice) = response.choices.first() {
142 | generate_result.generation = choice.message.content.clone().unwrap_or_default();
143 | }
144 |
145 | Ok(generate_result)
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/directory/directory_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::sync::Arc;
17 |
18 | use anyhow::Result;
19 | use async_recursion::async_recursion;
20 | use glob::Pattern;
21 | use opendal::EntryMode;
22 | use rayon::iter::ParallelIterator;
23 | use rayon::prelude::IntoParallelRefIterator;
24 | use rayon::ThreadPoolBuilder;
25 |
26 | use crate::Disk;
27 | use crate::DocumentLoader;
28 | use crate::DocumentPath;
29 | use crate::Documents;
30 |
31 | pub struct DirectoryLoader {
32 | disk: Arc,
33 | loaders: HashMap>,
34 | max_threads: usize,
35 | }
36 |
37 | impl DirectoryLoader {
38 | pub fn create(disk: Arc) -> Self {
39 | DirectoryLoader {
40 | disk,
41 | loaders: HashMap::default(),
42 | max_threads: 8,
43 | }
44 | }
45 |
46 | pub fn with_loader(mut self, glob: &str, loader: Arc) -> Self {
47 | self.loaders.insert(glob.to_string(), loader);
48 | self
49 | }
50 |
51 | pub fn with_max_threads(mut self, max_threads: usize) -> Self {
52 | self.max_threads = max_threads;
53 | self
54 | }
55 |
56 | #[async_recursion]
57 | async fn process_directory(
58 | &self,
59 | path: &str,
60 | tasks: &mut Vec<(String, Arc)>,
61 | ) -> Result<()> {
62 | let op = self.disk.get_operator()?;
63 | let entries = op.list(path).await?;
64 | for entry in entries {
65 | match entry.metadata().mode() {
66 | EntryMode::FILE => {
67 | for loader in &self.loaders {
68 | let path_str = format!("{}{}", op.info().root(), entry.path());
69 | let pattern = Pattern::new(loader.0)?;
70 | if pattern.matches(&path_str) {
71 | tasks.push((path_str, loader.1.clone()));
72 | break;
73 | }
74 | }
75 | }
76 | EntryMode::DIR => {
77 | self.process_directory(entry.path(), tasks).await?;
78 | }
79 | _ => continue,
80 | }
81 | }
82 |
83 | Ok(())
84 | }
85 | }
86 |
87 | #[async_trait::async_trait]
88 | impl DocumentLoader for DirectoryLoader {
89 | async fn load(&self, path: DocumentPath) -> Result {
90 | let mut tasks: Vec<(String, Arc)> = Vec::new();
91 | self.process_directory(path.as_str()?, &mut tasks).await?;
92 |
93 | let worker_pool = ThreadPoolBuilder::new()
94 | .num_threads(self.max_threads)
95 | .build()?;
96 | let results: Vec<_> = worker_pool.install(|| {
97 | tasks
98 | .par_iter()
99 | .map(|(path, loader)| loader.load(DocumentPath::from_string(path)))
100 | .collect()
101 | });
102 |
103 | let documents = Documents::create();
104 | for result in results {
105 | let result = result.await?;
106 | documents.extend(&result);
107 | }
108 |
109 | Ok(documents)
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/directory/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod directory_loader;
16 |
17 | pub use directory_loader::DirectoryLoader;
18 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/disk/disk.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use opendal::Operator;
17 |
18 | pub trait Disk: Send + Sync {
19 | fn get_operator(&self) -> Result;
20 | }
21 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/disk/local_disk.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | /// Local File System Disk
16 | use std::sync::Arc;
17 |
18 | use anyhow::Result;
19 | use opendal::services::Fs;
20 | use opendal::Operator;
21 |
22 | use crate::Disk;
23 |
24 | pub struct LocalDisk {
25 | op: Operator,
26 | }
27 |
28 | impl LocalDisk {
29 | pub fn create() -> Result> {
30 | let mut builder = Fs::default();
31 | builder.root("/");
32 | let op = Operator::new(builder)?.finish();
33 |
34 | Ok(Arc::new(LocalDisk { op }))
35 | }
36 | }
37 |
38 | impl Disk for LocalDisk {
39 | fn get_operator(&self) -> Result {
40 | Ok(self.op.clone())
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/disk/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod local_disk;
16 |
17 | #[allow(clippy::module_inception)]
18 | mod disk;
19 | mod remote_disk;
20 |
21 | pub use disk::Disk;
22 | pub use local_disk::LocalDisk;
23 | pub use remote_disk::RemoteDisk;
24 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/disk/remote_disk.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | /// Cloud Object Storage as Disk
16 | ///
17 | /// AWS S3
18 | /// Azure Blob
19 | /// Google Cloud Storage
20 | /// Cloudflare R2
21 | /// Wasabi
22 | /// MinIO
23 | /// Alibaba Cloud OSS
24 | /// Tencent Cloud COS
25 | /// Huawei Cloud OBS
26 | use std::sync::Arc;
27 |
28 | use anyhow::Result;
29 | use opendal::Operator;
30 |
31 | use crate::Disk;
32 |
33 | pub struct RemoteDisk {}
34 |
35 | impl RemoteDisk {
36 | pub fn create() -> Result> {
37 | Ok(Arc::new(RemoteDisk {}))
38 | }
39 | }
40 |
41 | impl Disk for RemoteDisk {
42 | fn get_operator(&self) -> Result {
43 | todo!()
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/document.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use crate::chat_tokens;
16 |
17 | #[derive(Debug, Clone, Eq, PartialEq)]
18 | pub struct Document {
19 | pub path: String,
20 | pub content: String,
21 | pub content_md5: String,
22 | }
23 |
24 | impl Document {
25 | pub fn create(path: &str, content: &str) -> Self {
26 | Document {
27 | path: path.to_string(),
28 | content: content.to_string(),
29 | content_md5: format!("{:x}", md5::compute(content)),
30 | }
31 | }
32 |
33 | pub fn tokens(&self) -> usize {
34 | chat_tokens(&self.content).unwrap().len()
35 | }
36 |
37 | pub fn size(&self) -> usize {
38 | self.content.len()
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/document_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::DocumentPath;
18 | use crate::Documents;
19 |
20 | #[async_trait::async_trait]
21 | pub trait DocumentLoader: Send + Sync {
22 | async fn load(&self, path: DocumentPath) -> Result;
23 | }
24 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/document_path.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | #[derive(Debug)]
18 | pub enum DocumentPath {
19 | Str(String),
20 | List(Vec),
21 | }
22 |
23 | impl DocumentPath {
24 | pub fn as_str(&self) -> Result<&str> {
25 | match self {
26 | DocumentPath::Str(s) => Ok(s),
27 | _ => {
28 | anyhow::bail!("DocumentPath is not a string, {:?}", self)
29 | }
30 | }
31 | }
32 |
33 | pub fn from_string(s: &str) -> Self {
34 | DocumentPath::Str(s.to_string())
35 | }
36 |
37 | pub fn as_list(&self) -> Result> {
38 | match self {
39 | DocumentPath::List(list) => Ok(list.clone()),
40 | _ => {
41 | anyhow::bail!("DocumentPath is not a list, {:?}", self)
42 | }
43 | }
44 | }
45 |
46 | pub fn from_list(list: Vec) -> Self {
47 | DocumentPath::List(list)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/document_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::Documents;
18 |
19 | pub trait DocumentSplitter {
20 | fn separators(&self) -> Vec;
21 | fn split_documents(&self, documents: &Documents) -> Result;
22 | }
23 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/documents.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use parking_lot::RwLock;
16 |
17 | use crate::Document;
18 |
19 | #[derive(Debug)]
20 | pub struct Documents {
21 | documents: RwLock>,
22 | }
23 |
24 | impl Documents {
25 | pub fn create() -> Self {
26 | Documents {
27 | documents: RwLock::new(vec![]),
28 | }
29 | }
30 |
31 | pub fn push(&self, document: Document) {
32 | self.documents.write().push(document);
33 | }
34 |
35 | pub fn extend(&self, other_docs: &Documents) {
36 | self.documents
37 | .write()
38 | .extend_from_slice(&other_docs.documents.read());
39 | }
40 |
41 | pub fn tokens(&self) -> usize {
42 | self.documents.read().iter().map(|d| d.tokens()).sum()
43 | }
44 |
45 | pub fn size(&self) -> usize {
46 | self.documents.read().iter().map(|d| d.size()).sum()
47 | }
48 |
49 | pub fn len(&self) -> usize {
50 | self.documents.read().len()
51 | }
52 |
53 | pub fn is_empty(&self) -> bool {
54 | self.documents.read().is_empty()
55 | }
56 |
57 | pub fn iter(&self) -> DocumentsIter {
58 | let guard = self.documents.read().clone();
59 | DocumentsIter {
60 | documents: guard,
61 | index: 0,
62 | }
63 | }
64 |
65 | pub fn first(&self) -> Option {
66 | let guard = self.documents.read();
67 | guard.first().cloned()
68 | }
69 | }
70 |
71 | impl FromIterator for Documents {
72 | fn from_iter>(iter: I) -> Self {
73 | Documents {
74 | documents: RwLock::new(iter.into_iter().collect()),
75 | }
76 | }
77 | }
78 |
79 | impl<'a> IntoIterator for &'a Documents {
80 | type Item = Document;
81 | type IntoIter = DocumentsIter;
82 |
83 | fn into_iter(self) -> Self::IntoIter {
84 | DocumentsIter {
85 | documents: self.documents.read().clone(),
86 | index: 0,
87 | }
88 | }
89 | }
90 |
91 | pub struct DocumentsIter {
92 | documents: Vec,
93 | index: usize,
94 | }
95 |
96 | impl Iterator for DocumentsIter {
97 | type Item = Document;
98 |
99 | fn next(&mut self) -> Option {
100 | if self.index < self.documents.len() {
101 | let result = self.documents[self.index].clone();
102 | self.index += 1;
103 | Some(result)
104 | } else {
105 | None
106 | }
107 | }
108 | }
109 |
110 | impl From> for Documents {
111 | fn from(documents: Vec) -> Self {
112 | Documents {
113 | documents: RwLock::new(documents),
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/github/github_pr_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 | use log::info;
19 | use octocrab::Octocrab;
20 |
21 | use crate::chat_tokens;
22 | use crate::Document;
23 | use crate::DocumentLoader;
24 | use crate::DocumentPath;
25 | use crate::Documents;
26 | pub struct GithubPRLoader {
27 | owner: String,
28 | repo: String,
29 | person_token: String,
30 | }
31 |
32 | impl GithubPRLoader {
33 | pub fn create(owner: &str, repo: &str, person_token: &str) -> Arc {
34 | Arc::new(GithubPRLoader {
35 | owner: owner.to_string(),
36 | repo: repo.to_string(),
37 | person_token: person_token.to_string(),
38 | })
39 | }
40 | }
41 |
42 | #[async_trait::async_trait]
43 | impl DocumentLoader for GithubPRLoader {
44 | async fn load(&self, path: DocumentPath) -> Result {
45 | let documents = Documents::create();
46 | let list = path.as_list()?;
47 | info!("Loading PRs from {:?}", list);
48 |
49 | for id in list {
50 | let now = std::time::Instant::now();
51 | let octocrab = octocrab::initialise(
52 | Octocrab::builder()
53 | .personal_token(self.person_token.clone())
54 | .build()?,
55 | );
56 | let diff = octocrab
57 | .pulls(&self.owner, &self.repo)
58 | .get_diff(id as u64)
59 | .await;
60 |
61 | let path = format!(
62 | "https://github.com/{}/{}/pull/{}",
63 | self.owner, self.repo, id
64 | );
65 | if diff.is_err() {
66 | info!("PR {} not found, error:{:?}", path, diff.err());
67 | continue;
68 | }
69 |
70 | let diff = diff?;
71 | documents.push(Document::create(&path, &diff));
72 | info!(
73 | "Loaded PR {}, diff_len {}, tokens {} in {:?}",
74 | path,
75 | diff.len(),
76 | chat_tokens(&diff).unwrap().len(),
77 | now.elapsed()
78 | );
79 | }
80 |
81 | Ok(documents)
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/github/github_pr_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use glob::Pattern;
17 | use log::info;
18 | use patch::Patch;
19 |
20 | use crate::Document;
21 | use crate::DocumentSplitter;
22 | use crate::Documents;
23 |
24 | pub struct GithubPRDiffSplitter {
25 | pub splitter_chunk_size: usize,
26 | skips: Vec,
27 | }
28 |
29 | impl GithubPRDiffSplitter {
30 | pub fn create() -> Self {
31 | GithubPRDiffSplitter {
32 | splitter_chunk_size: 2000,
33 | skips: vec![],
34 | }
35 | }
36 |
37 | pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
38 | self.splitter_chunk_size = chunk_size;
39 | self
40 | }
41 |
42 | pub fn with_skips(mut self, skips: Vec) -> Self {
43 | self.skips = skips;
44 | self
45 | }
46 | }
47 |
48 | impl DocumentSplitter for GithubPRDiffSplitter {
49 | fn separators(&self) -> Vec {
50 | vec![]
51 | }
52 |
53 | fn split_documents(&self, documents: &Documents) -> Result {
54 | let diff_documents = Documents::create();
55 | let mut acc_patch_str = String::new();
56 | let mut last_document_path = String::new();
57 |
58 | for document in documents {
59 | let content = Box::leak(document.content.clone().into_boxed_str());
60 | let patches = Patch::from_multiple(content)?;
61 | last_document_path = document.path.clone();
62 |
63 | for patch in patches {
64 | let mut need_skip = false;
65 | for skip in &self.skips {
66 | let pattern = Pattern::new(skip)?;
67 | if pattern.matches(&patch.old.path) || pattern.matches(&patch.new.path) {
68 | info!("Skip diff file: old:{}, new:{}", patch.old, patch.new);
69 | need_skip = true;
70 | break;
71 | }
72 | }
73 |
74 | // Skip deleted files.
75 | if patch.new.path == "/dev/null" {
76 | continue;
77 | }
78 |
79 | if !need_skip {
80 | let patch_str = format!("{}", patch);
81 |
82 | if acc_patch_str.len() + patch_str.len() <= self.splitter_chunk_size {
83 | acc_patch_str.push('\n');
84 | acc_patch_str.push_str(&patch_str);
85 | } else {
86 | if !acc_patch_str.is_empty() {
87 | diff_documents.push(Document::create(&document.path, &acc_patch_str));
88 | }
89 | acc_patch_str = patch_str;
90 | }
91 | }
92 | }
93 | }
94 |
95 | if !acc_patch_str.is_empty() {
96 | diff_documents.push(Document::create(&last_document_path, &acc_patch_str));
97 | }
98 | info!(
99 | "Split {} documents into {} diff documents",
100 | documents.len(),
101 | diff_documents.len()
102 | );
103 |
104 | Ok(diff_documents)
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/github/github_repo_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 | use git2::Repository;
19 | use log::info;
20 |
21 | use crate::DirectoryLoader;
22 | use crate::Disk;
23 | use crate::DocumentLoader;
24 | use crate::DocumentPath;
25 | use crate::Documents;
26 | use crate::LocalDisk;
27 | use crate::MarkdownLoader;
28 | use crate::TextLoader;
29 |
30 | pub struct GithubRepoLoader {}
31 |
32 | impl GithubRepoLoader {
33 | pub fn create() -> Arc {
34 | Arc::new(GithubRepoLoader {})
35 | }
36 | }
37 |
38 | #[async_trait::async_trait]
39 | impl DocumentLoader for GithubRepoLoader {
40 | async fn load(&self, path: DocumentPath) -> Result {
41 | let repo_url = path.as_str()?;
42 | let local_path = format!("/tmp/{}/", uuid::Uuid::new_v4());
43 | let local_disk = LocalDisk::create()?;
44 |
45 | {
46 | local_disk.get_operator()?.remove_all(&local_path).await?;
47 | info!("remove {}", local_path);
48 | }
49 |
50 | {
51 | info!("Cloning {} to {}", repo_url, local_path);
52 | let _ = Repository::clone(repo_url, &local_path)?;
53 | }
54 |
55 | let directory = DirectoryLoader::create(local_disk.clone())
56 | .with_loader("**/*.rs", TextLoader::create(local_disk.clone()))
57 | .with_loader("**/*.md", MarkdownLoader::create(local_disk.clone()));
58 |
59 | let result = directory
60 | .load(DocumentPath::Str(local_path.clone()))
61 | .await?;
62 | info!("DirectoryLoader result: {:?}", result.len());
63 |
64 | let result = result
65 | .iter()
66 | .map(|x| {
67 | let mut x = x;
68 | x.path = x.path.replace(&local_path, repo_url);
69 | x
70 | })
71 | .collect::>();
72 |
73 | {
74 | local_disk.get_operator()?.remove_all(&local_path).await?;
75 | info!("remove {}", local_path);
76 | }
77 |
78 | Ok(Documents::from(result))
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/github/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod github_pr_loader;
16 | mod github_pr_splitter;
17 | mod github_repo_loader;
18 |
19 | pub use github_pr_loader::GithubPRLoader;
20 | pub use github_pr_splitter::GithubPRDiffSplitter;
21 | pub use github_repo_loader::GithubRepoLoader;
22 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/markdown/markdown_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::Disk;
20 | use crate::DocumentLoader;
21 | use crate::DocumentPath;
22 | use crate::Documents;
23 | use crate::TextLoader;
24 |
25 | pub struct MarkdownLoader {
26 | disk: Arc,
27 | }
28 |
29 | impl MarkdownLoader {
30 | pub fn create(disk: Arc) -> Arc {
31 | Arc::new(MarkdownLoader { disk })
32 | }
33 | }
34 |
35 | #[async_trait::async_trait]
36 | impl DocumentLoader for MarkdownLoader {
37 | async fn load(&self, path: DocumentPath) -> Result {
38 | let text_loader = TextLoader::create(self.disk.clone());
39 | text_loader.load(path).await
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/markdown/markdown_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::DocumentSplitter;
18 | use crate::Documents;
19 | use crate::TextSplitter;
20 |
21 | pub struct MarkdownSplitter {
22 | pub splitter_chunk_size: usize,
23 | pub separators: Vec,
24 | }
25 |
26 | impl MarkdownSplitter {
27 | pub fn create() -> Self {
28 | MarkdownSplitter {
29 | splitter_chunk_size: 400,
30 | separators: vec![
31 | "\n## ".to_string(),
32 | "\n### ".to_string(),
33 | "\n#### ".to_string(),
34 | "\n##### ".to_string(),
35 | "\n###### ".to_string(),
36 | ],
37 | }
38 | }
39 |
40 | pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
41 | self.splitter_chunk_size = chunk_size;
42 | self
43 | }
44 |
45 | pub fn with_separators(mut self, separators: Vec) -> Self {
46 | self.separators = separators;
47 | self
48 | }
49 | }
50 |
51 | impl DocumentSplitter for MarkdownSplitter {
52 | fn separators(&self) -> Vec {
53 | self.separators.clone()
54 | }
55 |
56 | fn split_documents(&self, documents: &Documents) -> Result {
57 | let text_splitter = TextSplitter::create()
58 | .with_chunk_size(self.splitter_chunk_size)
59 | .with_separators(self.separators());
60 | text_splitter.split_documents(documents)
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/markdown/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod markdown_loader;
16 | mod markdown_splitter;
17 |
18 | pub use markdown_loader::MarkdownLoader;
19 | pub use markdown_splitter::MarkdownSplitter;
20 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod directory;
16 | mod disk;
17 | mod document;
18 | mod document_loader;
19 | mod document_path;
20 | mod document_splitter;
21 | mod documents;
22 | mod github;
23 | mod markdown;
24 | mod text;
25 |
26 | pub use directory::DirectoryLoader;
27 | pub use disk::Disk;
28 | pub use disk::LocalDisk;
29 | pub use disk::RemoteDisk;
30 | pub use document::Document;
31 | pub use document_loader::DocumentLoader;
32 | pub use document_path::DocumentPath;
33 | pub use document_splitter::DocumentSplitter;
34 | pub use documents::Documents;
35 | pub use github::GithubPRDiffSplitter;
36 | pub use github::GithubPRLoader;
37 | pub use github::GithubRepoLoader;
38 | pub use markdown::MarkdownLoader;
39 | pub use markdown::MarkdownSplitter;
40 | pub use text::TextLoader;
41 | pub use text::TextSplitter;
42 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/text/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod text_loader;
16 | mod text_splitter;
17 |
18 | pub use text_loader::TextLoader;
19 | pub use text_splitter::TextSplitter;
20 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/text/text_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::Disk;
20 | use crate::Document;
21 | use crate::DocumentLoader;
22 | use crate::DocumentPath;
23 | use crate::Documents;
24 |
25 | pub struct TextLoader {
26 | disk: Arc,
27 | }
28 |
29 | impl TextLoader {
30 | pub fn create(disk: Arc) -> Arc {
31 | Arc::new(TextLoader { disk })
32 | }
33 | }
34 |
35 | #[async_trait::async_trait]
36 | impl DocumentLoader for TextLoader {
37 | async fn load(&self, path: DocumentPath) -> Result {
38 | let bs = self.disk.get_operator()?.read(path.as_str()?).await?;
39 | let content = String::from_utf8_lossy(&bs).to_string();
40 | let documents = Documents::create();
41 | documents.push(Document::create(path.as_str()?, &content));
42 |
43 | Ok(documents)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/llmchain/src/loaders/text/text_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use regex::Regex;
17 |
18 | use crate::Document;
19 | use crate::DocumentSplitter;
20 | use crate::Documents;
21 |
22 | pub struct TextSplitter {
23 | pub splitter_chunk_size: usize,
24 | separators: Vec,
25 | }
26 |
27 | impl TextSplitter {
28 | pub fn create() -> Self {
29 | TextSplitter {
30 | splitter_chunk_size: 400,
31 | separators: vec!["\n".to_string()],
32 | }
33 | }
34 |
35 | pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
36 | self.splitter_chunk_size = chunk_size;
37 | self
38 | }
39 |
40 | pub fn with_separators(mut self, separators: Vec) -> Self {
41 | self.separators = separators;
42 | self
43 | }
44 |
45 | fn split_text(&self, text: &str) -> Result> {
46 | // Splits.
47 | let separators = self.separators();
48 | let separator_pattern = separators
49 | .iter()
50 | .map(|separator| regex::escape(separator))
51 | .collect::>()
52 | .join("|");
53 | let separator_regex = Regex::new(&separator_pattern)?;
54 |
55 | let mut parts = Vec::new();
56 | let mut last_end = 0;
57 | for cap in separator_regex.find_iter(text) {
58 | let part = &text[last_end..cap.start()];
59 | last_end = cap.end();
60 | parts.push(part.to_string());
61 | }
62 | parts.push(text[last_end..].to_string());
63 |
64 | // Merge.
65 | let mut docs = Vec::new();
66 | let mut current_chunk = String::new();
67 | for part in &parts {
68 | if current_chunk.len() > self.splitter_chunk_size {
69 | docs.push(current_chunk.clone());
70 | current_chunk.clear();
71 | } else if current_chunk.len() + part.len() >= self.splitter_chunk_size {
72 | current_chunk.push(' ');
73 | current_chunk.push_str(part);
74 | docs.push(current_chunk.clone());
75 | current_chunk.clear();
76 | } else {
77 | if !current_chunk.is_empty() {
78 | current_chunk.push(' ');
79 | }
80 | current_chunk.push_str(part);
81 | }
82 | }
83 |
84 | if !current_chunk.is_empty() {
85 | docs.push(current_chunk);
86 | }
87 |
88 | Ok(docs)
89 | }
90 | }
91 |
92 | impl DocumentSplitter for TextSplitter {
93 | fn separators(&self) -> Vec {
94 | self.separators.clone()
95 | }
96 |
97 | fn split_documents(&self, documents: &Documents) -> Result {
98 | let result = Documents::create();
99 |
100 | for document in documents {
101 | let chunks = self.split_text(&document.content)?;
102 |
103 | for chunk in chunks {
104 | result.push(Document::create(&document.path, &chunk))
105 | }
106 | }
107 | Ok(result)
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/llmchain/src/memory/github_pr_summary.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::sync::Arc;
17 |
18 | use anyhow::Result;
19 | use log::info;
20 | use parking_lot::RwLock;
21 |
22 | use crate::chat_tokens;
23 | use crate::Documents;
24 | use crate::GithubPRSummaryPrompt;
25 | use crate::Prompt;
26 | use crate::PromptTemplate;
27 | use crate::Summarize;
28 | use crate::LLM;
29 |
30 | pub struct GithubPRSummary {
31 | tokens: RwLock,
32 | llm: Arc,
33 | summaries: RwLock>,
34 | }
35 | impl GithubPRSummary {
36 | pub fn create(llm: Arc) -> Arc {
37 | Arc::new(Self {
38 | tokens: Default::default(),
39 | llm,
40 | summaries: RwLock::new(Vec::new()),
41 | })
42 | }
43 | }
44 |
45 | #[async_trait::async_trait]
46 | impl Summarize for GithubPRSummary {
47 | async fn add_documents(&self, documents: &Documents) -> Result<()> {
48 | for (i, document) in documents.iter().enumerate() {
49 | let template =
50 |
51 | "
52 | Please explain the code diff group by the file name in bullet points.
53 | If the file is added, prefix `ADD`, if the file is deleted, prefix `DELETE`, if the file is changed, prefix `CHANGE`.
54 | Please use the following format:
55 | [ADD/DELETE/CHANGE] file-name
56 | - bullet point 1
57 | - bullet point 2
58 | ... ...
59 | --------
60 |
61 | ```diff
62 | {text}
63 | ```
64 | ";
65 | let prompt_template = PromptTemplate::create(template, vec!["text".to_string()]);
66 | let mut input_variables = HashMap::new();
67 | input_variables.insert("text", document.content.as_str());
68 | let prompt = prompt_template.format(input_variables)?;
69 |
70 | let tokens = chat_tokens(&prompt)?;
71 | *self.tokens.write() += tokens.len();
72 |
73 | let summary = self.llm.generate(&prompt).await?;
74 | info!(
75 | "summary [{}/{}, tokens {}]: \n{}",
76 | i + 1,
77 | documents.len(),
78 | tokens.len(),
79 | summary.generation
80 | );
81 | self.summaries.write().push(summary.generation);
82 | }
83 |
84 | Ok(())
85 | }
86 |
87 | async fn final_summary(&self) -> Result {
88 | if self.summaries.read().is_empty() {
89 | return Ok("".to_string());
90 | }
91 |
92 | let mut input_variables = HashMap::new();
93 | let text = self.summaries.read().join("\n");
94 | input_variables.insert("text", text.as_str());
95 |
96 | let prompt_template = GithubPRSummaryPrompt::create();
97 | let prompt = prompt_template.format(input_variables)?;
98 |
99 | let tokens = chat_tokens(&prompt)?;
100 | *self.tokens.write() += tokens.len();
101 | info!("prompt: tokens {}, result\n{}", tokens.len(), prompt);
102 |
103 | let summary = self.llm.generate(&prompt).await?;
104 | info!("final summary: {}", summary.generation);
105 |
106 | Ok(summary.generation)
107 | }
108 |
109 | fn tokens(&self) -> usize {
110 | *self.tokens.read()
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/llmchain/src/memory/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod github_pr_summary;
16 | mod summary;
17 |
18 | pub use github_pr_summary::GithubPRSummary;
19 | pub use summary::Summarize;
20 |
--------------------------------------------------------------------------------
/llmchain/src/memory/summary.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::Documents;
18 |
19 | #[async_trait::async_trait]
20 | pub trait Summarize: Send + Sync {
21 | async fn add_documents(&self, documents: &Documents) -> Result<()>;
22 | async fn final_summary(&self) -> Result;
23 | fn tokens(&self) -> usize;
24 | }
25 |
--------------------------------------------------------------------------------
/llmchain/src/prompts/document_retrieval_prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 |
17 | use anyhow::Result;
18 | use parking_lot::RwLock;
19 |
20 | use crate::Prompt;
21 | use crate::PromptTemplate;
22 |
23 | pub struct DocumentRetrievalPrompt {
24 | instructions: RwLock>,
25 | }
26 |
27 | impl DocumentRetrievalPrompt {
28 | pub fn create() -> Self {
29 | DocumentRetrievalPrompt {
30 | instructions: RwLock::new(Vec::new()),
31 | }
32 | }
33 |
34 | pub fn with_instructions(self, instructions: Vec<&str>) -> Self {
35 | let instructs: Vec<_> = instructions.into_iter().map(|s| s.to_string()).collect();
36 | self.instructions.write().extend(instructs);
37 | self
38 | }
39 | }
40 |
41 | impl Prompt for DocumentRetrievalPrompt {
42 | // https://github.com/jerryjliu/llama_index/blob/main/llama_index/prompts/default_prompts.py
43 | // https://github.com/hwchase17/langchain/blob/master/langchain/chains/qa_with_sources/stuff_prompt.py
44 | fn template(&self) -> String {
45 | // Contexts format as:
46 | // Content: xx...
47 | // Source: 0-pl
48 | // Content: yy
49 | // Source: 24-pl
50 | r#"Given the following contexts of a long document and a question, create a final answer with references (\"SOURCES\").
51 | If you don't know the answer, just say that you don't know. Don't try to make up an answer.
52 | please follow these instructions
53 | {instructions}
54 | =========
55 | {contexts}
56 | =========
57 | QUESTION: {question}
58 | FINAL ANSWER:"#.to_string()
59 | }
60 |
61 | fn variables(&self) -> Vec {
62 | vec!["contexts".to_string(), "question".to_string()]
63 | }
64 |
65 | fn format(&self, input_variables: HashMap<&str, &str>) -> Result {
66 | // replace instructions.
67 | let instructions = self.instructions.read().join(" \n");
68 | let prompt_template = self.template().replace("{instructions}", &instructions);
69 |
70 | let prompt_template = PromptTemplate::create(&prompt_template, self.variables());
71 | prompt_template.format(input_variables)
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/llmchain/src/prompts/github_pr_summary_prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::Prompt;
20 | use crate::PromptTemplate;
21 |
22 | pub struct GithubPRSummaryPrompt {}
23 |
24 | impl GithubPRSummaryPrompt {
25 | pub fn create() -> Self {
26 | GithubPRSummaryPrompt {}
27 | }
28 | }
29 |
30 | impl Prompt for GithubPRSummaryPrompt {
31 | fn template(&self) -> String {
32 | r#"You are an expert programmer summarizing code changes, please provide a clear and concise summary of the main changes made in a pull request. Focus on the motivation behind the changes and avoid describing specific file modifications. Follow these guidelines while summarizing:
33 | 1. Ignore changes that you think are not important.
34 | 2. Summarize and classify all changelogs into 1 to 5 points.
35 | 3. Remove the similar points.
36 | 4. Summarize a title for each point, format is `* **Title**`, describing what the point mainly did, as a new title for the pull request changelog, no more than 30 words.
37 | 5. Make an understandable summary for each point with in 50 words, mainly for the background of this change.
38 | --------
39 | {text}"#.to_string()
40 | }
41 |
42 | fn variables(&self) -> Vec {
43 | vec!["text".to_string()]
44 | }
45 |
46 | fn format(&self, input_variables: HashMap<&str, &str>) -> Result {
47 | let prompt_template = PromptTemplate::create(&self.template(), self.variables());
48 | prompt_template.format(input_variables)
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/llmchain/src/prompts/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod document_retrieval_prompt;
16 | mod github_pr_summary_prompt;
17 | mod prompt;
18 | mod text_to_sql_prompt;
19 |
20 | pub use document_retrieval_prompt::DocumentRetrievalPrompt;
21 | pub use github_pr_summary_prompt::GithubPRSummaryPrompt;
22 | pub use prompt::Prompt;
23 | pub use prompt::PromptTemplate;
24 | pub use text_to_sql_prompt::TextToSQLPrompt;
25 |
--------------------------------------------------------------------------------
/llmchain/src/prompts/prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::sync::Arc;
17 |
18 | use anyhow::anyhow;
19 | use anyhow::Result;
20 |
21 | pub trait Prompt: Send + Sync {
22 | fn template(&self) -> String;
23 | fn variables(&self) -> Vec;
24 | fn format(&self, input_variables: HashMap<&str, &str>) -> Result;
25 | }
26 |
27 | pub struct PromptTemplate {
28 | template: String,
29 | variables: Vec,
30 | }
31 |
32 | impl PromptTemplate {
33 | pub fn create(template: &str, variables: Vec) -> Arc {
34 | Arc::new(PromptTemplate {
35 | template: template.to_string(),
36 | variables,
37 | })
38 | }
39 | }
40 |
41 | impl Prompt for PromptTemplate {
42 | fn template(&self) -> String {
43 | self.template.clone()
44 | }
45 |
46 | fn variables(&self) -> Vec {
47 | self.variables.clone()
48 | }
49 |
50 | fn format(&self, input_variables: HashMap<&str, &str>) -> Result {
51 | let mut prompt = self.template();
52 |
53 | // Check.
54 | for (key, value) in input_variables {
55 | if !self.variables().contains(&key.to_string()) {
56 | return Err(anyhow!(
57 | "input variable: '{}' is not in the variables: {:?}",
58 | key,
59 | self.variables()
60 | ));
61 | }
62 |
63 | let key = format!("{{{}}}", key);
64 | prompt = prompt.replace(&key, value);
65 | }
66 |
67 | Ok(prompt)
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/llmchain/src/prompts/text_to_sql_prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 |
17 | use anyhow::Result;
18 |
19 | use crate::Prompt;
20 | use crate::PromptTemplate;
21 |
22 | pub struct TextToSQLPrompt {}
23 |
24 | impl TextToSQLPrompt {
25 | pub fn create() -> Self {
26 | TextToSQLPrompt {}
27 | }
28 | }
29 |
30 | impl Prompt for TextToSQLPrompt {
31 | fn template(&self) -> String {
32 | let tpl = vec![
33 | "Given an input question, first create a syntactically correct {dialect} ",
34 | "query to run, then look at the results of the query and return the answer. ",
35 | "You can order the results by a relevant column to return the most ",
36 | "interesting examples in the database.\n",
37 | "Never query for all the columns from a specific table, only ask for a the ",
38 | "few relevant columns given the question.\n",
39 | "Pay attention to use only the column names that you can see in the schema ",
40 | "description. ",
41 | "Be careful to not query for columns that do not exist. ",
42 | "Pay attention to which column is in which table. ",
43 | "Also, qualify column names with the table name when needed.\n",
44 | "Use the following format:\n",
45 | "Question: Question here\n",
46 | "SQLQuery: SQL Query to run\n",
47 | "SQLResult: Result of the SQLQuery\n",
48 | "Answer: Final answer here\n",
49 | "Only use the tables listed below.\n",
50 | "{schema}\n",
51 | "Question: {query_str}\n",
52 | "SQLQuery: ",
53 | ];
54 | tpl.join("")
55 | }
56 |
57 | fn variables(&self) -> Vec {
58 | vec![
59 | "dialect".to_string(),
60 | "schema".to_string(),
61 | "query_str".to_string(),
62 | ]
63 | }
64 |
65 | fn format(&self, input_variables: HashMap<&str, &str>) -> Result {
66 | let prompt_template = PromptTemplate::create(&self.template(), self.variables());
67 | prompt_template.format(input_variables)
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/llmchain/src/vector_stores/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::anyhow;
18 | use anyhow::Result;
19 | use databend_driver::Client;
20 | use futures::StreamExt;
21 | use log::info;
22 | use uuid::Uuid;
23 |
24 | use crate::escape_sql_string;
25 | use crate::Document;
26 | use crate::Documents;
27 | use crate::Embedding;
28 | use crate::VectorStore;
29 |
30 | pub struct DatabendVectorStore {
31 | client: Client,
32 | database: String,
33 | table: String,
34 | embedding: Arc,
35 | min_similarity: f32,
36 | }
37 |
38 | impl DatabendVectorStore {
39 | pub fn create(dsn: &str, embedding: Arc) -> Self {
40 | DatabendVectorStore {
41 | client: Client::new(dsn.to_string()),
42 | database: "embedding_store".to_string(),
43 | table: "llmchain_collection".to_string(),
44 | embedding,
45 | min_similarity: 0.5,
46 | }
47 | }
48 |
49 | pub fn with_database(mut self, database: &str) -> Self {
50 | self.database = database.to_string();
51 | self
52 | }
53 |
54 | pub fn with_table(mut self, table: &str) -> Self {
55 | self.table = table.to_string();
56 | self
57 | }
58 |
59 | pub fn with_min_similarity(mut self, similarity: f32) -> Self {
60 | self.min_similarity = similarity;
61 | self
62 | }
63 | }
64 |
65 | #[async_trait::async_trait]
66 | impl VectorStore for DatabendVectorStore {
67 | async fn init(&self) -> Result<()> {
68 | let conn = self.client.get_conn().await?;
69 |
70 | let database_create_sql = format!("CREATE DATABASE IF NOT EXISTS {}", self.database);
71 | conn.exec(&database_create_sql).await?;
72 |
73 | let table_create_sql = format!(
74 | "CREATE TABLE IF NOT EXISTS {}.{} \
75 | (uuid VARCHAR, path VARCHAR, content VARCHAR, content_md5 VARCHAR, embedding ARRAY(float32))",
76 | self.database, self.table
77 | );
78 | conn.exec(&table_create_sql).await?;
79 |
80 | Ok(())
81 | }
82 |
83 | async fn add_documents(&self, inputs: &Documents) -> Result> {
84 | let uuids = (0..inputs.len())
85 | .map(|_| Uuid::new_v4().to_string())
86 | .collect::>();
87 |
88 | let embeddings = self.embedding.embed_documents(inputs).await?;
89 |
90 | let sql = format!(
91 | "INSERT INTO {}.{} (uuid, path, content, content_md5, embedding) VALUES ",
92 | self.database, self.table
93 | );
94 | let mut val_vec = vec![];
95 | for (idx, doc) in inputs.iter().enumerate() {
96 | val_vec.push(format!(
97 | "('{}', '{}', '{}', '{}', {:?})",
98 | uuids[idx],
99 | escape_sql_string(&doc.path),
100 | escape_sql_string(&doc.content),
101 | doc.content_md5,
102 | embeddings[idx]
103 | ));
104 | }
105 | let values = val_vec.join(",").to_string();
106 |
107 | let final_sql = format!("{} {}", sql, values);
108 | let conn = self.client.get_conn().await?;
109 | conn.exec(&final_sql).await?;
110 |
111 | Ok(uuids)
112 | }
113 |
114 | async fn similarity_search(&self, query: &str, k: usize) -> Result> {
115 | let query_embedding = self.embedding.embed_query(query).await?;
116 |
117 | let sql = format!(
118 | "SELECT path, content, content_md5, (1- cosine_distance({:?}, embedding)) AS similarity FROM {}.{} \
119 | WHERE length(embedding) > 0 AND length(content) > 0 AND similarity > {} ORDER BY similarity DESC LIMIT {}",
120 | query_embedding, self.database, self.table, self.min_similarity, k
121 | );
122 |
123 | info!("similarity_search from {}.{}", self.database, self.table);
124 |
125 | let mut documents = vec![];
126 | type RowResult = (String, String, String, f32);
127 | let conn = self.client.get_conn().await?;
128 | let mut rows = conn.query_iter(&sql).await?;
129 | while let Some(row) = rows.next().await {
130 | let row: RowResult = row?.try_into().map_err(|e: String| anyhow!(e))?;
131 |
132 | info!("document: {:?}", row);
133 |
134 | documents.push(Document {
135 | path: row.0,
136 | content: row.1,
137 | content_md5: row.2,
138 | });
139 | }
140 | info!("Found {} documents", documents.len());
141 |
142 | Ok(documents)
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/llmchain/src/vector_stores/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
18 | pub use databend::DatabendVectorStore;
19 |
--------------------------------------------------------------------------------
/llmchain/src/vector_stores/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod databend;
16 | mod vector_store;
17 |
18 | pub use databend::DatabendVectorStore;
19 | pub use vector_store::VectorStore;
20 |
--------------------------------------------------------------------------------
/llmchain/src/vector_stores/vector_store.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 |
17 | use crate::Document;
18 | use crate::Documents;
19 |
20 | #[async_trait::async_trait]
21 | pub trait VectorStore: Send + Sync {
22 | async fn init(&self) -> Result<()>;
23 | async fn add_documents(&self, inputs: &Documents) -> Result>;
24 | async fn similarity_search(&self, query: &str, k: usize) -> Result>;
25 | }
26 |
--------------------------------------------------------------------------------
/llmchain/tests/it/common/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod string;
16 | mod token;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/common/string.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use llmchain::escape_sql_string;
16 |
17 | #[test]
18 | fn test_escape_sql_string() {
19 | let input = "Hello, World!";
20 | let output = escape_sql_string(input);
21 | assert_eq!(output, "Hello, World!");
22 |
23 | let input = "Hello, 'World'!";
24 | let output = escape_sql_string(input);
25 | assert_eq!(output, "Hello, ''World''!");
26 |
27 | let input = "Hello, 'World'! \n";
28 | let output = escape_sql_string(input);
29 | assert_eq!(output, "Hello, ''World''! ");
30 |
31 | let input = "Hello, 'World'! \r";
32 | let output = escape_sql_string(input);
33 | assert_eq!(output, "Hello, ''World''! \\r");
34 | }
35 |
--------------------------------------------------------------------------------
/llmchain/tests/it/common/token.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use llmchain::chat_tokens;
16 |
17 | #[test]
18 | fn test_token() {
19 | let input = "🍌This is a sentence with spaces, hahhahah haha ha";
20 | let output = chat_tokens(input).unwrap();
21 | assert_eq!(output.len(), 17);
22 | }
23 |
--------------------------------------------------------------------------------
/llmchain/tests/it/embeddings/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::DatabendEmbedding;
17 | use llmchain::Document;
18 | use llmchain::Documents;
19 | use llmchain::Embedding;
20 |
21 | #[tokio::test]
22 | async fn test_embedding_databend() -> Result<()> {
23 | let dsn = std::env::var("DATABEND_DSN").expect("DATABEND_DSN is not set");
24 |
25 | // embeddings query.
26 | {
27 | let embeddings = DatabendEmbedding::create(&dsn);
28 | let query_result = embeddings.embed_query("hello").await?;
29 | assert_eq!(query_result.len(), 1536);
30 | }
31 |
32 | // embeddings documents.
33 | {
34 | let embeddings = DatabendEmbedding::create(&dsn);
35 | let documents = vec![
36 | Document::create("", "hello"),
37 | Document::create("", "llmchain.rs"),
38 | ];
39 | let documents = Documents::from(documents);
40 | let document_result = embeddings.embed_documents(&documents).await?;
41 | assert_eq!(document_result.len(), 2);
42 | assert_eq!(document_result[0].len(), 1536);
43 | }
44 |
45 | Ok(())
46 | }
47 |
--------------------------------------------------------------------------------
/llmchain/tests/it/embeddings/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/embeddings/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod databend;
16 | mod openai;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/embeddings/openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod openai;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/embeddings/openai/openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::Document;
17 | use llmchain::Documents;
18 | use llmchain::Embedding;
19 | use llmchain::OpenAIBuilder;
20 | use llmchain::OpenAIEmbedding;
21 |
22 | #[ignore]
23 | #[tokio::test]
24 | async fn test_embedding_openai() -> Result<()> {
25 | let api_key = std::env::var("OPENAI_API_KEY").unwrap_or("".to_string());
26 |
27 | // embeddings query.
28 | {
29 | let embeddings =
30 | OpenAIEmbedding::create(OpenAIBuilder::default().api_key(api_key.clone()).build()?);
31 | let query_result = embeddings.embed_query("hello").await?;
32 | assert_eq!(query_result.len(), 1536);
33 | }
34 |
35 | // embeddings documents.
36 | {
37 | let embeddings =
38 | OpenAIEmbedding::create(OpenAIBuilder::default().api_key(api_key.clone()).build()?);
39 | let documents = Documents::from(vec![
40 | Document::create("", "hello"),
41 | Document::create("", "llmchain.rs"),
42 | ]);
43 | let document_result = embeddings.embed_documents(&documents).await?;
44 | assert_eq!(document_result.len(), 2);
45 | assert_eq!(document_result[0].len(), 1536);
46 | }
47 |
48 | Ok(())
49 | }
50 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/azure_openai/azure_openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::AzureOpenAI;
17 | use llmchain::LLM;
18 |
19 | #[ignore]
20 | #[tokio::test]
21 | async fn test_llm_azure_openai_generate_gpt35() -> Result<()> {
22 | let api_base =
23 | std::env::var("AZURE_OPENAI_API_BASE").expect("AZURE_OPENAI_API_BASE is not set");
24 | let api_key = std::env::var("AZURE_OPENAI_API_KEY").expect("AZURE_OPENAI_API_KEY is not set");
25 | let api_deployment = std::env::var("AZURE_OPENAI_API_GEN_DEPLOYMENT")
26 | .expect("AZURE_OPENAI_API_GEN_DEPLOYMENT is not set");
27 |
28 | let llm = AzureOpenAI::create(&api_base, &api_key, &api_deployment);
29 | let result = llm.generate("say Hello").await?;
30 | let generation = result.generation;
31 | assert!(generation.contains("Hello"));
32 | assert_eq!(result.prompt_tokens, 10);
33 | assert_eq!(result.completion_tokens, 9);
34 | assert_eq!(result.total_tokens, 19);
35 |
36 | Ok(())
37 | }
38 |
39 | #[ignore]
40 | #[tokio::test]
41 | async fn test_llm_azure_openai_embedding() -> Result<()> {
42 | let api_base =
43 | std::env::var("AZURE_OPENAI_API_BASE").expect("AZURE_OPENAI_API_BASE is not set");
44 | let api_key = std::env::var("AZURE_OPENAI_API_KEY").expect("AZURE_OPENAI_API_KEY is not set");
45 | let api_deployment = std::env::var("AZURE_OPENAI_API_EMBED_DEPLOYMENT")
46 | .expect("AZURE_OPENAI_API_EMBED_DEPLOYMENT is not set");
47 |
48 | let llm = AzureOpenAI::create(&api_base, &api_key, &api_deployment);
49 |
50 | let inputs = vec!["llmchain".to_string()];
51 | let result = llm.embedding(inputs).await?;
52 | let embeddings = result.embeddings;
53 | assert_eq!(embeddings.len(), 1);
54 |
55 | assert_eq!(embeddings[0].len(), 1536);
56 | assert_eq!(result.prompt_tokens, 3);
57 | assert_eq!(result.total_tokens, 3);
58 |
59 | Ok(())
60 | }
61 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/azure_openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod azure_openai;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::DatabendLLM;
17 | use llmchain::LLM;
18 |
19 | #[tokio::test]
20 | async fn test_llm_databend_embedding() -> Result<()> {
21 | let dsn = std::env::var("DATABEND_DSN").unwrap();
22 |
23 | let llm = DatabendLLM::create(&dsn);
24 |
25 | let inputs = vec!["llmchain".to_string(), "rs".to_string()];
26 | let result = llm.embedding(inputs).await?;
27 | let embeddings = result.embeddings;
28 | assert_eq!(embeddings.len(), 2);
29 |
30 | assert_eq!(embeddings[0].len(), 1536);
31 | assert_eq!(embeddings[1].len(), 1536);
32 | assert_eq!(result.prompt_tokens, 0);
33 | assert_eq!(result.total_tokens, 0);
34 |
35 | Ok(())
36 | }
37 |
38 | #[tokio::test]
39 | async fn test_llm_databend_generate() -> Result<()> {
40 | let dsn = std::env::var("DATABEND_DSN").unwrap();
41 |
42 | let llm = DatabendLLM::create(&dsn);
43 | let result = llm.generate("say Hello").await?;
44 | let generation = result.generation;
45 | assert!(generation.contains("Hello"));
46 | assert_eq!(result.prompt_tokens, 0);
47 | assert_eq!(result.completion_tokens, 0);
48 | assert_eq!(result.total_tokens, 0);
49 |
50 | Ok(())
51 | }
52 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod azure_openai;
16 | mod databend;
17 | mod openai;
18 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/openai/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod openai;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/llms/openai/openai.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::OpenAIBuilder;
17 | use llmchain::OpenAIGenerateModel;
18 | use llmchain::LLM;
19 |
20 | #[ignore]
21 | #[tokio::test]
22 | async fn test_llm_openai_generate_gpt35() -> Result<()> {
23 | let api_key = std::env::var("OPENAI_API_KEY").unwrap_or("".to_string());
24 |
25 | let llm = OpenAIBuilder::default().api_key(api_key).build()?;
26 | let result = llm.generate("say Hello").await?;
27 | let generation = result.generation;
28 | assert!(generation.contains("Hello"));
29 | assert_eq!(result.prompt_tokens, 10);
30 | assert_eq!(result.completion_tokens, 9);
31 | assert_eq!(result.total_tokens, 19);
32 |
33 | Ok(())
34 | }
35 |
36 | #[ignore]
37 | #[tokio::test]
38 | async fn test_llm_openai_generate_gpt4() -> Result<()> {
39 | let api_key = std::env::var("OPENAI_API_KEY").unwrap_or("".to_string());
40 |
41 | let llm = OpenAIBuilder::default()
42 | .api_key(api_key)
43 | .generate_model(OpenAIGenerateModel::Gpt4.to_string())
44 | .build()?;
45 | let result = llm.generate("say Hello").await?;
46 | let generation = result.generation;
47 | assert!(generation.contains("Hello"));
48 | assert_eq!(result.prompt_tokens, 9);
49 | assert_eq!(result.completion_tokens, 2);
50 | assert_eq!(result.total_tokens, 11);
51 |
52 | Ok(())
53 | }
54 |
55 | #[ignore]
56 | #[tokio::test]
57 | async fn test_llm_openai_embedding() -> Result<()> {
58 | let api_key = std::env::var("OPENAI_API_KEY").unwrap_or("".to_string());
59 | let llm = OpenAIBuilder::default().api_key(api_key).build()?;
60 | let inputs = vec!["llmchain".to_string(), "rs".to_string()];
61 | let result = llm.embedding(inputs).await?;
62 | let embeddings = result.embeddings;
63 | assert_eq!(embeddings.len(), 2);
64 |
65 | assert_eq!(embeddings[0].len(), 1536);
66 | assert_eq!(embeddings[1].len(), 1536);
67 | assert_eq!(result.prompt_tokens, 4);
68 | assert_eq!(result.total_tokens, 4);
69 |
70 | Ok(())
71 | }
72 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/directory/directory_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DirectoryLoader;
20 | use llmchain::DocumentLoader;
21 | use llmchain::DocumentPath;
22 | use llmchain::LocalDisk;
23 | use llmchain::MarkdownLoader;
24 |
25 | #[tokio::test]
26 | async fn test_directory_loader() -> Result<()> {
27 | // testdata dir.
28 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
29 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
30 | let directory_dir = format!("{}/directory/", testdata_dir);
31 |
32 | // Load
33 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
34 | let directory_loader =
35 | DirectoryLoader::create(LocalDisk::create()?).with_loader("**/*.md", markdown_loader);
36 | let documents = directory_loader
37 | .load(DocumentPath::from_string(&directory_dir))
38 | .await?;
39 | assert_eq!(documents.len(), 2);
40 |
41 | // Check.
42 | let mut mint = Mint::new(&testdata_dir);
43 | let golden_path = "directory/directory_loader.golden";
44 | let mut file = mint.new_goldenfile(golden_path)?;
45 | for (i, doc) in documents.iter().enumerate() {
46 | writeln!(
47 | file,
48 | "part={}, len={}, md5={}",
49 | i,
50 | doc.content.len(),
51 | doc.content_md5
52 | )?;
53 | writeln!(
54 | file,
55 | "------------------------------------------------------------"
56 | )?;
57 | writeln!(file, "{}", doc.content)?;
58 | }
59 |
60 | Ok(())
61 | }
62 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/directory/directory_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DirectoryLoader;
20 | use llmchain::DocumentLoader;
21 | use llmchain::DocumentPath;
22 | use llmchain::DocumentSplitter;
23 | use llmchain::LocalDisk;
24 | use llmchain::MarkdownLoader;
25 | use llmchain::MarkdownSplitter;
26 |
27 | #[tokio::test]
28 | async fn test_directory_splitter_default() -> Result<()> {
29 | // testdata dir.
30 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
31 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
32 | let directory_dir = format!("{}/directory/", testdata_dir);
33 |
34 | // Load
35 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
36 | let directory_loader =
37 | DirectoryLoader::create(LocalDisk::create()?).with_loader("**/*.md", markdown_loader);
38 | let documents = directory_loader
39 | .load(DocumentPath::from_string(&directory_dir))
40 | .await?;
41 | assert_eq!(documents.len(), 2);
42 |
43 | let markdown_splitter = MarkdownSplitter::create().with_chunk_size(100);
44 | let documents = markdown_splitter.split_documents(&documents)?;
45 | assert_eq!(documents.len(), 18);
46 |
47 | // Check.
48 | let mut mint = Mint::new(&testdata_dir);
49 | let golden_path = "directory/directory_splitter_chunk_100.golden";
50 | let mut file = mint.new_goldenfile(golden_path)?;
51 | for (i, doc) in documents.iter().enumerate() {
52 | writeln!(
53 | file,
54 | "part={}, len={}, chunk_size={}, md5={}",
55 | i,
56 | doc.content.len(),
57 | markdown_splitter.splitter_chunk_size,
58 | doc.content_md5
59 | )?;
60 | writeln!(
61 | file,
62 | "------------------------------------------------------------"
63 | )?;
64 | writeln!(file, "{}", doc.content)?;
65 | }
66 |
67 | Ok(())
68 | }
69 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/directory/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod directory_loader;
16 | mod directory_splitter;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/github/github_pr_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::GithubPRLoader;
22 |
23 | #[tokio::test]
24 | async fn test_github_pr_loader() -> Result<()> {
25 | let token = std::env::var("L_GITHUB_TOKEN").expect("L_GITHUB_TOKEN is not set");
26 | // testdata dir.
27 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
28 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
29 |
30 | // Load
31 | let github_pr_loader = GithubPRLoader::create("datafuselabs", "databend", &token);
32 | let documents = github_pr_loader
33 | .load(DocumentPath::from_list(vec![
34 | 11450, 11451, 11452, 11453, 11454, 11455, 11456, 11457, 11458, 11459, 11460,
35 | ]))
36 | .await?;
37 |
38 | // Check.
39 | let mut mint = Mint::new(&testdata_dir);
40 | let golden_path = "github/github_pr_loader.golden";
41 | let mut file = mint.new_goldenfile(golden_path)?;
42 | for (i, doc) in documents.iter().enumerate() {
43 | writeln!(
44 | file,
45 | "part={}, len={}, md5={}, path:{}",
46 | i,
47 | doc.content.len(),
48 | doc.content_md5,
49 | doc.path
50 | )?;
51 | writeln!(
52 | file,
53 | "------------------------------------------------------------"
54 | )?;
55 | writeln!(file, "{}", doc.content)?;
56 | }
57 |
58 | Ok(())
59 | }
60 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/github/github_pr_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::DocumentSplitter;
22 | use llmchain::GithubPRDiffSplitter;
23 | use llmchain::GithubPRLoader;
24 |
25 | #[tokio::test]
26 | async fn test_github_pr_splitter_default() -> Result<()> {
27 | let token = std::env::var("L_GITHUB_TOKEN").expect("L_GITHUB_TOKEN is not set");
28 | // testdata dir.
29 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
30 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
31 |
32 | // Load
33 | let github_pr_loader = GithubPRLoader::create("datafuselabs", "databend", &token);
34 | let documents = github_pr_loader
35 | .load(DocumentPath::from_list(vec![
36 | 11450, 11451, 11452, 11453, 11454, 11455, 11456, 11457, 11458, 11459,
37 | ]))
38 | .await?;
39 |
40 | let github_pr_splitter =
41 | GithubPRDiffSplitter::create().with_skips(vec!["**/*.txt".to_string()]);
42 | let documents = github_pr_splitter.split_documents(&documents)?;
43 |
44 | // Check.
45 | let mut mint = Mint::new(&testdata_dir);
46 | let golden_path = "github/github_pr_splitter_default.golden";
47 | let mut file = mint.new_goldenfile(golden_path)?;
48 | for (i, doc) in documents.iter().enumerate() {
49 | writeln!(
50 | file,
51 | "part={}, len={}, chunk_size={}, md5={}, path:{}",
52 | i,
53 | doc.content.len(),
54 | github_pr_splitter.splitter_chunk_size,
55 | doc.content_md5,
56 | doc.path
57 | )?;
58 | writeln!(
59 | file,
60 | "------------------------------------------------------------"
61 | )?;
62 | writeln!(file, "{}", doc.content)?;
63 | }
64 |
65 | Ok(())
66 | }
67 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/github/github_repo_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use anyhow::Result;
16 | use llmchain::DocumentLoader;
17 | use llmchain::DocumentPath;
18 | use llmchain::GithubRepoLoader;
19 |
20 | #[tokio::test]
21 | async fn test_github_repo_loader() -> Result<()> {
22 | env_logger::init();
23 |
24 | // Load
25 | let github_repo_loader = GithubRepoLoader::create();
26 | let documents = github_repo_loader
27 | .load(DocumentPath::from_string(
28 | "https://github.com/shafishlabs/llmchain.rs",
29 | ))
30 | .await?;
31 |
32 | assert!(documents.len() > 10);
33 | assert!(documents
34 | .first()
35 | .unwrap()
36 | .path
37 | .starts_with("https://github.com"));
38 |
39 | Ok(())
40 | }
41 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/github/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod github_pr_loader;
16 | mod github_pr_splitter;
17 | mod github_repo_loader;
18 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/markdown/markdown_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::LocalDisk;
22 | use llmchain::MarkdownLoader;
23 |
24 | #[tokio::test]
25 | async fn test_markdown_loader() -> Result<()> {
26 | // testdata dir.
27 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
28 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
29 | let markdown_file = format!("{}/markdown/copy.md", testdata_dir);
30 |
31 | // Load
32 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
33 | let documents = markdown_loader
34 | .load(DocumentPath::from_string(&markdown_file))
35 | .await?;
36 |
37 | // Check.
38 | let mut mint = Mint::new(&testdata_dir);
39 | let golden_path = "markdown/copy_md_loader.golden";
40 | let mut file = mint.new_goldenfile(golden_path)?;
41 | for (i, doc) in documents.iter().enumerate() {
42 | writeln!(
43 | file,
44 | "part={}, len={}, md5={}",
45 | i,
46 | doc.content.len(),
47 | doc.content_md5
48 | )?;
49 | writeln!(
50 | file,
51 | "------------------------------------------------------------"
52 | )?;
53 | writeln!(file, "{}", doc.content)?;
54 | }
55 |
56 | Ok(())
57 | }
58 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/markdown/markdown_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::DocumentSplitter;
22 | use llmchain::LocalDisk;
23 | use llmchain::MarkdownLoader;
24 | use llmchain::MarkdownSplitter;
25 |
26 | #[tokio::test]
27 | async fn test_markdown_splitter_default() -> Result<()> {
28 | // testdata dir.
29 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
30 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
31 | let markdown_file = format!("{}/markdown/copy.md", testdata_dir);
32 |
33 | // Load
34 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
35 | let documents = markdown_loader
36 | .load(DocumentPath::from_string(&markdown_file))
37 | .await?;
38 |
39 | let markdown_splitter = MarkdownSplitter::create();
40 | let documents = markdown_splitter.split_documents(&documents)?;
41 |
42 | // Check.
43 | let mut mint = Mint::new(&testdata_dir);
44 | let golden_path = "markdown/copy_md_splitter_default.golden";
45 | let mut file = mint.new_goldenfile(golden_path)?;
46 | for (i, doc) in documents.iter().enumerate() {
47 | writeln!(
48 | file,
49 | "part={}, len={}, chunk_size={}, md5={}",
50 | i,
51 | doc.content.len(),
52 | markdown_splitter.splitter_chunk_size,
53 | doc.content_md5
54 | )?;
55 | writeln!(
56 | file,
57 | "------------------------------------------------------------"
58 | )?;
59 | writeln!(file, "{}", doc.content)?;
60 | }
61 |
62 | Ok(())
63 | }
64 |
65 | #[tokio::test]
66 | async fn test_markdown_splitter_100() -> Result<()> {
67 | // testdata dir.
68 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
69 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
70 | let markdown_file = format!("{}/markdown/copy.md", testdata_dir);
71 |
72 | // Load
73 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
74 | let documents = markdown_loader
75 | .load(DocumentPath::from_string(&markdown_file))
76 | .await?;
77 |
78 | let markdown_splitter = MarkdownSplitter::create().with_chunk_size(100);
79 | let documents = markdown_splitter.split_documents(&documents)?;
80 |
81 | // Check.
82 | assert_eq!(documents.len(), 14);
83 |
84 | // Check.
85 | let mut mint = Mint::new(&testdata_dir);
86 | let golden_path = "markdown/copy_md_splitter_chunk_100.golden";
87 | let mut file = mint.new_goldenfile(golden_path)?;
88 | for (i, doc) in documents.iter().enumerate() {
89 | writeln!(
90 | file,
91 | "part={}, len={}, chunk_size={}, md5={}",
92 | i,
93 | doc.content.len(),
94 | markdown_splitter.splitter_chunk_size,
95 | doc.content_md5
96 | )?;
97 | writeln!(
98 | file,
99 | "------------------------------------------------------------"
100 | )?;
101 | writeln!(file, "{}", doc.content)?;
102 | }
103 |
104 | Ok(())
105 | }
106 |
107 | #[tokio::test]
108 | async fn test_markdown_splitter_custom_separator() -> Result<()> {
109 | // testdata dir.
110 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
111 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
112 | let markdown_file = format!("{}/markdown/copy-hyphen.md", testdata_dir);
113 |
114 | // Load
115 | let markdown_loader = MarkdownLoader::create(LocalDisk::create()?);
116 | let documents = markdown_loader
117 | .load(DocumentPath::from_string(&markdown_file))
118 | .await?;
119 |
120 | let markdown_splitter = MarkdownSplitter::create().with_separators(vec![
121 | "\n- ## ".to_string(),
122 | "\n- ### ".to_string(),
123 | "\n- #### ".to_string(),
124 | "\n- ##### ".to_string(),
125 | "\n- ###### ".to_string(),
126 | ]);
127 | let documents = markdown_splitter.split_documents(&documents)?;
128 |
129 | // Check.
130 | let mut mint = Mint::new(&testdata_dir);
131 | let golden_path = "markdown/copy_md_splitter_custom_separator.golden";
132 | let mut file = mint.new_goldenfile(golden_path)?;
133 | for (i, doc) in documents.iter().enumerate() {
134 | writeln!(
135 | file,
136 | "part={}, len={}, chunk_size={}, md5={}",
137 | i,
138 | doc.content.len(),
139 | markdown_splitter.splitter_chunk_size,
140 | doc.content_md5
141 | )?;
142 | writeln!(
143 | file,
144 | "------------------------------------------------------------"
145 | )?;
146 | writeln!(file, "{}", doc.content)?;
147 | }
148 |
149 | Ok(())
150 | }
151 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/markdown/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod markdown_loader;
16 | mod markdown_splitter;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod directory;
16 | mod github;
17 | mod markdown;
18 | mod text;
19 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/text/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod text_loader;
16 | mod text_splitter;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/text/text_loader.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::LocalDisk;
22 | use llmchain::TextLoader;
23 |
24 | #[tokio::test]
25 | async fn test_text_loader() -> Result<()> {
26 | // testdata dir.
27 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
28 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
29 | let text_file = format!("{}/text/example.txt", testdata_dir);
30 |
31 | // Load
32 | let text_loader = TextLoader::create(LocalDisk::create()?);
33 | let documents = text_loader
34 | .load(DocumentPath::from_string(&text_file))
35 | .await?;
36 |
37 | // Check.
38 | let mut mint = Mint::new(&testdata_dir);
39 | let golden_path = "text/example_txt_loader.golden";
40 | let mut file = mint.new_goldenfile(golden_path)?;
41 | for (i, doc) in documents.iter().enumerate() {
42 | writeln!(
43 | file,
44 | "part={}, len={}, md5={}",
45 | i,
46 | doc.content.len(),
47 | doc.content_md5
48 | )?;
49 | writeln!(
50 | file,
51 | "------------------------------------------------------------"
52 | )?;
53 | writeln!(file, "{}", doc.content)?;
54 | }
55 |
56 | Ok(())
57 | }
58 |
--------------------------------------------------------------------------------
/llmchain/tests/it/loaders/text/text_splitter.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::io::Write;
16 |
17 | use anyhow::Result;
18 | use goldenfile::Mint;
19 | use llmchain::DocumentLoader;
20 | use llmchain::DocumentPath;
21 | use llmchain::DocumentSplitter;
22 | use llmchain::LocalDisk;
23 | use llmchain::TextLoader;
24 | use llmchain::TextSplitter;
25 |
26 | #[tokio::test]
27 | async fn test_text_splitter_default() -> Result<()> {
28 | // testdata dir.
29 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
30 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
31 | let text_file = format!("{}/text/example.txt", testdata_dir);
32 |
33 | // Load
34 | let text_loader = TextLoader::create(LocalDisk::create()?);
35 | let documents = text_loader
36 | .load(DocumentPath::from_string(&text_file))
37 | .await?;
38 |
39 | let text_splitter = TextSplitter::create();
40 | let documents = text_splitter.split_documents(&documents)?;
41 |
42 | // Check.
43 | let mut mint = Mint::new(&testdata_dir);
44 | let golden_path = "text/example_txt_splitter_default.golden";
45 | let mut file = mint.new_goldenfile(golden_path)?;
46 | for (i, doc) in documents.iter().enumerate() {
47 | writeln!(
48 | file,
49 | "part={}, len={}, chunk_size={}, md5={}",
50 | i,
51 | doc.content.len(),
52 | text_splitter.splitter_chunk_size,
53 | doc.content_md5
54 | )?;
55 | writeln!(
56 | file,
57 | "------------------------------------------------------------"
58 | )?;
59 | writeln!(file, "{}", doc.content)?;
60 | }
61 |
62 | Ok(())
63 | }
64 |
65 | #[tokio::test]
66 | async fn test_text_splitter_10() -> Result<()> {
67 | // testdata dir.
68 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
69 | let testdata_dir = format!("{}/tests/testdata/loaders", curdir);
70 | let text_file = format!("{}/text/example.txt", testdata_dir);
71 |
72 | // Load
73 | let text_loader = TextLoader::create(LocalDisk::create()?);
74 | let documents = text_loader
75 | .load(DocumentPath::from_string(&text_file))
76 | .await?;
77 |
78 | let text_splitter = TextSplitter::create().with_chunk_size(10);
79 | let documents = text_splitter.split_documents(&documents)?;
80 |
81 | // Check.
82 | assert_eq!(documents.len(), 2);
83 |
84 | // Check.
85 | let mut mint = Mint::new(&testdata_dir);
86 | let golden_path = "text/example_txt_splitter_chunk_10.golden";
87 | let mut file = mint.new_goldenfile(golden_path)?;
88 | for (i, doc) in documents.iter().enumerate() {
89 | writeln!(
90 | file,
91 | "part={}, len={}, chunk_size={}, md5={}",
92 | i,
93 | doc.content.len(),
94 | text_splitter.splitter_chunk_size,
95 | doc.content_md5,
96 | )?;
97 | writeln!(
98 | file,
99 | "------------------------------------------------------------"
100 | )?;
101 | writeln!(file, "{}", doc.content)?;
102 | }
103 |
104 | Ok(())
105 | }
106 |
--------------------------------------------------------------------------------
/llmchain/tests/it/main.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod common;
16 | mod embeddings;
17 | mod llms;
18 | mod loaders;
19 | mod prompts;
20 | mod vector_stores;
21 |
--------------------------------------------------------------------------------
/llmchain/tests/it/prompts/document_retrieval_prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::io::Write;
17 |
18 | use anyhow::Result;
19 | use goldenfile::Mint;
20 | use llmchain::DocumentRetrievalPrompt;
21 | use llmchain::Prompt;
22 |
23 | #[test]
24 | fn test_prompt_document_retrieval() -> Result<()> {
25 | // testdata dir.
26 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
27 | let testdata_dir = format!("{}/tests/testdata/prompts", curdir);
28 |
29 | let mut mint = Mint::new(testdata_dir);
30 | let golden_path = "document_retrieval_prompt.golden";
31 | let mut file = mint.new_goldenfile(golden_path)?;
32 |
33 | let prompt_template = DocumentRetrievalPrompt::create().with_instructions(vec!["Present your answer in markdown format, including code snippets if have, format the code snippets with SQL type if necessary.",
34 | "Do not include any links or external references in your response.\n",
35 | "Do not change the code snippets.\n",
36 | "Do not change the SQL syntax, please don't make up the function.\n",
37 | "Do not change explain any code snippets.\n",
38 | "Make the whole answer as short as possible to keep the code snippets.\n"
39 | ]);
40 |
41 | // invalid input variable.
42 | {
43 | let mut input_variables = HashMap::new();
44 | input_variables.insert("1", "v");
45 | let result = prompt_template.format(input_variables);
46 | assert!(result.is_err());
47 | }
48 |
49 | // ok.
50 | {
51 | let mut input_variables = HashMap::new();
52 | input_variables.insert(
53 | "contexts",
54 | "Content: Welcome to the Databend documentation! Databend is an open-source, elastic, and workload-aware modern cloud data warehouse designed to meet businesses' massive-scale analytics needs at low cost and with low complexity.\nSource:1.md\nConent: Databend is always searching for and incorporating the most advanced and innovative technologies to provide you with an exceptional user experience.\nSource:2.md",
55 | );
56 | input_variables.insert("question", "what is databend");
57 | let result = prompt_template.format(input_variables)?;
58 |
59 | writeln!(file, "------------------")?;
60 | writeln!(file, "{:?}", result)?;
61 | writeln!(file, "------------------")?;
62 | }
63 |
64 | Ok(())
65 | }
66 |
--------------------------------------------------------------------------------
/llmchain/tests/it/prompts/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod document_retrieval_prompt;
16 | mod prompt_template;
17 | mod text_to_sql_prompt;
18 |
--------------------------------------------------------------------------------
/llmchain/tests/it/prompts/prompt_template.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::io::Write;
17 |
18 | use anyhow::Result;
19 | use goldenfile::Mint;
20 | use llmchain::Prompt;
21 | use llmchain::PromptTemplate;
22 |
23 | #[test]
24 | fn test_prompt_template() -> Result<()> {
25 | // testdata dir.
26 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
27 | let testdata_dir = format!("{}/tests/testdata/prompts", curdir);
28 |
29 | let mut mint = Mint::new(testdata_dir);
30 | let golden_path = "prompt_template.golden";
31 | let mut file = mint.new_goldenfile(golden_path)?;
32 |
33 | let prompt_template = PromptTemplate::create("hello {name}", vec!["name".to_string()]);
34 |
35 | // invalid input variable.
36 | {
37 | let mut input_variables = HashMap::new();
38 | input_variables.insert("1", "v");
39 | let result = prompt_template.format(input_variables);
40 | assert!(result.is_err());
41 | }
42 |
43 | // ok.
44 | {
45 | let mut input_variables = HashMap::new();
46 | input_variables.insert("name", "llmchain.rs");
47 | let result = prompt_template.format(input_variables)?;
48 |
49 | writeln!(file, "------------------")?;
50 | writeln!(file, "{:?}", result)?;
51 | writeln!(file, "------------------")?;
52 | }
53 |
54 | Ok(())
55 | }
56 |
--------------------------------------------------------------------------------
/llmchain/tests/it/prompts/text_to_sql_prompt.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::collections::HashMap;
16 | use std::io::Write;
17 |
18 | use anyhow::Result;
19 | use goldenfile::Mint;
20 | use llmchain::Prompt;
21 | use llmchain::TextToSQLPrompt;
22 |
23 | #[test]
24 | fn test_prompt_text_to_sql() -> Result<()> {
25 | // testdata dir.
26 | let curdir = std::env::current_dir()?.to_str().unwrap().to_string();
27 | let testdata_dir = format!("{}/tests/testdata/prompts", curdir);
28 |
29 | let mut mint = Mint::new(testdata_dir);
30 | let golden_path = "text_to_sql_prompt.golden";
31 | let mut file = mint.new_goldenfile(golden_path)?;
32 |
33 | let text_to_sql_template = TextToSQLPrompt::create();
34 |
35 | // invalid input variable.
36 | {
37 | let mut input_variables = HashMap::new();
38 | input_variables.insert("1", "v");
39 | let result = text_to_sql_template.format(input_variables);
40 | assert!(result.is_err());
41 | }
42 |
43 | // ok.
44 | {
45 | let mut input_variables = HashMap::new();
46 | input_variables.insert("dialect", "mysql");
47 | input_variables.insert("schema", "name string");
48 | input_variables.insert("query_str", "how many names");
49 | let result = text_to_sql_template.format(input_variables)?;
50 |
51 | writeln!(file, "------------------")?;
52 | writeln!(file, "{:?}", result)?;
53 | writeln!(file, "------------------")?;
54 | }
55 |
56 | Ok(())
57 | }
58 |
--------------------------------------------------------------------------------
/llmchain/tests/it/vector_stores/databend/databend.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | use std::sync::Arc;
16 |
17 | use anyhow::Result;
18 | use llmchain::DatabendEmbedding;
19 | use llmchain::DatabendVectorStore;
20 | use llmchain::Document;
21 | use llmchain::Documents;
22 | use llmchain::VectorStore;
23 |
24 | #[tokio::test]
25 | async fn test_vector_stores_databend() -> Result<()> {
26 | let dsn = std::env::var("DATABEND_DSN").expect("DATABEND_DSN is not set");
27 |
28 | let databend_embedding = Arc::new(DatabendEmbedding::create(&dsn));
29 | let databend = DatabendVectorStore::create(&dsn, databend_embedding);
30 | databend.init().await?;
31 |
32 | let documents = Documents::from(vec![
33 | Document::create("1.md", "hello"),
34 | Document::create("2.md", "llmchain.rs"),
35 | ]);
36 | let result = databend.add_documents(&documents).await?;
37 | assert_eq!(result.len(), 2);
38 |
39 | let similarities = databend.similarity_search("llmchain", 1).await?;
40 | assert_eq!(similarities.len(), 1);
41 |
42 | let expect_document = Document {
43 | path: "2.md".to_string(),
44 | content: "llmchain.rs".to_string(),
45 | content_md5: "033d6bd60a5237d54fa8331dd2ca1325".to_string(),
46 | };
47 |
48 | let actual_document = similarities[0].clone();
49 |
50 | assert_eq!(expect_document, actual_document);
51 |
52 | Ok(())
53 | }
54 |
--------------------------------------------------------------------------------
/llmchain/tests/it/vector_stores/databend/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #[allow(clippy::module_inception)]
16 | mod databend;
17 |
--------------------------------------------------------------------------------
/llmchain/tests/it/vector_stores/mod.rs:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Shafish Labs.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | mod databend;
16 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/directory/subdir/42-data-type-map.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Map
3 | ---
4 |
5 | The MAP data structure is utilized for holding a set of `Key:Value` pairs, and stores data using a nested data structure of Array(Tuple(key, value)). It is appropriate in situations where the data type is constant, but the `Key`'s value cannot be entirely ascertained.
6 |
7 | ## Understanding Key:Value
8 |
9 | The `Key` is of a specified basic data type, including Boolean, Number, Decimal, String, Date, or Timestamp. A `Key`'s value cannot be Null, and duplicates are not allowed. The `Value` can be any data type, including nested arrays, tuples, and so on.
10 |
11 | Map data can be generated through `Key:Value` pairs enclosed in curly braces or by using the Map function to convert two arrays into a Map. The Map function takes two arrays as input, where the elements in the first array serve as the keys and the elements in the second array serve as the values. See an example below:
12 |
13 | ```sql
14 | -- Input arrays: [1, 2] and ['v1', 'v2']
15 | -- Resulting Map: {1: 'v1', 2: 'v2'}
16 |
17 | SELECT {'k1': 1, 'k2': 2}, map([1, 2], ['v1', 'v2']);
18 | +-----------------+---------------------------+
19 | | {'k1':1,'k2':2} | map([1, 2], ['v1', 'v2']) |
20 | +-----------------+---------------------------+
21 | | {'k1':1,'k2':2} | {1:'v1',2:'v2'} |
22 | +-----------------+---------------------------+
23 | ```
24 |
25 | ## Map and Bloom Filter Index
26 |
27 | In Databend Map, a bloom filter index is created for the value with certain data types: `Numeric`, `String`, `Timestamp`, and `Date`.
28 |
29 | This makes it easier and faster to search for values in the MAP data structure.
30 |
31 | The implementation of the bloom filter index in Databend Map is in [PR#10457](https://github.com/datafuselabs/databend/pull/10457).
32 |
33 | The bloom filter is particularly effective in reducing query time when the queried value does not exist.
34 |
35 | For example:
36 | ```sql
37 | select * from nginx_log where log['ip'] = '205.91.162.148';
38 | +----+----------------------------------------+
39 | | id | log |
40 | +----+----------------------------------------+
41 | | 1 | {'ip':'205.91.162.148','url':'test-1'} |
42 | +----+----------------------------------------+
43 | 1 row in set
44 | Time: 1.733s
45 |
46 | select * from nginx_log where log['ip'] = '205.91.162.141';
47 | +----+-----+
48 | | id | log |
49 | +----+-----+
50 | +----+-----+
51 | 0 rows in set
52 | Time: 0.129s
53 | ```
54 |
55 | ## Examples
56 |
57 | **Create a table with a Map column for storing web traffic data**
58 |
59 | ```sql
60 | CREATE TABLE web_traffic_data(id INT64, traffic_info MAP(STRING, STRING));
61 |
62 | DESC web_traffic_data;
63 | +-------------+--------------------+------+---------+-------+
64 | | Field | Type | Null | Default | Extra |
65 | +-------------+--------------------+------+---------+-------+
66 | | id | INT64 | NO | | |
67 | | traffic_info| MAP(STRING, STRING)| NO | {} | |
68 | +-------------+--------------------+------+---------+-------+
69 | ```
70 |
71 | **Insert Map data containing IP addresses and URLs visited**
72 |
73 | ```sql
74 | INSERT INTO web_traffic_data VALUES(1, {'ip': '192.168.1.1', 'url': 'example.com/home'}),
75 | (2, {'ip': '192.168.1.2', 'url': 'example.com/about'}),
76 | (3, {'ip': '192.168.1.1', 'url': 'example.com/contact'});
77 | ```
78 |
79 | **Query**
80 |
81 | ```sql
82 | SELECT * FROM web_traffic_data;
83 |
84 | +----+-----------------------------------+
85 | | id | traffic_info |
86 | +----+-----------------------------------+
87 | | 1 | {'ip':'192.168.1.1','url':'example.com/home'} |
88 | | 2 | {'ip':'192.168.1.2','url':'example.com/about'} |
89 | | 3 | {'ip':'192.168.1.1','url':'example.com/contact'} |
90 | +----+-----------------------------------+
91 | ```
92 |
93 | **Query the number of visits per IP address**
94 |
95 | ```sql
96 | SELECT traffic_info['ip'] as ip_address, COUNT(*) as visits
97 | FROM web_traffic_data
98 | GROUP BY traffic_info['ip'];
99 |
100 | +-------------+--------+
101 | | ip_address | visits |
102 | +-------------+--------+
103 | | 192.168.1.1 | 2 |
104 | | 192.168.1.2 | 1 |
105 | +-------------+--------+
106 | ```
107 |
108 | **Query the most visited URLs**
109 | ```sql
110 | SELECT traffic_info['url'] as url, COUNT(*) as visits
111 | FROM web_traffic_data
112 | GROUP BY traffic_info['url']
113 | ORDER BY visits DESC
114 | LIMIT 3;
115 |
116 | +---------------------+--------+
117 | | url | visits |
118 | +---------------------+--------+
119 | | example.com/home | 1 |
120 | | example.com/about | 1 |
121 | | example.com/contact | 1 |
122 | +---------------------+--------+
123 | ```
124 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/github/github_repo_loader.golden:
--------------------------------------------------------------------------------
1 | part=0, len=455, md5=0206f956cb2997be0343b44a10f9a871, path:https://github.com/datafuselabs/databend/pull/11450
2 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/text/example.txt:
--------------------------------------------------------------------------------
1 | Databend is an open-source Elastic and Workload-Aware modern cloud data warehouse focusing on Low-Cost and Low-Complexity for your massive-scale analytics needs.
2 |
3 | Databend uses the latest techniques in vectorized query processing to allow you to do blazing-fast data analytics on object storage: (S3, Azure Blob, Google Cloud Storage, Alibaba Cloud OSS, Tencent Cloud COS, Huawei Cloud OBS, Cloudflare R2, Wasabi or MinIO).
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/text/example_txt_loader.golden:
--------------------------------------------------------------------------------
1 | part=0, len=423, md5=569988f71a1e3b9fa8c43103ed20d437
2 | ------------------------------------------------------------
3 | Databend is an open-source Elastic and Workload-Aware modern cloud data warehouse focusing on Low-Cost and Low-Complexity for your massive-scale analytics needs.
4 |
5 | Databend uses the latest techniques in vectorized query processing to allow you to do blazing-fast data analytics on object storage: (S3, Azure Blob, Google Cloud Storage, Alibaba Cloud OSS, Tencent Cloud COS, Huawei Cloud OBS, Cloudflare R2, Wasabi or MinIO).
6 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/text/example_txt_splitter_chunk_10.golden:
--------------------------------------------------------------------------------
1 | part=0, len=162, chunk_size=10, md5=dabab7450395d08e3b7f50e7424f93d0
2 | ------------------------------------------------------------
3 | Databend is an open-source Elastic and Workload-Aware modern cloud data warehouse focusing on Low-Cost and Low-Complexity for your massive-scale analytics needs.
4 | part=1, len=261, chunk_size=10, md5=5a1b963a388059d4daaa4d575676ff86
5 | ------------------------------------------------------------
6 | Databend uses the latest techniques in vectorized query processing to allow you to do blazing-fast data analytics on object storage: (S3, Azure Blob, Google Cloud Storage, Alibaba Cloud OSS, Tencent Cloud COS, Huawei Cloud OBS, Cloudflare R2, Wasabi or MinIO).
7 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/loaders/text/example_txt_splitter_default.golden:
--------------------------------------------------------------------------------
1 | part=0, len=423, chunk_size=400, md5=975cb61c2a8305dcfd512e4ffd444722
2 | ------------------------------------------------------------
3 | Databend is an open-source Elastic and Workload-Aware modern cloud data warehouse focusing on Low-Cost and Low-Complexity for your massive-scale analytics needs. Databend uses the latest techniques in vectorized query processing to allow you to do blazing-fast data analytics on object storage: (S3, Azure Blob, Google Cloud Storage, Alibaba Cloud OSS, Tencent Cloud COS, Huawei Cloud OBS, Cloudflare R2, Wasabi or MinIO).
4 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/prompts/document_retrieval_prompt.golden:
--------------------------------------------------------------------------------
1 | ------------------
2 | "Given the following contexts of a long document and a question, create a final answer with references (\"SOURCES\"). If you don't know the answer, just say that you don't know. Don't try to make up an answer. please follow these instructions:\nPresent your answer in markdown format, including code snippets if have, format the code snippets with SQL type if necessary. \nDo not include any links or external references in your response.\n \nDo not change the code snippets.\n \nDo not change the SQL syntax, please don't make up the function.\n \nDo not change explain any code snippets.\n \nMake the whole answer as short as possible to keep the code snippets.\n\n=========\nContent: Welcome to the Databend documentation! Databend is an open-source, elastic, and workload-aware modern cloud data warehouse designed to meet businesses' massive-scale analytics needs at low cost and with low complexity.\nSource:1.md\nConent: Databend is always searching for and incorporating the most advanced and innovative technologies to provide you with an exceptional user experience.\nSource:2.md\n=========\nQUESTION: what is databend\nFINAL ANSWER:"
3 | ------------------
4 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/prompts/prompt_template.golden:
--------------------------------------------------------------------------------
1 | ------------------
2 | "hello llmchain.rs"
3 | ------------------
4 |
--------------------------------------------------------------------------------
/llmchain/tests/testdata/prompts/text_to_sql_prompt.golden:
--------------------------------------------------------------------------------
1 | ------------------
2 | "Given an input question, first create a syntactically correct mysql query to run, then look at the results of the query and return the answer. You can order the results by a relevant column to return the most interesting examples in the database.\nNever query for all the columns from a specific table, only ask for a the few relevant columns given the question.\nPay attention to use only the column names that you can see in the schema description. Be careful to not query for columns that do not exist. Pay attention to which column is in which table. Also, qualify column names with the table name when needed.\nUse the following format:\nQuestion: Question here\nSQLQuery: SQL Query to run\nSQLResult: Result of the SQLQuery\nAnswer: Final answer here\nOnly use the tables listed below.\nname string\nQuestion: how many names\nSQLQuery: "
3 | ------------------
4 |
--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "stable"
3 | components = ["rustfmt", "clippy"]
4 |
--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | edition = "2021"
2 | version = "Two"
3 | reorder_imports = true
4 | imports_granularity = "Item"
5 | group_imports = "StdExternalCrate"
6 | where_single_line = true
7 | trailing_comma = "Vertical"
8 | overflow_delimited_expr = true
9 | format_code_in_doc_comments = true
10 | normalize_comments = true
11 |
--------------------------------------------------------------------------------