├── .asf.yaml ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── build.yml ├── .gitignore ├── .ratignore ├── LICENSE ├── NOTICE ├── README.md ├── check-licenses.sh ├── pyproject.toml └── src ├── __about__.py ├── __init__.py └── extensions ├── chunking ├── ChunkDocument.py ├── ParseDocument.py └── __init__.py ├── openai ├── PromptChatGPT.py └── __init__.py └── vectorstores ├── ChromaUtils.py ├── EmbeddingUtils.py ├── OpenSearchVectorUtils.py ├── PutChroma.py ├── PutOpenSearchVector.py ├── PutPinecone.py ├── PutQdrant.py ├── QdrantUtils.py ├── QueryChroma.py ├── QueryOpenSearchVector.py ├── QueryPinecone.py ├── QueryQdrant.py ├── QueryUtils.py ├── __init__.py └── requirements.txt /.asf.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | github: 4 | description: "Apache NiFi Python Extensions" 5 | homepage: https://nifi.apache.org/ 6 | labels: 7 | - apache 8 | - nifi 9 | - python 10 | - hacktoberfest 11 | features: 12 | wiki: false 13 | issues: false 14 | projects: false 15 | enabled_merge_buttons: 16 | squash: true 17 | autolink_jira: 18 | - NIFI 19 | protected_branches: 20 | main: 21 | required_signatures: true 22 | required_linear_history: true 23 | notifications: 24 | commits: commits@nifi.apache.org 25 | issues: issues@nifi.apache.org 26 | pullrequests: issues@nifi.apache.org 27 | jira_options: link worklog 28 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Summary 4 | 5 | [NIFI-00000](https://issues.apache.org/jira/browse/NIFI-00000) 6 | 7 | # Tracking 8 | 9 | Please complete the following tracking steps prior to pull request creation. 10 | 11 | ### Issue Tracking 12 | 13 | - [ ] [Apache NiFi Jira](https://issues.apache.org/jira/browse/NIFI) issue created 14 | 15 | ### Pull Request Tracking 16 | 17 | - [ ] Pull Request title starts with Apache NiFi Jira issue number, such as `NIFI-00000` 18 | - [ ] Pull Request commit message starts with Apache NiFi Jira issue number, as such `NIFI-00000` 19 | 20 | ### Pull Request Formatting 21 | 22 | - [ ] Pull Request based on current revision of the `main` branch 23 | - [ ] Pull Request refers to a feature branch with one commit containing changes 24 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: build 4 | 5 | on: 6 | push: 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | permissions: 16 | security-events: write 17 | contents: read 18 | pull-requests: read 19 | 20 | jobs: 21 | build: 22 | name: Python ${{ matrix.python }} on ${{ matrix.os }} 23 | runs-on: ${{ matrix.os }} 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | os: 28 | - ubuntu-22.04 29 | - macos-14 30 | python: 31 | - '3.11' 32 | - '3.12' 33 | steps: 34 | - name: Checkout Sources 35 | uses: actions/checkout@v4 36 | - name: Check Licenses 37 | run: sh check-licenses.sh 38 | - name: Setup Python ${{ matrix.python }} 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: ${{ matrix.python }} 42 | - name: Install Hatch 43 | run: | 44 | python -m pip install --upgrade pip 45 | pip install hatch 46 | - name: Check Formatting 47 | run: hatch fmt --check 48 | - name: Build Distribution 49 | run: hatch build 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 57 | __pypackages__/ 58 | 59 | # Environments 60 | .env 61 | .venv 62 | env/ 63 | venv/ 64 | ENV/ 65 | env.bak/ 66 | venv.bak/ 67 | 68 | # mkdocs documentation 69 | /site 70 | 71 | # mypy 72 | .mypy_cache/ 73 | .dmypy.json 74 | dmypy.json 75 | 76 | # Pyre type checker 77 | .pyre/ 78 | 79 | # pytype static type analyzer 80 | .pytype/ 81 | 82 | # Cython debug symbols 83 | cython_debug/ 84 | 85 | .idea/ 86 | -------------------------------------------------------------------------------- /.ratignore: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | __pycache__/* 3 | build/* 4 | dist/* 5 | downloads/* 6 | eggs/* 7 | lib/* 8 | lib64/* 9 | parts/* 10 | sdist/* 11 | var/* 12 | wheels/* 13 | share/python-wheels/* 14 | .idea/* 15 | .git/* 16 | .cache/* 17 | .ruff_cache/* 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Apache NiFi Python Extensions 2 | Copyright 2024 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache NiFi Python Extensions 2 | 3 | [![license](https://img.shields.io/github/license/apache/nifi-python-extensions)](https://github.com/apache/nifi-python-extensions/blob/main/LICENSE) 4 | [![build](https://github.com/apache/nifi-python-extensions/actions/workflows/build.yml/badge.svg)](https://github.com/apache/nifi-python-extensions/actions/workflows/build.yml) 5 | [![Hatch](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) 6 | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) 7 | 8 | The [Apache NiFi](https://nifi.apache.org) Python Extensions repository contains Processors implemented in [Python](https://www.python.org/) 9 | for deployment in Apache NiFi 2. 10 | 11 | ## Building 12 | 13 | This project uses [Hatch](https://hatch.pypa.io) to build distribution packages. 14 | 15 | ``` 16 | hatch build 17 | ``` 18 | 19 | The build command creates a source distribution in the `dist` directory. 20 | 21 | The source distribution contains an `extensions` directory can be copied into Apache NiFi to use the packaged Processors. 22 | 23 | ## Developing 24 | 25 | The Apache NiFi [Python Developer's Guide](https://nifi.apache.org/documentation/nifi-2.0.0-M3/html/python-developer-guide.html) 26 | provides the API and implementation guidelines for Python Processors. 27 | 28 | The Hatch format command supports evaluating Python Processors against configured rules. 29 | 30 | ``` 31 | hatch fmt --check 32 | ``` 33 | 34 | ## Documentation 35 | 36 | The Apache NiFi [Documentation](https://nifi.apache.org/documentation/) includes reference information for project capabilities. 37 | 38 | ## Contributing 39 | 40 | The Apache NiFi [Contributor Guide](https://cwiki.apache.org/confluence/display/NIFI/Contributor+Guide) 41 | describes the process for getting involved in the development of this project. 42 | 43 | ## Issues 44 | 45 | This project uses [Jira](https://issues.apache.org/jira/browse/NIFI) for tracking bugs and features. 46 | 47 | ## Licensing 48 | 49 | This project is released under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 50 | -------------------------------------------------------------------------------- /check-licenses.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | APACHE_RAT_VERSION="0.16.1" 5 | APACHE_RAT_JAR="apache-rat-$APACHE_RAT_VERSION.jar" 6 | APACHE_RAT_JAR_URL="https://repo1.maven.org/maven2/org/apache/rat/apache-rat/$APACHE_RAT_VERSION/$APACHE_RAT_JAR" 7 | CACHE_DIRECTORY=".cache" 8 | APACHE_RAT_JAR_PATH="$CACHE_DIRECTORY/$APACHE_RAT_JAR" 9 | APACHE_RAT_EXCLUDE_FILE=".ratignore" 10 | 11 | # Set Java command 12 | if [ -n "${JAVA_HOME-}" ]; then 13 | JAVACMD="$JAVA_HOME/bin/java" 14 | if [ ! -x "$JAVACMD" ]; then 15 | die "Java command [$JAVACMD}] not found" 16 | fi 17 | elif command -v java > /dev/null; then 18 | JAVACMD=$(command -v java) 19 | else 20 | die "Environment variable [JAVA_HOME] and command [java] not found" 21 | fi 22 | 23 | # Set curl command 24 | if command -v curl > /dev/null; then 25 | CURLCMD=$(command -v curl) 26 | else 27 | die "Command [curl] not found" 28 | fi 29 | 30 | # Download Apache Rat JAR 31 | if [ ! -d $CACHE_DIRECTORY ]; then 32 | mkdir $CACHE_DIRECTORY 33 | fi 34 | if [ ! -f $APACHE_RAT_JAR_PATH ]; then 35 | echo "Downloading Apache Rat from [$APACHE_RAT_JAR_URL]" 36 | CURL_RESULTS=$(exec $CURLCMD -f --silent --show-error -o "$APACHE_RAT_JAR_PATH" "$APACHE_RAT_JAR_URL") 37 | if [ $? -ne 0 ]; then 38 | echo "Failed to download Apache Rat from [$APACHE_RAT_JAR_URL]" 39 | exit $? 40 | fi 41 | fi 42 | 43 | # Run Apache Rat 44 | REPORT_RESULTS=$(exec $JAVACMD -jar $APACHE_RAT_JAR_PATH --scan-hidden-directories --exclude-file $APACHE_RAT_EXCLUDE_FILE --dir . 2>&1) 45 | if [ $? -ne 0 ]; then 46 | echo "$REPORT_RESULTS" 47 | exit $? 48 | fi 49 | 50 | UNKNOWN_LICENSES_FOUND=$(echo "$REPORT_RESULTS" | grep --count "??") 51 | echo "Unknown Licenses Found: $UNKNOWN_LICENSES_FOUND" 52 | 53 | if [ $UNKNOWN_LICENSES_FOUND -eq 0 ]; then 54 | RESULT_CODE=0 55 | else 56 | RESULT_CODE=1 57 | echo "$REPORT_RESULTS" 58 | fi 59 | 60 | exit $RESULT_CODE 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | [build-system] 4 | requires = ["hatchling"] 5 | build-backend = "hatchling.build" 6 | 7 | [project] 8 | name = "nifi-python-extensions" 9 | dynamic = ["version"] 10 | description = "Apache NiFi Processors implemented in Python" 11 | requires-python = ">=3.11" 12 | keywords = ["apache", "nifi", "extensions", "processors"] 13 | readme = "README.md" 14 | authors = [ 15 | { name = "Apache NiFi Developers", email = "dev@nifi.apache.org" }, 16 | ] 17 | maintainers = [ 18 | { name = "Apache NiFi Developers", email = "dev@nifi.apache.org" }, 19 | ] 20 | classifiers = [ 21 | "Development Status :: 5 - Production/Stable", 22 | "License :: OSI Approved :: Apache Software License", 23 | "Intended Audience :: Developers", 24 | "Programming Language :: Python", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | "Framework :: Hatch", 28 | ] 29 | 30 | [project.urls] 31 | Homepage = "https://nifi.apache.org" 32 | Issues = "https://issues.apache.org/jira/projects/NIFI/issues" 33 | Source = "https://github.com/apache/nifi-python-extensions" 34 | 35 | [tool.hatch.version] 36 | path = "src/__about__.py" 37 | 38 | [[tool.hatch.envs.all.matrix]] 39 | python = ["3.11", "3.12"] 40 | 41 | [tool.hatch.build.targets.wheel] 42 | packages = ["src/extensions"] 43 | 44 | [tool.hatch.build.targets.sdist] 45 | exclude = [ 46 | ".asf.yaml", 47 | ".github", 48 | ".ratignore", 49 | "check-licenses.sh", 50 | ] 51 | 52 | [tool.ruff] 53 | preview = true 54 | lint.pep8-naming.extend-ignore-names = [ 55 | "flowFile", 56 | "getPropertyDescriptors", 57 | "onScheduled", 58 | ] 59 | lint.flake8-self.extend-ignore-names = [ 60 | "_standard_validators" 61 | ] 62 | lint.extend-select = [ 63 | "CPY001" 64 | ] 65 | lint.ignore = [ 66 | "G004", # Allow f-string for logging 67 | "N999", # Allow Processor module names that do not follow pep8-naming 68 | "PERF401", # Allow manual list comprehension 69 | "RUF012", # Allow mutable class attributes without typing.ClassVar 70 | "S105", # Avoid checking for hardcoded-password-string values 71 | ] 72 | 73 | [tool.ruff.lint.flake8-copyright] 74 | notice-rgx = "# SPDX-License-Identifier: Apache-2.0\n" 75 | -------------------------------------------------------------------------------- /src/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | __version__ = "2.0.0.dev0" 4 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | -------------------------------------------------------------------------------- /src/extensions/chunking/ChunkDocument.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | from langchain.text_splitter import Language 6 | from nifiapi.documentation import ProcessorConfiguration, multi_processor_use_case, use_case 7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 9 | 10 | SPLIT_BY_CHARACTER = "Split by Character" 11 | SPLIT_CODE = "Split Code" 12 | RECURSIVELY_SPLIT_BY_CHARACTER = "Recursively Split by Character" 13 | 14 | TEXT_KEY = "text" 15 | METADATA_KEY = "metadata" 16 | 17 | 18 | @use_case( 19 | description="Create chunks of text from a single larger chunk.", 20 | notes="The input for this use case is expected to be a FlowFile whose content is a JSON Lines document, with each line having a 'text' and a 'metadata' element.", 21 | keywords=["embedding", "vector", "text", "rag", "retrieval augmented generation"], 22 | configuration=""" 23 | Set "Input Format" to "Plain Text" 24 | Set "Element Strategy" to "Single Document" 25 | """, 26 | ) 27 | @multi_processor_use_case( 28 | description=""" 29 | Chunk Plaintext data in order to prepare it for storage in a vector store. The output is in "json-lines" format, 30 | containing the chunked data as text, as well as metadata pertaining to the chunk.""", 31 | notes="The input for this use case is expected to be a FlowFile whose content is a plaintext document.", 32 | keywords=["embedding", "vector", "text", "rag", "retrieval augmented generation"], 33 | configurations=[ 34 | ProcessorConfiguration( 35 | processor_type="ParseDocument", 36 | configuration=""" 37 | Set "Input Format" to "Plain Text" 38 | Set "Element Strategy" to "Single Document" 39 | 40 | Connect the 'success' Relationship to ChunkDocument. 41 | """, 42 | ), 43 | ProcessorConfiguration( 44 | processor_type="ChunkDocument", 45 | configuration=""" 46 | Set the following properties: 47 | "Chunking Strategy" = "Recursively Split by Character" 48 | "Separator" = "\\n\\n,\\n, ," 49 | "Separator Format" = "Plain Text" 50 | "Chunk Size" = "4000" 51 | "Chunk Overlap" = "200" 52 | "Keep Separator" = "false" 53 | 54 | Connect the 'success' Relationship to the appropriate destination to store data in the desired vector store. 55 | """, 56 | ), 57 | ], 58 | ) 59 | @multi_processor_use_case( 60 | description=""" 61 | Parse and chunk the textual contents of a PDF document in order to prepare it for storage in a vector store. The output is in "json-lines" format, 62 | containing the chunked data as text, as well as metadata pertaining to the chunk.""", 63 | notes="The input for this use case is expected to be a FlowFile whose content is a PDF document.", 64 | keywords=["pdf", "embedding", "vector", "text", "rag", "retrieval augmented generation"], 65 | configurations=[ 66 | ProcessorConfiguration( 67 | processor_type="ParseDocument", 68 | configuration=""" 69 | Set "Input Format" to "PDF" 70 | Set "Element Strategy" to "Single Document" 71 | Set "Include Extracted Metadata" to "false" 72 | 73 | Connect the 'success' Relationship to ChunkDocument. 74 | """, 75 | ), 76 | ProcessorConfiguration( 77 | processor_type="ChunkDocument", 78 | configuration=""" 79 | Set the following properties: 80 | "Chunking Strategy" = "Recursively Split by Character" 81 | "Separator" = "\\n\\n,\\n, ," 82 | "Separator Format" = "Plain Text" 83 | "Chunk Size" = "4000" 84 | "Chunk Overlap" = "200" 85 | "Keep Separator" = "false" 86 | 87 | Connect the 'success' Relationship to the appropriate destination to store data in the desired vector store. 88 | """, 89 | ), 90 | ], 91 | ) 92 | class ChunkDocument(FlowFileTransform): 93 | class Java: 94 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 95 | 96 | class ProcessorDetails: 97 | version = "2.0.0.dev0" 98 | description = """Chunks incoming documents that are formatted as JSON Lines into chunks that are appropriately sized for creating Text Embeddings. 99 | The input is expected to be in "json-lines" format, with each line having a 'text' and a 'metadata' element. 100 | Each line will then be split into one or more lines in the output.""" 101 | tags = [ 102 | "text", 103 | "split", 104 | "chunk", 105 | "langchain", 106 | "embeddings", 107 | "vector", 108 | "machine learning", 109 | "ML", 110 | "artificial intelligence", 111 | "ai", 112 | "document", 113 | ] 114 | dependencies = ["langchain"] 115 | 116 | CHUNK_STRATEGY = PropertyDescriptor( 117 | name="Chunking Strategy", 118 | description="Specifies which splitter should be used to split the text", 119 | allowable_values=[RECURSIVELY_SPLIT_BY_CHARACTER, SPLIT_BY_CHARACTER, SPLIT_CODE], 120 | required=True, 121 | default_value=RECURSIVELY_SPLIT_BY_CHARACTER, 122 | ) 123 | SEPARATOR = PropertyDescriptor( 124 | name="Separator", 125 | description="""Specifies the character sequence to use for splitting apart the text. If using a Chunking Strategy of Recursively Split by Character, 126 | it is a comma-separated list of character sequences. Meta-characters \\n, \\r and \\t are automatically un-escaped.""", 127 | required=True, 128 | default_value="\\n\\n,\\n, ,", 129 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 130 | dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)], 131 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 132 | ) 133 | SEPARATOR_FORMAT = PropertyDescriptor( 134 | name="Separator Format", 135 | description="Specifies how to interpret the value of the property", 136 | required=True, 137 | default_value="Plain Text", 138 | allowable_values=["Plain Text", "Regular Expression"], 139 | dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)], 140 | ) 141 | CHUNK_SIZE = PropertyDescriptor( 142 | name="Chunk Size", 143 | description="The maximum size of a chunk that should be returned", 144 | required=True, 145 | default_value="4000", 146 | validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], 147 | ) 148 | CHUNK_OVERLAP = PropertyDescriptor( 149 | name="Chunk Overlap", 150 | description="The number of characters that should be overlapped between each chunk of text", 151 | required=True, 152 | default_value="200", 153 | validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR], 154 | ) 155 | KEEP_SEPARATOR = PropertyDescriptor( 156 | name="Keep Separator", 157 | description="Whether or not to keep the text separator in each chunk of data", 158 | required=True, 159 | default_value="false", 160 | allowable_values=["true", "false"], 161 | dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)], 162 | ) 163 | STRIP_WHITESPACE = PropertyDescriptor( 164 | name="Strip Whitespace", 165 | description="Whether or not to strip the whitespace at the beginning and end of each chunk", 166 | required=True, 167 | default_value="true", 168 | allowable_values=["true", "false"], 169 | dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)], 170 | ) 171 | LANGUAGE = PropertyDescriptor( 172 | name="Language", 173 | description="The language to use for the Code's syntax", 174 | required=True, 175 | default_value="python", 176 | allowable_values=[e.value for e in Language], 177 | dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_CODE)], 178 | ) 179 | 180 | property_descriptors = [ 181 | CHUNK_STRATEGY, 182 | SEPARATOR, 183 | SEPARATOR_FORMAT, 184 | CHUNK_SIZE, 185 | CHUNK_OVERLAP, 186 | KEEP_SEPARATOR, 187 | STRIP_WHITESPACE, 188 | LANGUAGE, 189 | ] 190 | 191 | def __init__(self, **kwargs): 192 | pass 193 | 194 | def getPropertyDescriptors(self): 195 | return self.property_descriptors 196 | 197 | def split_docs(self, context, flowfile, documents): 198 | from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter 199 | 200 | strategy = context.getProperty(self.CHUNK_STRATEGY).getValue() 201 | if strategy == SPLIT_BY_CHARACTER: 202 | text_splitter = CharacterTextSplitter( 203 | separator=context.getProperty(self.SEPARATOR).evaluateAttributeExpressions(flowfile).getValue(), 204 | keep_separator=context.getProperty(self.KEEP_SEPARATOR).asBoolean(), 205 | is_separator_regex=context.getProperty(self.SEPARATOR_FORMAT).getValue() == "Regular Expression", 206 | chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(), 207 | chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(), 208 | length_function=len, 209 | strip_whitespace=context.getProperty(self.STRIP_WHITESPACE).asBoolean(), 210 | ) 211 | elif strategy == SPLIT_CODE: 212 | text_splitter = RecursiveCharacterTextSplitter.from_language( 213 | language=context.getProperty(self.LANGUAGE).getValue(), 214 | chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(), 215 | chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(), 216 | ) 217 | else: 218 | separator_text = context.getProperty(self.SEPARATOR).evaluateAttributeExpressions(flowfile).getValue() 219 | splits = separator_text.split(",") 220 | unescaped = [] 221 | for split in splits: 222 | unescaped.append(split.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t")) 223 | text_splitter = RecursiveCharacterTextSplitter( 224 | separators=unescaped, 225 | keep_separator=context.getProperty(self.KEEP_SEPARATOR).asBoolean(), 226 | is_separator_regex=context.getProperty(self.SEPARATOR_FORMAT).getValue() == "Regular Expression", 227 | chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(), 228 | chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(), 229 | length_function=len, 230 | strip_whitespace=context.getProperty(self.STRIP_WHITESPACE).asBoolean(), 231 | ) 232 | 233 | return text_splitter.split_documents(documents) 234 | 235 | def to_json(self, docs) -> str: 236 | json_docs = [] 237 | 238 | for i, doc in enumerate(docs): 239 | doc.metadata["chunk_index"] = i 240 | doc.metadata["chunk_count"] = len(docs) 241 | 242 | json_doc = json.dumps({TEXT_KEY: doc.page_content, METADATA_KEY: doc.metadata}) 243 | json_docs.append(json_doc) 244 | 245 | return "\n".join(json_docs) 246 | 247 | def load_docs(self, flowfile): 248 | from langchain.schema import Document 249 | 250 | flowfile_contents = flowfile.getContentsAsBytes().decode() 251 | docs = [] 252 | for line in flowfile_contents.split("\n"): 253 | stripped = line.strip() 254 | if stripped == "": 255 | continue 256 | 257 | json_element = json.loads(stripped) 258 | page_content = json_element.get(TEXT_KEY) 259 | if page_content is None: 260 | continue 261 | 262 | metadata = json_element.get(METADATA_KEY) 263 | if metadata is None: 264 | metadata = {} 265 | 266 | doc = Document(page_content=page_content, metadata=metadata) 267 | docs.append(doc) 268 | 269 | return docs 270 | 271 | def transform(self, context, flowfile): 272 | documents = self.load_docs(flowfile) 273 | split_docs = self.split_docs(context, flowfile, documents) 274 | 275 | output_json = self.to_json(split_docs) 276 | attributes = {"document.count": str(len(split_docs))} 277 | return FlowFileTransformResult("success", contents=output_json, attributes=attributes) 278 | -------------------------------------------------------------------------------- /src/extensions/chunking/ParseDocument.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import io 4 | import json 5 | 6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 7 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators 8 | 9 | PLAIN_TEXT = "Plain Text" 10 | HTML = "HTML" 11 | MARKDOWN = "Markdown" 12 | PDF = "PDF" 13 | EXCEL = "Microsoft Excel" 14 | POWERPOINT = "Microsoft PowerPoint" 15 | WORD = "Microsoft Word" 16 | 17 | PARSING_STRATEGY_AUTO = "Automatic" 18 | PARSING_STRATEGY_HIGH_RES = "High Resolution" 19 | PARSING_STRATEGY_OCR_ONLY = "OCR Only" 20 | PARSING_STRATEGY_FAST = "Fast" 21 | 22 | SINGLE_DOCUMENT = "Single Document" 23 | DOCUMENT_PER_ELEMENT = "Document Per Element" 24 | 25 | TEXT_KEY = "text" 26 | METADATA_KEY = "metadata" 27 | 28 | 29 | class ParseDocument(FlowFileTransform): 30 | class Java: 31 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 32 | 33 | class ProcessorDetails: 34 | version = "2.0.0.dev0" 35 | description = """Parses incoming unstructured text documents and performs optical character recognition (OCR) in order to extract text from PDF and image files. 36 | The output is formatted as "json-lines" with two keys: 'text' and 'metadata'. 37 | Note that use of this Processor may require significant storage space and RAM utilization due to third-party dependencies necessary for processing PDF and image files. 38 | Also note that in order to process PDF or Images, Tesseract and Poppler must be installed on the system.""" 39 | tags = [ 40 | "text", 41 | "embeddings", 42 | "vector", 43 | "machine learning", 44 | "ML", 45 | "artificial intelligence", 46 | "ai", 47 | "document", 48 | "langchain", 49 | "pdf", 50 | "html", 51 | "markdown", 52 | "word", 53 | "excel", 54 | "powerpoint", 55 | ] 56 | dependencies = [ 57 | "pikepdf==8.12.0", 58 | "pypdf==4.0.1", 59 | "langchain==0.1.7", 60 | "unstructured==0.14.8", 61 | "unstructured-inference==0.7.36", 62 | "unstructured_pytesseract==0.3.12", 63 | "pillow-heif==0.15.0", 64 | "numpy==1.26.4", 65 | "opencv-python==4.9.0.80", 66 | "pdf2image==1.17.0", 67 | "pdfminer.six==20221105", 68 | "python-docx==1.1.0", 69 | "openpyxl==3.1.2", 70 | "python-pptx==0.6.23", 71 | ] 72 | 73 | INPUT_FORMAT = PropertyDescriptor( 74 | name="Input Format", 75 | description="""The format of the input FlowFile. This dictates which TextLoader will be used to parse the input. 76 | Note that in order to process images or extract tables from PDF files,you must have both 'poppler' and 'tesseract' installed on your system.""", 77 | allowable_values=[PLAIN_TEXT, HTML, MARKDOWN, PDF, WORD, EXCEL, POWERPOINT], 78 | required=True, 79 | default_value=PLAIN_TEXT, 80 | ) 81 | PDF_PARSING_STRATEGY = PropertyDescriptor( 82 | name="PDF Parsing Strategy", 83 | display_name="Parsing Strategy", 84 | description="Specifies the strategy to use when parsing a PDF", 85 | allowable_values=[ 86 | PARSING_STRATEGY_AUTO, 87 | PARSING_STRATEGY_HIGH_RES, 88 | PARSING_STRATEGY_OCR_ONLY, 89 | PARSING_STRATEGY_FAST, 90 | ], 91 | required=True, 92 | default_value=PARSING_STRATEGY_AUTO, 93 | dependencies=[PropertyDependency(INPUT_FORMAT, PDF)], 94 | ) 95 | PDF_MODEL_NAME = PropertyDescriptor( 96 | name="PDF Parsing Model", 97 | description="The model to use for parsing. Different models will have their own strengths and weaknesses.", 98 | allowable_values=["yolox", "detectron2_onnx", "chipper"], 99 | required=True, 100 | default_value="yolox", 101 | dependencies=[PropertyDependency(INPUT_FORMAT, PDF)], 102 | ) 103 | ELEMENT_STRATEGY = PropertyDescriptor( 104 | name="Element Strategy", 105 | description="Specifies whether the input should be loaded as a single Document, or if each element in the input should be separated out into its own Document", 106 | allowable_values=[SINGLE_DOCUMENT, DOCUMENT_PER_ELEMENT], 107 | required=True, 108 | default_value=DOCUMENT_PER_ELEMENT, 109 | dependencies=[PropertyDependency(INPUT_FORMAT, HTML, MARKDOWN)], 110 | ) 111 | INCLUDE_PAGE_BREAKS = PropertyDescriptor( 112 | name="Include Page Breaks", 113 | description="Specifies whether or not page breaks should be considered when creating Documents from the input", 114 | allowable_values=["true", "false"], 115 | required=True, 116 | default_value="false", 117 | dependencies=[ 118 | PropertyDependency(INPUT_FORMAT, HTML, MARKDOWN), 119 | PropertyDependency(ELEMENT_STRATEGY, DOCUMENT_PER_ELEMENT), 120 | ], 121 | ) 122 | PDF_INFER_TABLE_STRUCTURE = PropertyDescriptor( 123 | name="Infer Table Structure", 124 | description="If true, any table that is identified in the PDF will be parsed and translated into an HTML structure. The HTML of that table will then be added to the \ 125 | Document's metadata in a key named 'text_as_html'. Regardless of the value of this property, the textual contents of the table will be written to the contents \ 126 | without the structure.", 127 | allowable_values=["true", "false"], 128 | default_value="false", 129 | required=True, 130 | dependencies=[PropertyDependency(PDF_PARSING_STRATEGY, PARSING_STRATEGY_HIGH_RES)], 131 | ) 132 | LANGUAGES = PropertyDescriptor( 133 | name="Languages", 134 | description="A comma-separated list of language codes that should be used when using OCR to determine the text.", 135 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 136 | default_value="Eng", 137 | required=True, 138 | dependencies=[PropertyDependency(INPUT_FORMAT, PDF)], 139 | ) 140 | METADATA_FIELDS = PropertyDescriptor( 141 | name="Metadata Fields", 142 | description="A comma-separated list of FlowFile attributes that will be added to the Documents' Metadata", 143 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 144 | default_value="filename, uuid", 145 | required=True, 146 | ) 147 | EXTRACT_METADATA = PropertyDescriptor( 148 | name="Include Extracted Metadata", 149 | description="Whether or not to include the metadata that is extracted from the input in each of the Documents", 150 | allowable_values=["true", "false"], 151 | default_value="true", 152 | required=True, 153 | ) 154 | 155 | property_descriptors = [ 156 | INPUT_FORMAT, 157 | PDF_PARSING_STRATEGY, 158 | PDF_MODEL_NAME, 159 | ELEMENT_STRATEGY, 160 | INCLUDE_PAGE_BREAKS, 161 | PDF_INFER_TABLE_STRUCTURE, 162 | LANGUAGES, 163 | METADATA_FIELDS, 164 | EXTRACT_METADATA, 165 | ] 166 | 167 | def __init__(self, **kwargs): 168 | pass 169 | 170 | def getPropertyDescriptors(self): 171 | return self.property_descriptors 172 | 173 | def get_parsing_strategy(self, nifi_value: str, default_value: str) -> str: 174 | if nifi_value == PARSING_STRATEGY_OCR_ONLY: 175 | return "ocr_only" 176 | if nifi_value == PARSING_STRATEGY_HIGH_RES: 177 | return "hi_res" 178 | if nifi_value == PARSING_STRATEGY_FAST: 179 | return "fast" 180 | if nifi_value == PARSING_STRATEGY_AUTO: 181 | return "auto" 182 | return default_value 183 | 184 | def get_languages(self, nifi_value: str) -> list[str]: 185 | return [lang.strip() for lang in nifi_value.split(",")] 186 | 187 | def create_docs(self, context, flowFile): 188 | from langchain.schema import Document 189 | 190 | metadata = {} 191 | 192 | for attribute_name in context.getProperty(self.METADATA_FIELDS).getValue().split(","): 193 | trimmed = attribute_name.strip() 194 | value = flowFile.getAttribute(trimmed) 195 | metadata[trimmed] = value 196 | 197 | input_format = context.getProperty(self.INPUT_FORMAT).evaluateAttributeExpressions(flowFile).getValue() 198 | if input_format == PLAIN_TEXT: 199 | return [Document(page_content=flowFile.getContentsAsBytes().decode("utf-8"), metadata=metadata)] 200 | 201 | element_strategy = context.getProperty(self.ELEMENT_STRATEGY).getValue() 202 | mode = "single" if element_strategy == SINGLE_DOCUMENT else "elements" 203 | 204 | include_page_breaks = context.getProperty(self.INCLUDE_PAGE_BREAKS).asBoolean() 205 | include_metadata = context.getProperty(self.EXTRACT_METADATA).asBoolean() 206 | 207 | if input_format == HTML: 208 | from langchain.document_loaders import UnstructuredHTMLLoader 209 | 210 | loader = UnstructuredHTMLLoader( 211 | None, 212 | file=io.BytesIO(flowFile.getContentsAsBytes()), 213 | mode=mode, 214 | include_page_breaks=include_page_breaks, 215 | include_metadata=include_metadata, 216 | ) 217 | 218 | elif input_format == PDF: 219 | from langchain.document_loaders import UnstructuredPDFLoader 220 | 221 | infer_table_structure = context.getProperty(self.PDF_INFER_TABLE_STRUCTURE).asBoolean() 222 | strategy = self.get_parsing_strategy( 223 | context.getProperty(self.PDF_PARSING_STRATEGY).getValue(), PARSING_STRATEGY_AUTO 224 | ) 225 | languages = self.get_languages(context.getProperty(self.LANGUAGES).getValue()) 226 | model_name = context.getProperty(self.PDF_MODEL_NAME).getValue() 227 | 228 | loader = UnstructuredPDFLoader( 229 | None, 230 | file=io.BytesIO(flowFile.getContentsAsBytes()), 231 | mode=mode, 232 | infer_table_structure=infer_table_structure, 233 | include_page_breaks=include_page_breaks, 234 | languages=languages, 235 | strategy=strategy, 236 | include_metadata=include_metadata, 237 | model_name=model_name, 238 | ) 239 | 240 | elif input_format == MARKDOWN: 241 | from langchain.document_loaders import UnstructuredMarkdownLoader 242 | 243 | loader = UnstructuredMarkdownLoader( 244 | None, 245 | file=io.BytesIO(flowFile.getContentsAsBytes()), 246 | mode=mode, 247 | include_page_breaks=include_page_breaks, 248 | include_metadata=include_metadata, 249 | ) 250 | 251 | elif input_format == WORD: 252 | from langchain.document_loaders import UnstructuredWordDocumentLoader 253 | 254 | loader = UnstructuredWordDocumentLoader( 255 | None, 256 | file=io.BytesIO(flowFile.getContentsAsBytes()), 257 | mode=mode, 258 | include_page_breaks=include_page_breaks, 259 | include_metadata=include_metadata, 260 | ) 261 | 262 | elif input_format == EXCEL: 263 | from langchain.document_loaders import UnstructuredExcelLoader 264 | 265 | loader = UnstructuredExcelLoader( 266 | None, 267 | file=io.BytesIO(flowFile.getContentsAsBytes()), 268 | mode=mode, 269 | include_page_breaks=include_page_breaks, 270 | include_metadata=include_metadata, 271 | ) 272 | 273 | elif input_format == POWERPOINT: 274 | from langchain.document_loaders import UnstructuredPowerPointLoader 275 | 276 | loader = UnstructuredPowerPointLoader( 277 | None, 278 | file=io.BytesIO(flowFile.getContentsAsBytes()), 279 | mode=mode, 280 | include_page_breaks=include_page_breaks, 281 | include_metadata=include_metadata, 282 | ) 283 | 284 | else: 285 | raise ValueError("Configured Input Format is invalid: " + input_format) 286 | 287 | documents = loader.load() 288 | 289 | if len(metadata) > 0: 290 | for doc in documents: 291 | if doc.metadata is None: 292 | doc.metadata = metadata 293 | else: 294 | doc.metadata.update(metadata) 295 | 296 | return documents 297 | 298 | def to_json(self, docs) -> str: 299 | json_docs = [] 300 | 301 | for i, doc in enumerate(docs): 302 | doc.metadata["chunk_index"] = i 303 | doc.metadata["chunk_count"] = len(docs) 304 | 305 | json_doc = json.dumps({"text": doc.page_content, "metadata": doc.metadata}) 306 | json_docs.append(json_doc) 307 | 308 | return "\n".join(json_docs) 309 | 310 | def transform(self, context, flowFile): 311 | documents = self.create_docs(context, flowFile) 312 | output_json = self.to_json(documents) 313 | 314 | return FlowFileTransformResult("success", contents=output_json, attributes={"mime.type": "application/json"}) 315 | -------------------------------------------------------------------------------- /src/extensions/chunking/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | -------------------------------------------------------------------------------- /src/extensions/openai/PromptChatGPT.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | import re 5 | 6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 7 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators, TimeUnit 8 | 9 | FLOWFILE_CONTENT = "flowfile_content" 10 | FLOWFILE_CONTENT_REFERENCE = "{" + FLOWFILE_CONTENT + "}" 11 | # Regex to match { followed by any number of characters other than { or }, followed by }. But do not match if it starts with {{ 12 | VAR_NAME_REGEX = r"(? and we will keep a mapping from that name to 130 | # the substituted variable name so that we can later determine what the JSONPath expression was. 131 | variable_references = list(set(re.findall(VAR_NAME_REGEX, prompt))) 132 | 133 | input_variables = [] 134 | jsonpath_to_var_mapping = {} 135 | index = 0 136 | for ref in variable_references: 137 | if ref.startswith("$"): 138 | var_name = "jsonpath_var_" + str(index) 139 | index += 1 140 | input_variables.append(var_name) 141 | jsonpath_to_var_mapping[ref] = var_name 142 | prompt = prompt.replace("{" + ref + "}", "{" + var_name + "}") 143 | elif ref == FLOWFILE_CONTENT: 144 | input_variables.append(ref) 145 | else: 146 | raise ValueError( 147 | "Prompt contained an invalid variable reference: {" 148 | + ref 149 | + "}. Valid references are flowfile_content or any JSONPath expression." 150 | ) 151 | 152 | temperature = context.getProperty(self.TEMPERATURE).evaluateAttributeExpressions(flowFile).asFloat() 153 | model_name = context.getProperty(self.MODEL).evaluateAttributeExpressions(flowFile).getValue() 154 | api_key = context.getProperty(self.API_KEY).getValue() 155 | timeout = context.getProperty(self.TIMEOUT).asTimePeriod(TimeUnit.SECONDS) 156 | max_tokens = context.getProperty(self.MAX_TOKENS).asInteger() 157 | organization = context.getProperty(self.ORGANIZATION).getValue() 158 | api_base = context.getProperty(self.API_BASE).getValue() 159 | 160 | # Build out our LLMChain 161 | llm = ChatOpenAI( 162 | model_name=model_name, 163 | temperature=temperature, 164 | openai_api_key=api_key, 165 | request_timeout=timeout, 166 | max_retries=0, 167 | max_tokens=max_tokens, 168 | openai_organization=organization, 169 | openai_api_base=api_base, 170 | ) 171 | 172 | prompt_template = PromptTemplate(template=prompt, input_variables=input_variables) 173 | 174 | llm_chain = LLMChain(llm=llm, prompt=prompt_template) 175 | 176 | # Substitute in any JSON Path Expressions or references to {flowfile_content}. 177 | llm_args = {} 178 | json_content = None 179 | for var_name in variable_references: 180 | # If variable references {flowfile_content} substitute the content 181 | if var_name == FLOWFILE_CONTENT: 182 | llm_args[FLOWFILE_CONTENT] = flowFile.getContentsAsBytes().decode() 183 | if var_name.startswith("$"): 184 | # Load the FlowFile's contents into the json_content variable only once 185 | if json_content is None: 186 | json_content = json.loads(flowFile.getContentsAsBytes().decode()) 187 | 188 | # Import jsonpath_ng so that we can evaluate JSONPath against the FlowFile content. 189 | from jsonpath_ng import parse 190 | 191 | try: 192 | jsonpath_expression = parse(var_name) 193 | matches = jsonpath_expression.find(json_content) 194 | variable_value = "\n".join([match.value for match in matches]) 195 | except: 196 | self.logger.exception(f"Invalid JSONPath reference in prompt: {var_name}") 197 | raise 198 | 199 | # Insert the resolved value into llm_args 200 | resolved_var_name = jsonpath_to_var_mapping.get(var_name) 201 | llm_args[resolved_var_name] = variable_value 202 | 203 | self.logger.debug(f"Evaluating prompt\nPrompt: {prompt}\nArgs: #{llm_args}") 204 | 205 | # Run the LLM Chain in order to prompt ChatGPT 206 | results = llm_chain(llm_args) 207 | 208 | # Create the output content or FLowFile attribute 209 | text = results["text"] 210 | attribute_name = context.getProperty(self.RESULT_ATTRIBUTE).getValue() 211 | if attribute_name is None: 212 | output_content = text 213 | output_attributes = None 214 | else: 215 | output_content = None 216 | output_attributes = {attribute_name: text} 217 | 218 | # Return the results 219 | return FlowFileTransformResult("success", contents=output_content, attributes=output_attributes) 220 | -------------------------------------------------------------------------------- /src/extensions/openai/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/ChromaUtils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 4 | 5 | # Connection Strategies 6 | LOCAL_DISK = "Local Disk" 7 | REMOTE_SERVER = "Remote Chroma Server" 8 | 9 | # Authentication Strategies 10 | TOKEN = "Token Authentication" 11 | BASIC_AUTH = "Basic Authentication" 12 | NONE = "None" 13 | 14 | # Transport Protocols 15 | HTTP = "http" 16 | HTTPS = "https" 17 | 18 | CONNECTION_STRATEGY = PropertyDescriptor( 19 | name="Connection Strategy", 20 | description="Specifies how to connect to the Chroma server", 21 | allowable_values=[LOCAL_DISK, REMOTE_SERVER], 22 | default_value=REMOTE_SERVER, 23 | required=True, 24 | ) 25 | DIRECTORY = PropertyDescriptor( 26 | name="Directory", 27 | description="The Directory that Chroma should use to persist data", 28 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 29 | required=True, 30 | default_value="./chroma", 31 | dependencies=[PropertyDependency(CONNECTION_STRATEGY, LOCAL_DISK)], 32 | ) 33 | HOSTNAME = PropertyDescriptor( 34 | name="Hostname", 35 | description="The hostname to connect to in order to communicate with Chroma", 36 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 37 | default_value="localhost", 38 | required=True, 39 | dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)], 40 | ) 41 | PORT = PropertyDescriptor( 42 | name="Port", 43 | description="The port that the Chroma server is listening on", 44 | validators=[StandardValidators.PORT_VALIDATOR], 45 | default_value="8000", 46 | required=True, 47 | dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)], 48 | ) 49 | TRANSPORT_PROTOCOL = PropertyDescriptor( 50 | name="Transport Protocol", 51 | description="Specifies whether connections should be made over http or https", 52 | allowable_values=[HTTP, HTTPS], 53 | default_value=HTTPS, 54 | required=True, 55 | dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)], 56 | ) 57 | AUTH_STRATEGY = PropertyDescriptor( 58 | name="Authentication Strategy", 59 | description="Specifies how to authenticate to Chroma server", 60 | allowable_values=[TOKEN, BASIC_AUTH, NONE], 61 | default_value=TOKEN, 62 | required=True, 63 | dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)], 64 | ) 65 | AUTH_TOKEN = PropertyDescriptor( 66 | name="Authentication Token", 67 | description="The token to use for authenticating to Chroma server", 68 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 69 | required=True, 70 | sensitive=True, 71 | dependencies=[PropertyDependency(AUTH_STRATEGY, TOKEN)], 72 | ) 73 | USERNAME = PropertyDescriptor( 74 | name="Username", 75 | description="The username to use for authenticating to Chroma server", 76 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 77 | required=True, 78 | dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)], 79 | ) 80 | PASSWORD = PropertyDescriptor( 81 | name="Password", 82 | description="The password to use for authenticating to Chroma server", 83 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 84 | required=True, 85 | sensitive=True, 86 | dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)], 87 | ) 88 | COLLECTION_NAME = PropertyDescriptor( 89 | name="Collection Name", 90 | description="The name of the Chroma Collection", 91 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 92 | required=True, 93 | default_value="nifi", 94 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 95 | ) 96 | 97 | PROPERTIES = [ 98 | CONNECTION_STRATEGY, 99 | DIRECTORY, 100 | HOSTNAME, 101 | PORT, 102 | TRANSPORT_PROTOCOL, 103 | AUTH_STRATEGY, 104 | AUTH_TOKEN, 105 | USERNAME, 106 | PASSWORD, 107 | COLLECTION_NAME, 108 | ] 109 | 110 | 111 | def create_client(context): 112 | import chromadb 113 | from chromadb import Settings 114 | 115 | connection_strategy = context.getProperty(CONNECTION_STRATEGY).getValue() 116 | if connection_strategy == LOCAL_DISK: 117 | directory = context.getProperty(DIRECTORY).getValue() 118 | return chromadb.PersistentClient(directory) 119 | hostname = context.getProperty(HOSTNAME).getValue() 120 | port = context.getProperty(PORT).asInteger() 121 | headers = {} 122 | ssl = context.getProperty(TRANSPORT_PROTOCOL).getValue() == HTTPS 123 | 124 | auth_strategy = context.getProperty(AUTH_STRATEGY).getValue() 125 | if auth_strategy == TOKEN: 126 | auth_provider = "chromadb.auth.token.TokenAuthClientProvider" 127 | credentials = context.getProperty(AUTH_TOKEN).getValue() 128 | elif auth_strategy == BASIC_AUTH: 129 | auth_provider = "chromadb.auth.basic.BasicAuthClientProvider" 130 | username = context.getProperty(USERNAME).getValue() 131 | password = context.getProperty(PASSWORD).getValue() 132 | credentials = username + ":" + password 133 | else: 134 | auth_provider = None 135 | credentials = None 136 | 137 | settings = Settings(chroma_client_auth_provider=auth_provider, chroma_client_auth_credentials=credentials) 138 | return chromadb.HttpClient(hostname, port, ssl, headers, settings) 139 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/EmbeddingUtils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings 4 | from langchain.embeddings.openai import OpenAIEmbeddings 5 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators 6 | 7 | # Embedding Functions 8 | ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model" 9 | HUGGING_FACE = "Hugging Face Model" 10 | OPENAI = "OpenAI Model" 11 | SENTENCE_TRANSFORMERS = "Sentence Transformers" 12 | 13 | 14 | EMBEDDING_FUNCTION = PropertyDescriptor( 15 | name="Embedding Function", 16 | description="Specifies which embedding function should be used in order to create embeddings from incoming Documents", 17 | allowable_values=[ONNX_ALL_MINI_LM_L6_V2, HUGGING_FACE, OPENAI, SENTENCE_TRANSFORMERS], 18 | default_value=ONNX_ALL_MINI_LM_L6_V2, 19 | required=True, 20 | ) 21 | HUGGING_FACE_MODEL_NAME = PropertyDescriptor( 22 | name="HuggingFace Model Name", 23 | description="The name of the HuggingFace model to use", 24 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 25 | default_value="sentence-transformers/all-MiniLM-L6-v2", 26 | required=True, 27 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, HUGGING_FACE)], 28 | ) 29 | HUGGING_FACE_API_KEY = PropertyDescriptor( 30 | name="HuggingFace API Key", 31 | description="The API Key for interacting with HuggingFace", 32 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 33 | required=True, 34 | sensitive=True, 35 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, HUGGING_FACE)], 36 | ) 37 | OPENAI_API_KEY = PropertyDescriptor( 38 | name="OpenAI API Key", 39 | description="The API Key for interacting with OpenAI", 40 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 41 | required=True, 42 | sensitive=True, 43 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 44 | ) 45 | OPENAI_MODEL_NAME = PropertyDescriptor( 46 | name="OpenAI Model Name", 47 | description="The name of the OpenAI model to use", 48 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 49 | default_value="text-embedding-ada-002", 50 | required=True, 51 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 52 | ) 53 | OPENAI_ORGANIZATION = PropertyDescriptor( 54 | name="OpenAI Organization ID", 55 | description="The OpenAI Organization ID", 56 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 57 | required=False, 58 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 59 | ) 60 | OPENAI_API_BASE = PropertyDescriptor( 61 | name="OpenAI API Base Path", 62 | description="The API Base to use for interacting with OpenAI. This is used for interacting with different deployments, such as an Azure deployment.", 63 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 64 | required=False, 65 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 66 | ) 67 | OPENAI_API_TYPE = PropertyDescriptor( 68 | name="OpenAI API Deployment Type", 69 | description="The type of the OpenAI API Deployment. This is used for interacting with different deployments, such as an Azure deployment.", 70 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 71 | required=False, 72 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 73 | ) 74 | OPENAI_API_VERSION = PropertyDescriptor( 75 | name="OpenAI API Version", 76 | description="The OpenAI API Version. This is used for interacting with different deployments, such as an Azure deployment.", 77 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 78 | required=False, 79 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)], 80 | ) 81 | SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor( 82 | name="Sentence Transformer Model Name", 83 | description="The name of the Sentence Transformer model to use", 84 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 85 | default_value="all-MiniLM-L6-v2", 86 | required=True, 87 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)], 88 | ) 89 | SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor( 90 | name="Sentence Transformer Device Type", 91 | description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. 92 | If not specified, a GPU will be used if possible, otherwise a CPU.""", 93 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 94 | required=False, 95 | dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)], 96 | ) 97 | EMBEDDING_MODEL = PropertyDescriptor( 98 | name="Embedding Model", 99 | description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.", 100 | allowable_values=[HUGGING_FACE, OPENAI], 101 | default_value=OPENAI, 102 | required=True, 103 | ) 104 | OPENAI_MODEL = PropertyDescriptor( 105 | name="OpenAI Model", 106 | description="The name of the OpenAI model to use", 107 | default_value="text-embedding-ada-002", 108 | required=True, 109 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 110 | dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)], 111 | ) 112 | HUGGING_FACE_MODEL = PropertyDescriptor( 113 | name="HuggingFace Model", 114 | description="The name of the HuggingFace model to use", 115 | default_value="sentence-transformers/all-MiniLM-L6-v2", 116 | required=True, 117 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 118 | dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)], 119 | ) 120 | 121 | PROPERTIES = [ 122 | EMBEDDING_FUNCTION, 123 | HUGGING_FACE_MODEL_NAME, 124 | HUGGING_FACE_API_KEY, 125 | OPENAI_MODEL_NAME, 126 | OPENAI_API_KEY, 127 | OPENAI_ORGANIZATION, 128 | OPENAI_API_BASE, 129 | OPENAI_API_TYPE, 130 | OPENAI_API_VERSION, 131 | SENTENCE_TRANSFORMER_MODEL_NAME, 132 | SENTENCE_TRANSFORMER_DEVICE, 133 | EMBEDDING_MODEL, 134 | ] 135 | 136 | 137 | def create_embedding_function(context): 138 | from chromadb.utils.embedding_functions import ( 139 | HuggingFaceEmbeddingFunction, 140 | ONNXMiniLM_L6_V2, 141 | OpenAIEmbeddingFunction, 142 | SentenceTransformerEmbeddingFunction, 143 | ) 144 | 145 | function_name = context.getProperty(EMBEDDING_FUNCTION).getValue() 146 | if function_name == ONNX_ALL_MINI_LM_L6_V2: 147 | return ONNXMiniLM_L6_V2() 148 | 149 | if function_name == OPENAI: 150 | api_key = context.getProperty(OPENAI_API_KEY).getValue() 151 | model_name = context.getProperty(OPENAI_MODEL_NAME).getValue() 152 | organization_id = context.getProperty(OPENAI_ORGANIZATION).getValue() 153 | api_base = context.getProperty(OPENAI_API_BASE).getValue() 154 | api_type = context.getProperty(OPENAI_API_TYPE).getValue() 155 | api_version = context.getProperty(OPENAI_API_VERSION).getValue() 156 | return OpenAIEmbeddingFunction( 157 | api_key=api_key, 158 | model_name=model_name, 159 | organization_id=organization_id, 160 | api_base=api_base, 161 | api_type=api_type, 162 | api_version=api_version, 163 | ) 164 | 165 | if function_name == HUGGING_FACE: 166 | api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue() 167 | model_name = context.getProperty(HUGGING_FACE_MODEL_NAME).getValue() 168 | return HuggingFaceEmbeddingFunction(api_key=api_key, model_name=model_name) 169 | 170 | model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue() 171 | device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue() 172 | return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device) 173 | 174 | 175 | def create_embedding_service(context): 176 | embedding_service = context.getProperty(EMBEDDING_MODEL).getValue() 177 | 178 | if embedding_service == OPENAI: 179 | openai_api_key = context.getProperty(OPENAI_API_KEY).getValue() 180 | openai_model = context.getProperty(OPENAI_MODEL).getValue() 181 | return OpenAIEmbeddings(openai_api_key=openai_api_key, model=openai_model) 182 | huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue() 183 | huggingface_model = context.getProperty(HUGGING_FACE_MODEL).getValue() 184 | return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key, model_name=huggingface_model) 185 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/OpenSearchVectorUtils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE, OPENAI 4 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 5 | 6 | # Space types 7 | L2 = ("L2 (Euclidean distance)", "l2") 8 | L1 = ("L1 (Manhattan distance)", "l1") 9 | LINF = ("L-infinity (chessboard) distance", "linf") 10 | COSINESIMIL = ("Cosine similarity", "cosinesimil") 11 | 12 | HUGGING_FACE_API_KEY = PropertyDescriptor( 13 | name="HuggingFace API Key", 14 | description="The API Key for interacting with HuggingFace", 15 | required=True, 16 | sensitive=True, 17 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 18 | dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)], 19 | ) 20 | OPENAI_API_KEY = PropertyDescriptor( 21 | name="OpenAI API Key", 22 | description="The API Key for OpenAI in order to create embeddings", 23 | required=True, 24 | sensitive=True, 25 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 26 | dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)], 27 | ) 28 | HTTP_HOST = PropertyDescriptor( 29 | name="HTTP Host", 30 | description="URL where OpenSearch is hosted.", 31 | default_value="http://localhost:9200", 32 | required=True, 33 | validators=[StandardValidators.URL_VALIDATOR], 34 | ) 35 | USERNAME = PropertyDescriptor( 36 | name="Username", 37 | description="The username to use for authenticating to OpenSearch server", 38 | required=False, 39 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 40 | ) 41 | PASSWORD = PropertyDescriptor( 42 | name="Password", 43 | description="The password to use for authenticating to OpenSearch server", 44 | required=False, 45 | sensitive=True, 46 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 47 | ) 48 | CERTIFICATE_PATH = PropertyDescriptor( 49 | name="Certificate Path", 50 | description="The path to the CA certificate to be used.", 51 | required=False, 52 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 53 | ) 54 | INDEX_NAME = PropertyDescriptor( 55 | name="Index Name", 56 | description="The name of the OpenSearch index.", 57 | sensitive=False, 58 | required=True, 59 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 60 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 61 | ) 62 | VECTOR_FIELD = PropertyDescriptor( 63 | name="Vector Field Name", 64 | description="The name of field in the document where the embeddings are stored. This field need to be a 'knn_vector' typed field.", 65 | default_value="vector_field", 66 | required=True, 67 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 68 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 69 | ) 70 | TEXT_FIELD = PropertyDescriptor( 71 | name="Text Field Name", 72 | description="The name of field in the document where the text is stored.", 73 | default_value="text", 74 | required=True, 75 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 76 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 77 | ) 78 | 79 | 80 | def create_authentication_params(context): 81 | username = context.getProperty(USERNAME).getValue() 82 | password = context.getProperty(PASSWORD).getValue() 83 | certificate_path = context.getProperty(CERTIFICATE_PATH).getValue() 84 | 85 | params = {} 86 | 87 | if username is not None and password is not None: 88 | params["http_auth"] = (username, password) 89 | 90 | if certificate_path is not None: 91 | params["ca_certs"] = certificate_path 92 | 93 | return params 94 | 95 | 96 | def parse_documents(json_lines, id_field_name, file_name): 97 | import json 98 | 99 | texts = [] 100 | metadatas = [] 101 | ids = [] 102 | for i, line in enumerate(json_lines.split("\n"), start=1): 103 | try: 104 | doc = json.loads(line) 105 | except Exception as e: 106 | message = f"Could not parse line {i} as JSON" 107 | raise ValueError(message) from e 108 | 109 | text = doc.get("text") 110 | metadata = doc.get("metadata") 111 | texts.append(text) 112 | 113 | # Remove any null values, or it will cause the embedding to fail 114 | filtered_metadata = {key: value for key, value in metadata.items() if value is not None} 115 | metadatas.append(filtered_metadata) 116 | 117 | doc_id = None 118 | if id_field_name is not None: 119 | doc_id = metadata.get(id_field_name) 120 | if doc_id is None: 121 | doc_id = file_name + "-" + str(i) 122 | ids.append(doc_id) 123 | 124 | return {"texts": texts, "metadatas": metadatas, "ids": ids} 125 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/PutChroma.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import ChromaUtils 6 | import EmbeddingUtils 7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators 9 | 10 | 11 | class PutChroma(FlowFileTransform): 12 | class Java: 13 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 14 | 15 | class ProcessorDetails: 16 | version = "2.0.0.dev0" 17 | description = """Publishes JSON data to a Chroma VectorDB. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'. 18 | The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored. If the collection name specified 19 | does not exist, the Processor will automatically create the collection.""" 20 | tags = [ 21 | "chroma", 22 | "vector", 23 | "vectordb", 24 | "embeddings", 25 | "ai", 26 | "artificial intelligence", 27 | "ml", 28 | "machine learning", 29 | "text", 30 | "LLM", 31 | ] 32 | 33 | STORE_TEXT = PropertyDescriptor( 34 | name="Store Document Text", 35 | description="""Specifies whether or not the text of the document should be stored in Chroma. If so, both the document's text and its embedding will be stored. If not, 36 | only the vector/embedding will be stored.""", 37 | allowable_values=["true", "false"], 38 | required=True, 39 | default_value="true", 40 | ) 41 | DISTANCE_METHOD = PropertyDescriptor( 42 | name="Distance Method", 43 | description="If the specified collection does not exist, it will be created using this Distance Method. If the collection exists, this property will be ignored.", 44 | allowable_values=["cosine", "l2", "ip"], 45 | default_value="cosine", 46 | required=True, 47 | ) 48 | DOC_ID_FIELD_NAME = PropertyDescriptor( 49 | name="Document ID Field Name", 50 | description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. 51 | If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""", 52 | required=False, 53 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 54 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 55 | ) 56 | 57 | client = None 58 | embedding_function = None 59 | 60 | def __init__(self, **kwargs): # noqa: ARG002 61 | self.property_descriptors = list(ChromaUtils.PROPERTIES) + [ 62 | prop for prop in EmbeddingUtils.PROPERTIES if prop != EmbeddingUtils.EMBEDDING_MODEL 63 | ] 64 | self.property_descriptors.append(self.STORE_TEXT) 65 | self.property_descriptors.append(self.DISTANCE_METHOD) 66 | self.property_descriptors.append(self.DOC_ID_FIELD_NAME) 67 | 68 | def getPropertyDescriptors(self): 69 | return self.property_descriptors 70 | 71 | def onScheduled(self, context): 72 | self.client = ChromaUtils.create_client(context) 73 | self.embedding_function = EmbeddingUtils.create_embedding_function(context) 74 | 75 | def transform(self, context, flowfile): 76 | client = self.client 77 | embedding_function = self.embedding_function 78 | collection_name = ( 79 | context.getProperty(ChromaUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue() 80 | ) 81 | distance_method = context.getProperty(self.DISTANCE_METHOD).getValue() 82 | id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue() 83 | 84 | collection = client.get_or_create_collection( 85 | name=collection_name, embedding_function=embedding_function, metadata={"hnsw:space": distance_method} 86 | ) 87 | 88 | json_lines = flowfile.getContentsAsBytes().decode() 89 | i = 0 90 | texts = [] 91 | metadatas = [] 92 | ids = [] 93 | for line in json_lines.split("\n"): 94 | doc = json.loads(line) 95 | text = doc.get("text") 96 | metadata = doc.get("metadata") 97 | texts.append(text) 98 | 99 | # Remove any null values, or it will cause the embedding to fail 100 | filtered_metadata = {} 101 | for key, value in metadata.items(): 102 | if value is not None: 103 | if isinstance(value, list): 104 | for i, element in enumerate(value): 105 | element_count = i + 1 106 | indexed_key = f"{key}_{element_count}" 107 | filtered_metadata[indexed_key] = element 108 | else: 109 | filtered_metadata[key] = value 110 | 111 | metadatas.append(filtered_metadata) 112 | 113 | doc_id = None 114 | if id_field_name is not None: 115 | doc_id = metadata.get(id_field_name) 116 | if doc_id is None: 117 | doc_id = flowfile.getAttribute("filename") + "-" + str(i) 118 | ids.append(doc_id) 119 | 120 | i += 1 121 | 122 | embeddings = embedding_function(texts) 123 | if not context.getProperty(self.STORE_TEXT).asBoolean(): 124 | texts = None 125 | 126 | collection.upsert(ids, embeddings, metadatas, texts) 127 | 128 | return FlowFileTransformResult(relationship="success") 129 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/PutOpenSearchVector.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE_MODEL, OPENAI_MODEL, create_embedding_service 4 | from langchain.vectorstores import OpenSearchVectorSearch 5 | from nifiapi.documentation import use_case 6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 7 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 8 | from OpenSearchVectorUtils import ( 9 | CERTIFICATE_PATH, 10 | COSINESIMIL, 11 | HTTP_HOST, 12 | HUGGING_FACE_API_KEY, 13 | INDEX_NAME, 14 | L1, 15 | L2, 16 | LINF, 17 | OPENAI_API_KEY, 18 | PASSWORD, 19 | TEXT_FIELD, 20 | USERNAME, 21 | VECTOR_FIELD, 22 | create_authentication_params, 23 | parse_documents, 24 | ) 25 | 26 | 27 | @use_case( 28 | description="Create vectors/embeddings that represent text content and send the vectors to OpenSearch", 29 | notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.", 30 | keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "insert"], 31 | configuration=""" 32 | Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible. 33 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 34 | Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model. 35 | Set 'Index Name' to the name of your OpenSearch Index. 36 | Set 'Vector Field Name' to the name of the field in the document which will store the vector data. 37 | Set 'Text Field Name' to the name of the field in the document which will store the text data. 38 | 39 | If the documents to send to OpenSearch contain a unique identifier, set the 'Document ID Field Name' property to the name of the field that contains the document ID. 40 | This property can be left blank, in which case a unique ID will be generated based on the FlowFile's filename. 41 | 42 | If the provided index does not exists in OpenSearch then the processor is capable to create it. The 'New Index Strategy' property defines 43 | that the index needs to be created from the default template or it should be configured with custom values. 44 | """, 45 | ) 46 | @use_case( 47 | description="Update vectors/embeddings in OpenSearch", 48 | notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.", 49 | keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "update", "upsert"], 50 | configuration=""" 51 | Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible. 52 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 53 | Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model. 54 | Set 'Index Name' to the name of your OpenSearch Index. 55 | Set 'Vector Field Name' to the name of the field in the document which will store the vector data. 56 | Set 'Text Field Name' to the name of the field in the document which will store the text data. 57 | Set the 'Document ID Field Name' property to the name of the field that contains the identifier of the document in OpenSearch to update. 58 | """, 59 | ) 60 | class PutOpenSearchVector(FlowFileTransform): 61 | class Java: 62 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 63 | 64 | class ProcessorDetails: 65 | version = "2.0.0.dev0" 66 | description = """Publishes JSON data to OpenSearch. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'. 67 | The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored.""" 68 | tags = [ 69 | "opensearch", 70 | "vector", 71 | "vectordb", 72 | "vectorstore", 73 | "embeddings", 74 | "ai", 75 | "artificial intelligence", 76 | "ml", 77 | "machine learning", 78 | "text", 79 | "LLM", 80 | ] 81 | 82 | # Engine types 83 | NMSLIB = ("nmslib (Non-Metric Space Library)", "nmslib") 84 | FAISS = ("faiss (Facebook AI Similarity Search)", "faiss") 85 | LUCENE = ("lucene", "lucene") 86 | 87 | ENGINE_VALUES = dict([NMSLIB, FAISS, LUCENE]) 88 | 89 | # Space types 90 | INNERPRODUCT = ("Inner product", "innerproduct") 91 | 92 | NMSLIB_SPACE_TYPE_VALUES = dict([L2, L1, LINF, COSINESIMIL, INNERPRODUCT]) 93 | FAISS_SPACE_TYPE_VALUES = dict([L2, INNERPRODUCT]) 94 | LUCENE_SPACE_TYPE_VALUES = dict([L2, COSINESIMIL]) 95 | 96 | # New Index Mapping Strategy 97 | DEFAULT_INDEX_MAPPING = "Default index mapping" 98 | CUSTOM_INDEX_MAPPING = "Custom index mapping" 99 | 100 | DOC_ID_FIELD_NAME = PropertyDescriptor( 101 | name="Document ID Field Name", 102 | description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. 103 | If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""", 104 | required=False, 105 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 106 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 107 | ) 108 | NEW_INDEX_STRATEGY = PropertyDescriptor( 109 | name="New Index Strategy", 110 | description="""Specifies the Mapping strategy to use for new index creation. The default template values are the following: 111 | {engine: nmslib, space_type: l2, ef_search: 512, ef_construction: 512, m: 16}""", 112 | allowable_values=[DEFAULT_INDEX_MAPPING, CUSTOM_INDEX_MAPPING], 113 | default_value=DEFAULT_INDEX_MAPPING, 114 | required=False, 115 | ) 116 | ENGINE = PropertyDescriptor( 117 | name="Engine", 118 | description="The approximate k-NN library to use for indexing and search.", 119 | allowable_values=ENGINE_VALUES.keys(), 120 | default_value=NMSLIB[0], 121 | required=False, 122 | dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)], 123 | ) 124 | NMSLIB_SPACE_TYPE = PropertyDescriptor( 125 | name="NMSLIB Space Type", 126 | description="The vector space used to calculate the distance between vectors.", 127 | allowable_values=NMSLIB_SPACE_TYPE_VALUES.keys(), 128 | default_value=L2[0], 129 | required=False, 130 | dependencies=[ 131 | PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING), 132 | PropertyDependency(ENGINE, NMSLIB[0]), 133 | ], 134 | ) 135 | FAISS_SPACE_TYPE = PropertyDescriptor( 136 | name="FAISS Space Type", 137 | description="The vector space used to calculate the distance between vectors.", 138 | allowable_values=FAISS_SPACE_TYPE_VALUES.keys(), 139 | default_value=L2[0], 140 | required=False, 141 | dependencies=[ 142 | PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING), 143 | PropertyDependency(ENGINE, FAISS[0]), 144 | ], 145 | ) 146 | LUCENE_SPACE_TYPE = PropertyDescriptor( 147 | name="Lucene Space Type", 148 | description="The vector space used to calculate the distance between vectors.", 149 | allowable_values=LUCENE_SPACE_TYPE_VALUES.keys(), 150 | default_value=L2[0], 151 | required=False, 152 | dependencies=[ 153 | PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING), 154 | PropertyDependency(ENGINE, LUCENE[0]), 155 | ], 156 | ) 157 | EF_SEARCH = PropertyDescriptor( 158 | name="EF Search", 159 | description="The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches.", 160 | default_value="512", 161 | required=False, 162 | validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR], 163 | dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)], 164 | ) 165 | EF_CONSTRUCTION = PropertyDescriptor( 166 | name="EF Construction", 167 | description="The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed.", 168 | default_value="512", 169 | required=False, 170 | validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR], 171 | dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)], 172 | ) 173 | M = PropertyDescriptor( 174 | name="M", 175 | description="The number of bidirectional links that the plugin creates for each new element. Increasing and " 176 | "decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.", 177 | default_value="16", 178 | required=False, 179 | validators=[StandardValidators._standard_validators.createLongValidator(2, 100, True)], 180 | dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)], 181 | ) 182 | 183 | properties = [ 184 | EMBEDDING_MODEL, 185 | OPENAI_API_KEY, 186 | OPENAI_MODEL, 187 | HUGGING_FACE_API_KEY, 188 | HUGGING_FACE_MODEL, 189 | HTTP_HOST, 190 | USERNAME, 191 | PASSWORD, 192 | CERTIFICATE_PATH, 193 | INDEX_NAME, 194 | DOC_ID_FIELD_NAME, 195 | VECTOR_FIELD, 196 | TEXT_FIELD, 197 | NEW_INDEX_STRATEGY, 198 | ENGINE, 199 | NMSLIB_SPACE_TYPE, 200 | FAISS_SPACE_TYPE, 201 | LUCENE_SPACE_TYPE, 202 | EF_SEARCH, 203 | EF_CONSTRUCTION, 204 | M, 205 | ] 206 | 207 | embeddings = None 208 | 209 | def __init__(self, **kwargs): 210 | pass 211 | 212 | def getPropertyDescriptors(self): 213 | return self.properties 214 | 215 | def onScheduled(self, context): 216 | self.embeddings = create_embedding_service(context) 217 | 218 | def transform(self, context, flowfile): 219 | file_name = flowfile.getAttribute("filename") 220 | http_host = context.getProperty(HTTP_HOST).evaluateAttributeExpressions(flowfile).getValue() 221 | index_name = context.getProperty(INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue() 222 | id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue() 223 | vector_field = context.getProperty(VECTOR_FIELD).evaluateAttributeExpressions(flowfile).getValue() 224 | text_field = context.getProperty(TEXT_FIELD).evaluateAttributeExpressions(flowfile).getValue() 225 | new_index_strategy = context.getProperty(self.NEW_INDEX_STRATEGY).evaluateAttributeExpressions().getValue() 226 | 227 | params = {"vector_field": vector_field, "text_field": text_field} 228 | params.update(create_authentication_params(context)) 229 | 230 | if new_index_strategy == self.CUSTOM_INDEX_MAPPING: 231 | engine = context.getProperty(self.ENGINE).evaluateAttributeExpressions().getValue() 232 | params["engine"] = self.ENGINE_VALUES.get(engine) 233 | 234 | if engine == self.NMSLIB[0]: 235 | space_type = context.getProperty(self.NMSLIB_SPACE_TYPE).evaluateAttributeExpressions().getValue() 236 | params["space_type"] = self.NMSLIB_SPACE_TYPE_VALUES.get(space_type) 237 | if engine == self.FAISS[0]: 238 | space_type = context.getProperty(self.FAISS_SPACE_TYPE).evaluateAttributeExpressions().getValue() 239 | params["space_type"] = self.FAISS_SPACE_TYPE_VALUES.get(space_type) 240 | if engine == self.LUCENE[0]: 241 | space_type = context.getProperty(self.LUCENE_SPACE_TYPE).evaluateAttributeExpressions().getValue() 242 | params["space_type"] = self.LUCENE_SPACE_TYPE_VALUES.get(space_type) 243 | 244 | ef_search = context.getProperty(self.EF_SEARCH).evaluateAttributeExpressions().asInteger() 245 | params["ef_search"] = ef_search 246 | 247 | ef_construction = context.getProperty(self.EF_CONSTRUCTION).evaluateAttributeExpressions().asInteger() 248 | params["ef_construction"] = ef_construction 249 | 250 | m = context.getProperty(self.M).evaluateAttributeExpressions().asInteger() 251 | params["m"] = m 252 | 253 | # Read the FlowFile content as "json-lines". 254 | json_lines = flowfile.getContentsAsBytes().decode() 255 | parsed_documents = parse_documents(json_lines, id_field_name, file_name) 256 | 257 | vectorstore = OpenSearchVectorSearch( 258 | opensearch_url=http_host, index_name=index_name, embedding_function=self.embeddings, **params 259 | ) 260 | vectorstore.add_texts( 261 | texts=parsed_documents["texts"], 262 | metadatas=parsed_documents["metadatas"], 263 | ids=parsed_documents["ids"], 264 | **params, 265 | ) 266 | 267 | return FlowFileTransformResult(relationship="success") 268 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/PutPinecone.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import langchain.vectorstores 6 | from EmbeddingUtils import ( 7 | EMBEDDING_MODEL, 8 | HUGGING_FACE, 9 | HUGGING_FACE_MODEL, 10 | OPENAI, 11 | OPENAI_MODEL, 12 | create_embedding_service, 13 | ) 14 | from nifiapi.documentation import use_case 15 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 16 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 17 | from pinecone import Pinecone 18 | 19 | 20 | @use_case( 21 | description="Create vectors/embeddings that represent text content and send the vectors to Pinecone", 22 | notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in Pinecone provided in the 'text' field.", 23 | keywords=["pinecone", "embedding", "vector", "text", "vectorstore", "insert"], 24 | configuration=""" 25 | Configure the 'Pinecone API Key' to the appropriate authentication token for interacting with Pinecone. 26 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 27 | Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model. 28 | Set 'Pinecone Environment' to the name of your Pinecone environment 29 | Set 'Index Name' to the name of your Pinecone Index. 30 | Set 'Namespace' to appropriate namespace, or leave it empty to use the default Namespace. 31 | 32 | If the documents to send to Pinecone contain a unique identifier, set the 'Document ID Field Name' property to the name of the field that contains the document ID. 33 | This property can be left blank, in which case a unique ID will be generated based on the FlowFile's filename. 34 | """, 35 | ) 36 | @use_case( 37 | description="Update vectors/embeddings in Pinecone", 38 | notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in Pinecone provided in the 'text' field.", 39 | keywords=["pinecone", "embedding", "vector", "text", "vectorstore", "update", "upsert"], 40 | configuration=""" 41 | Configure the 'Pinecone API Key' to the appropriate authentication token for interacting with Pinecone. 42 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 43 | Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model. 44 | Set 'Pinecone Environment' to the name of your Pinecone environment 45 | Set 'Index Name' to the name of your Pinecone Index. 46 | Set 'Namespace' to appropriate namespace, or leave it empty to use the default Namespace. 47 | Set the 'Document ID Field Name' property to the name of the field that contains the identifier of the document in Pinecone to update. 48 | """, 49 | ) 50 | class PutPinecone(FlowFileTransform): 51 | class Java: 52 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 53 | 54 | class ProcessorDetails: 55 | version = "2.0.0.dev0" 56 | description = """Publishes JSON data to Pinecone. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'. 57 | The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored.""" 58 | tags = [ 59 | "pinecone", 60 | "vector", 61 | "vectordb", 62 | "vectorstore", 63 | "embeddings", 64 | "ai", 65 | "artificial intelligence", 66 | "ml", 67 | "machine learning", 68 | "text", 69 | "LLM", 70 | ] 71 | 72 | PINECONE_API_KEY = PropertyDescriptor( 73 | name="Pinecone API Key", 74 | description="The API Key to use in order to authentication with Pinecone", 75 | sensitive=True, 76 | required=True, 77 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 78 | ) 79 | HUGGING_FACE_API_KEY = PropertyDescriptor( 80 | name="HuggingFace API Key", 81 | description="The API Key for interacting with HuggingFace", 82 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 83 | required=True, 84 | sensitive=True, 85 | dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)], 86 | ) 87 | OPENAI_API_KEY = PropertyDescriptor( 88 | name="OpenAI API Key", 89 | description="The API Key for OpenAI in order to create embeddings", 90 | sensitive=True, 91 | required=True, 92 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 93 | dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)], 94 | ) 95 | PINECONE_ENV = PropertyDescriptor( 96 | name="Pinecone Environment", 97 | description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.", 98 | sensitive=False, 99 | required=True, 100 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 101 | ) 102 | INDEX_NAME = PropertyDescriptor( 103 | name="Index Name", 104 | description="The name of the Pinecone index.", 105 | sensitive=False, 106 | required=True, 107 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 108 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 109 | ) 110 | TEXT_KEY = PropertyDescriptor( 111 | name="Text Key", 112 | description="The key in the document that contains the text to create embeddings for.", 113 | required=True, 114 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 115 | default_value="text", 116 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 117 | ) 118 | NAMESPACE = PropertyDescriptor( 119 | name="Namespace", 120 | description="The name of the Pinecone Namespace to put the documents to.", 121 | required=False, 122 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 123 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 124 | ) 125 | DOC_ID_FIELD_NAME = PropertyDescriptor( 126 | name="Document ID Field Name", 127 | description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. 128 | If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""", 129 | required=False, 130 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 131 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 132 | ) 133 | 134 | properties = [ 135 | PINECONE_API_KEY, 136 | EMBEDDING_MODEL, 137 | OPENAI_API_KEY, 138 | OPENAI_MODEL, 139 | HUGGING_FACE_API_KEY, 140 | HUGGING_FACE_MODEL, 141 | PINECONE_ENV, 142 | INDEX_NAME, 143 | TEXT_KEY, 144 | NAMESPACE, 145 | DOC_ID_FIELD_NAME, 146 | ] 147 | 148 | embeddings = None 149 | pc = None 150 | 151 | def __init__(self, **kwargs): 152 | pass 153 | 154 | def getPropertyDescriptors(self): 155 | return self.properties 156 | 157 | def onScheduled(self, context): 158 | # initialize pinecone 159 | self.pc = Pinecone( 160 | api_key=context.getProperty(self.PINECONE_API_KEY).getValue(), 161 | environment=context.getProperty(self.PINECONE_ENV).getValue(), 162 | ) 163 | # initialize embedding service 164 | self.embeddings = create_embedding_service(context) 165 | 166 | def transform(self, context, flowfile): 167 | # First, check if our index already exists. If it doesn't, we create it 168 | index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue() 169 | namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue() 170 | id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue() 171 | 172 | index = self.pc.Index(index_name) 173 | 174 | # Read the FlowFile content as "json-lines". 175 | json_lines = flowfile.getContentsAsBytes().decode() 176 | i = 1 177 | texts = [] 178 | metadatas = [] 179 | ids = [] 180 | for line in json_lines.split("\n"): 181 | try: 182 | doc = json.loads(line) 183 | except Exception as e: 184 | message = f"Could not parse line {i} as JSON" 185 | raise ValueError(message) from e 186 | 187 | text = doc.get("text") 188 | metadata = doc.get("metadata") 189 | texts.append(text) 190 | 191 | # Remove any null values, or it will cause the embedding to fail 192 | filtered_metadata = {} 193 | for key, value in metadata.items(): 194 | if value is not None: 195 | filtered_metadata[key] = value 196 | 197 | metadatas.append(filtered_metadata) 198 | 199 | doc_id = None 200 | if id_field_name is not None: 201 | doc_id = metadata.get(id_field_name) 202 | if doc_id is None: 203 | doc_id = flowfile.getAttribute("filename") + "-" + str(i) 204 | ids.append(doc_id) 205 | 206 | i += 1 207 | 208 | text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue() 209 | vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key) 210 | vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace) 211 | return FlowFileTransformResult(relationship="success") 212 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/PutQdrant.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import QdrantUtils 6 | from EmbeddingUtils import ( 7 | create_embedding_service, 8 | ) 9 | from langchain.vectorstores.qdrant import Qdrant 10 | from nifiapi.documentation import use_case 11 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 12 | from nifiapi.properties import ( 13 | ExpressionLanguageScope, 14 | PropertyDescriptor, 15 | StandardValidators, 16 | ) 17 | from qdrant_client.models import Distance 18 | 19 | 20 | @use_case( 21 | description="Create embeddings that semantically represent text content and upload to Qdrant - https://qdrant.tech/", 22 | notes="This processor assumes that the data has already been formatted in JSONL format with the text to store in Qdrant provided in the 'text' field.", 23 | keywords=["qdrant", "embedding", "vector", "text", "vectorstore", "insert"], 24 | configuration=""" 25 | Configure 'Collection Name' to the name of the Qdrant collection to use. 26 | Configure 'Qdrant URL' to the fully qualified URL of the Qdrant instance. 27 | Configure 'Qdrant API Key' to the API Key to use in order to authenticate with Qdrant. 28 | Configure 'Prefer gRPC' to True if you want to use gRPC for interfacing with Qdrant. 29 | Configure 'Use HTTPS' to True if you want to use TLS(HTTPS) while interfacing with Qdrant. 30 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 31 | Configure 'HuggingFace API Key' or 'OpenAI API Key', depending on the chosen Embedding Model. 32 | Configure 'HuggingFace Model' or 'OpenAI Model' to the name of the model to use. 33 | Configure 'Force Recreate Collection' to True if you want to recreate the collection if it already exists. 34 | Configure 'Similarity Metric' to the similarity metric to use when querying Qdrant. 35 | 36 | If the documents to send to Qdrant contain a unique identifier(UUID), set the 'Document ID Field Name' property to the name of the field that contains the document ID. 37 | This property can be left blank, in which case a UUID will be generated based on the FlowFile's filename. 38 | """, 39 | ) 40 | class PutQdrant(FlowFileTransform): 41 | class Java: 42 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 43 | 44 | class ProcessorDetails: 45 | version = "2.0.0.dev0" 46 | description = """Publishes JSON data to Qdrant. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'. 47 | The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored.""" 48 | tags = [ 49 | "qdrant", 50 | "vector", 51 | "vectordb", 52 | "vectorstore", 53 | "embeddings", 54 | "ai", 55 | "artificial intelligence", 56 | "ml", 57 | "machine learning", 58 | "text", 59 | "LLM", 60 | ] 61 | 62 | DOC_ID_FIELD_NAME = PropertyDescriptor( 63 | name="Document ID Field Name", 64 | description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. 65 | If not specified, a UUID will be generated based on the FlowFile's filename and an incremental number.""", 66 | required=False, 67 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 68 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 69 | ) 70 | FORCE_RECREATE_COLLECTION = PropertyDescriptor( 71 | name="Force Recreate Collection", 72 | description="Specifies whether to recreate the collection if it already exists. Essentially clearing the existing data.", 73 | required=True, 74 | default_value="False", 75 | allowable_values=["True", "False"], 76 | validators=[StandardValidators.BOOLEAN_VALIDATOR], 77 | ) 78 | SIMILARITY_METRIC = PropertyDescriptor( 79 | name="Similarity Metric", 80 | description="Specifies the similarity metric when creating the collection.", 81 | required=True, 82 | default_value=Distance.COSINE, 83 | allowable_values=[ 84 | Distance.COSINE, 85 | Distance.EUCLID, 86 | Distance.DOT, 87 | Distance.MANHATTAN, 88 | ], 89 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 90 | ) 91 | 92 | properties = ( 93 | QdrantUtils.QDRANT_PROPERTIES 94 | + QdrantUtils.EMBEDDING_MODEL_PROPERTIES 95 | + [ 96 | FORCE_RECREATE_COLLECTION, 97 | SIMILARITY_METRIC, 98 | DOC_ID_FIELD_NAME, 99 | ] 100 | ) 101 | 102 | def __init__(self, **kwargs): 103 | pass 104 | 105 | def getPropertyDescriptors(self): 106 | return self.properties 107 | 108 | def onScheduled(self, context): 109 | # The Qdrant#construct_instance() internally checks if the collection exists 110 | # and creates it if it doesn't with the appropriate dimesions and configurations. 111 | self.vector_store = Qdrant.construct_instance( 112 | texts=["Some text to obtain the embeddings dimension when creating the collection"], 113 | embedding=create_embedding_service(context), 114 | collection_name=context.getProperty(QdrantUtils.COLLECTION_NAME).getValue(), 115 | url=context.getProperty(QdrantUtils.QDRANT_URL).getValue(), 116 | api_key=context.getProperty(QdrantUtils.QDRANT_API_KEY).getValue(), 117 | prefer_grpc=context.getProperty(QdrantUtils.PREFER_GRPC).asBoolean(), 118 | https=context.getProperty(QdrantUtils.HTTPS).asBoolean(), 119 | force_recreate=context.getProperty(self.FORCE_RECREATE_COLLECTION).asBoolean(), 120 | distance_func=context.getProperty(self.SIMILARITY_METRIC).getValue(), 121 | ) 122 | 123 | def transform(self, context, flowfile): 124 | id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue() 125 | 126 | # Read the FlowFile content as "json-lines". 127 | json_lines = flowfile.getContentsAsBytes().decode() 128 | i = 1 129 | texts, metadatas, ids = [], [], [] 130 | for line in json_lines.split("\n"): 131 | try: 132 | doc = json.loads(line) 133 | except Exception as e: 134 | message = f"Could not parse line {i} as JSON" 135 | raise ValueError(message) from e 136 | 137 | metadata = doc.get("metadata") 138 | texts.append(doc.get("text")) 139 | metadatas.append(metadata) 140 | 141 | doc_id = None 142 | if id_field_name is not None: 143 | doc_id = metadata.get(id_field_name) 144 | if doc_id is None: 145 | doc_id = QdrantUtils.convert_id(flowfile.getAttribute("filename") + "-" + str(i)) 146 | ids.append(doc_id) 147 | 148 | i += 1 149 | 150 | self.vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids) 151 | return FlowFileTransformResult(relationship="success") 152 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QdrantUtils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import uuid 4 | 5 | from EmbeddingUtils import ( 6 | EMBEDDING_MODEL, 7 | HUGGING_FACE, 8 | HUGGING_FACE_MODEL, 9 | OPENAI, 10 | OPENAI_MODEL, 11 | ) 12 | from nifiapi.properties import ( 13 | ExpressionLanguageScope, 14 | PropertyDependency, 15 | PropertyDescriptor, 16 | StandardValidators, 17 | ) 18 | 19 | DEFAULT_COLLECTION_NAME = "apache-nifi" 20 | 21 | 22 | COLLECTION_NAME = PropertyDescriptor( 23 | name="Collection Name", 24 | description="The name of the Qdrant collection to use.", 25 | sensitive=False, 26 | required=True, 27 | default_value=DEFAULT_COLLECTION_NAME, 28 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 29 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 30 | ) 31 | QDRANT_URL = PropertyDescriptor( 32 | name="Qdrant URL", 33 | description="The fully qualified URL to the Qdrant instance.", 34 | sensitive=False, 35 | required=True, 36 | default_value="http://localhost:6333", 37 | validators=[StandardValidators.URL_VALIDATOR], 38 | ) 39 | QDRANT_API_KEY = PropertyDescriptor( 40 | name="Qdrant API Key", 41 | description="The API Key to use in order to authentication with Qdrant. Can be empty.", 42 | sensitive=True, 43 | required=True, 44 | ) 45 | 46 | PREFER_GRPC = PropertyDescriptor( 47 | name="Prefer gRPC", 48 | description="Specifies whether to use gRPC for interfacing with Qdrant.", 49 | required=True, 50 | default_value=False, 51 | allowable_values=["True", "False"], 52 | validators=[StandardValidators.BOOLEAN_VALIDATOR], 53 | ) 54 | HTTPS = PropertyDescriptor( 55 | name="Use HTTPS", 56 | description="Specifies whether to TLS(HTTPS) while interfacing with Qdrant.", 57 | required=True, 58 | default_value=False, 59 | allowable_values=["True", "False"], 60 | validators=[StandardValidators.BOOLEAN_VALIDATOR], 61 | ) 62 | 63 | QDRANT_PROPERTIES = [COLLECTION_NAME, QDRANT_URL, QDRANT_API_KEY, PREFER_GRPC, HTTPS] 64 | 65 | HUGGING_FACE_API_KEY = PropertyDescriptor( 66 | name="HuggingFace API Key", 67 | description="The API Key for interacting with HuggingFace", 68 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 69 | required=True, 70 | sensitive=True, 71 | dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)], 72 | ) 73 | OPENAI_API_KEY = PropertyDescriptor( 74 | name="OpenAI API Key", 75 | description="The API Key for OpenAI in order to create embeddings.", 76 | sensitive=True, 77 | required=True, 78 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 79 | dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)], 80 | ) 81 | 82 | EMBEDDING_MODEL_PROPERTIES = [ 83 | EMBEDDING_MODEL, 84 | HUGGING_FACE_API_KEY, 85 | HUGGING_FACE_MODEL, 86 | OPENAI_API_KEY, 87 | OPENAI_MODEL, 88 | ] 89 | 90 | 91 | def convert_id(_id: str) -> str: 92 | """ 93 | Converts any string into a UUID string deterministically. 94 | 95 | Qdrant accepts UUID strings and unsigned integers as point ID. 96 | This allows us to overwrite the same point with the original ID. 97 | """ 98 | return str(uuid.uuid5(uuid.NAMESPACE_DNS, _id)) 99 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QueryChroma.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import ChromaUtils 6 | import EmbeddingUtils 7 | import QueryUtils 8 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 9 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators 10 | 11 | 12 | class QueryChroma(FlowFileTransform): 13 | class Java: 14 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 15 | 16 | class ProcessorDetails: 17 | version = "2.0.0.dev0" 18 | description = "Queries a Chroma Vector Database in order to gather a specified number of documents that are most closely related to the given query." 19 | tags = [ 20 | "chroma", 21 | "vector", 22 | "vectordb", 23 | "embeddings", 24 | "enrich", 25 | "enrichment", 26 | "ai", 27 | "artificial intelligence", 28 | "ml", 29 | "machine learning", 30 | "text", 31 | "LLM", 32 | ] 33 | 34 | QUERY = PropertyDescriptor( 35 | name="Query", 36 | description="""The query to issue to the Chroma VectorDB. The query is always converted into embeddings using the configured embedding function, and the embedding is 37 | then sent to Chroma. The text itself is not sent to Chroma.""", 38 | required=True, 39 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 40 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 41 | ) 42 | NUMBER_OF_RESULTS = PropertyDescriptor( 43 | name="Number of Results", 44 | description="The number of results to return from Chroma", 45 | required=True, 46 | validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], 47 | default_value="10", 48 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 49 | ) 50 | METADATA_FILTER = PropertyDescriptor( 51 | name="Metadata Filter", 52 | description="""A JSON representation of a Metadata Filter that can be applied against the Chroma documents in order to narrow down the documents that can be returned. 53 | For example: { "metadata_field": "some_value" }""", 54 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 55 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 56 | required=False, 57 | ) 58 | DOCUMENT_FILTER = PropertyDescriptor( 59 | name="Document Filter", 60 | description="""A JSON representation of a Document Filter that can be applied against the Chroma documents' text in order to narrow down the documents that can be returned. 61 | For example: { "$contains": "search_string" }""", 62 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 63 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 64 | required=False, 65 | ) 66 | 67 | client = None 68 | embedding_function = None 69 | include_ids = None 70 | include_metadatas = None 71 | include_documents = None 72 | include_distances = None 73 | include_embeddings = None 74 | results_field = None 75 | 76 | property_descriptors = ( 77 | list(ChromaUtils.PROPERTIES) 78 | + [prop for prop in EmbeddingUtils.PROPERTIES if prop != EmbeddingUtils.EMBEDDING_MODEL] 79 | + [ 80 | QUERY, 81 | NUMBER_OF_RESULTS, 82 | QueryUtils.OUTPUT_STRATEGY, 83 | QueryUtils.RESULTS_FIELD, 84 | METADATA_FILTER, 85 | DOCUMENT_FILTER, 86 | QueryUtils.INCLUDE_IDS, 87 | QueryUtils.INCLUDE_METADATAS, 88 | QueryUtils.INCLUDE_DOCUMENTS, 89 | QueryUtils.INCLUDE_DISTANCES, 90 | QueryUtils.INCLUDE_EMBEDDINGS, 91 | ] 92 | ) 93 | 94 | def __init__(self, **kwargs): 95 | pass 96 | 97 | def getPropertyDescriptors(self): 98 | return self.property_descriptors 99 | 100 | def onScheduled(self, context): 101 | self.client = ChromaUtils.create_client(context) 102 | self.embedding_function = EmbeddingUtils.create_embedding_function(context) 103 | self.include_ids = context.getProperty(QueryUtils.INCLUDE_IDS).asBoolean() 104 | self.include_metadatas = context.getProperty(QueryUtils.INCLUDE_METADATAS).asBoolean() 105 | self.include_documents = context.getProperty(QueryUtils.INCLUDE_DOCUMENTS).asBoolean() 106 | self.include_distances = context.getProperty(QueryUtils.INCLUDE_DISTANCES).asBoolean() 107 | self.include_embeddings = context.getProperty(QueryUtils.INCLUDE_EMBEDDINGS).asBoolean() 108 | self.results_field = context.getProperty(QueryUtils.RESULTS_FIELD).getValue() 109 | self.query_utils = QueryUtils.QueryUtils(context) 110 | 111 | def transform(self, context, flowfile): 112 | client = self.client 113 | embedding_function = self.embedding_function 114 | collection_name = ( 115 | context.getProperty(ChromaUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue() 116 | ) 117 | 118 | collection = client.get_collection(name=collection_name, embedding_function=embedding_function) 119 | 120 | query_text = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue() 121 | embeddings = embedding_function([query_text]) 122 | 123 | included_fields = [] 124 | if self.include_distances: 125 | included_fields.append("distances") 126 | if self.include_documents: 127 | included_fields.append("documents") 128 | if self.include_embeddings: 129 | included_fields.append("embeddings") 130 | if self.include_metadatas: 131 | included_fields.append("metadatas") 132 | 133 | where = None 134 | where_clause = context.getProperty(self.METADATA_FILTER).evaluateAttributeExpressions(flowfile).getValue() 135 | if where_clause is not None: 136 | where = json.loads(where_clause) 137 | 138 | where_document = None 139 | where_document_clause = ( 140 | context.getProperty(self.DOCUMENT_FILTER).evaluateAttributeExpressions(flowfile).getValue() 141 | ) 142 | if where_document_clause is not None: 143 | where_document = json.loads(where_document_clause) 144 | 145 | query_results = collection.query( 146 | query_embeddings=embeddings, 147 | n_results=context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger(), 148 | include=included_fields, 149 | where_document=where_document, 150 | where=where, 151 | ) 152 | 153 | ids = query_results["ids"][0] 154 | distances = ( 155 | None 156 | if (not self.include_distances or query_results["distances"] is None) 157 | else query_results["distances"][0] 158 | ) 159 | metadatas = ( 160 | None 161 | if (not self.include_metadatas or query_results["metadatas"] is None) 162 | else query_results["metadatas"][0] 163 | ) 164 | documents = ( 165 | None 166 | if (not self.include_documents or query_results["documents"] is None) 167 | else query_results["documents"][0] 168 | ) 169 | embeddings = ( 170 | None 171 | if (not self.include_embeddings or query_results["embeddings"] is None) 172 | else query_results["embeddings"][0] 173 | ) 174 | 175 | (output_contents, mime_type) = self.query_utils.create_json( 176 | flowfile, documents, metadatas, embeddings, distances, ids 177 | ) 178 | 179 | # Return the results 180 | attributes = {"mime.type": mime_type} 181 | return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes) 182 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QueryOpenSearchVector.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE_MODEL, OPENAI_MODEL, create_embedding_service 6 | from langchain.vectorstores import OpenSearchVectorSearch 7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 9 | from OpenSearchVectorUtils import ( 10 | CERTIFICATE_PATH, 11 | COSINESIMIL, 12 | HTTP_HOST, 13 | HUGGING_FACE_API_KEY, 14 | INDEX_NAME, 15 | L1, 16 | L2, 17 | LINF, 18 | OPENAI_API_KEY, 19 | PASSWORD, 20 | TEXT_FIELD, 21 | USERNAME, 22 | VECTOR_FIELD, 23 | create_authentication_params, 24 | ) 25 | from QueryUtils import INCLUDE_DISTANCES, INCLUDE_METADATAS, OUTPUT_STRATEGY, RESULTS_FIELD, QueryUtils 26 | 27 | 28 | class QueryOpenSearchVector(FlowFileTransform): 29 | class Java: 30 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 31 | 32 | class ProcessorDetails: 33 | version = "2.0.0.dev0" 34 | description = "Queries OpenSearch in order to gather a specified number of documents that are most closely related to the given query." 35 | tags = [ 36 | "opensearch", 37 | "vector", 38 | "vectordb", 39 | "vectorstore", 40 | "embeddings", 41 | "ai", 42 | "artificial intelligence", 43 | "ml", 44 | "machine learning", 45 | "text", 46 | "LLM", 47 | ] 48 | 49 | # Search types 50 | APPROXIMATE_SEARCH = ("Approximate Search", "approximate_search") 51 | SCRIPT_SCORING_SEARCH = ("Script Scoring Search", "script_scoring") 52 | PAINLESS_SCRIPTING_SEARCH = ("Painless Scripting Search", "painless_scripting") 53 | 54 | SEARCH_TYPE_VALUES = dict([APPROXIMATE_SEARCH, SCRIPT_SCORING_SEARCH, PAINLESS_SCRIPTING_SEARCH]) 55 | 56 | # Script Scoring Search space types 57 | HAMMINGBIT = ("Hamming distance", "hammingbit") 58 | 59 | SCRIPT_SCORING_SPACE_TYPE_VALUES = dict([L2, L1, LINF, COSINESIMIL, HAMMINGBIT]) 60 | 61 | # Painless Scripting Search space types 62 | L2_SQUARED = ("L2 (Euclidean distance)", "l2Squared") 63 | L1_NORM = ("L1 (Manhattan distance)", "l1Norm") 64 | COSINE_SIMILARITY = ("Cosine similarity", "cosineSimilarity") 65 | 66 | PAINLESS_SCRIPTING_SPACE_TYPE_VALUES = dict([L2_SQUARED, L1_NORM, COSINE_SIMILARITY]) 67 | 68 | QUERY = PropertyDescriptor( 69 | name="Query", 70 | description="The text of the query to send to OpenSearch.", 71 | required=True, 72 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 73 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 74 | ) 75 | NUMBER_OF_RESULTS = PropertyDescriptor( 76 | name="Number of Results", 77 | description="The number of results to return from OpenSearch", 78 | default_value="10", 79 | required=True, 80 | validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], 81 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 82 | ) 83 | SEARCH_TYPE = PropertyDescriptor( 84 | name="Search Type", 85 | description="Specifies the type of the search to be performed.", 86 | allowable_values=SEARCH_TYPE_VALUES.keys(), 87 | default_value=APPROXIMATE_SEARCH[0], 88 | required=True, 89 | ) 90 | SCRIPT_SCORING_SPACE_TYPE = PropertyDescriptor( 91 | name="Script Scoring Space Type", 92 | description="Used to measure the distance between two points in order to determine the k-nearest neighbors.", 93 | allowable_values=SCRIPT_SCORING_SPACE_TYPE_VALUES.keys(), 94 | default_value=L2[0], 95 | required=False, 96 | dependencies=[PropertyDependency(SEARCH_TYPE, SCRIPT_SCORING_SEARCH[0])], 97 | ) 98 | PAINLESS_SCRIPTING_SPACE_TYPE = PropertyDescriptor( 99 | name="Painless Scripting Space Type", 100 | description="Used to measure the distance between two points in order to determine the k-nearest neighbors.", 101 | allowable_values=PAINLESS_SCRIPTING_SPACE_TYPE_VALUES.keys(), 102 | default_value=L2_SQUARED[0], 103 | required=False, 104 | dependencies=[PropertyDependency(SEARCH_TYPE, PAINLESS_SCRIPTING_SEARCH[0])], 105 | ) 106 | BOOLEAN_FILTER = PropertyDescriptor( 107 | name="Boolean Filter", 108 | description="A Boolean filter is a post filter consists of a Boolean query that contains a k-NN query and a filter. " 109 | "The value of the field must be a JSON representation of the filter.", 110 | required=False, 111 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 112 | dependencies=[PropertyDependency(SEARCH_TYPE, APPROXIMATE_SEARCH[0])], 113 | ) 114 | EFFICIENT_FILTER = PropertyDescriptor( 115 | name="Efficient Filter", 116 | description="The Lucene Engine or Faiss Engine decides whether to perform an exact k-NN search with " 117 | "pre-filtering or an approximate search with modified post-filtering. The value of the field must " 118 | "be a JSON representation of the filter.", 119 | required=False, 120 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 121 | dependencies=[PropertyDependency(SEARCH_TYPE, APPROXIMATE_SEARCH[0])], 122 | ) 123 | PRE_FILTER = PropertyDescriptor( 124 | name="Pre Filter", 125 | description="Script Score query to pre-filter documents before identifying nearest neighbors. The value of " 126 | "the field must be a JSON representation of the filter.", 127 | default_value='{"match_all": {}}', 128 | required=False, 129 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 130 | dependencies=[PropertyDependency(SEARCH_TYPE, SCRIPT_SCORING_SEARCH[0], PAINLESS_SCRIPTING_SEARCH[0])], 131 | ) 132 | 133 | properties = [ 134 | EMBEDDING_MODEL, 135 | OPENAI_API_KEY, 136 | OPENAI_MODEL, 137 | HUGGING_FACE_API_KEY, 138 | HUGGING_FACE_MODEL, 139 | HTTP_HOST, 140 | USERNAME, 141 | PASSWORD, 142 | CERTIFICATE_PATH, 143 | INDEX_NAME, 144 | QUERY, 145 | VECTOR_FIELD, 146 | TEXT_FIELD, 147 | NUMBER_OF_RESULTS, 148 | SEARCH_TYPE, 149 | SCRIPT_SCORING_SPACE_TYPE, 150 | PAINLESS_SCRIPTING_SPACE_TYPE, 151 | BOOLEAN_FILTER, 152 | EFFICIENT_FILTER, 153 | PRE_FILTER, 154 | OUTPUT_STRATEGY, 155 | RESULTS_FIELD, 156 | INCLUDE_METADATAS, 157 | INCLUDE_DISTANCES, 158 | ] 159 | 160 | embeddings = None 161 | query_utils = None 162 | 163 | def __init__(self, **kwargs): 164 | pass 165 | 166 | def getPropertyDescriptors(self): 167 | return self.properties 168 | 169 | def onScheduled(self, context): 170 | # initialize embedding service 171 | self.embeddings = create_embedding_service(context) 172 | self.query_utils = QueryUtils(context) 173 | 174 | def transform(self, context, flowfile): 175 | http_host = context.getProperty(HTTP_HOST).evaluateAttributeExpressions(flowfile).getValue() 176 | index_name = context.getProperty(INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue() 177 | query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue() 178 | num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger() 179 | vector_field = context.getProperty(VECTOR_FIELD).evaluateAttributeExpressions(flowfile).getValue() 180 | text_field = context.getProperty(TEXT_FIELD).evaluateAttributeExpressions(flowfile).getValue() 181 | search_type = context.getProperty(self.SEARCH_TYPE).evaluateAttributeExpressions().getValue() 182 | 183 | params = { 184 | "vector_field": vector_field, 185 | "text_field": text_field, 186 | "search_type": self.SEARCH_TYPE_VALUES.get(search_type), 187 | } 188 | params.update(create_authentication_params(context)) 189 | 190 | if search_type == self.APPROXIMATE_SEARCH[0]: 191 | boolean_filter = context.getProperty(self.BOOLEAN_FILTER).evaluateAttributeExpressions().getValue() 192 | if boolean_filter is not None: 193 | params["boolean_filter"] = json.loads(boolean_filter) 194 | 195 | efficient_filter = context.getProperty(self.EFFICIENT_FILTER).evaluateAttributeExpressions().getValue() 196 | if efficient_filter is not None: 197 | params["efficient_filter"] = json.loads(efficient_filter) 198 | else: 199 | pre_filter = context.getProperty(self.PRE_FILTER).evaluateAttributeExpressions().getValue() 200 | if pre_filter is not None: 201 | params["pre_filter"] = json.loads(pre_filter) 202 | if search_type == self.SCRIPT_SCORING_SEARCH[0]: 203 | space_type = ( 204 | context.getProperty(self.SCRIPT_SCORING_SPACE_TYPE).evaluateAttributeExpressions().getValue() 205 | ) 206 | params["space_type"] = self.SCRIPT_SCORING_SPACE_TYPE_VALUES.get(space_type) 207 | elif search_type == self.PAINLESS_SCRIPTING_SEARCH[0]: 208 | space_type = ( 209 | context.getProperty(self.PAINLESS_SCRIPTING_SPACE_TYPE).evaluateAttributeExpressions().getValue() 210 | ) 211 | params["space_type"] = self.PAINLESS_SCRIPTING_SPACE_TYPE_VALUES.get(space_type) 212 | 213 | vectorstore = OpenSearchVectorSearch( 214 | index_name=index_name, embedding_function=self.embeddings, opensearch_url=http_host, **params 215 | ) 216 | 217 | results = vectorstore.similarity_search_with_score(query=query, k=num_results, **params) 218 | 219 | documents = [] 220 | for result in results: 221 | documents.append(result[0].page_content) 222 | 223 | if context.getProperty(INCLUDE_METADATAS): 224 | metadatas = [] 225 | for result in results: 226 | metadatas.append(result[0].metadata) 227 | else: 228 | metadatas = None 229 | 230 | if context.getProperty(INCLUDE_DISTANCES): 231 | distances = [] 232 | for result in results: 233 | distances.append(result[1]) 234 | else: 235 | distances = None 236 | 237 | (output_contents, mime_type) = self.query_utils.create_json( 238 | flowfile, documents, metadatas, None, distances, None 239 | ) 240 | attributes = {"mime.type": mime_type} 241 | 242 | return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes) 243 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QueryPinecone.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import langchain.vectorstores 6 | import QueryUtils 7 | from EmbeddingUtils import ( 8 | EMBEDDING_MODEL, 9 | HUGGING_FACE, 10 | HUGGING_FACE_MODEL, 11 | OPENAI, 12 | OPENAI_MODEL, 13 | create_embedding_service, 14 | ) 15 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 16 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators 17 | from pinecone import Pinecone 18 | 19 | 20 | class QueryPinecone(FlowFileTransform): 21 | class Java: 22 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 23 | 24 | class ProcessorDetails: 25 | version = "2.0.0.dev0" 26 | description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query." 27 | tags = [ 28 | "pinecone", 29 | "vector", 30 | "vectordb", 31 | "vectorstore", 32 | "embeddings", 33 | "ai", 34 | "artificial intelligence", 35 | "ml", 36 | "machine learning", 37 | "text", 38 | "LLM", 39 | ] 40 | 41 | PINECONE_API_KEY = PropertyDescriptor( 42 | name="Pinecone API Key", 43 | description="The API Key to use in order to authentication with Pinecone", 44 | sensitive=True, 45 | required=True, 46 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 47 | ) 48 | OPENAI_API_KEY = PropertyDescriptor( 49 | name="OpenAI API Key", 50 | description="The API Key for OpenAI in order to create embeddings", 51 | sensitive=True, 52 | required=True, 53 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 54 | dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)], 55 | ) 56 | HUGGING_FACE_API_KEY = PropertyDescriptor( 57 | name="HuggingFace API Key", 58 | description="The API Key for interacting with HuggingFace", 59 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 60 | required=True, 61 | sensitive=True, 62 | dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)], 63 | ) 64 | PINECONE_ENV = PropertyDescriptor( 65 | name="Pinecone Environment", 66 | description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.", 67 | sensitive=False, 68 | required=True, 69 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 70 | ) 71 | INDEX_NAME = PropertyDescriptor( 72 | name="Index Name", 73 | description="The name of the Pinecone index.", 74 | sensitive=False, 75 | required=True, 76 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 77 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 78 | ) 79 | QUERY = PropertyDescriptor( 80 | name="Query", 81 | description="The text of the query to send to Pinecone.", 82 | required=True, 83 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 84 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 85 | ) 86 | NUMBER_OF_RESULTS = PropertyDescriptor( 87 | name="Number of Results", 88 | description="The number of results to return from Pinecone", 89 | required=True, 90 | validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], 91 | default_value="10", 92 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 93 | ) 94 | TEXT_KEY = PropertyDescriptor( 95 | name="Text Key", 96 | description="The key in the document that contains the text to create embeddings for.", 97 | required=True, 98 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 99 | default_value="text", 100 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 101 | ) 102 | NAMESPACE = PropertyDescriptor( 103 | name="Namespace", 104 | description="The name of the Pinecone Namespace to query into.", 105 | required=False, 106 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 107 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 108 | ) 109 | FILTER = PropertyDescriptor( 110 | name="Metadata Filter", 111 | description='Optional metadata filter to apply with the query. For example: { "author": {"$eq": "john.doe"} }', 112 | required=False, 113 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 114 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 115 | ) 116 | 117 | properties = [ 118 | PINECONE_API_KEY, 119 | EMBEDDING_MODEL, 120 | OPENAI_API_KEY, 121 | OPENAI_MODEL, 122 | HUGGING_FACE_API_KEY, 123 | HUGGING_FACE_MODEL, 124 | PINECONE_ENV, 125 | INDEX_NAME, 126 | QUERY, 127 | FILTER, 128 | NUMBER_OF_RESULTS, 129 | NAMESPACE, 130 | TEXT_KEY, 131 | QueryUtils.OUTPUT_STRATEGY, 132 | QueryUtils.RESULTS_FIELD, 133 | QueryUtils.INCLUDE_METADATAS, 134 | QueryUtils.INCLUDE_DISTANCES, 135 | ] 136 | 137 | embeddings = None 138 | query_utils = None 139 | pc = None 140 | 141 | def __init__(self, **kwargs): 142 | pass 143 | 144 | def getPropertyDescriptors(self): 145 | return self.properties 146 | 147 | def onScheduled(self, context): 148 | # initialize pinecone 149 | self.pc = Pinecone( 150 | api_key=context.getProperty(self.PINECONE_API_KEY).getValue(), 151 | environment=context.getProperty(self.PINECONE_ENV).getValue(), 152 | ) 153 | # initialize embedding service 154 | self.embeddings = create_embedding_service(context) 155 | self.query_utils = QueryUtils.QueryUtils(context) 156 | 157 | def transform(self, context, flowfile): 158 | # First, check if our index already exists. If it doesn't, we create it 159 | index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue() 160 | query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue() 161 | namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue() 162 | num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger() 163 | 164 | index = self.pc.Index(index_name) 165 | 166 | text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue() 167 | filter_definition = context.getProperty(self.FILTER).evaluateAttributeExpressions(flowfile).getValue() 168 | vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key, namespace=namespace) 169 | results = vectorstore.similarity_search_with_score( 170 | query, num_results, filter=None if filter_definition is None else json.loads(filter_definition) 171 | ) 172 | 173 | documents = [] 174 | for result in results: 175 | documents.append(result[0].page_content) 176 | 177 | if context.getProperty(QueryUtils.INCLUDE_METADATAS): 178 | metadatas = [] 179 | for result in results: 180 | metadatas.append(result[0].metadata) 181 | else: 182 | metadatas = None 183 | 184 | if context.getProperty(QueryUtils.INCLUDE_DISTANCES): 185 | distances = [] 186 | for result in results: 187 | distances.append(result[1]) 188 | else: 189 | distances = None 190 | 191 | (output_contents, mime_type) = self.query_utils.create_json( 192 | flowfile, documents, metadatas, None, distances, None 193 | ) 194 | attributes = {"mime.type": mime_type} 195 | 196 | return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes) 197 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QueryQdrant.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | import QdrantUtils 6 | import QueryUtils 7 | from EmbeddingUtils import ( 8 | create_embedding_service, 9 | ) 10 | from langchain.vectorstores.qdrant import Qdrant 11 | from nifiapi.documentation import use_case 12 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult 13 | from nifiapi.properties import ( 14 | ExpressionLanguageScope, 15 | PropertyDescriptor, 16 | StandardValidators, 17 | ) 18 | from qdrant_client import QdrantClient 19 | 20 | 21 | @use_case( 22 | description="Semantically search for documents stored in Qdrant - https://qdrant.tech/", 23 | keywords=["qdrant", "embedding", "vector", "text", "vectorstore", "search"], 24 | configuration=""" 25 | Configure 'Collection Name' to the name of the Qdrant collection to use. 26 | Configure 'Qdrant URL' to the fully qualified URL of the Qdrant instance. 27 | Configure 'Qdrant API Key' to the API Key to use in order to authenticate with Qdrant. 28 | Configure 'Prefer gRPC' to True if you want to use gRPC for interfacing with Qdrant. 29 | Configure 'Use HTTPS' to True if you want to use TLS(HTTPS) while interfacing with Qdrant. 30 | Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' 31 | Configure 'HuggingFace API Key' or 'OpenAI API Key', depending on the chosen Embedding Model. 32 | Configure 'HuggingFace Model' or 'OpenAI Model' to the name of the model to use. 33 | Configure 'Query' to the text of the query to send to Qdrant. 34 | Configure 'Number of Results' to the number of results to return from Qdrant. 35 | Configure 'Metadata Filter' to apply an optional metadata filter with the query. For example: { "author": "john.doe" } 36 | Configure 'Output Strategy' to indicate how the output should be formatted: 'Row-Oriented', 'Text', or 'Column-Oriented'. 37 | Configure 'Results Field' to the name of the field to insert the results, if the input FlowFile is JSON Formatted,. 38 | Configure 'Include Metadatas' to True if metadata should be included in the output. 39 | Configure 'Include Distances' to True if distances should be included in the output. 40 | """, 41 | ) 42 | class QueryQdrant(FlowFileTransform): 43 | class Java: 44 | implements = ["org.apache.nifi.python.processor.FlowFileTransform"] 45 | 46 | class ProcessorDetails: 47 | version = "2.0.0.dev0" 48 | description = "Queries Qdrant in order to gather a specified number of documents that are most closely related to the given query." 49 | tags = [ 50 | "qdrant", 51 | "vector", 52 | "vectordb", 53 | "vectorstore", 54 | "embeddings", 55 | "ai", 56 | "artificial intelligence", 57 | "ml", 58 | "machine learning", 59 | "text", 60 | "LLM", 61 | ] 62 | 63 | QUERY = PropertyDescriptor( 64 | name="Query", 65 | description="The text of the query to send to Qdrant.", 66 | required=True, 67 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 68 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 69 | ) 70 | NUMBER_OF_RESULTS = PropertyDescriptor( 71 | name="Number of Results", 72 | description="The number of results to return from Qdrant.", 73 | required=True, 74 | validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], 75 | default_value="10", 76 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 77 | ) 78 | FILTER = PropertyDescriptor( 79 | name="Metadata Filter", 80 | description='Optional metadata filter to apply with the query. For example: { "author": "john.doe" }', 81 | required=False, 82 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 83 | expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, 84 | ) 85 | 86 | properties = ( 87 | QdrantUtils.QDRANT_PROPERTIES 88 | + QdrantUtils.EMBEDDING_MODEL_PROPERTIES 89 | + [ 90 | QUERY, 91 | FILTER, 92 | NUMBER_OF_RESULTS, 93 | QueryUtils.OUTPUT_STRATEGY, 94 | QueryUtils.RESULTS_FIELD, 95 | QueryUtils.INCLUDE_METADATAS, 96 | QueryUtils.INCLUDE_DISTANCES, 97 | ] 98 | ) 99 | 100 | embeddings = None 101 | query_utils = None 102 | client = None 103 | 104 | def __init__(self, **kwargs): 105 | pass 106 | 107 | def getPropertyDescriptors(self): 108 | return self.properties 109 | 110 | def onScheduled(self, context): 111 | self.client = QdrantClient( 112 | url=context.getProperty(QdrantUtils.QDRANT_URL).getValue(), 113 | api_key=context.getProperty(QdrantUtils.QDRANT_API_KEY).getValue(), 114 | prefer_grpc=context.getProperty(QdrantUtils.PREFER_GRPC).asBoolean(), 115 | https=context.getProperty(QdrantUtils.HTTPS).asBoolean(), 116 | ) 117 | self.embeddings = create_embedding_service(context) 118 | self.query_utils = QueryUtils.QueryUtils(context) 119 | 120 | def transform(self, context, flowfile): 121 | collection_name = ( 122 | context.getProperty(QdrantUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue() 123 | ) 124 | query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue() 125 | num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger() 126 | filter_definition = context.getProperty(self.FILTER).evaluateAttributeExpressions(flowfile).getValue() 127 | vector_store = Qdrant( 128 | client=self.client, 129 | collection_name=collection_name, 130 | embeddings=self.embeddings, 131 | ) 132 | results = vector_store.similarity_search_with_score( 133 | query=query, 134 | k=num_results, 135 | filter=None if filter_definition is None else json.loads(filter_definition), 136 | ) 137 | 138 | documents = [] 139 | for result in results: 140 | documents.append(result[0].page_content) 141 | 142 | if context.getProperty(QueryUtils.INCLUDE_METADATAS).asBoolean(): 143 | metadatas = [] 144 | for result in results: 145 | metadatas.append(result[0].metadata) 146 | else: 147 | metadatas = None 148 | 149 | if context.getProperty(QueryUtils.INCLUDE_DISTANCES).asBoolean(): 150 | distances = [] 151 | for result in results: 152 | distances.append(result[1]) 153 | else: 154 | distances = None 155 | 156 | (output_contents, mime_type) = self.query_utils.create_json( 157 | flowfile, documents, metadatas, None, distances, None 158 | ) 159 | attributes = {"mime.type": mime_type} 160 | 161 | return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes) 162 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/QueryUtils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import json 4 | 5 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators 6 | 7 | ROW_ORIENTED = "Row-Oriented" 8 | TEXT = "Text" 9 | COLUMN_ORIENTED = "Column-Oriented" 10 | 11 | 12 | OUTPUT_STRATEGY = PropertyDescriptor( 13 | name="Output Strategy", 14 | description="""Specifies whether the output should contain only the text of the documents (each document separated by \\n\\n), or if it 15 | should be formatted as either single column-oriented JSON object, 16 | consisting of a keys 'ids', 'embeddings', 'documents', 'distances', and 'metadatas'; or if the results should be row-oriented, 17 | a JSON per line, each consisting of a single id, document, metadata, embedding, and distance.""", 18 | allowable_values=[ROW_ORIENTED, TEXT, COLUMN_ORIENTED], 19 | default_value=ROW_ORIENTED, 20 | required=True, 21 | ) 22 | RESULTS_FIELD = PropertyDescriptor( 23 | name="Results Field", 24 | description="""If the input FlowFile is JSON Formatted, this represents the name of the field to insert the results. This allows the results to be inserted into 25 | "an existing input in order to enrich it. If this property is unset, the results will be written to the FlowFile contents, overwriting any pre-existing content.""", 26 | validators=[StandardValidators.NON_EMPTY_VALIDATOR], 27 | required=False, 28 | ) 29 | 30 | INCLUDE_IDS = PropertyDescriptor( 31 | name="Include Document IDs", 32 | description="Whether or not to include the Documents' IDs in the response", 33 | allowable_values=["true", "false"], 34 | default_value="true", 35 | required=False, 36 | dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)], 37 | ) 38 | INCLUDE_METADATAS = PropertyDescriptor( 39 | name="Include Metadata", 40 | description="Whether or not to include the Documents' Metadata in the response", 41 | allowable_values=["true", "false"], 42 | default_value="true", 43 | required=False, 44 | dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)], 45 | ) 46 | INCLUDE_DOCUMENTS = PropertyDescriptor( 47 | name="Include Document", 48 | description="Whether or not to include the Documents' Text in the response", 49 | allowable_values=["true", "false"], 50 | default_value="true", 51 | required=False, 52 | dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)], 53 | ) 54 | INCLUDE_DISTANCES = PropertyDescriptor( 55 | name="Include Distances", 56 | description="Whether or not to include the Documents' Distances (i.e., how far the Document was away from the query) in the response", 57 | allowable_values=["true", "false"], 58 | default_value="true", 59 | required=False, 60 | dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)], 61 | ) 62 | INCLUDE_EMBEDDINGS = PropertyDescriptor( 63 | name="Include Embeddings", 64 | description="Whether or not to include the Documents' Embeddings in the response", 65 | allowable_values=["true", "false"], 66 | default_value="false", 67 | required=False, 68 | dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)], 69 | ) 70 | 71 | 72 | class QueryUtils: 73 | context = None 74 | 75 | def __init__(self, context): 76 | self.context = context 77 | self.results_field = context.getProperty(RESULTS_FIELD).getValue() 78 | self.output_strategy = context.getProperty(OUTPUT_STRATEGY).getValue() 79 | 80 | ids_property = context.getProperty(INCLUDE_IDS) 81 | self.include_ids = ids_property.asBoolean() if ids_property else False 82 | 83 | embeddings_property = context.getProperty(INCLUDE_EMBEDDINGS) 84 | self.include_embeddings = embeddings_property.asBoolean() if embeddings_property else False 85 | 86 | self.include_distances = context.getProperty(INCLUDE_DISTANCES).asBoolean() 87 | 88 | documents_property = context.getProperty(INCLUDE_DOCUMENTS) 89 | self.include_documents = documents_property.asBoolean() if documents_property else True 90 | self.include_metadatas = context.getProperty(INCLUDE_METADATAS).asBoolean() 91 | 92 | def create_json(self, flowfile, documents, metadatas, embeddings, distances, ids) -> tuple[str, str]: 93 | input_json = None if self.results_field is None else json.loads(flowfile.getContentsAsBytes().decode()) 94 | 95 | if self.output_strategy == TEXT: 96 | # Delete any document that is None or an empty-string 97 | documents = [doc for doc in documents if doc is not None and doc != ""] 98 | 99 | # Join the documents with two newlines 100 | text = "\n\n".join(documents) 101 | 102 | # Create either JSON or text output, based on whether or not an results field was specified 103 | if input_json is None: 104 | mime_type = "text/plain" 105 | output_contents = text 106 | else: 107 | input_json[self.results_field] = text 108 | output_contents = json.dumps(input_json) 109 | mime_type = "application/json" 110 | elif self.output_strategy == COLUMN_ORIENTED: 111 | doc = {} 112 | if self.include_ids: 113 | doc["ids"] = ids 114 | if self.include_distances: 115 | doc["distances"] = distances 116 | if self.include_documents: 117 | doc["documents"] = documents 118 | if self.include_metadatas: 119 | doc["metadatas"] = metadatas 120 | if self.include_embeddings: 121 | doc["embeddings"] = embeddings 122 | 123 | # Create the JSON from the Document 124 | if input_json is None: 125 | output_contents = json.dumps(doc) 126 | else: 127 | input_json[self.results_field] = doc 128 | output_contents = json.dumps(input_json) 129 | 130 | mime_type = "application/json" 131 | else: 132 | # Build the Documents 133 | docs = [] 134 | 135 | count = len(ids) if ids else len(documents) 136 | for i in range(count): 137 | doc_id = None if ids is None else ids[i] 138 | distance = None if distances is None else distances[i] 139 | metadata = None if metadatas is None else metadatas[i] 140 | document = None if documents is None else documents[i] 141 | embedding = None if embeddings is None else embeddings[i] 142 | 143 | # Create the document but do not include any key that we don't want to include in the output. 144 | doc = {} 145 | if self.include_ids: 146 | doc["id"] = doc_id 147 | if self.include_distances: 148 | doc["distance"] = distance 149 | if self.include_documents: 150 | doc["document"] = document 151 | if self.include_metadatas: 152 | doc["metadata"] = metadata 153 | if self.include_embeddings: 154 | doc["embedding"] = embedding 155 | 156 | docs.append(doc) 157 | 158 | # If input_json is None, we just create JSON based on the Documents. 159 | # If input_json is populated, we insert the documents into the input JSON using the specified key. 160 | if input_json is None: 161 | jsons = [] 162 | for doc in docs: 163 | jsons.append(json.dumps(doc)) 164 | output_contents = "\n".join(jsons) 165 | else: 166 | input_json[self.results_field] = docs 167 | output_contents = json.dumps(input_json) 168 | 169 | mime_type = "application/json" 170 | 171 | return output_contents, mime_type 172 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | -------------------------------------------------------------------------------- /src/extensions/vectorstores/requirements.txt: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Shared requirements 4 | openai==1.9.0 5 | tiktoken 6 | langchain==0.1.11 7 | 8 | # Chroma requirements 9 | chromadb==0.4.22 10 | numpy==1.26.4 11 | onnxruntime 12 | tokenizers 13 | tqdm 14 | requests 15 | 16 | # Pinecone requirements 17 | pinecone-client==3.0.1 18 | tiktoken 19 | langchain==0.1.11 20 | 21 | # OpenSearch requirements 22 | opensearch-py==2.5.0 23 | 24 | # Qdrant requirements 25 | qdrant-client==1.9.1 26 | --------------------------------------------------------------------------------