├── .asf.yaml
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── build.yml
├── .gitignore
├── .ratignore
├── LICENSE
├── NOTICE
├── README.md
├── check-licenses.sh
├── pyproject.toml
└── src
    ├── __about__.py
    ├── __init__.py
    └── extensions
        ├── chunking
            ├── ChunkDocument.py
            ├── ParseDocument.py
            └── __init__.py
        ├── openai
            ├── PromptChatGPT.py
            └── __init__.py
        └── vectorstores
            ├── ChromaUtils.py
            ├── EmbeddingUtils.py
            ├── OpenSearchVectorUtils.py
            ├── PutChroma.py
            ├── PutOpenSearchVector.py
            ├── PutPinecone.py
            ├── PutQdrant.py
            ├── QdrantUtils.py
            ├── QueryChroma.py
            ├── QueryOpenSearchVector.py
            ├── QueryPinecone.py
            ├── QueryQdrant.py
            ├── QueryUtils.py
            ├── __init__.py
            └── requirements.txt


/.asf.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | github:
 4 |   description: "Apache NiFi Python Extensions"
 5 |   homepage: https://nifi.apache.org/
 6 |   labels:
 7 |     - apache
 8 |     - nifi
 9 |     - python
10 |     - hacktoberfest
11 |   features:
12 |     wiki: false
13 |     issues: false
14 |     projects: false
15 |   enabled_merge_buttons:
16 |     squash:  true
17 |   autolink_jira:
18 |     - NIFI
19 |   protected_branches:
20 |     main:
21 |       required_signatures: true
22 |       required_linear_history: true
23 | notifications:
24 |   commits:      commits@nifi.apache.org
25 |   issues:       issues@nifi.apache.org
26 |   pullrequests: issues@nifi.apache.org
27 |   jira_options: link worklog
28 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- SPDX-License-Identifier: Apache-2.0 -->
 2 | 
 3 | # Summary
 4 | 
 5 | [NIFI-00000](https://issues.apache.org/jira/browse/NIFI-00000)
 6 | 
 7 | # Tracking
 8 | 
 9 | Please complete the following tracking steps prior to pull request creation.
10 | 
11 | ### Issue Tracking
12 | 
13 | - [ ] [Apache NiFi Jira](https://issues.apache.org/jira/browse/NIFI) issue created
14 | 
15 | ### Pull Request Tracking
16 | 
17 | - [ ] Pull Request title starts with Apache NiFi Jira issue number, such as `NIFI-00000`
18 | - [ ] Pull Request commit message starts with Apache NiFi Jira issue number, as such `NIFI-00000`
19 | 
20 | ### Pull Request Formatting
21 | 
22 | - [ ] Pull Request based on current revision of the `main` branch
23 | - [ ] Pull Request refers to a feature branch with one commit containing changes
24 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: build
 4 | 
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | permissions:
16 |   security-events: write
17 |   contents: read
18 |   pull-requests: read
19 | 
20 | jobs:
21 |   build:
22 |     name: Python ${{ matrix.python }} on ${{ matrix.os }}
23 |     runs-on: ${{ matrix.os }}
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         os:
28 |           - ubuntu-22.04
29 |           - macos-14
30 |         python:
31 |           - '3.11'
32 |           - '3.12'
33 |     steps:
34 |       - name: Checkout Sources
35 |         uses: actions/checkout@v4
36 |       - name: Check Licenses
37 |         run: sh check-licenses.sh
38 |       - name: Setup Python ${{ matrix.python }}
39 |         uses: actions/setup-python@v5
40 |         with:
41 |           python-version: ${{ matrix.python }}
42 |       - name: Install Hatch
43 |         run: |
44 |           python -m pip install --upgrade pip
45 |           pip install hatch
46 |       - name: Check Formatting
47 |         run: hatch fmt --check
48 |       - name: Build Distribution
49 |         run: hatch build
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 | 
56 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
57 | __pypackages__/
58 | 
59 | # Environments
60 | .env
61 | .venv
62 | env/
63 | venv/
64 | ENV/
65 | env.bak/
66 | venv.bak/
67 | 
68 | # mkdocs documentation
69 | /site
70 | 
71 | # mypy
72 | .mypy_cache/
73 | .dmypy.json
74 | dmypy.json
75 | 
76 | # Pyre type checker
77 | .pyre/
78 | 
79 | # pytype static type analyzer
80 | .pytype/
81 | 
82 | # Cython debug symbols
83 | cython_debug/
84 | 
85 | .idea/
86 | 


--------------------------------------------------------------------------------
/.ratignore:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | __pycache__/*
 3 | build/*
 4 | dist/*
 5 | downloads/*
 6 | eggs/*
 7 | lib/*
 8 | lib64/*
 9 | parts/*
10 | sdist/*
11 | var/*
12 | wheels/*
13 | share/python-wheels/*
14 | .idea/*
15 | .git/*
16 | .cache/*
17 | .ruff_cache/*
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Apache NiFi Python Extensions
2 | Copyright 2024 The Apache Software Foundation
3 | 
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apache NiFi Python Extensions
 2 | 
 3 | [![license](https://img.shields.io/github/license/apache/nifi-python-extensions)](https://github.com/apache/nifi-python-extensions/blob/main/LICENSE)
 4 | [![build](https://github.com/apache/nifi-python-extensions/actions/workflows/build.yml/badge.svg)](https://github.com/apache/nifi-python-extensions/actions/workflows/build.yml)
 5 | [![Hatch](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch)
 6 | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 7 | 
 8 | The [Apache NiFi](https://nifi.apache.org) Python Extensions repository contains Processors implemented in [Python](https://www.python.org/)
 9 | for deployment in Apache NiFi 2.
10 | 
11 | ## Building
12 | 
13 | This project uses [Hatch](https://hatch.pypa.io) to build distribution packages.
14 | 
15 | ```
16 | hatch build
17 | ```
18 | 
19 | The build command creates a source distribution in the `dist` directory.
20 | 
21 | The source distribution contains an `extensions` directory can be copied into Apache NiFi to use the packaged Processors.
22 | 
23 | ## Developing
24 | 
25 | The Apache NiFi [Python Developer's Guide](https://nifi.apache.org/documentation/nifi-2.0.0-M3/html/python-developer-guide.html)
26 | provides the API and implementation guidelines for Python Processors.
27 | 
28 | The Hatch format command supports evaluating Python Processors against configured rules.
29 | 
30 | ```
31 | hatch fmt --check
32 | ```
33 | 
34 | ## Documentation
35 | 
36 | The Apache NiFi [Documentation](https://nifi.apache.org/documentation/) includes reference information for project capabilities.
37 | 
38 | ## Contributing
39 | 
40 | The Apache NiFi [Contributor Guide](https://cwiki.apache.org/confluence/display/NIFI/Contributor+Guide)
41 | describes the process for getting involved in the development of this project.
42 | 
43 | ## Issues
44 | 
45 | This project uses [Jira](https://issues.apache.org/jira/browse/NIFI) for tracking bugs and features.
46 | 
47 | ## Licensing
48 | 
49 | This project is released under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
50 | 


--------------------------------------------------------------------------------
/check-licenses.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | APACHE_RAT_VERSION="0.16.1"
 5 | APACHE_RAT_JAR="apache-rat-$APACHE_RAT_VERSION.jar"
 6 | APACHE_RAT_JAR_URL="https://repo1.maven.org/maven2/org/apache/rat/apache-rat/$APACHE_RAT_VERSION/$APACHE_RAT_JAR"
 7 | CACHE_DIRECTORY=".cache"
 8 | APACHE_RAT_JAR_PATH="$CACHE_DIRECTORY/$APACHE_RAT_JAR"
 9 | APACHE_RAT_EXCLUDE_FILE=".ratignore"
10 | 
11 | # Set Java command
12 | if [ -n "${JAVA_HOME-}" ]; then
13 |   JAVACMD="$JAVA_HOME/bin/java"
14 |   if [ ! -x "$JAVACMD" ]; then
15 |     die "Java command [$JAVACMD}] not found"
16 |   fi
17 | elif command -v java > /dev/null; then
18 |   JAVACMD=$(command -v java)
19 | else
20 |   die "Environment variable [JAVA_HOME] and command [java] not found"
21 | fi
22 | 
23 | # Set curl command
24 | if command -v curl > /dev/null; then
25 |   CURLCMD=$(command -v curl)
26 | else
27 |   die "Command [curl] not found"
28 | fi
29 | 
30 | # Download Apache Rat JAR
31 | if [ ! -d $CACHE_DIRECTORY ]; then
32 |   mkdir $CACHE_DIRECTORY
33 | fi
34 | if [ ! -f $APACHE_RAT_JAR_PATH ]; then
35 |   echo "Downloading Apache Rat from [$APACHE_RAT_JAR_URL]"
36 |   CURL_RESULTS=$(exec $CURLCMD -f --silent --show-error -o "$APACHE_RAT_JAR_PATH" "$APACHE_RAT_JAR_URL")
37 |   if [ $? -ne 0 ]; then
38 |     echo "Failed to download Apache Rat from [$APACHE_RAT_JAR_URL]"
39 |     exit $?
40 |   fi
41 | fi
42 | 
43 | # Run Apache Rat
44 | REPORT_RESULTS=$(exec $JAVACMD -jar $APACHE_RAT_JAR_PATH --scan-hidden-directories --exclude-file $APACHE_RAT_EXCLUDE_FILE --dir . 2>&1)
45 | if [ $? -ne 0 ]; then
46 |   echo "$REPORT_RESULTS"
47 |   exit $?
48 | fi
49 | 
50 | UNKNOWN_LICENSES_FOUND=$(echo "$REPORT_RESULTS" | grep --count "??")
51 | echo "Unknown Licenses Found: $UNKNOWN_LICENSES_FOUND"
52 | 
53 | if [ $UNKNOWN_LICENSES_FOUND -eq 0 ]; then
54 |   RESULT_CODE=0
55 | else
56 |   RESULT_CODE=1
57 |   echo "$REPORT_RESULTS"
58 | fi
59 | 
60 | exit $RESULT_CODE
61 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | [build-system]
 4 | requires = ["hatchling"]
 5 | build-backend = "hatchling.build"
 6 | 
 7 | [project]
 8 | name = "nifi-python-extensions"
 9 | dynamic = ["version"]
10 | description = "Apache NiFi Processors implemented in Python"
11 | requires-python = ">=3.11"
12 | keywords = ["apache", "nifi", "extensions", "processors"]
13 | readme = "README.md"
14 | authors = [
15 |     { name = "Apache NiFi Developers", email = "dev@nifi.apache.org" },
16 | ]
17 | maintainers = [
18 |     { name = "Apache NiFi Developers", email = "dev@nifi.apache.org" },
19 | ]
20 | classifiers = [
21 |     "Development Status :: 5 - Production/Stable",
22 |     "License :: OSI Approved :: Apache Software License",
23 |     "Intended Audience :: Developers",
24 |     "Programming Language :: Python",
25 |     "Programming Language :: Python :: 3.11",
26 |     "Programming Language :: Python :: 3.12",
27 |     "Framework :: Hatch",
28 | ]
29 | 
30 | [project.urls]
31 | Homepage = "https://nifi.apache.org"
32 | Issues = "https://issues.apache.org/jira/projects/NIFI/issues"
33 | Source = "https://github.com/apache/nifi-python-extensions"
34 | 
35 | [tool.hatch.version]
36 | path = "src/__about__.py"
37 | 
38 | [[tool.hatch.envs.all.matrix]]
39 | python = ["3.11", "3.12"]
40 | 
41 | [tool.hatch.build.targets.wheel]
42 | packages = ["src/extensions"]
43 | 
44 | [tool.hatch.build.targets.sdist]
45 | exclude = [
46 |     ".asf.yaml",
47 |     ".github",
48 |     ".ratignore",
49 |     "check-licenses.sh",
50 | ]
51 | 
52 | [tool.ruff]
53 | preview = true
54 | lint.pep8-naming.extend-ignore-names = [
55 |     "flowFile",
56 |     "getPropertyDescriptors",
57 |     "onScheduled",
58 | ]
59 | lint.flake8-self.extend-ignore-names = [
60 |     "_standard_validators"
61 | ]
62 | lint.extend-select = [
63 |     "CPY001"
64 | ]
65 | lint.ignore = [
66 |     "G004", # Allow f-string for logging
67 |     "N999", # Allow Processor module names that do not follow pep8-naming
68 |     "PERF401", # Allow manual list comprehension
69 |     "RUF012", # Allow mutable class attributes without typing.ClassVar
70 |     "S105", # Avoid checking for hardcoded-password-string values
71 | ]
72 | 
73 | [tool.ruff.lint.flake8-copyright]
74 | notice-rgx = "# SPDX-License-Identifier: Apache-2.0\n"
75 | 


--------------------------------------------------------------------------------
/src/__about__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 
3 | __version__ = "2.0.0.dev0"
4 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 


--------------------------------------------------------------------------------
/src/extensions/chunking/ChunkDocument.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | from langchain.text_splitter import Language
  6 | from nifiapi.documentation import ProcessorConfiguration, multi_processor_use_case, use_case
  7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
  9 | 
 10 | SPLIT_BY_CHARACTER = "Split by Character"
 11 | SPLIT_CODE = "Split Code"
 12 | RECURSIVELY_SPLIT_BY_CHARACTER = "Recursively Split by Character"
 13 | 
 14 | TEXT_KEY = "text"
 15 | METADATA_KEY = "metadata"
 16 | 
 17 | 
 18 | @use_case(
 19 |     description="Create chunks of text from a single larger chunk.",
 20 |     notes="The input for this use case is expected to be a FlowFile whose content is a JSON Lines document, with each line having a 'text' and a 'metadata' element.",
 21 |     keywords=["embedding", "vector", "text", "rag", "retrieval augmented generation"],
 22 |     configuration="""
 23 |         Set "Input Format" to "Plain Text"
 24 |         Set "Element Strategy" to "Single Document"
 25 |         """,
 26 | )
 27 | @multi_processor_use_case(
 28 |     description="""
 29 |         Chunk Plaintext data in order to prepare it for storage in a vector store. The output is in "json-lines" format,
 30 |         containing the chunked data as text, as well as metadata pertaining to the chunk.""",
 31 |     notes="The input for this use case is expected to be a FlowFile whose content is a plaintext document.",
 32 |     keywords=["embedding", "vector", "text", "rag", "retrieval augmented generation"],
 33 |     configurations=[
 34 |         ProcessorConfiguration(
 35 |             processor_type="ParseDocument",
 36 |             configuration="""
 37 |                   Set "Input Format" to "Plain Text"
 38 |                   Set "Element Strategy" to "Single Document"
 39 | 
 40 |                   Connect the 'success' Relationship to ChunkDocument.
 41 |                   """,
 42 |         ),
 43 |         ProcessorConfiguration(
 44 |             processor_type="ChunkDocument",
 45 |             configuration="""
 46 |                   Set the following properties:
 47 |                     "Chunking Strategy" = "Recursively Split by Character"
 48 |                     "Separator" = "\\n\\n,\\n, ,"
 49 |                     "Separator Format" = "Plain Text"
 50 |                     "Chunk Size" = "4000"
 51 |                     "Chunk Overlap" = "200"
 52 |                     "Keep Separator" = "false"
 53 | 
 54 |                   Connect the 'success' Relationship to the appropriate destination to store data in the desired vector store.
 55 |                   """,
 56 |         ),
 57 |     ],
 58 | )
 59 | @multi_processor_use_case(
 60 |     description="""
 61 |         Parse and chunk the textual contents of a PDF document in order to prepare it for storage in a vector store. The output is in "json-lines" format,
 62 |         containing the chunked data as text, as well as metadata pertaining to the chunk.""",
 63 |     notes="The input for this use case is expected to be a FlowFile whose content is a PDF document.",
 64 |     keywords=["pdf", "embedding", "vector", "text", "rag", "retrieval augmented generation"],
 65 |     configurations=[
 66 |         ProcessorConfiguration(
 67 |             processor_type="ParseDocument",
 68 |             configuration="""
 69 |                   Set "Input Format" to "PDF"
 70 |                   Set "Element Strategy" to "Single Document"
 71 |                   Set "Include Extracted Metadata" to "false"
 72 | 
 73 |                   Connect the 'success' Relationship to ChunkDocument.
 74 |                   """,
 75 |         ),
 76 |         ProcessorConfiguration(
 77 |             processor_type="ChunkDocument",
 78 |             configuration="""
 79 |                   Set the following properties:
 80 |                     "Chunking Strategy" = "Recursively Split by Character"
 81 |                     "Separator" = "\\n\\n,\\n, ,"
 82 |                     "Separator Format" = "Plain Text"
 83 |                     "Chunk Size" = "4000"
 84 |                     "Chunk Overlap" = "200"
 85 |                     "Keep Separator" = "false"
 86 | 
 87 |                   Connect the 'success' Relationship to the appropriate destination to store data in the desired vector store.
 88 |                   """,
 89 |         ),
 90 |     ],
 91 | )
 92 | class ChunkDocument(FlowFileTransform):
 93 |     class Java:
 94 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 95 | 
 96 |     class ProcessorDetails:
 97 |         version = "2.0.0.dev0"
 98 |         description = """Chunks incoming documents that are formatted as JSON Lines into chunks that are appropriately sized for creating Text Embeddings.
 99 |             The input is expected to be in "json-lines" format, with each line having a 'text' and a 'metadata' element.
100 |             Each line will then be split into one or more lines in the output."""
101 |         tags = [
102 |             "text",
103 |             "split",
104 |             "chunk",
105 |             "langchain",
106 |             "embeddings",
107 |             "vector",
108 |             "machine learning",
109 |             "ML",
110 |             "artificial intelligence",
111 |             "ai",
112 |             "document",
113 |         ]
114 |         dependencies = ["langchain"]
115 | 
116 |     CHUNK_STRATEGY = PropertyDescriptor(
117 |         name="Chunking Strategy",
118 |         description="Specifies which splitter should be used to split the text",
119 |         allowable_values=[RECURSIVELY_SPLIT_BY_CHARACTER, SPLIT_BY_CHARACTER, SPLIT_CODE],
120 |         required=True,
121 |         default_value=RECURSIVELY_SPLIT_BY_CHARACTER,
122 |     )
123 |     SEPARATOR = PropertyDescriptor(
124 |         name="Separator",
125 |         description="""Specifies the character sequence to use for splitting apart the text. If using a Chunking Strategy of Recursively Split by Character,
126 |                     it is a comma-separated list of character sequences. Meta-characters \\n, \\r and \\t are automatically un-escaped.""",
127 |         required=True,
128 |         default_value="\\n\\n,\\n, ,",
129 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
130 |         dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)],
131 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
132 |     )
133 |     SEPARATOR_FORMAT = PropertyDescriptor(
134 |         name="Separator Format",
135 |         description="Specifies how to interpret the value of the <Separator> property",
136 |         required=True,
137 |         default_value="Plain Text",
138 |         allowable_values=["Plain Text", "Regular Expression"],
139 |         dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)],
140 |     )
141 |     CHUNK_SIZE = PropertyDescriptor(
142 |         name="Chunk Size",
143 |         description="The maximum size of a chunk that should be returned",
144 |         required=True,
145 |         default_value="4000",
146 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
147 |     )
148 |     CHUNK_OVERLAP = PropertyDescriptor(
149 |         name="Chunk Overlap",
150 |         description="The number of characters that should be overlapped between each chunk of text",
151 |         required=True,
152 |         default_value="200",
153 |         validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR],
154 |     )
155 |     KEEP_SEPARATOR = PropertyDescriptor(
156 |         name="Keep Separator",
157 |         description="Whether or not to keep the text separator in each chunk of data",
158 |         required=True,
159 |         default_value="false",
160 |         allowable_values=["true", "false"],
161 |         dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)],
162 |     )
163 |     STRIP_WHITESPACE = PropertyDescriptor(
164 |         name="Strip Whitespace",
165 |         description="Whether or not to strip the whitespace at the beginning and end of each chunk",
166 |         required=True,
167 |         default_value="true",
168 |         allowable_values=["true", "false"],
169 |         dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_BY_CHARACTER, RECURSIVELY_SPLIT_BY_CHARACTER)],
170 |     )
171 |     LANGUAGE = PropertyDescriptor(
172 |         name="Language",
173 |         description="The language to use for the Code's syntax",
174 |         required=True,
175 |         default_value="python",
176 |         allowable_values=[e.value for e in Language],
177 |         dependencies=[PropertyDependency(CHUNK_STRATEGY, SPLIT_CODE)],
178 |     )
179 | 
180 |     property_descriptors = [
181 |         CHUNK_STRATEGY,
182 |         SEPARATOR,
183 |         SEPARATOR_FORMAT,
184 |         CHUNK_SIZE,
185 |         CHUNK_OVERLAP,
186 |         KEEP_SEPARATOR,
187 |         STRIP_WHITESPACE,
188 |         LANGUAGE,
189 |     ]
190 | 
191 |     def __init__(self, **kwargs):
192 |         pass
193 | 
194 |     def getPropertyDescriptors(self):
195 |         return self.property_descriptors
196 | 
197 |     def split_docs(self, context, flowfile, documents):
198 |         from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
199 | 
200 |         strategy = context.getProperty(self.CHUNK_STRATEGY).getValue()
201 |         if strategy == SPLIT_BY_CHARACTER:
202 |             text_splitter = CharacterTextSplitter(
203 |                 separator=context.getProperty(self.SEPARATOR).evaluateAttributeExpressions(flowfile).getValue(),
204 |                 keep_separator=context.getProperty(self.KEEP_SEPARATOR).asBoolean(),
205 |                 is_separator_regex=context.getProperty(self.SEPARATOR_FORMAT).getValue() == "Regular Expression",
206 |                 chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(),
207 |                 chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(),
208 |                 length_function=len,
209 |                 strip_whitespace=context.getProperty(self.STRIP_WHITESPACE).asBoolean(),
210 |             )
211 |         elif strategy == SPLIT_CODE:
212 |             text_splitter = RecursiveCharacterTextSplitter.from_language(
213 |                 language=context.getProperty(self.LANGUAGE).getValue(),
214 |                 chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(),
215 |                 chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(),
216 |             )
217 |         else:
218 |             separator_text = context.getProperty(self.SEPARATOR).evaluateAttributeExpressions(flowfile).getValue()
219 |             splits = separator_text.split(",")
220 |             unescaped = []
221 |             for split in splits:
222 |                 unescaped.append(split.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t"))
223 |             text_splitter = RecursiveCharacterTextSplitter(
224 |                 separators=unescaped,
225 |                 keep_separator=context.getProperty(self.KEEP_SEPARATOR).asBoolean(),
226 |                 is_separator_regex=context.getProperty(self.SEPARATOR_FORMAT).getValue() == "Regular Expression",
227 |                 chunk_size=context.getProperty(self.CHUNK_SIZE).asInteger(),
228 |                 chunk_overlap=context.getProperty(self.CHUNK_OVERLAP).asInteger(),
229 |                 length_function=len,
230 |                 strip_whitespace=context.getProperty(self.STRIP_WHITESPACE).asBoolean(),
231 |             )
232 | 
233 |         return text_splitter.split_documents(documents)
234 | 
235 |     def to_json(self, docs) -> str:
236 |         json_docs = []
237 | 
238 |         for i, doc in enumerate(docs):
239 |             doc.metadata["chunk_index"] = i
240 |             doc.metadata["chunk_count"] = len(docs)
241 | 
242 |             json_doc = json.dumps({TEXT_KEY: doc.page_content, METADATA_KEY: doc.metadata})
243 |             json_docs.append(json_doc)
244 | 
245 |         return "\n".join(json_docs)
246 | 
247 |     def load_docs(self, flowfile):
248 |         from langchain.schema import Document
249 | 
250 |         flowfile_contents = flowfile.getContentsAsBytes().decode()
251 |         docs = []
252 |         for line in flowfile_contents.split("\n"):
253 |             stripped = line.strip()
254 |             if stripped == "":
255 |                 continue
256 | 
257 |             json_element = json.loads(stripped)
258 |             page_content = json_element.get(TEXT_KEY)
259 |             if page_content is None:
260 |                 continue
261 | 
262 |             metadata = json_element.get(METADATA_KEY)
263 |             if metadata is None:
264 |                 metadata = {}
265 | 
266 |             doc = Document(page_content=page_content, metadata=metadata)
267 |             docs.append(doc)
268 | 
269 |         return docs
270 | 
271 |     def transform(self, context, flowfile):
272 |         documents = self.load_docs(flowfile)
273 |         split_docs = self.split_docs(context, flowfile, documents)
274 | 
275 |         output_json = self.to_json(split_docs)
276 |         attributes = {"document.count": str(len(split_docs))}
277 |         return FlowFileTransformResult("success", contents=output_json, attributes=attributes)
278 | 


--------------------------------------------------------------------------------
/src/extensions/chunking/ParseDocument.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import io
  4 | import json
  5 | 
  6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  7 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators
  8 | 
  9 | PLAIN_TEXT = "Plain Text"
 10 | HTML = "HTML"
 11 | MARKDOWN = "Markdown"
 12 | PDF = "PDF"
 13 | EXCEL = "Microsoft Excel"
 14 | POWERPOINT = "Microsoft PowerPoint"
 15 | WORD = "Microsoft Word"
 16 | 
 17 | PARSING_STRATEGY_AUTO = "Automatic"
 18 | PARSING_STRATEGY_HIGH_RES = "High Resolution"
 19 | PARSING_STRATEGY_OCR_ONLY = "OCR Only"
 20 | PARSING_STRATEGY_FAST = "Fast"
 21 | 
 22 | SINGLE_DOCUMENT = "Single Document"
 23 | DOCUMENT_PER_ELEMENT = "Document Per Element"
 24 | 
 25 | TEXT_KEY = "text"
 26 | METADATA_KEY = "metadata"
 27 | 
 28 | 
 29 | class ParseDocument(FlowFileTransform):
 30 |     class Java:
 31 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 32 | 
 33 |     class ProcessorDetails:
 34 |         version = "2.0.0.dev0"
 35 |         description = """Parses incoming unstructured text documents and performs optical character recognition (OCR) in order to extract text from PDF and image files.
 36 |             The output is formatted as "json-lines" with two keys: 'text' and 'metadata'.
 37 |             Note that use of this Processor may require significant storage space and RAM utilization due to third-party dependencies necessary for processing PDF and image files.
 38 |             Also note that in order to process PDF or Images, Tesseract and Poppler must be installed on the system."""
 39 |         tags = [
 40 |             "text",
 41 |             "embeddings",
 42 |             "vector",
 43 |             "machine learning",
 44 |             "ML",
 45 |             "artificial intelligence",
 46 |             "ai",
 47 |             "document",
 48 |             "langchain",
 49 |             "pdf",
 50 |             "html",
 51 |             "markdown",
 52 |             "word",
 53 |             "excel",
 54 |             "powerpoint",
 55 |         ]
 56 |         dependencies = [
 57 |             "pikepdf==8.12.0",
 58 |             "pypdf==4.0.1",
 59 |             "langchain==0.1.7",
 60 |             "unstructured==0.14.8",
 61 |             "unstructured-inference==0.7.36",
 62 |             "unstructured_pytesseract==0.3.12",
 63 |             "pillow-heif==0.15.0",
 64 |             "numpy==1.26.4",
 65 |             "opencv-python==4.9.0.80",
 66 |             "pdf2image==1.17.0",
 67 |             "pdfminer.six==20221105",
 68 |             "python-docx==1.1.0",
 69 |             "openpyxl==3.1.2",
 70 |             "python-pptx==0.6.23",
 71 |         ]
 72 | 
 73 |     INPUT_FORMAT = PropertyDescriptor(
 74 |         name="Input Format",
 75 |         description="""The format of the input FlowFile. This dictates which TextLoader will be used to parse the input.
 76 |             Note that in order to process images or extract tables from PDF files,you must have both 'poppler' and 'tesseract' installed on your system.""",
 77 |         allowable_values=[PLAIN_TEXT, HTML, MARKDOWN, PDF, WORD, EXCEL, POWERPOINT],
 78 |         required=True,
 79 |         default_value=PLAIN_TEXT,
 80 |     )
 81 |     PDF_PARSING_STRATEGY = PropertyDescriptor(
 82 |         name="PDF Parsing Strategy",
 83 |         display_name="Parsing Strategy",
 84 |         description="Specifies the strategy to use when parsing a PDF",
 85 |         allowable_values=[
 86 |             PARSING_STRATEGY_AUTO,
 87 |             PARSING_STRATEGY_HIGH_RES,
 88 |             PARSING_STRATEGY_OCR_ONLY,
 89 |             PARSING_STRATEGY_FAST,
 90 |         ],
 91 |         required=True,
 92 |         default_value=PARSING_STRATEGY_AUTO,
 93 |         dependencies=[PropertyDependency(INPUT_FORMAT, PDF)],
 94 |     )
 95 |     PDF_MODEL_NAME = PropertyDescriptor(
 96 |         name="PDF Parsing Model",
 97 |         description="The model to use for parsing. Different models will have their own strengths and weaknesses.",
 98 |         allowable_values=["yolox", "detectron2_onnx", "chipper"],
 99 |         required=True,
100 |         default_value="yolox",
101 |         dependencies=[PropertyDependency(INPUT_FORMAT, PDF)],
102 |     )
103 |     ELEMENT_STRATEGY = PropertyDescriptor(
104 |         name="Element Strategy",
105 |         description="Specifies whether the input should be loaded as a single Document, or if each element in the input should be separated out into its own Document",
106 |         allowable_values=[SINGLE_DOCUMENT, DOCUMENT_PER_ELEMENT],
107 |         required=True,
108 |         default_value=DOCUMENT_PER_ELEMENT,
109 |         dependencies=[PropertyDependency(INPUT_FORMAT, HTML, MARKDOWN)],
110 |     )
111 |     INCLUDE_PAGE_BREAKS = PropertyDescriptor(
112 |         name="Include Page Breaks",
113 |         description="Specifies whether or not page breaks should be considered when creating Documents from the input",
114 |         allowable_values=["true", "false"],
115 |         required=True,
116 |         default_value="false",
117 |         dependencies=[
118 |             PropertyDependency(INPUT_FORMAT, HTML, MARKDOWN),
119 |             PropertyDependency(ELEMENT_STRATEGY, DOCUMENT_PER_ELEMENT),
120 |         ],
121 |     )
122 |     PDF_INFER_TABLE_STRUCTURE = PropertyDescriptor(
123 |         name="Infer Table Structure",
124 |         description="If true, any table that is identified in the PDF will be parsed and translated into an HTML structure. The HTML of that table will then be added to the \
125 |                     Document's metadata in a key named 'text_as_html'. Regardless of the value of this property, the textual contents of the table will be written to the contents \
126 |                     without the structure.",
127 |         allowable_values=["true", "false"],
128 |         default_value="false",
129 |         required=True,
130 |         dependencies=[PropertyDependency(PDF_PARSING_STRATEGY, PARSING_STRATEGY_HIGH_RES)],
131 |     )
132 |     LANGUAGES = PropertyDescriptor(
133 |         name="Languages",
134 |         description="A comma-separated list of language codes that should be used when using OCR to determine the text.",
135 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
136 |         default_value="Eng",
137 |         required=True,
138 |         dependencies=[PropertyDependency(INPUT_FORMAT, PDF)],
139 |     )
140 |     METADATA_FIELDS = PropertyDescriptor(
141 |         name="Metadata Fields",
142 |         description="A comma-separated list of FlowFile attributes that will be added to the Documents' Metadata",
143 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
144 |         default_value="filename, uuid",
145 |         required=True,
146 |     )
147 |     EXTRACT_METADATA = PropertyDescriptor(
148 |         name="Include Extracted Metadata",
149 |         description="Whether or not to include the metadata that is extracted from the input in each of the Documents",
150 |         allowable_values=["true", "false"],
151 |         default_value="true",
152 |         required=True,
153 |     )
154 | 
155 |     property_descriptors = [
156 |         INPUT_FORMAT,
157 |         PDF_PARSING_STRATEGY,
158 |         PDF_MODEL_NAME,
159 |         ELEMENT_STRATEGY,
160 |         INCLUDE_PAGE_BREAKS,
161 |         PDF_INFER_TABLE_STRUCTURE,
162 |         LANGUAGES,
163 |         METADATA_FIELDS,
164 |         EXTRACT_METADATA,
165 |     ]
166 | 
167 |     def __init__(self, **kwargs):
168 |         pass
169 | 
170 |     def getPropertyDescriptors(self):
171 |         return self.property_descriptors
172 | 
173 |     def get_parsing_strategy(self, nifi_value: str, default_value: str) -> str:
174 |         if nifi_value == PARSING_STRATEGY_OCR_ONLY:
175 |             return "ocr_only"
176 |         if nifi_value == PARSING_STRATEGY_HIGH_RES:
177 |             return "hi_res"
178 |         if nifi_value == PARSING_STRATEGY_FAST:
179 |             return "fast"
180 |         if nifi_value == PARSING_STRATEGY_AUTO:
181 |             return "auto"
182 |         return default_value
183 | 
184 |     def get_languages(self, nifi_value: str) -> list[str]:
185 |         return [lang.strip() for lang in nifi_value.split(",")]
186 | 
187 |     def create_docs(self, context, flowFile):
188 |         from langchain.schema import Document
189 | 
190 |         metadata = {}
191 | 
192 |         for attribute_name in context.getProperty(self.METADATA_FIELDS).getValue().split(","):
193 |             trimmed = attribute_name.strip()
194 |             value = flowFile.getAttribute(trimmed)
195 |             metadata[trimmed] = value
196 | 
197 |         input_format = context.getProperty(self.INPUT_FORMAT).evaluateAttributeExpressions(flowFile).getValue()
198 |         if input_format == PLAIN_TEXT:
199 |             return [Document(page_content=flowFile.getContentsAsBytes().decode("utf-8"), metadata=metadata)]
200 | 
201 |         element_strategy = context.getProperty(self.ELEMENT_STRATEGY).getValue()
202 |         mode = "single" if element_strategy == SINGLE_DOCUMENT else "elements"
203 | 
204 |         include_page_breaks = context.getProperty(self.INCLUDE_PAGE_BREAKS).asBoolean()
205 |         include_metadata = context.getProperty(self.EXTRACT_METADATA).asBoolean()
206 | 
207 |         if input_format == HTML:
208 |             from langchain.document_loaders import UnstructuredHTMLLoader
209 | 
210 |             loader = UnstructuredHTMLLoader(
211 |                 None,
212 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
213 |                 mode=mode,
214 |                 include_page_breaks=include_page_breaks,
215 |                 include_metadata=include_metadata,
216 |             )
217 | 
218 |         elif input_format == PDF:
219 |             from langchain.document_loaders import UnstructuredPDFLoader
220 | 
221 |             infer_table_structure = context.getProperty(self.PDF_INFER_TABLE_STRUCTURE).asBoolean()
222 |             strategy = self.get_parsing_strategy(
223 |                 context.getProperty(self.PDF_PARSING_STRATEGY).getValue(), PARSING_STRATEGY_AUTO
224 |             )
225 |             languages = self.get_languages(context.getProperty(self.LANGUAGES).getValue())
226 |             model_name = context.getProperty(self.PDF_MODEL_NAME).getValue()
227 | 
228 |             loader = UnstructuredPDFLoader(
229 |                 None,
230 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
231 |                 mode=mode,
232 |                 infer_table_structure=infer_table_structure,
233 |                 include_page_breaks=include_page_breaks,
234 |                 languages=languages,
235 |                 strategy=strategy,
236 |                 include_metadata=include_metadata,
237 |                 model_name=model_name,
238 |             )
239 | 
240 |         elif input_format == MARKDOWN:
241 |             from langchain.document_loaders import UnstructuredMarkdownLoader
242 | 
243 |             loader = UnstructuredMarkdownLoader(
244 |                 None,
245 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
246 |                 mode=mode,
247 |                 include_page_breaks=include_page_breaks,
248 |                 include_metadata=include_metadata,
249 |             )
250 | 
251 |         elif input_format == WORD:
252 |             from langchain.document_loaders import UnstructuredWordDocumentLoader
253 | 
254 |             loader = UnstructuredWordDocumentLoader(
255 |                 None,
256 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
257 |                 mode=mode,
258 |                 include_page_breaks=include_page_breaks,
259 |                 include_metadata=include_metadata,
260 |             )
261 | 
262 |         elif input_format == EXCEL:
263 |             from langchain.document_loaders import UnstructuredExcelLoader
264 | 
265 |             loader = UnstructuredExcelLoader(
266 |                 None,
267 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
268 |                 mode=mode,
269 |                 include_page_breaks=include_page_breaks,
270 |                 include_metadata=include_metadata,
271 |             )
272 | 
273 |         elif input_format == POWERPOINT:
274 |             from langchain.document_loaders import UnstructuredPowerPointLoader
275 | 
276 |             loader = UnstructuredPowerPointLoader(
277 |                 None,
278 |                 file=io.BytesIO(flowFile.getContentsAsBytes()),
279 |                 mode=mode,
280 |                 include_page_breaks=include_page_breaks,
281 |                 include_metadata=include_metadata,
282 |             )
283 | 
284 |         else:
285 |             raise ValueError("Configured Input Format is invalid: " + input_format)
286 | 
287 |         documents = loader.load()
288 | 
289 |         if len(metadata) > 0:
290 |             for doc in documents:
291 |                 if doc.metadata is None:
292 |                     doc.metadata = metadata
293 |                 else:
294 |                     doc.metadata.update(metadata)
295 | 
296 |         return documents
297 | 
298 |     def to_json(self, docs) -> str:
299 |         json_docs = []
300 | 
301 |         for i, doc in enumerate(docs):
302 |             doc.metadata["chunk_index"] = i
303 |             doc.metadata["chunk_count"] = len(docs)
304 | 
305 |             json_doc = json.dumps({"text": doc.page_content, "metadata": doc.metadata})
306 |             json_docs.append(json_doc)
307 | 
308 |         return "\n".join(json_docs)
309 | 
310 |     def transform(self, context, flowFile):
311 |         documents = self.create_docs(context, flowFile)
312 |         output_json = self.to_json(documents)
313 | 
314 |         return FlowFileTransformResult("success", contents=output_json, attributes={"mime.type": "application/json"})
315 | 


--------------------------------------------------------------------------------
/src/extensions/chunking/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 


--------------------------------------------------------------------------------
/src/extensions/openai/PromptChatGPT.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | import re
  5 | 
  6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  7 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators, TimeUnit
  8 | 
  9 | FLOWFILE_CONTENT = "flowfile_content"
 10 | FLOWFILE_CONTENT_REFERENCE = "{" + FLOWFILE_CONTENT + "}"
 11 | # Regex to match { followed by any number of characters other than { or }, followed by }. But do not match if it starts with {{
 12 | VAR_NAME_REGEX = r"(?<!{)\{([^{]*?)\}"
 13 | 
 14 | 
 15 | class PromptChatGPT(FlowFileTransform):
 16 |     class Java:
 17 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 18 | 
 19 |     class ProcessorDetails:
 20 |         version = "2.0.0.dev0"
 21 |         description = "Submits a prompt to ChatGPT, writing the results either to a FlowFile attribute or to the contents of the FlowFile"
 22 |         tags = [
 23 |             "text",
 24 |             "chatgpt",
 25 |             "gpt",
 26 |             "machine learning",
 27 |             "ML",
 28 |             "artificial intelligence",
 29 |             "ai",
 30 |             "document",
 31 |             "langchain",
 32 |         ]
 33 |         dependencies = ["langchain==0.1.2", "openai==1.9.0", "jsonpath-ng"]
 34 | 
 35 |     MODEL = PropertyDescriptor(
 36 |         name="OpenAI Model Name",
 37 |         description="The name of the OpenAI Model to use in order to answer the prompt",
 38 |         default_value="gpt-3.5-turbo",
 39 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 40 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 41 |         required=True,
 42 |     )
 43 |     PROMPT = PropertyDescriptor(
 44 |         name="Prompt",
 45 |         description="""The prompt to issue to ChatGPT. This may use FlowFile attributes via Expression Language and may also reference the FlowFile content by using the literal
 46 |                     {flowfile_content} (including braces) in the prompt. If the FlowFile's content is JSON formatted, a reference may also include JSONPath Expressions
 47 |                     to reference specific fields in the FlowFile content, such as {$.page_content}""",
 48 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 49 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 50 |         required=True,
 51 |     )
 52 |     TEMPERATURE = PropertyDescriptor(
 53 |         name="Temperature",
 54 |         description="""The Temperature parameter to submit to OpenAI. A lower value will result in more consistent answers while a higher value will result in a more creative answer.
 55 |                     "The value must be between 0 and 2, inclusive.""",
 56 |         validators=[StandardValidators._standard_validators.createNonNegativeFloatingPointValidator(2.0)],
 57 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 58 |         required=True,
 59 |         default_value="1.0",
 60 |     )
 61 |     RESULT_ATTRIBUTE = PropertyDescriptor(
 62 |         name="Result Attribute",
 63 |         description="If specified, the result will be added to the attribute whose name is given. If not specified, the result will be written to the FlowFile's content",
 64 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 65 |         required=False,
 66 |     )
 67 |     API_KEY = PropertyDescriptor(
 68 |         name="API Key",
 69 |         description="The OpenAI API Key to use",
 70 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 71 |         required=True,
 72 |         sensitive=True,
 73 |     )
 74 |     TIMEOUT = PropertyDescriptor(
 75 |         name="Request Timeout",
 76 |         description="The amount of time to wait before timing out the request",
 77 |         validators=[StandardValidators.TIME_PERIOD_VALIDATOR],
 78 |         default_value="60 secs",
 79 |         required=True,
 80 |     )
 81 |     MAX_TOKENS = PropertyDescriptor(
 82 |         name="Max Tokens to Generate",
 83 |         description="The maximum number of tokens that ChatGPT should generate",
 84 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
 85 |         required=False,
 86 |     )
 87 |     ORGANIZATION = PropertyDescriptor(
 88 |         name="OpenAI Organization ID",
 89 |         description="The OpenAI Organization ID",
 90 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 91 |         required=False,
 92 |     )
 93 |     API_BASE = PropertyDescriptor(
 94 |         name="API Base URL Path",
 95 |         description="The API Base URL to use for interacting with OpenAI. This should be populated only if using a proxy or an emulator.",
 96 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 97 |         required=False,
 98 |     )
 99 | 
100 |     property_descriptors = [
101 |         MODEL,
102 |         PROMPT,
103 |         TEMPERATURE,
104 |         RESULT_ATTRIBUTE,
105 |         API_KEY,
106 |         TIMEOUT,
107 |         MAX_TOKENS,
108 |         ORGANIZATION,
109 |         API_BASE,
110 |     ]
111 | 
112 |     def __init__(self, **kwargs):
113 |         pass
114 | 
115 |     def getPropertyDescriptors(self):
116 |         return self.property_descriptors
117 | 
118 |     def transform(self, context, flowFile):
119 |         from langchain import PromptTemplate
120 |         from langchain.chains.llm import LLMChain
121 |         from langchain.chat_models import ChatOpenAI
122 | 
123 |         prompt = context.getProperty(self.PROMPT).evaluateAttributeExpressions(flowFile).getValue()
124 | 
125 |         # We want to allow referencing FlowFile content using JSONPath Expressions.
126 |         # To do that, we allow the same {variable} syntax as Langchain. But Langchain does not allow '$' characters
127 |         # to exist in the variable names. So we need to replace those variables in the prompt with new variables, such as
128 |         # jsonpath_var_0, jsonpath_var_1, etc. To do this, we will use a Regex to detect any variables that are referenced
129 |         # and if it starts with a $ we will replace it with jsonpath_var_<index> and we will keep a mapping from that name to
130 |         # the substituted variable name so that we can later determine what the JSONPath expression was.
131 |         variable_references = list(set(re.findall(VAR_NAME_REGEX, prompt)))
132 | 
133 |         input_variables = []
134 |         jsonpath_to_var_mapping = {}
135 |         index = 0
136 |         for ref in variable_references:
137 |             if ref.startswith("$"):
138 |                 var_name = "jsonpath_var_" + str(index)
139 |                 index += 1
140 |                 input_variables.append(var_name)
141 |                 jsonpath_to_var_mapping[ref] = var_name
142 |                 prompt = prompt.replace("{" + ref + "}", "{" + var_name + "}")
143 |             elif ref == FLOWFILE_CONTENT:
144 |                 input_variables.append(ref)
145 |             else:
146 |                 raise ValueError(
147 |                     "Prompt contained an invalid variable reference: {"
148 |                     + ref
149 |                     + "}. Valid references are flowfile_content or any JSONPath expression."
150 |                 )
151 | 
152 |         temperature = context.getProperty(self.TEMPERATURE).evaluateAttributeExpressions(flowFile).asFloat()
153 |         model_name = context.getProperty(self.MODEL).evaluateAttributeExpressions(flowFile).getValue()
154 |         api_key = context.getProperty(self.API_KEY).getValue()
155 |         timeout = context.getProperty(self.TIMEOUT).asTimePeriod(TimeUnit.SECONDS)
156 |         max_tokens = context.getProperty(self.MAX_TOKENS).asInteger()
157 |         organization = context.getProperty(self.ORGANIZATION).getValue()
158 |         api_base = context.getProperty(self.API_BASE).getValue()
159 | 
160 |         # Build out our LLMChain
161 |         llm = ChatOpenAI(
162 |             model_name=model_name,
163 |             temperature=temperature,
164 |             openai_api_key=api_key,
165 |             request_timeout=timeout,
166 |             max_retries=0,
167 |             max_tokens=max_tokens,
168 |             openai_organization=organization,
169 |             openai_api_base=api_base,
170 |         )
171 | 
172 |         prompt_template = PromptTemplate(template=prompt, input_variables=input_variables)
173 | 
174 |         llm_chain = LLMChain(llm=llm, prompt=prompt_template)
175 | 
176 |         # Substitute in any JSON Path Expressions or references to {flowfile_content}.
177 |         llm_args = {}
178 |         json_content = None
179 |         for var_name in variable_references:
180 |             # If variable references {flowfile_content} substitute the content
181 |             if var_name == FLOWFILE_CONTENT:
182 |                 llm_args[FLOWFILE_CONTENT] = flowFile.getContentsAsBytes().decode()
183 |             if var_name.startswith("$"):
184 |                 # Load the FlowFile's contents into the json_content variable only once
185 |                 if json_content is None:
186 |                     json_content = json.loads(flowFile.getContentsAsBytes().decode())
187 | 
188 |                 # Import jsonpath_ng so that we can evaluate JSONPath against the FlowFile content.
189 |                 from jsonpath_ng import parse
190 | 
191 |                 try:
192 |                     jsonpath_expression = parse(var_name)
193 |                     matches = jsonpath_expression.find(json_content)
194 |                     variable_value = "\n".join([match.value for match in matches])
195 |                 except:
196 |                     self.logger.exception(f"Invalid JSONPath reference in prompt: {var_name}")
197 |                     raise
198 | 
199 |                 # Insert the resolved value into llm_args
200 |                 resolved_var_name = jsonpath_to_var_mapping.get(var_name)
201 |                 llm_args[resolved_var_name] = variable_value
202 | 
203 |         self.logger.debug(f"Evaluating prompt\nPrompt: {prompt}\nArgs: #{llm_args}")
204 | 
205 |         # Run the LLM Chain in order to prompt ChatGPT
206 |         results = llm_chain(llm_args)
207 | 
208 |         # Create the output content or FLowFile attribute
209 |         text = results["text"]
210 |         attribute_name = context.getProperty(self.RESULT_ATTRIBUTE).getValue()
211 |         if attribute_name is None:
212 |             output_content = text
213 |             output_attributes = None
214 |         else:
215 |             output_content = None
216 |             output_attributes = {attribute_name: text}
217 | 
218 |         # Return the results
219 |         return FlowFileTransformResult("success", contents=output_content, attributes=output_attributes)
220 | 


--------------------------------------------------------------------------------
/src/extensions/openai/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/ChromaUtils.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
  4 | 
  5 | # Connection Strategies
  6 | LOCAL_DISK = "Local Disk"
  7 | REMOTE_SERVER = "Remote Chroma Server"
  8 | 
  9 | # Authentication Strategies
 10 | TOKEN = "Token Authentication"
 11 | BASIC_AUTH = "Basic Authentication"
 12 | NONE = "None"
 13 | 
 14 | # Transport Protocols
 15 | HTTP = "http"
 16 | HTTPS = "https"
 17 | 
 18 | CONNECTION_STRATEGY = PropertyDescriptor(
 19 |     name="Connection Strategy",
 20 |     description="Specifies how to connect to the Chroma server",
 21 |     allowable_values=[LOCAL_DISK, REMOTE_SERVER],
 22 |     default_value=REMOTE_SERVER,
 23 |     required=True,
 24 | )
 25 | DIRECTORY = PropertyDescriptor(
 26 |     name="Directory",
 27 |     description="The Directory that Chroma should use to persist data",
 28 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 29 |     required=True,
 30 |     default_value="./chroma",
 31 |     dependencies=[PropertyDependency(CONNECTION_STRATEGY, LOCAL_DISK)],
 32 | )
 33 | HOSTNAME = PropertyDescriptor(
 34 |     name="Hostname",
 35 |     description="The hostname to connect to in order to communicate with Chroma",
 36 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 37 |     default_value="localhost",
 38 |     required=True,
 39 |     dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)],
 40 | )
 41 | PORT = PropertyDescriptor(
 42 |     name="Port",
 43 |     description="The port that the Chroma server is listening on",
 44 |     validators=[StandardValidators.PORT_VALIDATOR],
 45 |     default_value="8000",
 46 |     required=True,
 47 |     dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)],
 48 | )
 49 | TRANSPORT_PROTOCOL = PropertyDescriptor(
 50 |     name="Transport Protocol",
 51 |     description="Specifies whether connections should be made over http or https",
 52 |     allowable_values=[HTTP, HTTPS],
 53 |     default_value=HTTPS,
 54 |     required=True,
 55 |     dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)],
 56 | )
 57 | AUTH_STRATEGY = PropertyDescriptor(
 58 |     name="Authentication Strategy",
 59 |     description="Specifies how to authenticate to Chroma server",
 60 |     allowable_values=[TOKEN, BASIC_AUTH, NONE],
 61 |     default_value=TOKEN,
 62 |     required=True,
 63 |     dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)],
 64 | )
 65 | AUTH_TOKEN = PropertyDescriptor(
 66 |     name="Authentication Token",
 67 |     description="The token to use for authenticating to Chroma server",
 68 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 69 |     required=True,
 70 |     sensitive=True,
 71 |     dependencies=[PropertyDependency(AUTH_STRATEGY, TOKEN)],
 72 | )
 73 | USERNAME = PropertyDescriptor(
 74 |     name="Username",
 75 |     description="The username to use for authenticating to Chroma server",
 76 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 77 |     required=True,
 78 |     dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)],
 79 | )
 80 | PASSWORD = PropertyDescriptor(
 81 |     name="Password",
 82 |     description="The password to use for authenticating to Chroma server",
 83 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 84 |     required=True,
 85 |     sensitive=True,
 86 |     dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)],
 87 | )
 88 | COLLECTION_NAME = PropertyDescriptor(
 89 |     name="Collection Name",
 90 |     description="The name of the Chroma Collection",
 91 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 92 |     required=True,
 93 |     default_value="nifi",
 94 |     expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 95 | )
 96 | 
 97 | PROPERTIES = [
 98 |     CONNECTION_STRATEGY,
 99 |     DIRECTORY,
100 |     HOSTNAME,
101 |     PORT,
102 |     TRANSPORT_PROTOCOL,
103 |     AUTH_STRATEGY,
104 |     AUTH_TOKEN,
105 |     USERNAME,
106 |     PASSWORD,
107 |     COLLECTION_NAME,
108 | ]
109 | 
110 | 
111 | def create_client(context):
112 |     import chromadb
113 |     from chromadb import Settings
114 | 
115 |     connection_strategy = context.getProperty(CONNECTION_STRATEGY).getValue()
116 |     if connection_strategy == LOCAL_DISK:
117 |         directory = context.getProperty(DIRECTORY).getValue()
118 |         return chromadb.PersistentClient(directory)
119 |     hostname = context.getProperty(HOSTNAME).getValue()
120 |     port = context.getProperty(PORT).asInteger()
121 |     headers = {}
122 |     ssl = context.getProperty(TRANSPORT_PROTOCOL).getValue() == HTTPS
123 | 
124 |     auth_strategy = context.getProperty(AUTH_STRATEGY).getValue()
125 |     if auth_strategy == TOKEN:
126 |         auth_provider = "chromadb.auth.token.TokenAuthClientProvider"
127 |         credentials = context.getProperty(AUTH_TOKEN).getValue()
128 |     elif auth_strategy == BASIC_AUTH:
129 |         auth_provider = "chromadb.auth.basic.BasicAuthClientProvider"
130 |         username = context.getProperty(USERNAME).getValue()
131 |         password = context.getProperty(PASSWORD).getValue()
132 |         credentials = username + ":" + password
133 |     else:
134 |         auth_provider = None
135 |         credentials = None
136 | 
137 |     settings = Settings(chroma_client_auth_provider=auth_provider, chroma_client_auth_credentials=credentials)
138 |     return chromadb.HttpClient(hostname, port, ssl, headers, settings)
139 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/EmbeddingUtils.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
  4 | from langchain.embeddings.openai import OpenAIEmbeddings
  5 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators
  6 | 
  7 | # Embedding Functions
  8 | ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
  9 | HUGGING_FACE = "Hugging Face Model"
 10 | OPENAI = "OpenAI Model"
 11 | SENTENCE_TRANSFORMERS = "Sentence Transformers"
 12 | 
 13 | 
 14 | EMBEDDING_FUNCTION = PropertyDescriptor(
 15 |     name="Embedding Function",
 16 |     description="Specifies which embedding function should be used in order to create embeddings from incoming Documents",
 17 |     allowable_values=[ONNX_ALL_MINI_LM_L6_V2, HUGGING_FACE, OPENAI, SENTENCE_TRANSFORMERS],
 18 |     default_value=ONNX_ALL_MINI_LM_L6_V2,
 19 |     required=True,
 20 | )
 21 | HUGGING_FACE_MODEL_NAME = PropertyDescriptor(
 22 |     name="HuggingFace Model Name",
 23 |     description="The name of the HuggingFace model to use",
 24 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 25 |     default_value="sentence-transformers/all-MiniLM-L6-v2",
 26 |     required=True,
 27 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, HUGGING_FACE)],
 28 | )
 29 | HUGGING_FACE_API_KEY = PropertyDescriptor(
 30 |     name="HuggingFace API Key",
 31 |     description="The API Key for interacting with HuggingFace",
 32 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 33 |     required=True,
 34 |     sensitive=True,
 35 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, HUGGING_FACE)],
 36 | )
 37 | OPENAI_API_KEY = PropertyDescriptor(
 38 |     name="OpenAI API Key",
 39 |     description="The API Key for interacting with OpenAI",
 40 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 41 |     required=True,
 42 |     sensitive=True,
 43 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 44 | )
 45 | OPENAI_MODEL_NAME = PropertyDescriptor(
 46 |     name="OpenAI Model Name",
 47 |     description="The name of the OpenAI model to use",
 48 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 49 |     default_value="text-embedding-ada-002",
 50 |     required=True,
 51 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 52 | )
 53 | OPENAI_ORGANIZATION = PropertyDescriptor(
 54 |     name="OpenAI Organization ID",
 55 |     description="The OpenAI Organization ID",
 56 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 57 |     required=False,
 58 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 59 | )
 60 | OPENAI_API_BASE = PropertyDescriptor(
 61 |     name="OpenAI API Base Path",
 62 |     description="The API Base to use for interacting with OpenAI. This is used for interacting with different deployments, such as an Azure deployment.",
 63 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 64 |     required=False,
 65 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 66 | )
 67 | OPENAI_API_TYPE = PropertyDescriptor(
 68 |     name="OpenAI API Deployment Type",
 69 |     description="The type of the OpenAI API Deployment. This is used for interacting with different deployments, such as an Azure deployment.",
 70 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 71 |     required=False,
 72 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 73 | )
 74 | OPENAI_API_VERSION = PropertyDescriptor(
 75 |     name="OpenAI API Version",
 76 |     description="The OpenAI API Version. This is used for interacting with different deployments, such as an Azure deployment.",
 77 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 78 |     required=False,
 79 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, OPENAI)],
 80 | )
 81 | SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor(
 82 |     name="Sentence Transformer Model Name",
 83 |     description="The name of the Sentence Transformer model to use",
 84 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 85 |     default_value="all-MiniLM-L6-v2",
 86 |     required=True,
 87 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)],
 88 | )
 89 | SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
 90 |     name="Sentence Transformer Device Type",
 91 |     description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc.
 92 |                    If not specified, a GPU will be used if possible, otherwise a CPU.""",
 93 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 94 |     required=False,
 95 |     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)],
 96 | )
 97 | EMBEDDING_MODEL = PropertyDescriptor(
 98 |     name="Embedding Model",
 99 |     description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.",
100 |     allowable_values=[HUGGING_FACE, OPENAI],
101 |     default_value=OPENAI,
102 |     required=True,
103 | )
104 | OPENAI_MODEL = PropertyDescriptor(
105 |     name="OpenAI Model",
106 |     description="The name of the OpenAI model to use",
107 |     default_value="text-embedding-ada-002",
108 |     required=True,
109 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
110 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)],
111 | )
112 | HUGGING_FACE_MODEL = PropertyDescriptor(
113 |     name="HuggingFace Model",
114 |     description="The name of the HuggingFace model to use",
115 |     default_value="sentence-transformers/all-MiniLM-L6-v2",
116 |     required=True,
117 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
118 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)],
119 | )
120 | 
121 | PROPERTIES = [
122 |     EMBEDDING_FUNCTION,
123 |     HUGGING_FACE_MODEL_NAME,
124 |     HUGGING_FACE_API_KEY,
125 |     OPENAI_MODEL_NAME,
126 |     OPENAI_API_KEY,
127 |     OPENAI_ORGANIZATION,
128 |     OPENAI_API_BASE,
129 |     OPENAI_API_TYPE,
130 |     OPENAI_API_VERSION,
131 |     SENTENCE_TRANSFORMER_MODEL_NAME,
132 |     SENTENCE_TRANSFORMER_DEVICE,
133 |     EMBEDDING_MODEL,
134 | ]
135 | 
136 | 
137 | def create_embedding_function(context):
138 |     from chromadb.utils.embedding_functions import (
139 |         HuggingFaceEmbeddingFunction,
140 |         ONNXMiniLM_L6_V2,
141 |         OpenAIEmbeddingFunction,
142 |         SentenceTransformerEmbeddingFunction,
143 |     )
144 | 
145 |     function_name = context.getProperty(EMBEDDING_FUNCTION).getValue()
146 |     if function_name == ONNX_ALL_MINI_LM_L6_V2:
147 |         return ONNXMiniLM_L6_V2()
148 | 
149 |     if function_name == OPENAI:
150 |         api_key = context.getProperty(OPENAI_API_KEY).getValue()
151 |         model_name = context.getProperty(OPENAI_MODEL_NAME).getValue()
152 |         organization_id = context.getProperty(OPENAI_ORGANIZATION).getValue()
153 |         api_base = context.getProperty(OPENAI_API_BASE).getValue()
154 |         api_type = context.getProperty(OPENAI_API_TYPE).getValue()
155 |         api_version = context.getProperty(OPENAI_API_VERSION).getValue()
156 |         return OpenAIEmbeddingFunction(
157 |             api_key=api_key,
158 |             model_name=model_name,
159 |             organization_id=organization_id,
160 |             api_base=api_base,
161 |             api_type=api_type,
162 |             api_version=api_version,
163 |         )
164 | 
165 |     if function_name == HUGGING_FACE:
166 |         api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue()
167 |         model_name = context.getProperty(HUGGING_FACE_MODEL_NAME).getValue()
168 |         return HuggingFaceEmbeddingFunction(api_key=api_key, model_name=model_name)
169 | 
170 |     model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
171 |     device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
172 |     return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device)
173 | 
174 | 
175 | def create_embedding_service(context):
176 |     embedding_service = context.getProperty(EMBEDDING_MODEL).getValue()
177 | 
178 |     if embedding_service == OPENAI:
179 |         openai_api_key = context.getProperty(OPENAI_API_KEY).getValue()
180 |         openai_model = context.getProperty(OPENAI_MODEL).getValue()
181 |         return OpenAIEmbeddings(openai_api_key=openai_api_key, model=openai_model)
182 |     huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue()
183 |     huggingface_model = context.getProperty(HUGGING_FACE_MODEL).getValue()
184 |     return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key, model_name=huggingface_model)
185 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/OpenSearchVectorUtils.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE, OPENAI
  4 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
  5 | 
  6 | # Space types
  7 | L2 = ("L2 (Euclidean distance)", "l2")
  8 | L1 = ("L1 (Manhattan distance)", "l1")
  9 | LINF = ("L-infinity (chessboard) distance", "linf")
 10 | COSINESIMIL = ("Cosine similarity", "cosinesimil")
 11 | 
 12 | HUGGING_FACE_API_KEY = PropertyDescriptor(
 13 |     name="HuggingFace API Key",
 14 |     description="The API Key for interacting with HuggingFace",
 15 |     required=True,
 16 |     sensitive=True,
 17 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 18 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)],
 19 | )
 20 | OPENAI_API_KEY = PropertyDescriptor(
 21 |     name="OpenAI API Key",
 22 |     description="The API Key for OpenAI in order to create embeddings",
 23 |     required=True,
 24 |     sensitive=True,
 25 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 26 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)],
 27 | )
 28 | HTTP_HOST = PropertyDescriptor(
 29 |     name="HTTP Host",
 30 |     description="URL where OpenSearch is hosted.",
 31 |     default_value="http://localhost:9200",
 32 |     required=True,
 33 |     validators=[StandardValidators.URL_VALIDATOR],
 34 | )
 35 | USERNAME = PropertyDescriptor(
 36 |     name="Username",
 37 |     description="The username to use for authenticating to OpenSearch server",
 38 |     required=False,
 39 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 40 | )
 41 | PASSWORD = PropertyDescriptor(
 42 |     name="Password",
 43 |     description="The password to use for authenticating to OpenSearch server",
 44 |     required=False,
 45 |     sensitive=True,
 46 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 47 | )
 48 | CERTIFICATE_PATH = PropertyDescriptor(
 49 |     name="Certificate Path",
 50 |     description="The path to the CA certificate to be used.",
 51 |     required=False,
 52 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 53 | )
 54 | INDEX_NAME = PropertyDescriptor(
 55 |     name="Index Name",
 56 |     description="The name of the OpenSearch index.",
 57 |     sensitive=False,
 58 |     required=True,
 59 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 60 |     expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 61 | )
 62 | VECTOR_FIELD = PropertyDescriptor(
 63 |     name="Vector Field Name",
 64 |     description="The name of field in the document where the embeddings are stored. This field need to be a 'knn_vector' typed field.",
 65 |     default_value="vector_field",
 66 |     required=True,
 67 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 68 |     expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 69 | )
 70 | TEXT_FIELD = PropertyDescriptor(
 71 |     name="Text Field Name",
 72 |     description="The name of field in the document where the text is stored.",
 73 |     default_value="text",
 74 |     required=True,
 75 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 76 |     expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 77 | )
 78 | 
 79 | 
 80 | def create_authentication_params(context):
 81 |     username = context.getProperty(USERNAME).getValue()
 82 |     password = context.getProperty(PASSWORD).getValue()
 83 |     certificate_path = context.getProperty(CERTIFICATE_PATH).getValue()
 84 | 
 85 |     params = {}
 86 | 
 87 |     if username is not None and password is not None:
 88 |         params["http_auth"] = (username, password)
 89 | 
 90 |     if certificate_path is not None:
 91 |         params["ca_certs"] = certificate_path
 92 | 
 93 |     return params
 94 | 
 95 | 
 96 | def parse_documents(json_lines, id_field_name, file_name):
 97 |     import json
 98 | 
 99 |     texts = []
100 |     metadatas = []
101 |     ids = []
102 |     for i, line in enumerate(json_lines.split("\n"), start=1):
103 |         try:
104 |             doc = json.loads(line)
105 |         except Exception as e:
106 |             message = f"Could not parse line {i} as JSON"
107 |             raise ValueError(message) from e
108 | 
109 |         text = doc.get("text")
110 |         metadata = doc.get("metadata")
111 |         texts.append(text)
112 | 
113 |         # Remove any null values, or it will cause the embedding to fail
114 |         filtered_metadata = {key: value for key, value in metadata.items() if value is not None}
115 |         metadatas.append(filtered_metadata)
116 | 
117 |         doc_id = None
118 |         if id_field_name is not None:
119 |             doc_id = metadata.get(id_field_name)
120 |         if doc_id is None:
121 |             doc_id = file_name + "-" + str(i)
122 |         ids.append(doc_id)
123 | 
124 |     return {"texts": texts, "metadatas": metadatas, "ids": ids}
125 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/PutChroma.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import ChromaUtils
  6 | import EmbeddingUtils
  7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators
  9 | 
 10 | 
 11 | class PutChroma(FlowFileTransform):
 12 |     class Java:
 13 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 14 | 
 15 |     class ProcessorDetails:
 16 |         version = "2.0.0.dev0"
 17 |         description = """Publishes JSON data to a Chroma VectorDB. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'.
 18 |                        The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored. If the collection name specified
 19 |                        does not exist, the Processor will automatically create the collection."""
 20 |         tags = [
 21 |             "chroma",
 22 |             "vector",
 23 |             "vectordb",
 24 |             "embeddings",
 25 |             "ai",
 26 |             "artificial intelligence",
 27 |             "ml",
 28 |             "machine learning",
 29 |             "text",
 30 |             "LLM",
 31 |         ]
 32 | 
 33 |     STORE_TEXT = PropertyDescriptor(
 34 |         name="Store Document Text",
 35 |         description="""Specifies whether or not the text of the document should be stored in Chroma. If so, both the document's text and its embedding will be stored. If not,
 36 |                     only the vector/embedding will be stored.""",
 37 |         allowable_values=["true", "false"],
 38 |         required=True,
 39 |         default_value="true",
 40 |     )
 41 |     DISTANCE_METHOD = PropertyDescriptor(
 42 |         name="Distance Method",
 43 |         description="If the specified collection does not exist, it will be created using this Distance Method. If the collection exists, this property will be ignored.",
 44 |         allowable_values=["cosine", "l2", "ip"],
 45 |         default_value="cosine",
 46 |         required=True,
 47 |     )
 48 |     DOC_ID_FIELD_NAME = PropertyDescriptor(
 49 |         name="Document ID Field Name",
 50 |         description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
 51 |                     If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
 52 |         required=False,
 53 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 54 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 55 |     )
 56 | 
 57 |     client = None
 58 |     embedding_function = None
 59 | 
 60 |     def __init__(self, **kwargs):  # noqa: ARG002
 61 |         self.property_descriptors = list(ChromaUtils.PROPERTIES) + [
 62 |             prop for prop in EmbeddingUtils.PROPERTIES if prop != EmbeddingUtils.EMBEDDING_MODEL
 63 |         ]
 64 |         self.property_descriptors.append(self.STORE_TEXT)
 65 |         self.property_descriptors.append(self.DISTANCE_METHOD)
 66 |         self.property_descriptors.append(self.DOC_ID_FIELD_NAME)
 67 | 
 68 |     def getPropertyDescriptors(self):
 69 |         return self.property_descriptors
 70 | 
 71 |     def onScheduled(self, context):
 72 |         self.client = ChromaUtils.create_client(context)
 73 |         self.embedding_function = EmbeddingUtils.create_embedding_function(context)
 74 | 
 75 |     def transform(self, context, flowfile):
 76 |         client = self.client
 77 |         embedding_function = self.embedding_function
 78 |         collection_name = (
 79 |             context.getProperty(ChromaUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue()
 80 |         )
 81 |         distance_method = context.getProperty(self.DISTANCE_METHOD).getValue()
 82 |         id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
 83 | 
 84 |         collection = client.get_or_create_collection(
 85 |             name=collection_name, embedding_function=embedding_function, metadata={"hnsw:space": distance_method}
 86 |         )
 87 | 
 88 |         json_lines = flowfile.getContentsAsBytes().decode()
 89 |         i = 0
 90 |         texts = []
 91 |         metadatas = []
 92 |         ids = []
 93 |         for line in json_lines.split("\n"):
 94 |             doc = json.loads(line)
 95 |             text = doc.get("text")
 96 |             metadata = doc.get("metadata")
 97 |             texts.append(text)
 98 | 
 99 |             # Remove any null values, or it will cause the embedding to fail
100 |             filtered_metadata = {}
101 |             for key, value in metadata.items():
102 |                 if value is not None:
103 |                     if isinstance(value, list):
104 |                         for i, element in enumerate(value):
105 |                             element_count = i + 1
106 |                             indexed_key = f"{key}_{element_count}"
107 |                             filtered_metadata[indexed_key] = element
108 |                     else:
109 |                         filtered_metadata[key] = value
110 | 
111 |             metadatas.append(filtered_metadata)
112 | 
113 |             doc_id = None
114 |             if id_field_name is not None:
115 |                 doc_id = metadata.get(id_field_name)
116 |             if doc_id is None:
117 |                 doc_id = flowfile.getAttribute("filename") + "-" + str(i)
118 |             ids.append(doc_id)
119 | 
120 |             i += 1
121 | 
122 |         embeddings = embedding_function(texts)
123 |         if not context.getProperty(self.STORE_TEXT).asBoolean():
124 |             texts = None
125 | 
126 |         collection.upsert(ids, embeddings, metadatas, texts)
127 | 
128 |         return FlowFileTransformResult(relationship="success")
129 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/PutOpenSearchVector.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE_MODEL, OPENAI_MODEL, create_embedding_service
  4 | from langchain.vectorstores import OpenSearchVectorSearch
  5 | from nifiapi.documentation import use_case
  6 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  7 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
  8 | from OpenSearchVectorUtils import (
  9 |     CERTIFICATE_PATH,
 10 |     COSINESIMIL,
 11 |     HTTP_HOST,
 12 |     HUGGING_FACE_API_KEY,
 13 |     INDEX_NAME,
 14 |     L1,
 15 |     L2,
 16 |     LINF,
 17 |     OPENAI_API_KEY,
 18 |     PASSWORD,
 19 |     TEXT_FIELD,
 20 |     USERNAME,
 21 |     VECTOR_FIELD,
 22 |     create_authentication_params,
 23 |     parse_documents,
 24 | )
 25 | 
 26 | 
 27 | @use_case(
 28 |     description="Create vectors/embeddings that represent text content and send the vectors to OpenSearch",
 29 |     notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.",
 30 |     keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "insert"],
 31 |     configuration="""
 32 |                 Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible.
 33 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 34 |                 Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
 35 |                 Set 'Index Name' to the name of your OpenSearch Index.
 36 |                 Set 'Vector Field Name' to the name of the field in the document which will store the vector data.
 37 |                 Set 'Text Field Name' to the name of the field in the document which will store the text data.
 38 | 
 39 |                 If the documents to send to OpenSearch contain a unique identifier, set the 'Document ID Field Name' property to the name of the field that contains the document ID.
 40 |                 This property can be left blank, in which case a unique ID will be generated based on the FlowFile's filename.
 41 | 
 42 |                 If the provided index does not exists in OpenSearch then the processor is capable to create it. The 'New Index Strategy' property defines
 43 |                 that the index needs to be created from the default template or it should be configured with custom values.
 44 |                 """,
 45 | )
 46 | @use_case(
 47 |     description="Update vectors/embeddings in OpenSearch",
 48 |     notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.",
 49 |     keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "update", "upsert"],
 50 |     configuration="""
 51 |                 Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible.
 52 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 53 |                 Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
 54 |                 Set 'Index Name' to the name of your OpenSearch Index.
 55 |                 Set 'Vector Field Name' to the name of the field in the document which will store the vector data.
 56 |                 Set 'Text Field Name' to the name of the field in the document which will store the text data.
 57 |                 Set the 'Document ID Field Name' property to the name of the field that contains the identifier of the document in OpenSearch to update.
 58 |                 """,
 59 | )
 60 | class PutOpenSearchVector(FlowFileTransform):
 61 |     class Java:
 62 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 63 | 
 64 |     class ProcessorDetails:
 65 |         version = "2.0.0.dev0"
 66 |         description = """Publishes JSON data to OpenSearch. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'.
 67 |                        The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
 68 |         tags = [
 69 |             "opensearch",
 70 |             "vector",
 71 |             "vectordb",
 72 |             "vectorstore",
 73 |             "embeddings",
 74 |             "ai",
 75 |             "artificial intelligence",
 76 |             "ml",
 77 |             "machine learning",
 78 |             "text",
 79 |             "LLM",
 80 |         ]
 81 | 
 82 |     # Engine types
 83 |     NMSLIB = ("nmslib (Non-Metric Space Library)", "nmslib")
 84 |     FAISS = ("faiss (Facebook AI Similarity Search)", "faiss")
 85 |     LUCENE = ("lucene", "lucene")
 86 | 
 87 |     ENGINE_VALUES = dict([NMSLIB, FAISS, LUCENE])
 88 | 
 89 |     # Space types
 90 |     INNERPRODUCT = ("Inner product", "innerproduct")
 91 | 
 92 |     NMSLIB_SPACE_TYPE_VALUES = dict([L2, L1, LINF, COSINESIMIL, INNERPRODUCT])
 93 |     FAISS_SPACE_TYPE_VALUES = dict([L2, INNERPRODUCT])
 94 |     LUCENE_SPACE_TYPE_VALUES = dict([L2, COSINESIMIL])
 95 | 
 96 |     # New Index Mapping Strategy
 97 |     DEFAULT_INDEX_MAPPING = "Default index mapping"
 98 |     CUSTOM_INDEX_MAPPING = "Custom index mapping"
 99 | 
100 |     DOC_ID_FIELD_NAME = PropertyDescriptor(
101 |         name="Document ID Field Name",
102 |         description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
103 |                     If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
104 |         required=False,
105 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
106 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
107 |     )
108 |     NEW_INDEX_STRATEGY = PropertyDescriptor(
109 |         name="New Index Strategy",
110 |         description="""Specifies the Mapping strategy to use for new index creation. The default template values are the following:
111 |                     {engine: nmslib, space_type: l2, ef_search: 512, ef_construction: 512, m: 16}""",
112 |         allowable_values=[DEFAULT_INDEX_MAPPING, CUSTOM_INDEX_MAPPING],
113 |         default_value=DEFAULT_INDEX_MAPPING,
114 |         required=False,
115 |     )
116 |     ENGINE = PropertyDescriptor(
117 |         name="Engine",
118 |         description="The approximate k-NN library to use for indexing and search.",
119 |         allowable_values=ENGINE_VALUES.keys(),
120 |         default_value=NMSLIB[0],
121 |         required=False,
122 |         dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)],
123 |     )
124 |     NMSLIB_SPACE_TYPE = PropertyDescriptor(
125 |         name="NMSLIB Space Type",
126 |         description="The vector space used to calculate the distance between vectors.",
127 |         allowable_values=NMSLIB_SPACE_TYPE_VALUES.keys(),
128 |         default_value=L2[0],
129 |         required=False,
130 |         dependencies=[
131 |             PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
132 |             PropertyDependency(ENGINE, NMSLIB[0]),
133 |         ],
134 |     )
135 |     FAISS_SPACE_TYPE = PropertyDescriptor(
136 |         name="FAISS Space Type",
137 |         description="The vector space used to calculate the distance between vectors.",
138 |         allowable_values=FAISS_SPACE_TYPE_VALUES.keys(),
139 |         default_value=L2[0],
140 |         required=False,
141 |         dependencies=[
142 |             PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
143 |             PropertyDependency(ENGINE, FAISS[0]),
144 |         ],
145 |     )
146 |     LUCENE_SPACE_TYPE = PropertyDescriptor(
147 |         name="Lucene Space Type",
148 |         description="The vector space used to calculate the distance between vectors.",
149 |         allowable_values=LUCENE_SPACE_TYPE_VALUES.keys(),
150 |         default_value=L2[0],
151 |         required=False,
152 |         dependencies=[
153 |             PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
154 |             PropertyDependency(ENGINE, LUCENE[0]),
155 |         ],
156 |     )
157 |     EF_SEARCH = PropertyDescriptor(
158 |         name="EF Search",
159 |         description="The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches.",
160 |         default_value="512",
161 |         required=False,
162 |         validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR],
163 |         dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)],
164 |     )
165 |     EF_CONSTRUCTION = PropertyDescriptor(
166 |         name="EF Construction",
167 |         description="The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed.",
168 |         default_value="512",
169 |         required=False,
170 |         validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR],
171 |         dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)],
172 |     )
173 |     M = PropertyDescriptor(
174 |         name="M",
175 |         description="The number of bidirectional links that the plugin creates for each new element. Increasing and "
176 |         "decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.",
177 |         default_value="16",
178 |         required=False,
179 |         validators=[StandardValidators._standard_validators.createLongValidator(2, 100, True)],
180 |         dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)],
181 |     )
182 | 
183 |     properties = [
184 |         EMBEDDING_MODEL,
185 |         OPENAI_API_KEY,
186 |         OPENAI_MODEL,
187 |         HUGGING_FACE_API_KEY,
188 |         HUGGING_FACE_MODEL,
189 |         HTTP_HOST,
190 |         USERNAME,
191 |         PASSWORD,
192 |         CERTIFICATE_PATH,
193 |         INDEX_NAME,
194 |         DOC_ID_FIELD_NAME,
195 |         VECTOR_FIELD,
196 |         TEXT_FIELD,
197 |         NEW_INDEX_STRATEGY,
198 |         ENGINE,
199 |         NMSLIB_SPACE_TYPE,
200 |         FAISS_SPACE_TYPE,
201 |         LUCENE_SPACE_TYPE,
202 |         EF_SEARCH,
203 |         EF_CONSTRUCTION,
204 |         M,
205 |     ]
206 | 
207 |     embeddings = None
208 | 
209 |     def __init__(self, **kwargs):
210 |         pass
211 | 
212 |     def getPropertyDescriptors(self):
213 |         return self.properties
214 | 
215 |     def onScheduled(self, context):
216 |         self.embeddings = create_embedding_service(context)
217 | 
218 |     def transform(self, context, flowfile):
219 |         file_name = flowfile.getAttribute("filename")
220 |         http_host = context.getProperty(HTTP_HOST).evaluateAttributeExpressions(flowfile).getValue()
221 |         index_name = context.getProperty(INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
222 |         id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
223 |         vector_field = context.getProperty(VECTOR_FIELD).evaluateAttributeExpressions(flowfile).getValue()
224 |         text_field = context.getProperty(TEXT_FIELD).evaluateAttributeExpressions(flowfile).getValue()
225 |         new_index_strategy = context.getProperty(self.NEW_INDEX_STRATEGY).evaluateAttributeExpressions().getValue()
226 | 
227 |         params = {"vector_field": vector_field, "text_field": text_field}
228 |         params.update(create_authentication_params(context))
229 | 
230 |         if new_index_strategy == self.CUSTOM_INDEX_MAPPING:
231 |             engine = context.getProperty(self.ENGINE).evaluateAttributeExpressions().getValue()
232 |             params["engine"] = self.ENGINE_VALUES.get(engine)
233 | 
234 |             if engine == self.NMSLIB[0]:
235 |                 space_type = context.getProperty(self.NMSLIB_SPACE_TYPE).evaluateAttributeExpressions().getValue()
236 |                 params["space_type"] = self.NMSLIB_SPACE_TYPE_VALUES.get(space_type)
237 |             if engine == self.FAISS[0]:
238 |                 space_type = context.getProperty(self.FAISS_SPACE_TYPE).evaluateAttributeExpressions().getValue()
239 |                 params["space_type"] = self.FAISS_SPACE_TYPE_VALUES.get(space_type)
240 |             if engine == self.LUCENE[0]:
241 |                 space_type = context.getProperty(self.LUCENE_SPACE_TYPE).evaluateAttributeExpressions().getValue()
242 |                 params["space_type"] = self.LUCENE_SPACE_TYPE_VALUES.get(space_type)
243 | 
244 |             ef_search = context.getProperty(self.EF_SEARCH).evaluateAttributeExpressions().asInteger()
245 |             params["ef_search"] = ef_search
246 | 
247 |             ef_construction = context.getProperty(self.EF_CONSTRUCTION).evaluateAttributeExpressions().asInteger()
248 |             params["ef_construction"] = ef_construction
249 | 
250 |             m = context.getProperty(self.M).evaluateAttributeExpressions().asInteger()
251 |             params["m"] = m
252 | 
253 |         # Read the FlowFile content as "json-lines".
254 |         json_lines = flowfile.getContentsAsBytes().decode()
255 |         parsed_documents = parse_documents(json_lines, id_field_name, file_name)
256 | 
257 |         vectorstore = OpenSearchVectorSearch(
258 |             opensearch_url=http_host, index_name=index_name, embedding_function=self.embeddings, **params
259 |         )
260 |         vectorstore.add_texts(
261 |             texts=parsed_documents["texts"],
262 |             metadatas=parsed_documents["metadatas"],
263 |             ids=parsed_documents["ids"],
264 |             **params,
265 |         )
266 | 
267 |         return FlowFileTransformResult(relationship="success")
268 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/PutPinecone.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import langchain.vectorstores
  6 | from EmbeddingUtils import (
  7 |     EMBEDDING_MODEL,
  8 |     HUGGING_FACE,
  9 |     HUGGING_FACE_MODEL,
 10 |     OPENAI,
 11 |     OPENAI_MODEL,
 12 |     create_embedding_service,
 13 | )
 14 | from nifiapi.documentation import use_case
 15 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
 16 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
 17 | from pinecone import Pinecone
 18 | 
 19 | 
 20 | @use_case(
 21 |     description="Create vectors/embeddings that represent text content and send the vectors to Pinecone",
 22 |     notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in Pinecone provided in the 'text' field.",
 23 |     keywords=["pinecone", "embedding", "vector", "text", "vectorstore", "insert"],
 24 |     configuration="""
 25 |                 Configure the 'Pinecone API Key' to the appropriate authentication token for interacting with Pinecone.
 26 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 27 |                 Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
 28 |                 Set 'Pinecone Environment' to the name of your Pinecone environment
 29 |                 Set 'Index Name' to the name of your Pinecone Index.
 30 |                 Set 'Namespace' to appropriate namespace, or leave it empty to use the default Namespace.
 31 | 
 32 |                 If the documents to send to Pinecone contain a unique identifier, set the 'Document ID Field Name' property to the name of the field that contains the document ID.
 33 |                 This property can be left blank, in which case a unique ID will be generated based on the FlowFile's filename.
 34 |                 """,
 35 | )
 36 | @use_case(
 37 |     description="Update vectors/embeddings in Pinecone",
 38 |     notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in Pinecone provided in the 'text' field.",
 39 |     keywords=["pinecone", "embedding", "vector", "text", "vectorstore", "update", "upsert"],
 40 |     configuration="""
 41 |                 Configure the 'Pinecone API Key' to the appropriate authentication token for interacting with Pinecone.
 42 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 43 |                 Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
 44 |                 Set 'Pinecone Environment' to the name of your Pinecone environment
 45 |                 Set 'Index Name' to the name of your Pinecone Index.
 46 |                 Set 'Namespace' to appropriate namespace, or leave it empty to use the default Namespace.
 47 |                 Set the 'Document ID Field Name' property to the name of the field that contains the identifier of the document in Pinecone to update.
 48 |                 """,
 49 | )
 50 | class PutPinecone(FlowFileTransform):
 51 |     class Java:
 52 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 53 | 
 54 |     class ProcessorDetails:
 55 |         version = "2.0.0.dev0"
 56 |         description = """Publishes JSON data to Pinecone. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'.
 57 |                        The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
 58 |         tags = [
 59 |             "pinecone",
 60 |             "vector",
 61 |             "vectordb",
 62 |             "vectorstore",
 63 |             "embeddings",
 64 |             "ai",
 65 |             "artificial intelligence",
 66 |             "ml",
 67 |             "machine learning",
 68 |             "text",
 69 |             "LLM",
 70 |         ]
 71 | 
 72 |     PINECONE_API_KEY = PropertyDescriptor(
 73 |         name="Pinecone API Key",
 74 |         description="The API Key to use in order to authentication with Pinecone",
 75 |         sensitive=True,
 76 |         required=True,
 77 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 78 |     )
 79 |     HUGGING_FACE_API_KEY = PropertyDescriptor(
 80 |         name="HuggingFace API Key",
 81 |         description="The API Key for interacting with HuggingFace",
 82 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 83 |         required=True,
 84 |         sensitive=True,
 85 |         dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)],
 86 |     )
 87 |     OPENAI_API_KEY = PropertyDescriptor(
 88 |         name="OpenAI API Key",
 89 |         description="The API Key for OpenAI in order to create embeddings",
 90 |         sensitive=True,
 91 |         required=True,
 92 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 93 |         dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)],
 94 |     )
 95 |     PINECONE_ENV = PropertyDescriptor(
 96 |         name="Pinecone Environment",
 97 |         description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.",
 98 |         sensitive=False,
 99 |         required=True,
100 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
101 |     )
102 |     INDEX_NAME = PropertyDescriptor(
103 |         name="Index Name",
104 |         description="The name of the Pinecone index.",
105 |         sensitive=False,
106 |         required=True,
107 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
108 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
109 |     )
110 |     TEXT_KEY = PropertyDescriptor(
111 |         name="Text Key",
112 |         description="The key in the document that contains the text to create embeddings for.",
113 |         required=True,
114 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
115 |         default_value="text",
116 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
117 |     )
118 |     NAMESPACE = PropertyDescriptor(
119 |         name="Namespace",
120 |         description="The name of the Pinecone Namespace to put the documents to.",
121 |         required=False,
122 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
123 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
124 |     )
125 |     DOC_ID_FIELD_NAME = PropertyDescriptor(
126 |         name="Document ID Field Name",
127 |         description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
128 |                     If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
129 |         required=False,
130 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
131 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
132 |     )
133 | 
134 |     properties = [
135 |         PINECONE_API_KEY,
136 |         EMBEDDING_MODEL,
137 |         OPENAI_API_KEY,
138 |         OPENAI_MODEL,
139 |         HUGGING_FACE_API_KEY,
140 |         HUGGING_FACE_MODEL,
141 |         PINECONE_ENV,
142 |         INDEX_NAME,
143 |         TEXT_KEY,
144 |         NAMESPACE,
145 |         DOC_ID_FIELD_NAME,
146 |     ]
147 | 
148 |     embeddings = None
149 |     pc = None
150 | 
151 |     def __init__(self, **kwargs):
152 |         pass
153 | 
154 |     def getPropertyDescriptors(self):
155 |         return self.properties
156 | 
157 |     def onScheduled(self, context):
158 |         # initialize pinecone
159 |         self.pc = Pinecone(
160 |             api_key=context.getProperty(self.PINECONE_API_KEY).getValue(),
161 |             environment=context.getProperty(self.PINECONE_ENV).getValue(),
162 |         )
163 |         # initialize embedding service
164 |         self.embeddings = create_embedding_service(context)
165 | 
166 |     def transform(self, context, flowfile):
167 |         # First, check if our index already exists. If it doesn't, we create it
168 |         index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
169 |         namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue()
170 |         id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
171 | 
172 |         index = self.pc.Index(index_name)
173 | 
174 |         # Read the FlowFile content as "json-lines".
175 |         json_lines = flowfile.getContentsAsBytes().decode()
176 |         i = 1
177 |         texts = []
178 |         metadatas = []
179 |         ids = []
180 |         for line in json_lines.split("\n"):
181 |             try:
182 |                 doc = json.loads(line)
183 |             except Exception as e:
184 |                 message = f"Could not parse line {i} as JSON"
185 |                 raise ValueError(message) from e
186 | 
187 |             text = doc.get("text")
188 |             metadata = doc.get("metadata")
189 |             texts.append(text)
190 | 
191 |             # Remove any null values, or it will cause the embedding to fail
192 |             filtered_metadata = {}
193 |             for key, value in metadata.items():
194 |                 if value is not None:
195 |                     filtered_metadata[key] = value
196 | 
197 |             metadatas.append(filtered_metadata)
198 | 
199 |             doc_id = None
200 |             if id_field_name is not None:
201 |                 doc_id = metadata.get(id_field_name)
202 |             if doc_id is None:
203 |                 doc_id = flowfile.getAttribute("filename") + "-" + str(i)
204 |             ids.append(doc_id)
205 | 
206 |             i += 1
207 | 
208 |         text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
209 |         vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key)
210 |         vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
211 |         return FlowFileTransformResult(relationship="success")
212 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/PutQdrant.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import QdrantUtils
  6 | from EmbeddingUtils import (
  7 |     create_embedding_service,
  8 | )
  9 | from langchain.vectorstores.qdrant import Qdrant
 10 | from nifiapi.documentation import use_case
 11 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
 12 | from nifiapi.properties import (
 13 |     ExpressionLanguageScope,
 14 |     PropertyDescriptor,
 15 |     StandardValidators,
 16 | )
 17 | from qdrant_client.models import Distance
 18 | 
 19 | 
 20 | @use_case(
 21 |     description="Create embeddings that semantically represent text content and upload to Qdrant - https://qdrant.tech/",
 22 |     notes="This processor assumes that the data has already been formatted in JSONL format with the text to store in Qdrant provided in the 'text' field.",
 23 |     keywords=["qdrant", "embedding", "vector", "text", "vectorstore", "insert"],
 24 |     configuration="""
 25 |                 Configure 'Collection Name' to the name of the Qdrant collection to use.
 26 |                 Configure 'Qdrant URL' to the fully qualified URL of the Qdrant instance.
 27 |                 Configure 'Qdrant API Key' to the API Key to use in order to authenticate with Qdrant.
 28 |                 Configure 'Prefer gRPC' to True if you want to use gRPC for interfacing with Qdrant.
 29 |                 Configure 'Use HTTPS' to True if you want to use TLS(HTTPS) while interfacing with Qdrant.
 30 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 31 |                 Configure 'HuggingFace API Key' or 'OpenAI API Key', depending on the chosen Embedding Model.
 32 |                 Configure 'HuggingFace Model' or 'OpenAI Model' to the name of the model to use.
 33 |                 Configure 'Force Recreate Collection' to True if you want to recreate the collection if it already exists.
 34 |                 Configure 'Similarity Metric' to the similarity metric to use when querying Qdrant.
 35 | 
 36 |                 If the documents to send to Qdrant contain a unique identifier(UUID), set the 'Document ID Field Name' property to the name of the field that contains the document ID.
 37 |                 This property can be left blank, in which case a UUID will be generated based on the FlowFile's filename.
 38 |                 """,
 39 | )
 40 | class PutQdrant(FlowFileTransform):
 41 |     class Java:
 42 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 43 | 
 44 |     class ProcessorDetails:
 45 |         version = "2.0.0.dev0"
 46 |         description = """Publishes JSON data to Qdrant. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'.
 47 |                        The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
 48 |         tags = [
 49 |             "qdrant",
 50 |             "vector",
 51 |             "vectordb",
 52 |             "vectorstore",
 53 |             "embeddings",
 54 |             "ai",
 55 |             "artificial intelligence",
 56 |             "ml",
 57 |             "machine learning",
 58 |             "text",
 59 |             "LLM",
 60 |         ]
 61 | 
 62 |     DOC_ID_FIELD_NAME = PropertyDescriptor(
 63 |         name="Document ID Field Name",
 64 |         description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
 65 |                     If not specified, a UUID will be generated based on the FlowFile's filename and an incremental number.""",
 66 |         required=False,
 67 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 68 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 69 |     )
 70 |     FORCE_RECREATE_COLLECTION = PropertyDescriptor(
 71 |         name="Force Recreate Collection",
 72 |         description="Specifies whether to recreate the collection if it already exists. Essentially clearing the existing data.",
 73 |         required=True,
 74 |         default_value="False",
 75 |         allowable_values=["True", "False"],
 76 |         validators=[StandardValidators.BOOLEAN_VALIDATOR],
 77 |     )
 78 |     SIMILARITY_METRIC = PropertyDescriptor(
 79 |         name="Similarity Metric",
 80 |         description="Specifies the similarity metric when creating the collection.",
 81 |         required=True,
 82 |         default_value=Distance.COSINE,
 83 |         allowable_values=[
 84 |             Distance.COSINE,
 85 |             Distance.EUCLID,
 86 |             Distance.DOT,
 87 |             Distance.MANHATTAN,
 88 |         ],
 89 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 90 |     )
 91 | 
 92 |     properties = (
 93 |         QdrantUtils.QDRANT_PROPERTIES
 94 |         + QdrantUtils.EMBEDDING_MODEL_PROPERTIES
 95 |         + [
 96 |             FORCE_RECREATE_COLLECTION,
 97 |             SIMILARITY_METRIC,
 98 |             DOC_ID_FIELD_NAME,
 99 |         ]
100 |     )
101 | 
102 |     def __init__(self, **kwargs):
103 |         pass
104 | 
105 |     def getPropertyDescriptors(self):
106 |         return self.properties
107 | 
108 |     def onScheduled(self, context):
109 |         # The Qdrant#construct_instance() internally checks if the collection exists
110 |         # and creates it if it doesn't with the appropriate dimesions and configurations.
111 |         self.vector_store = Qdrant.construct_instance(
112 |             texts=["Some text to obtain the embeddings dimension when creating the collection"],
113 |             embedding=create_embedding_service(context),
114 |             collection_name=context.getProperty(QdrantUtils.COLLECTION_NAME).getValue(),
115 |             url=context.getProperty(QdrantUtils.QDRANT_URL).getValue(),
116 |             api_key=context.getProperty(QdrantUtils.QDRANT_API_KEY).getValue(),
117 |             prefer_grpc=context.getProperty(QdrantUtils.PREFER_GRPC).asBoolean(),
118 |             https=context.getProperty(QdrantUtils.HTTPS).asBoolean(),
119 |             force_recreate=context.getProperty(self.FORCE_RECREATE_COLLECTION).asBoolean(),
120 |             distance_func=context.getProperty(self.SIMILARITY_METRIC).getValue(),
121 |         )
122 | 
123 |     def transform(self, context, flowfile):
124 |         id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
125 | 
126 |         # Read the FlowFile content as "json-lines".
127 |         json_lines = flowfile.getContentsAsBytes().decode()
128 |         i = 1
129 |         texts, metadatas, ids = [], [], []
130 |         for line in json_lines.split("\n"):
131 |             try:
132 |                 doc = json.loads(line)
133 |             except Exception as e:
134 |                 message = f"Could not parse line {i} as JSON"
135 |                 raise ValueError(message) from e
136 | 
137 |             metadata = doc.get("metadata")
138 |             texts.append(doc.get("text"))
139 |             metadatas.append(metadata)
140 | 
141 |             doc_id = None
142 |             if id_field_name is not None:
143 |                 doc_id = metadata.get(id_field_name)
144 |             if doc_id is None:
145 |                 doc_id = QdrantUtils.convert_id(flowfile.getAttribute("filename") + "-" + str(i))
146 |             ids.append(doc_id)
147 | 
148 |             i += 1
149 | 
150 |         self.vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
151 |         return FlowFileTransformResult(relationship="success")
152 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QdrantUtils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | import uuid
 4 | 
 5 | from EmbeddingUtils import (
 6 |     EMBEDDING_MODEL,
 7 |     HUGGING_FACE,
 8 |     HUGGING_FACE_MODEL,
 9 |     OPENAI,
10 |     OPENAI_MODEL,
11 | )
12 | from nifiapi.properties import (
13 |     ExpressionLanguageScope,
14 |     PropertyDependency,
15 |     PropertyDescriptor,
16 |     StandardValidators,
17 | )
18 | 
19 | DEFAULT_COLLECTION_NAME = "apache-nifi"
20 | 
21 | 
22 | COLLECTION_NAME = PropertyDescriptor(
23 |     name="Collection Name",
24 |     description="The name of the Qdrant collection to use.",
25 |     sensitive=False,
26 |     required=True,
27 |     default_value=DEFAULT_COLLECTION_NAME,
28 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
29 |     expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
30 | )
31 | QDRANT_URL = PropertyDescriptor(
32 |     name="Qdrant URL",
33 |     description="The fully qualified URL to the Qdrant instance.",
34 |     sensitive=False,
35 |     required=True,
36 |     default_value="http://localhost:6333",
37 |     validators=[StandardValidators.URL_VALIDATOR],
38 | )
39 | QDRANT_API_KEY = PropertyDescriptor(
40 |     name="Qdrant API Key",
41 |     description="The API Key to use in order to authentication with Qdrant. Can be empty.",
42 |     sensitive=True,
43 |     required=True,
44 | )
45 | 
46 | PREFER_GRPC = PropertyDescriptor(
47 |     name="Prefer gRPC",
48 |     description="Specifies whether to use gRPC for interfacing with Qdrant.",
49 |     required=True,
50 |     default_value=False,
51 |     allowable_values=["True", "False"],
52 |     validators=[StandardValidators.BOOLEAN_VALIDATOR],
53 | )
54 | HTTPS = PropertyDescriptor(
55 |     name="Use HTTPS",
56 |     description="Specifies whether to TLS(HTTPS) while interfacing with Qdrant.",
57 |     required=True,
58 |     default_value=False,
59 |     allowable_values=["True", "False"],
60 |     validators=[StandardValidators.BOOLEAN_VALIDATOR],
61 | )
62 | 
63 | QDRANT_PROPERTIES = [COLLECTION_NAME, QDRANT_URL, QDRANT_API_KEY, PREFER_GRPC, HTTPS]
64 | 
65 | HUGGING_FACE_API_KEY = PropertyDescriptor(
66 |     name="HuggingFace API Key",
67 |     description="The API Key for interacting with HuggingFace",
68 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
69 |     required=True,
70 |     sensitive=True,
71 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)],
72 | )
73 | OPENAI_API_KEY = PropertyDescriptor(
74 |     name="OpenAI API Key",
75 |     description="The API Key for OpenAI in order to create embeddings.",
76 |     sensitive=True,
77 |     required=True,
78 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
79 |     dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)],
80 | )
81 | 
82 | EMBEDDING_MODEL_PROPERTIES = [
83 |     EMBEDDING_MODEL,
84 |     HUGGING_FACE_API_KEY,
85 |     HUGGING_FACE_MODEL,
86 |     OPENAI_API_KEY,
87 |     OPENAI_MODEL,
88 | ]
89 | 
90 | 
91 | def convert_id(_id: str) -> str:
92 |     """
93 |     Converts any string into a UUID string deterministically.
94 | 
95 |     Qdrant accepts UUID strings and unsigned integers as point ID.
96 |     This allows us to overwrite the same point with the original ID.
97 |     """
98 |     return str(uuid.uuid5(uuid.NAMESPACE_DNS, _id))
99 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QueryChroma.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import ChromaUtils
  6 | import EmbeddingUtils
  7 | import QueryUtils
  8 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  9 | from nifiapi.properties import ExpressionLanguageScope, PropertyDescriptor, StandardValidators
 10 | 
 11 | 
 12 | class QueryChroma(FlowFileTransform):
 13 |     class Java:
 14 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 15 | 
 16 |     class ProcessorDetails:
 17 |         version = "2.0.0.dev0"
 18 |         description = "Queries a Chroma Vector Database in order to gather a specified number of documents that are most closely related to the given query."
 19 |         tags = [
 20 |             "chroma",
 21 |             "vector",
 22 |             "vectordb",
 23 |             "embeddings",
 24 |             "enrich",
 25 |             "enrichment",
 26 |             "ai",
 27 |             "artificial intelligence",
 28 |             "ml",
 29 |             "machine learning",
 30 |             "text",
 31 |             "LLM",
 32 |         ]
 33 | 
 34 |     QUERY = PropertyDescriptor(
 35 |         name="Query",
 36 |         description="""The query to issue to the Chroma VectorDB. The query is always converted into embeddings using the configured embedding function, and the embedding is
 37 |                     then sent to Chroma. The text itself is not sent to Chroma.""",
 38 |         required=True,
 39 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 40 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 41 |     )
 42 |     NUMBER_OF_RESULTS = PropertyDescriptor(
 43 |         name="Number of Results",
 44 |         description="The number of results to return from Chroma",
 45 |         required=True,
 46 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
 47 |         default_value="10",
 48 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 49 |     )
 50 |     METADATA_FILTER = PropertyDescriptor(
 51 |         name="Metadata Filter",
 52 |         description="""A JSON representation of a Metadata Filter that can be applied against the Chroma documents in order to narrow down the documents that can be returned.
 53 |                     For example: { "metadata_field": "some_value" }""",
 54 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 55 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 56 |         required=False,
 57 |     )
 58 |     DOCUMENT_FILTER = PropertyDescriptor(
 59 |         name="Document Filter",
 60 |         description="""A JSON representation of a Document Filter that can be applied against the Chroma documents' text in order to narrow down the documents that can be returned.
 61 |                     For example: { "$contains": "search_string" }""",
 62 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 63 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 64 |         required=False,
 65 |     )
 66 | 
 67 |     client = None
 68 |     embedding_function = None
 69 |     include_ids = None
 70 |     include_metadatas = None
 71 |     include_documents = None
 72 |     include_distances = None
 73 |     include_embeddings = None
 74 |     results_field = None
 75 | 
 76 |     property_descriptors = (
 77 |         list(ChromaUtils.PROPERTIES)
 78 |         + [prop for prop in EmbeddingUtils.PROPERTIES if prop != EmbeddingUtils.EMBEDDING_MODEL]
 79 |         + [
 80 |             QUERY,
 81 |             NUMBER_OF_RESULTS,
 82 |             QueryUtils.OUTPUT_STRATEGY,
 83 |             QueryUtils.RESULTS_FIELD,
 84 |             METADATA_FILTER,
 85 |             DOCUMENT_FILTER,
 86 |             QueryUtils.INCLUDE_IDS,
 87 |             QueryUtils.INCLUDE_METADATAS,
 88 |             QueryUtils.INCLUDE_DOCUMENTS,
 89 |             QueryUtils.INCLUDE_DISTANCES,
 90 |             QueryUtils.INCLUDE_EMBEDDINGS,
 91 |         ]
 92 |     )
 93 | 
 94 |     def __init__(self, **kwargs):
 95 |         pass
 96 | 
 97 |     def getPropertyDescriptors(self):
 98 |         return self.property_descriptors
 99 | 
100 |     def onScheduled(self, context):
101 |         self.client = ChromaUtils.create_client(context)
102 |         self.embedding_function = EmbeddingUtils.create_embedding_function(context)
103 |         self.include_ids = context.getProperty(QueryUtils.INCLUDE_IDS).asBoolean()
104 |         self.include_metadatas = context.getProperty(QueryUtils.INCLUDE_METADATAS).asBoolean()
105 |         self.include_documents = context.getProperty(QueryUtils.INCLUDE_DOCUMENTS).asBoolean()
106 |         self.include_distances = context.getProperty(QueryUtils.INCLUDE_DISTANCES).asBoolean()
107 |         self.include_embeddings = context.getProperty(QueryUtils.INCLUDE_EMBEDDINGS).asBoolean()
108 |         self.results_field = context.getProperty(QueryUtils.RESULTS_FIELD).getValue()
109 |         self.query_utils = QueryUtils.QueryUtils(context)
110 | 
111 |     def transform(self, context, flowfile):
112 |         client = self.client
113 |         embedding_function = self.embedding_function
114 |         collection_name = (
115 |             context.getProperty(ChromaUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue()
116 |         )
117 | 
118 |         collection = client.get_collection(name=collection_name, embedding_function=embedding_function)
119 | 
120 |         query_text = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue()
121 |         embeddings = embedding_function([query_text])
122 | 
123 |         included_fields = []
124 |         if self.include_distances:
125 |             included_fields.append("distances")
126 |         if self.include_documents:
127 |             included_fields.append("documents")
128 |         if self.include_embeddings:
129 |             included_fields.append("embeddings")
130 |         if self.include_metadatas:
131 |             included_fields.append("metadatas")
132 | 
133 |         where = None
134 |         where_clause = context.getProperty(self.METADATA_FILTER).evaluateAttributeExpressions(flowfile).getValue()
135 |         if where_clause is not None:
136 |             where = json.loads(where_clause)
137 | 
138 |         where_document = None
139 |         where_document_clause = (
140 |             context.getProperty(self.DOCUMENT_FILTER).evaluateAttributeExpressions(flowfile).getValue()
141 |         )
142 |         if where_document_clause is not None:
143 |             where_document = json.loads(where_document_clause)
144 | 
145 |         query_results = collection.query(
146 |             query_embeddings=embeddings,
147 |             n_results=context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger(),
148 |             include=included_fields,
149 |             where_document=where_document,
150 |             where=where,
151 |         )
152 | 
153 |         ids = query_results["ids"][0]
154 |         distances = (
155 |             None
156 |             if (not self.include_distances or query_results["distances"] is None)
157 |             else query_results["distances"][0]
158 |         )
159 |         metadatas = (
160 |             None
161 |             if (not self.include_metadatas or query_results["metadatas"] is None)
162 |             else query_results["metadatas"][0]
163 |         )
164 |         documents = (
165 |             None
166 |             if (not self.include_documents or query_results["documents"] is None)
167 |             else query_results["documents"][0]
168 |         )
169 |         embeddings = (
170 |             None
171 |             if (not self.include_embeddings or query_results["embeddings"] is None)
172 |             else query_results["embeddings"][0]
173 |         )
174 | 
175 |         (output_contents, mime_type) = self.query_utils.create_json(
176 |             flowfile, documents, metadatas, embeddings, distances, ids
177 |         )
178 | 
179 |         # Return the results
180 |         attributes = {"mime.type": mime_type}
181 |         return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes)
182 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QueryOpenSearchVector.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | from EmbeddingUtils import EMBEDDING_MODEL, HUGGING_FACE_MODEL, OPENAI_MODEL, create_embedding_service
  6 | from langchain.vectorstores import OpenSearchVectorSearch
  7 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
  8 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
  9 | from OpenSearchVectorUtils import (
 10 |     CERTIFICATE_PATH,
 11 |     COSINESIMIL,
 12 |     HTTP_HOST,
 13 |     HUGGING_FACE_API_KEY,
 14 |     INDEX_NAME,
 15 |     L1,
 16 |     L2,
 17 |     LINF,
 18 |     OPENAI_API_KEY,
 19 |     PASSWORD,
 20 |     TEXT_FIELD,
 21 |     USERNAME,
 22 |     VECTOR_FIELD,
 23 |     create_authentication_params,
 24 | )
 25 | from QueryUtils import INCLUDE_DISTANCES, INCLUDE_METADATAS, OUTPUT_STRATEGY, RESULTS_FIELD, QueryUtils
 26 | 
 27 | 
 28 | class QueryOpenSearchVector(FlowFileTransform):
 29 |     class Java:
 30 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 31 | 
 32 |     class ProcessorDetails:
 33 |         version = "2.0.0.dev0"
 34 |         description = "Queries OpenSearch in order to gather a specified number of documents that are most closely related to the given query."
 35 |         tags = [
 36 |             "opensearch",
 37 |             "vector",
 38 |             "vectordb",
 39 |             "vectorstore",
 40 |             "embeddings",
 41 |             "ai",
 42 |             "artificial intelligence",
 43 |             "ml",
 44 |             "machine learning",
 45 |             "text",
 46 |             "LLM",
 47 |         ]
 48 | 
 49 |     # Search types
 50 |     APPROXIMATE_SEARCH = ("Approximate Search", "approximate_search")
 51 |     SCRIPT_SCORING_SEARCH = ("Script Scoring Search", "script_scoring")
 52 |     PAINLESS_SCRIPTING_SEARCH = ("Painless Scripting Search", "painless_scripting")
 53 | 
 54 |     SEARCH_TYPE_VALUES = dict([APPROXIMATE_SEARCH, SCRIPT_SCORING_SEARCH, PAINLESS_SCRIPTING_SEARCH])
 55 | 
 56 |     # Script Scoring Search space types
 57 |     HAMMINGBIT = ("Hamming distance", "hammingbit")
 58 | 
 59 |     SCRIPT_SCORING_SPACE_TYPE_VALUES = dict([L2, L1, LINF, COSINESIMIL, HAMMINGBIT])
 60 | 
 61 |     # Painless Scripting Search space types
 62 |     L2_SQUARED = ("L2 (Euclidean distance)", "l2Squared")
 63 |     L1_NORM = ("L1 (Manhattan distance)", "l1Norm")
 64 |     COSINE_SIMILARITY = ("Cosine similarity", "cosineSimilarity")
 65 | 
 66 |     PAINLESS_SCRIPTING_SPACE_TYPE_VALUES = dict([L2_SQUARED, L1_NORM, COSINE_SIMILARITY])
 67 | 
 68 |     QUERY = PropertyDescriptor(
 69 |         name="Query",
 70 |         description="The text of the query to send to OpenSearch.",
 71 |         required=True,
 72 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 73 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 74 |     )
 75 |     NUMBER_OF_RESULTS = PropertyDescriptor(
 76 |         name="Number of Results",
 77 |         description="The number of results to return from OpenSearch",
 78 |         default_value="10",
 79 |         required=True,
 80 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
 81 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 82 |     )
 83 |     SEARCH_TYPE = PropertyDescriptor(
 84 |         name="Search Type",
 85 |         description="Specifies the type of the search to be performed.",
 86 |         allowable_values=SEARCH_TYPE_VALUES.keys(),
 87 |         default_value=APPROXIMATE_SEARCH[0],
 88 |         required=True,
 89 |     )
 90 |     SCRIPT_SCORING_SPACE_TYPE = PropertyDescriptor(
 91 |         name="Script Scoring Space Type",
 92 |         description="Used to measure the distance between two points in order to determine the k-nearest neighbors.",
 93 |         allowable_values=SCRIPT_SCORING_SPACE_TYPE_VALUES.keys(),
 94 |         default_value=L2[0],
 95 |         required=False,
 96 |         dependencies=[PropertyDependency(SEARCH_TYPE, SCRIPT_SCORING_SEARCH[0])],
 97 |     )
 98 |     PAINLESS_SCRIPTING_SPACE_TYPE = PropertyDescriptor(
 99 |         name="Painless Scripting Space Type",
100 |         description="Used to measure the distance between two points in order to determine the k-nearest neighbors.",
101 |         allowable_values=PAINLESS_SCRIPTING_SPACE_TYPE_VALUES.keys(),
102 |         default_value=L2_SQUARED[0],
103 |         required=False,
104 |         dependencies=[PropertyDependency(SEARCH_TYPE, PAINLESS_SCRIPTING_SEARCH[0])],
105 |     )
106 |     BOOLEAN_FILTER = PropertyDescriptor(
107 |         name="Boolean Filter",
108 |         description="A Boolean filter is a post filter consists of a Boolean query that contains a k-NN query and a filter. "
109 |         "The value of the field must be a JSON representation of the filter.",
110 |         required=False,
111 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
112 |         dependencies=[PropertyDependency(SEARCH_TYPE, APPROXIMATE_SEARCH[0])],
113 |     )
114 |     EFFICIENT_FILTER = PropertyDescriptor(
115 |         name="Efficient Filter",
116 |         description="The Lucene Engine or Faiss Engine decides whether to perform an exact k-NN search with "
117 |         "pre-filtering or an approximate search with modified post-filtering. The value of the field must "
118 |         "be a JSON representation of the filter.",
119 |         required=False,
120 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
121 |         dependencies=[PropertyDependency(SEARCH_TYPE, APPROXIMATE_SEARCH[0])],
122 |     )
123 |     PRE_FILTER = PropertyDescriptor(
124 |         name="Pre Filter",
125 |         description="Script Score query to pre-filter documents before identifying nearest neighbors. The value of "
126 |         "the field must be a JSON representation of the filter.",
127 |         default_value='{"match_all": {}}',
128 |         required=False,
129 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
130 |         dependencies=[PropertyDependency(SEARCH_TYPE, SCRIPT_SCORING_SEARCH[0], PAINLESS_SCRIPTING_SEARCH[0])],
131 |     )
132 | 
133 |     properties = [
134 |         EMBEDDING_MODEL,
135 |         OPENAI_API_KEY,
136 |         OPENAI_MODEL,
137 |         HUGGING_FACE_API_KEY,
138 |         HUGGING_FACE_MODEL,
139 |         HTTP_HOST,
140 |         USERNAME,
141 |         PASSWORD,
142 |         CERTIFICATE_PATH,
143 |         INDEX_NAME,
144 |         QUERY,
145 |         VECTOR_FIELD,
146 |         TEXT_FIELD,
147 |         NUMBER_OF_RESULTS,
148 |         SEARCH_TYPE,
149 |         SCRIPT_SCORING_SPACE_TYPE,
150 |         PAINLESS_SCRIPTING_SPACE_TYPE,
151 |         BOOLEAN_FILTER,
152 |         EFFICIENT_FILTER,
153 |         PRE_FILTER,
154 |         OUTPUT_STRATEGY,
155 |         RESULTS_FIELD,
156 |         INCLUDE_METADATAS,
157 |         INCLUDE_DISTANCES,
158 |     ]
159 | 
160 |     embeddings = None
161 |     query_utils = None
162 | 
163 |     def __init__(self, **kwargs):
164 |         pass
165 | 
166 |     def getPropertyDescriptors(self):
167 |         return self.properties
168 | 
169 |     def onScheduled(self, context):
170 |         # initialize embedding service
171 |         self.embeddings = create_embedding_service(context)
172 |         self.query_utils = QueryUtils(context)
173 | 
174 |     def transform(self, context, flowfile):
175 |         http_host = context.getProperty(HTTP_HOST).evaluateAttributeExpressions(flowfile).getValue()
176 |         index_name = context.getProperty(INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
177 |         query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue()
178 |         num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger()
179 |         vector_field = context.getProperty(VECTOR_FIELD).evaluateAttributeExpressions(flowfile).getValue()
180 |         text_field = context.getProperty(TEXT_FIELD).evaluateAttributeExpressions(flowfile).getValue()
181 |         search_type = context.getProperty(self.SEARCH_TYPE).evaluateAttributeExpressions().getValue()
182 | 
183 |         params = {
184 |             "vector_field": vector_field,
185 |             "text_field": text_field,
186 |             "search_type": self.SEARCH_TYPE_VALUES.get(search_type),
187 |         }
188 |         params.update(create_authentication_params(context))
189 | 
190 |         if search_type == self.APPROXIMATE_SEARCH[0]:
191 |             boolean_filter = context.getProperty(self.BOOLEAN_FILTER).evaluateAttributeExpressions().getValue()
192 |             if boolean_filter is not None:
193 |                 params["boolean_filter"] = json.loads(boolean_filter)
194 | 
195 |             efficient_filter = context.getProperty(self.EFFICIENT_FILTER).evaluateAttributeExpressions().getValue()
196 |             if efficient_filter is not None:
197 |                 params["efficient_filter"] = json.loads(efficient_filter)
198 |         else:
199 |             pre_filter = context.getProperty(self.PRE_FILTER).evaluateAttributeExpressions().getValue()
200 |             if pre_filter is not None:
201 |                 params["pre_filter"] = json.loads(pre_filter)
202 |             if search_type == self.SCRIPT_SCORING_SEARCH[0]:
203 |                 space_type = (
204 |                     context.getProperty(self.SCRIPT_SCORING_SPACE_TYPE).evaluateAttributeExpressions().getValue()
205 |                 )
206 |                 params["space_type"] = self.SCRIPT_SCORING_SPACE_TYPE_VALUES.get(space_type)
207 |             elif search_type == self.PAINLESS_SCRIPTING_SEARCH[0]:
208 |                 space_type = (
209 |                     context.getProperty(self.PAINLESS_SCRIPTING_SPACE_TYPE).evaluateAttributeExpressions().getValue()
210 |                 )
211 |                 params["space_type"] = self.PAINLESS_SCRIPTING_SPACE_TYPE_VALUES.get(space_type)
212 | 
213 |         vectorstore = OpenSearchVectorSearch(
214 |             index_name=index_name, embedding_function=self.embeddings, opensearch_url=http_host, **params
215 |         )
216 | 
217 |         results = vectorstore.similarity_search_with_score(query=query, k=num_results, **params)
218 | 
219 |         documents = []
220 |         for result in results:
221 |             documents.append(result[0].page_content)
222 | 
223 |         if context.getProperty(INCLUDE_METADATAS):
224 |             metadatas = []
225 |             for result in results:
226 |                 metadatas.append(result[0].metadata)
227 |         else:
228 |             metadatas = None
229 | 
230 |         if context.getProperty(INCLUDE_DISTANCES):
231 |             distances = []
232 |             for result in results:
233 |                 distances.append(result[1])
234 |         else:
235 |             distances = None
236 | 
237 |         (output_contents, mime_type) = self.query_utils.create_json(
238 |             flowfile, documents, metadatas, None, distances, None
239 |         )
240 |         attributes = {"mime.type": mime_type}
241 | 
242 |         return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes)
243 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QueryPinecone.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import langchain.vectorstores
  6 | import QueryUtils
  7 | from EmbeddingUtils import (
  8 |     EMBEDDING_MODEL,
  9 |     HUGGING_FACE,
 10 |     HUGGING_FACE_MODEL,
 11 |     OPENAI,
 12 |     OPENAI_MODEL,
 13 |     create_embedding_service,
 14 | )
 15 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
 16 | from nifiapi.properties import ExpressionLanguageScope, PropertyDependency, PropertyDescriptor, StandardValidators
 17 | from pinecone import Pinecone
 18 | 
 19 | 
 20 | class QueryPinecone(FlowFileTransform):
 21 |     class Java:
 22 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 23 | 
 24 |     class ProcessorDetails:
 25 |         version = "2.0.0.dev0"
 26 |         description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query."
 27 |         tags = [
 28 |             "pinecone",
 29 |             "vector",
 30 |             "vectordb",
 31 |             "vectorstore",
 32 |             "embeddings",
 33 |             "ai",
 34 |             "artificial intelligence",
 35 |             "ml",
 36 |             "machine learning",
 37 |             "text",
 38 |             "LLM",
 39 |         ]
 40 | 
 41 |     PINECONE_API_KEY = PropertyDescriptor(
 42 |         name="Pinecone API Key",
 43 |         description="The API Key to use in order to authentication with Pinecone",
 44 |         sensitive=True,
 45 |         required=True,
 46 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 47 |     )
 48 |     OPENAI_API_KEY = PropertyDescriptor(
 49 |         name="OpenAI API Key",
 50 |         description="The API Key for OpenAI in order to create embeddings",
 51 |         sensitive=True,
 52 |         required=True,
 53 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 54 |         dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)],
 55 |     )
 56 |     HUGGING_FACE_API_KEY = PropertyDescriptor(
 57 |         name="HuggingFace API Key",
 58 |         description="The API Key for interacting with HuggingFace",
 59 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 60 |         required=True,
 61 |         sensitive=True,
 62 |         dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)],
 63 |     )
 64 |     PINECONE_ENV = PropertyDescriptor(
 65 |         name="Pinecone Environment",
 66 |         description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.",
 67 |         sensitive=False,
 68 |         required=True,
 69 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 70 |     )
 71 |     INDEX_NAME = PropertyDescriptor(
 72 |         name="Index Name",
 73 |         description="The name of the Pinecone index.",
 74 |         sensitive=False,
 75 |         required=True,
 76 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 77 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 78 |     )
 79 |     QUERY = PropertyDescriptor(
 80 |         name="Query",
 81 |         description="The text of the query to send to Pinecone.",
 82 |         required=True,
 83 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 84 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 85 |     )
 86 |     NUMBER_OF_RESULTS = PropertyDescriptor(
 87 |         name="Number of Results",
 88 |         description="The number of results to return from Pinecone",
 89 |         required=True,
 90 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
 91 |         default_value="10",
 92 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 93 |     )
 94 |     TEXT_KEY = PropertyDescriptor(
 95 |         name="Text Key",
 96 |         description="The key in the document that contains the text to create embeddings for.",
 97 |         required=True,
 98 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 99 |         default_value="text",
100 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
101 |     )
102 |     NAMESPACE = PropertyDescriptor(
103 |         name="Namespace",
104 |         description="The name of the Pinecone Namespace to query into.",
105 |         required=False,
106 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
107 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
108 |     )
109 |     FILTER = PropertyDescriptor(
110 |         name="Metadata Filter",
111 |         description='Optional metadata filter to apply with the query. For example: { "author": {"$eq": "john.doe"} }',
112 |         required=False,
113 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
114 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
115 |     )
116 | 
117 |     properties = [
118 |         PINECONE_API_KEY,
119 |         EMBEDDING_MODEL,
120 |         OPENAI_API_KEY,
121 |         OPENAI_MODEL,
122 |         HUGGING_FACE_API_KEY,
123 |         HUGGING_FACE_MODEL,
124 |         PINECONE_ENV,
125 |         INDEX_NAME,
126 |         QUERY,
127 |         FILTER,
128 |         NUMBER_OF_RESULTS,
129 |         NAMESPACE,
130 |         TEXT_KEY,
131 |         QueryUtils.OUTPUT_STRATEGY,
132 |         QueryUtils.RESULTS_FIELD,
133 |         QueryUtils.INCLUDE_METADATAS,
134 |         QueryUtils.INCLUDE_DISTANCES,
135 |     ]
136 | 
137 |     embeddings = None
138 |     query_utils = None
139 |     pc = None
140 | 
141 |     def __init__(self, **kwargs):
142 |         pass
143 | 
144 |     def getPropertyDescriptors(self):
145 |         return self.properties
146 | 
147 |     def onScheduled(self, context):
148 |         # initialize pinecone
149 |         self.pc = Pinecone(
150 |             api_key=context.getProperty(self.PINECONE_API_KEY).getValue(),
151 |             environment=context.getProperty(self.PINECONE_ENV).getValue(),
152 |         )
153 |         # initialize embedding service
154 |         self.embeddings = create_embedding_service(context)
155 |         self.query_utils = QueryUtils.QueryUtils(context)
156 | 
157 |     def transform(self, context, flowfile):
158 |         # First, check if our index already exists. If it doesn't, we create it
159 |         index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
160 |         query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue()
161 |         namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue()
162 |         num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger()
163 | 
164 |         index = self.pc.Index(index_name)
165 | 
166 |         text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
167 |         filter_definition = context.getProperty(self.FILTER).evaluateAttributeExpressions(flowfile).getValue()
168 |         vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key, namespace=namespace)
169 |         results = vectorstore.similarity_search_with_score(
170 |             query, num_results, filter=None if filter_definition is None else json.loads(filter_definition)
171 |         )
172 | 
173 |         documents = []
174 |         for result in results:
175 |             documents.append(result[0].page_content)
176 | 
177 |         if context.getProperty(QueryUtils.INCLUDE_METADATAS):
178 |             metadatas = []
179 |             for result in results:
180 |                 metadatas.append(result[0].metadata)
181 |         else:
182 |             metadatas = None
183 | 
184 |         if context.getProperty(QueryUtils.INCLUDE_DISTANCES):
185 |             distances = []
186 |             for result in results:
187 |                 distances.append(result[1])
188 |         else:
189 |             distances = None
190 | 
191 |         (output_contents, mime_type) = self.query_utils.create_json(
192 |             flowfile, documents, metadatas, None, distances, None
193 |         )
194 |         attributes = {"mime.type": mime_type}
195 | 
196 |         return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes)
197 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QueryQdrant.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | import QdrantUtils
  6 | import QueryUtils
  7 | from EmbeddingUtils import (
  8 |     create_embedding_service,
  9 | )
 10 | from langchain.vectorstores.qdrant import Qdrant
 11 | from nifiapi.documentation import use_case
 12 | from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
 13 | from nifiapi.properties import (
 14 |     ExpressionLanguageScope,
 15 |     PropertyDescriptor,
 16 |     StandardValidators,
 17 | )
 18 | from qdrant_client import QdrantClient
 19 | 
 20 | 
 21 | @use_case(
 22 |     description="Semantically search for documents stored in Qdrant - https://qdrant.tech/",
 23 |     keywords=["qdrant", "embedding", "vector", "text", "vectorstore", "search"],
 24 |     configuration="""
 25 |                 Configure 'Collection Name' to the name of the Qdrant collection to use.
 26 |                 Configure 'Qdrant URL' to the fully qualified URL of the Qdrant instance.
 27 |                 Configure 'Qdrant API Key' to the API Key to use in order to authenticate with Qdrant.
 28 |                 Configure 'Prefer gRPC' to True if you want to use gRPC for interfacing with Qdrant.
 29 |                 Configure 'Use HTTPS' to True if you want to use TLS(HTTPS) while interfacing with Qdrant.
 30 |                 Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
 31 |                 Configure 'HuggingFace API Key' or 'OpenAI API Key', depending on the chosen Embedding Model.
 32 |                 Configure 'HuggingFace Model' or 'OpenAI Model' to the name of the model to use.
 33 |                 Configure 'Query' to the text of the query to send to Qdrant.
 34 |                 Configure 'Number of Results' to the number of results to return from Qdrant.
 35 |                 Configure 'Metadata Filter' to apply an optional metadata filter with the query. For example: { "author": "john.doe" }
 36 |                 Configure 'Output Strategy' to indicate how the output should be formatted: 'Row-Oriented', 'Text', or 'Column-Oriented'.
 37 |                 Configure 'Results Field' to the name of the field to insert the results, if the input FlowFile is JSON Formatted,.
 38 |                 Configure 'Include Metadatas' to True if metadata should be included in the output.
 39 |                 Configure 'Include Distances' to True if distances should be included in the output.
 40 |                 """,
 41 | )
 42 | class QueryQdrant(FlowFileTransform):
 43 |     class Java:
 44 |         implements = ["org.apache.nifi.python.processor.FlowFileTransform"]
 45 | 
 46 |     class ProcessorDetails:
 47 |         version = "2.0.0.dev0"
 48 |         description = "Queries Qdrant in order to gather a specified number of documents that are most closely related to the given query."
 49 |         tags = [
 50 |             "qdrant",
 51 |             "vector",
 52 |             "vectordb",
 53 |             "vectorstore",
 54 |             "embeddings",
 55 |             "ai",
 56 |             "artificial intelligence",
 57 |             "ml",
 58 |             "machine learning",
 59 |             "text",
 60 |             "LLM",
 61 |         ]
 62 | 
 63 |     QUERY = PropertyDescriptor(
 64 |         name="Query",
 65 |         description="The text of the query to send to Qdrant.",
 66 |         required=True,
 67 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 68 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 69 |     )
 70 |     NUMBER_OF_RESULTS = PropertyDescriptor(
 71 |         name="Number of Results",
 72 |         description="The number of results to return from Qdrant.",
 73 |         required=True,
 74 |         validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR],
 75 |         default_value="10",
 76 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 77 |     )
 78 |     FILTER = PropertyDescriptor(
 79 |         name="Metadata Filter",
 80 |         description='Optional metadata filter to apply with the query. For example: { "author": "john.doe" }',
 81 |         required=False,
 82 |         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 83 |         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES,
 84 |     )
 85 | 
 86 |     properties = (
 87 |         QdrantUtils.QDRANT_PROPERTIES
 88 |         + QdrantUtils.EMBEDDING_MODEL_PROPERTIES
 89 |         + [
 90 |             QUERY,
 91 |             FILTER,
 92 |             NUMBER_OF_RESULTS,
 93 |             QueryUtils.OUTPUT_STRATEGY,
 94 |             QueryUtils.RESULTS_FIELD,
 95 |             QueryUtils.INCLUDE_METADATAS,
 96 |             QueryUtils.INCLUDE_DISTANCES,
 97 |         ]
 98 |     )
 99 | 
100 |     embeddings = None
101 |     query_utils = None
102 |     client = None
103 | 
104 |     def __init__(self, **kwargs):
105 |         pass
106 | 
107 |     def getPropertyDescriptors(self):
108 |         return self.properties
109 | 
110 |     def onScheduled(self, context):
111 |         self.client = QdrantClient(
112 |             url=context.getProperty(QdrantUtils.QDRANT_URL).getValue(),
113 |             api_key=context.getProperty(QdrantUtils.QDRANT_API_KEY).getValue(),
114 |             prefer_grpc=context.getProperty(QdrantUtils.PREFER_GRPC).asBoolean(),
115 |             https=context.getProperty(QdrantUtils.HTTPS).asBoolean(),
116 |         )
117 |         self.embeddings = create_embedding_service(context)
118 |         self.query_utils = QueryUtils.QueryUtils(context)
119 | 
120 |     def transform(self, context, flowfile):
121 |         collection_name = (
122 |             context.getProperty(QdrantUtils.COLLECTION_NAME).evaluateAttributeExpressions(flowfile).getValue()
123 |         )
124 |         query = context.getProperty(self.QUERY).evaluateAttributeExpressions(flowfile).getValue()
125 |         num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger()
126 |         filter_definition = context.getProperty(self.FILTER).evaluateAttributeExpressions(flowfile).getValue()
127 |         vector_store = Qdrant(
128 |             client=self.client,
129 |             collection_name=collection_name,
130 |             embeddings=self.embeddings,
131 |         )
132 |         results = vector_store.similarity_search_with_score(
133 |             query=query,
134 |             k=num_results,
135 |             filter=None if filter_definition is None else json.loads(filter_definition),
136 |         )
137 | 
138 |         documents = []
139 |         for result in results:
140 |             documents.append(result[0].page_content)
141 | 
142 |         if context.getProperty(QueryUtils.INCLUDE_METADATAS).asBoolean():
143 |             metadatas = []
144 |             for result in results:
145 |                 metadatas.append(result[0].metadata)
146 |         else:
147 |             metadatas = None
148 | 
149 |         if context.getProperty(QueryUtils.INCLUDE_DISTANCES).asBoolean():
150 |             distances = []
151 |             for result in results:
152 |                 distances.append(result[1])
153 |         else:
154 |             distances = None
155 | 
156 |         (output_contents, mime_type) = self.query_utils.create_json(
157 |             flowfile, documents, metadatas, None, distances, None
158 |         )
159 |         attributes = {"mime.type": mime_type}
160 | 
161 |         return FlowFileTransformResult(relationship="success", contents=output_contents, attributes=attributes)
162 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/QueryUtils.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import json
  4 | 
  5 | from nifiapi.properties import PropertyDependency, PropertyDescriptor, StandardValidators
  6 | 
  7 | ROW_ORIENTED = "Row-Oriented"
  8 | TEXT = "Text"
  9 | COLUMN_ORIENTED = "Column-Oriented"
 10 | 
 11 | 
 12 | OUTPUT_STRATEGY = PropertyDescriptor(
 13 |     name="Output Strategy",
 14 |     description="""Specifies whether the output should contain only the text of the documents (each document separated by \\n\\n), or if it
 15 |                 should be formatted as either single column-oriented JSON object,
 16 |                 consisting of a keys 'ids', 'embeddings', 'documents', 'distances', and 'metadatas'; or if the results should be row-oriented,
 17 |                 a JSON per line, each consisting of a single id, document, metadata, embedding, and distance.""",
 18 |     allowable_values=[ROW_ORIENTED, TEXT, COLUMN_ORIENTED],
 19 |     default_value=ROW_ORIENTED,
 20 |     required=True,
 21 | )
 22 | RESULTS_FIELD = PropertyDescriptor(
 23 |     name="Results Field",
 24 |     description="""If the input FlowFile is JSON Formatted, this represents the name of the field to insert the results. This allows the results to be inserted into
 25 |                 "an existing input in order to enrich it. If this property is unset, the results will be written to the FlowFile contents, overwriting any pre-existing content.""",
 26 |     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
 27 |     required=False,
 28 | )
 29 | 
 30 | INCLUDE_IDS = PropertyDescriptor(
 31 |     name="Include Document IDs",
 32 |     description="Whether or not to include the Documents' IDs in the response",
 33 |     allowable_values=["true", "false"],
 34 |     default_value="true",
 35 |     required=False,
 36 |     dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)],
 37 | )
 38 | INCLUDE_METADATAS = PropertyDescriptor(
 39 |     name="Include Metadata",
 40 |     description="Whether or not to include the Documents' Metadata in the response",
 41 |     allowable_values=["true", "false"],
 42 |     default_value="true",
 43 |     required=False,
 44 |     dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)],
 45 | )
 46 | INCLUDE_DOCUMENTS = PropertyDescriptor(
 47 |     name="Include Document",
 48 |     description="Whether or not to include the Documents' Text in the response",
 49 |     allowable_values=["true", "false"],
 50 |     default_value="true",
 51 |     required=False,
 52 |     dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)],
 53 | )
 54 | INCLUDE_DISTANCES = PropertyDescriptor(
 55 |     name="Include Distances",
 56 |     description="Whether or not to include the Documents' Distances (i.e., how far the Document was away from the query) in the response",
 57 |     allowable_values=["true", "false"],
 58 |     default_value="true",
 59 |     required=False,
 60 |     dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)],
 61 | )
 62 | INCLUDE_EMBEDDINGS = PropertyDescriptor(
 63 |     name="Include Embeddings",
 64 |     description="Whether or not to include the Documents' Embeddings in the response",
 65 |     allowable_values=["true", "false"],
 66 |     default_value="false",
 67 |     required=False,
 68 |     dependencies=[PropertyDependency(OUTPUT_STRATEGY, ROW_ORIENTED, COLUMN_ORIENTED)],
 69 | )
 70 | 
 71 | 
 72 | class QueryUtils:
 73 |     context = None
 74 | 
 75 |     def __init__(self, context):
 76 |         self.context = context
 77 |         self.results_field = context.getProperty(RESULTS_FIELD).getValue()
 78 |         self.output_strategy = context.getProperty(OUTPUT_STRATEGY).getValue()
 79 | 
 80 |         ids_property = context.getProperty(INCLUDE_IDS)
 81 |         self.include_ids = ids_property.asBoolean() if ids_property else False
 82 | 
 83 |         embeddings_property = context.getProperty(INCLUDE_EMBEDDINGS)
 84 |         self.include_embeddings = embeddings_property.asBoolean() if embeddings_property else False
 85 | 
 86 |         self.include_distances = context.getProperty(INCLUDE_DISTANCES).asBoolean()
 87 | 
 88 |         documents_property = context.getProperty(INCLUDE_DOCUMENTS)
 89 |         self.include_documents = documents_property.asBoolean() if documents_property else True
 90 |         self.include_metadatas = context.getProperty(INCLUDE_METADATAS).asBoolean()
 91 | 
 92 |     def create_json(self, flowfile, documents, metadatas, embeddings, distances, ids) -> tuple[str, str]:
 93 |         input_json = None if self.results_field is None else json.loads(flowfile.getContentsAsBytes().decode())
 94 | 
 95 |         if self.output_strategy == TEXT:
 96 |             # Delete any document that is None or an empty-string
 97 |             documents = [doc for doc in documents if doc is not None and doc != ""]
 98 | 
 99 |             # Join the documents with two newlines
100 |             text = "\n\n".join(documents)
101 | 
102 |             # Create either JSON or text output, based on whether or not an results field was specified
103 |             if input_json is None:
104 |                 mime_type = "text/plain"
105 |                 output_contents = text
106 |             else:
107 |                 input_json[self.results_field] = text
108 |                 output_contents = json.dumps(input_json)
109 |                 mime_type = "application/json"
110 |         elif self.output_strategy == COLUMN_ORIENTED:
111 |             doc = {}
112 |             if self.include_ids:
113 |                 doc["ids"] = ids
114 |             if self.include_distances:
115 |                 doc["distances"] = distances
116 |             if self.include_documents:
117 |                 doc["documents"] = documents
118 |             if self.include_metadatas:
119 |                 doc["metadatas"] = metadatas
120 |             if self.include_embeddings:
121 |                 doc["embeddings"] = embeddings
122 | 
123 |             # Create the JSON from the Document
124 |             if input_json is None:
125 |                 output_contents = json.dumps(doc)
126 |             else:
127 |                 input_json[self.results_field] = doc
128 |                 output_contents = json.dumps(input_json)
129 | 
130 |             mime_type = "application/json"
131 |         else:
132 |             # Build the Documents
133 |             docs = []
134 | 
135 |             count = len(ids) if ids else len(documents)
136 |             for i in range(count):
137 |                 doc_id = None if ids is None else ids[i]
138 |                 distance = None if distances is None else distances[i]
139 |                 metadata = None if metadatas is None else metadatas[i]
140 |                 document = None if documents is None else documents[i]
141 |                 embedding = None if embeddings is None else embeddings[i]
142 | 
143 |                 # Create the document but do not include any key that we don't want to include in the output.
144 |                 doc = {}
145 |                 if self.include_ids:
146 |                     doc["id"] = doc_id
147 |                 if self.include_distances:
148 |                     doc["distance"] = distance
149 |                 if self.include_documents:
150 |                     doc["document"] = document
151 |                 if self.include_metadatas:
152 |                     doc["metadata"] = metadata
153 |                 if self.include_embeddings:
154 |                     doc["embedding"] = embedding
155 | 
156 |                 docs.append(doc)
157 | 
158 |             # If input_json is None, we just create JSON based on the Documents.
159 |             # If input_json is populated, we insert the documents into the input JSON using the specified key.
160 |             if input_json is None:
161 |                 jsons = []
162 |                 for doc in docs:
163 |                     jsons.append(json.dumps(doc))
164 |                 output_contents = "\n".join(jsons)
165 |             else:
166 |                 input_json[self.results_field] = docs
167 |                 output_contents = json.dumps(input_json)
168 | 
169 |             mime_type = "application/json"
170 | 
171 |         return output_contents, mime_type
172 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | 


--------------------------------------------------------------------------------
/src/extensions/vectorstores/requirements.txt:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Shared requirements
 4 | openai==1.9.0
 5 | tiktoken
 6 | langchain==0.1.11
 7 | 
 8 | # Chroma requirements
 9 | chromadb==0.4.22
10 | numpy==1.26.4
11 | onnxruntime
12 | tokenizers
13 | tqdm
14 | requests
15 | 
16 | # Pinecone requirements
17 | pinecone-client==3.0.1
18 | tiktoken
19 | langchain==0.1.11
20 | 
21 | # OpenSearch requirements
22 | opensearch-py==2.5.0
23 | 
24 | # Qdrant requirements
25 | qdrant-client==1.9.1
26 | 


--------------------------------------------------------------------------------