├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── env
├── pyproject.toml
├── requirements.dev.txt
├── requirements.txt
└── src
    └── vectra_py
        ├── __init__.py
        ├── all_MiniLM_L6_v2_tokenizer.py
        ├── custom_types.py
        ├── file_fetcher.py
        ├── gpt3_tokenizer.py
        ├── item_selector.py
        ├── local_document.py
        ├── local_document_index.py
        ├── local_document_result.py
        ├── local_index.py
        ├── openai_embeddings.py
        ├── oss_embeddings.py
        ├── text_splitter.py
        ├── vectra-cli.py
        ├── vectra-pipeline.py
        └── web_fetcher.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # keyfiles
163 | *.keys


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 brian schleckser
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Update: Revisiting this very stale project in Fall 2024. Going to start with a general review of the source vectra project, then go from there. As such I'll probably reply to the issues and close them, pending new discussion. 
  2 | 
  3 | # vectra-py
  4 | This is a faithful port of Steven Ickman's [Vectra](https://github.com/Stevenic/vectra) in memory vector index project. Only modifications were to port into python, adjust for format, and generate some python friendly example code. Below readme follows on from his, with similar pythonic adjustments.
  5 | 
  6 | Thanks for the inspiriation Steve!
  7 | 
  8 | 
  9 | Vectra-py is a local vector database for Python with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata.  When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
 10 | 
 11 | When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by similarity. Every item in the index will first be filtered by metadata and then ranked for similarity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
 12 | 
 13 | Keep in mind that your entire Vectra index is loaded into memory so it's not well suited for scenarios like long term chat bot memory. Use a real vector DB for that. Vectra is intended to be used in scenarios where you have a small corpus of mostly static data that you'd like to include in your prompt. Infinite few shot examples would be a great use case for Vectra or even just a single document you want to ask questions over.
 14 | 
 15 | Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
 16 | 
 17 | ## Installation
 18 | 
 19 | ```
 20 | $ pip install vectra-py
 21 | ```
 22 | 
 23 | ## Prep
 24 | 
 25 | Use dotenv or set env var to store your openAI API Key.
 26 | 
 27 | ## Usage
 28 | 
 29 | First create an instance of `LocalIndex` with the path to the folder where you want you're items stored:
 30 | 
 31 | ```python
 32 | from vectra_py import LocalIndex
 33 | 
 34 | index = LocalIndex(os.path.join(os.getcwd(), 'index'))
 35 | ```
 36 | 
 37 | Next, from inside an async function, create your index:
 38 | 
 39 | ```python
 40 | if not index.isIndexCreated():
 41 |         index.createIndex()
 42 | ```
 43 | 
 44 | Add some items to your index:
 45 | 
 46 | ```python
 47 | openai.api_key = os.environ.get("OPENAI_APIKEY")
 48 | 
 49 | async def get_vector(text: str):
 50 |     print(text)
 51 |     model = "text-embedding-ada-002"
 52 |     response = await openai_async.embeddings(
 53 |                                             openai.api_key,
 54 |                                             timeout=2,
 55 |                                             payload={"model": model,
 56 |                                                      "input": [text]},
 57 |                                         )
 58 |     return response.json()['data'][0]['embedding']
 59 | 
 60 | 
 61 | async def add_item(text: str):
 62 |     vector = await get_vector(text)
 63 |     metadata = {'text': text}
 64 |     print(vector, metadata)
 65 |     await index.insertItem({'vector': vector,
 66 |                             'metadata': metadata})
 67 | 
 68 | // Add items
 69 | await add_item('apple');
 70 | await add_item('oranges');
 71 | await add_item('red');
 72 | await add_item('blue');
 73 | ```
 74 | 
 75 | Then query for items:
 76 | 
 77 | ```python
 78 | async def query(text: str):
 79 |     vector = await get_vector(text)
 80 |     results = await index.queryItems(vector, 3)
 81 |     if len(results) > 0:
 82 |         for result in results:
 83 |             print(f"[{result['score']}] \
 84 |                   {result.get('item')['metadata']['text']}")
 85 |     else:
 86 |         print("No results found.")
 87 | 
 88 | await query('green')
 89 | /*
 90 | [0.9036569942401076] blue
 91 | [0.8758153664568566] red
 92 | [0.8323828606103998] apple
 93 | */
 94 | 
 95 | await query('banana')
 96 | /*
 97 | [0.9033128691220631] apple
 98 | [0.8493374123092652] oranges
 99 | [0.8415324469533297] blue
100 | */
101 | ```
102 | 
103 | Creating a document index is a bit more involved. 
104 | 
105 | First, set up configurations. Pass in an example list of Filing objects as a list_file like:
106 | ```json
107 | {
108 |     "filings": [
109 |         {
110 |             "company_name": "DigitalBridge Group, Inc.",
111 |             "form_type": "10-Q",
112 |             "filing_date": "20230505",
113 |             "url": "https://www.sec.gov/Archives/edgar/data/0001679688/000167968823000049/dbrg-20230331.htm"
114 |         }
115 |     ]
116 | }
117 | ```
118 | 
119 | ```python
120 | import os
121 | import json
122 | import asyncio
123 | from typing import List
124 | from dataclasses import dataclass
125 | 
126 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer
127 | from oss_embeddings import OSSEmbeddings, OSSEmbeddingsOptions
128 | from openai_embeddings import OpenAIEmbeddings, OpenAIEmbeddingsOptions
129 | from local_index import LocalIndex, CreateIndexConfig
130 | from local_document_index import LocalDocumentIndex, LocalDocumentIndexConfig
131 | from file_fetcher import FileFetcher
132 | from web_fetcher import WebFetcher
133 | 
134 | # test defaults
135 | keys_file = "vectra.keys"
136 | uri = None
137 | list_file = "test_filings_1.json"
138 | item_type = "html"
139 | 
140 | openai_options = OpenAIEmbeddingsOptions(
141 |     api_key=os.environ.get("OPENAI_API_KEY"),
142 |     model="text-embedding-ada-002",
143 |     retry_policy=[2000, 5000],
144 |     request_config={"timeout": 30}
145 | )
146 | 
147 | oss_options = OSSEmbeddingsOptions(
148 |     tokenizer=OSSTokenizer(model_name="sentence-transformers/all-MiniLM-L6-v2"),
149 |     model="sentence-transformers/all-MiniLM-L6-v2"
150 | )
151 | 
152 | 
153 | @dataclass
154 | class Filing:
155 |     company_name: str
156 |     form_type: str
157 |     filing_date: str
158 |     url: str
159 | ```
160 | 
161 | Next, write a basic way to organize the filings.
162 | ```python
163 | def get_item_list(uri: str, list_file: str, item_type: str) -> List[str]:
164 |     """Get a list of URIs from a specified URI or list file"""
165 |     if uri:
166 |         return [uri]
167 |     elif list_file:
168 |         with open(list_file, "r", encoding="utf-8") as file:
169 |             filings = json.load(file)['filings']
170 |             return [Filing(**filing) for filing in filings]
171 | 
172 |     else:
173 |         raise Exception(f"Please provide a {item_type} URI or list file")
174 | ```
175 | 
176 | Then, handle the operations to create, manage, and populate the doc index.
177 | 
178 | ```python
179 | async def add_docs_to_index(uri: str = None, list_file: str = None, item_type: str = None):
180 |     """
181 |     Handle operations.
182 |     Establish the index, prepare the config, fetch the docs, and add them to the index.
183 |     """
184 |     print("Adding Web Pages to Index")
185 | 
186 |     # Create embeddings and tokenizer
187 |     # embeddings = OpenAIEmbeddings(options=openai_options)
188 |     # tokenizer = None  # the tokenizer is wrapped in the openai embedding.
189 |     embeddings = OSSEmbeddings(options=oss_options)
190 |     tokenizer = embeddings.tokenizer
191 |     # Initialize index in current directory
192 |     # update the index_config to include the embeddings
193 |     doc_index_config = LocalDocumentIndexConfig(folder_path=(os.path.join(os.getcwd(), 'index')),
194 |                                                 tokenizer=tokenizer,
195 |                                                 embeddings=embeddings)
196 |     simple_index_config = CreateIndexConfig(version=1, 
197 |                                             delete_if_exists=True,
198 |                                             metadata_config={"model_framework": embeddings.__class__.__name__,
199 |                                                              "model_name": embeddings.options.model},
200 |                                             )
201 |     index = LocalDocumentIndex(doc_index_config)
202 |     await index.create_index(simple_index_config)
203 | 
204 |     # Get list of URIs
205 |     uris = get_item_list(uri, list_file, item_type)
206 |     print('uris', uris)
207 | 
208 |     # Fetch web pages
209 |     file_fetcher = FileFetcher()
210 |     web_fetcher = WebFetcher()
211 |     for uri in uris:
212 |         try:
213 |             url = uri.url if isinstance(uri, Filing) else uri
214 |             print(f"Fetching {url}")
215 |             fetcher = web_fetcher if url.startswith("http") else file_fetcher
216 |             fetched_doc = fetcher.fetch(url)
217 |             await index.upsert_document(url,
218 |                                         fetched_doc,
219 |                                         doc_type=item_type)
220 |         except Exception as err:
221 |             print(f"Error adding: {uri}\n{str(err)}")
222 | 
223 | 
224 | async def main():
225 |     await add_docs_to_index(list_file=list_file, item_type=item_type)
226 | 
227 | if __name__ == "__main__":
228 |     asyncio.run(main())
229 | 
230 | ```
231 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BMS-geodev/vectra-py/285bd341f1da469d9695fdee584b2d526f24a4ee/__init__.py


--------------------------------------------------------------------------------
/env:
--------------------------------------------------------------------------------
1 | # copy to .env, then replace ### with your API key.
2 | OPENAI_APIKEY=###


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "vectra_py"
 3 | version = "0.0.5"
 4 | authors = [
 5 |   { name="Brian Schleckser", email="brian.schleckser+vectrapy@gmail.com" },
 6 | ]
 7 | description = "An in memory vector index project, simliar to Pinecone DB."
 8 | readme = "README.md"
 9 | requires-python = ">=3.9"
10 | classifiers = [
11 |     "Programming Language :: Python :: 3",
12 |     "License :: OSI Approved :: MIT License",
13 |     "Operating System :: OS Independent",
14 | ]
15 | 
16 | [project.urls]
17 | "Homepage" = "https://github.com/BMS-geodev/vectra-py"
18 | "Bug Tracker" = "https://github.com/BMS-geodev/vectra-py/issues"


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | tiktoken
 3 | openai-async
 4 | python-dotenv
 5 | bs4
 6 | markdownify
 7 | transformers
 8 | sentence-transformers
 9 | colorize
10 | aiofiles


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tiktoken
2 | openai
3 | bs4
4 | markdownify
5 | transformers
6 | sentence-transformers
7 | aiofiles


--------------------------------------------------------------------------------
/src/vectra_py/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BMS-geodev/vectra-py/285bd341f1da469d9695fdee584b2d526f24a4ee/src/vectra_py/__init__.py


--------------------------------------------------------------------------------
/src/vectra_py/all_MiniLM_L6_v2_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | from transformers import AutoTokenizer
 3 | 
 4 | 
 5 | class OSSTokenizer:
 6 |     def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
 7 |         # Load model from HuggingFace Hub
 8 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 9 | 
10 |     def decode(self, tokens):
11 |         pass
12 | 
13 |     def encode(self, text):
14 |         try:
15 |             if len(text) > 1:  # if text is a list of strings
16 |                 data = [self.tokenizer.encode(item) for item in text]
17 |                 return data
18 |             else:
19 |                 data = self.tokenizer.encode(text)
20 |                 return data
21 |         except Exception as e:
22 |             print('encoding error', e)
23 |             return None
24 | 


--------------------------------------------------------------------------------
/src/vectra_py/custom_types.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List, Union, Dict, Optional, Any
  3 | 
  4 | 
  5 | @dataclass
  6 | class EmbeddingsModel:
  7 |     max_tokens: int
  8 | 
  9 |     # async def create_embeddings(self, inputs: Union[str, List[str]]) -> 'EmbeddingsResponse':
 10 |     #     pass
 11 | 
 12 | 
 13 | @dataclass
 14 | class EmbeddingsResponse:
 15 |     status: str
 16 |     output: List[List[float]] = None
 17 |     message: str = None
 18 | 
 19 | 
 20 | @dataclass
 21 | class TextChunk:
 22 |     text: str
 23 |     tokens: List[int]
 24 |     start_pos: int
 25 |     end_pos: int
 26 |     start_overlap: List[int]
 27 |     end_overlap: List[int]
 28 | 
 29 | 
 30 | @dataclass
 31 | class TextFetcher:
 32 |     async def fetch(self, uri: str) -> Dict[str, Union[str, None]]:
 33 |         pass
 34 | 
 35 | 
 36 | @dataclass
 37 | class IndexStats:
 38 |     version: int
 39 |     metadata_config: Dict[str, Optional[List[str]]]
 40 |     items: int
 41 | 
 42 | 
 43 | @dataclass
 44 | class IndexItem:
 45 |     id: str
 46 |     metadata: Dict[str, Any]
 47 |     vector: List[float]
 48 |     norm: float
 49 |     metadata_file: str = None
 50 | 
 51 | 
 52 | @dataclass
 53 | class MetadataFilter:
 54 |     eq: Union[int, str, bool] = None  # Equal to (number, string, boolean)
 55 |     ne: Union[int, str, bool] = None  # Not equal to (number, string, boolean)
 56 |     gt: int = None  # Greater than (number)
 57 |     gte: int = None  # Greater than or equal to (number)
 58 |     lt: int = None  # Less than (number)
 59 |     lte: int = None  # Less than or equal to (number)
 60 |     _in: List[Union[int, str]] = None  # In array (string or number)
 61 |     nin: List[Union[int, str]] = None  # Not in array (string or number)
 62 |     _and: List['MetadataFilter'] = None  # AND (MetadataFilter[])
 63 |     _or: List['MetadataFilter'] = None  # OR (MetadataFilter[])
 64 |     extra: Dict[str, Any] = None
 65 | 
 66 | 
 67 | @dataclass
 68 | class MetadataTypes:
 69 |     value: Union[int, str, bool]
 70 | 
 71 | 
 72 | @dataclass
 73 | class QueryResult:
 74 |     item: IndexItem
 75 |     score: float
 76 | 
 77 | 
 78 | @dataclass
 79 | class Tokenizer:
 80 |     def decode(self, tokens: List[int]) -> str:
 81 |         pass
 82 | 
 83 |     def encode(self, text: str) -> List[int]:
 84 |         pass
 85 | 
 86 | 
 87 | @dataclass
 88 | class DocumentChunkMetadata:
 89 |     document_id: str
 90 |     start_pos: int
 91 |     end_pos: int
 92 |     extra: Dict[str, Any] = None
 93 | 
 94 | 
 95 | @dataclass
 96 | class DocumentCatalogStats:
 97 |     version: int
 98 |     documents: int
 99 |     chunks: int
100 |     metadata_config: Dict[str, Optional[List[str]]]
101 |     extra: Dict[str, Any] = None
102 | 
103 | 
104 | @dataclass
105 | class DocumentTextSection:
106 |     text: str
107 |     token_count: int
108 |     score: float
109 | 


--------------------------------------------------------------------------------
/src/vectra_py/file_fetcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class FileFetcher:
 5 |     async def fetch(self, uri):
 6 |         # Check if the path exists and whether it's a directory
 7 |         if os.path.exists(uri):
 8 |             if os.path.isdir(uri):
 9 |                 # If it's a directory, read all files and recurse
10 |                 files = os.listdir(uri)
11 |                 for file in files:
12 |                     file_path = os.path.join(uri, file)
13 |                     await self.fetch(file_path)
14 |                 return True
15 |             else:
16 |                 # If it's a file, read its contents
17 |                 with open(uri, 'r', encoding='utf-8') as file:
18 |                     text = file.read()
19 |                     # Determine the document type based on the file extension
20 |                     _, file_extension = os.path.splitext(uri)
21 |                     doc_type = file_extension[1:].lower() if file_extension else None
22 |                     return uri, text, doc_type
23 |         else:
24 |             # Handle the case where the path doesn't exist
25 |             return None
26 | 


--------------------------------------------------------------------------------
/src/vectra_py/gpt3_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | # from tiktoken import encode, decode
 3 | 
 4 | 
 5 | class GPT3Tokenizer:
 6 |     def __init__(self, model_name: str = "gpt-3.5-turbo"):
 7 |         self.encoding = tiktoken.encoding_for_model(model_name)
 8 | 
 9 |     def decode(self, tokens):
10 |         return self.encoding.decode(tokens)
11 | 
12 |     def encode(self, text):
13 |         return self.encoding.encode(text)
14 | 


--------------------------------------------------------------------------------
/src/vectra_py/item_selector.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import math
  3 | 
  4 | 
  5 | class ItemSelector:
  6 |     """
  7 |     A class for selecting items based on their similarity.
  8 |     """
  9 |     @staticmethod
 10 |     def cosine_similarity(vector1: List[int],
 11 |                           vector2: List[int]) -> float:
 12 |         """
 13 |         Returns the similarity between two vectors using the cosine similarity.
 14 |         """
 15 |         # the quotient of the dot product and the product of the norms
 16 |         return (ItemSelector.dot_product(vector1, vector2) /
 17 |                 (ItemSelector.normalize(vector1) *
 18 |                  ItemSelector.normalize(vector2)))
 19 | 
 20 |     @staticmethod
 21 |     def normalize(vector: List[int]) -> float:
 22 |         """
 23 |         The norm of a vector is
 24 |             the square root of the sum of the squares of the elements.
 25 |         Returns the normalized value of a vector.
 26 |         """
 27 |         # crutch to santize lists of lists that come from some embedding models
 28 |         # this will almost certainly have consequences
 29 |         if isinstance(vector[0], list):
 30 |             vector = vector[0]
 31 |         # Initialize a variable to store the sum of the squares
 32 |         sum = 0
 33 |         # Loop through the elements of the array
 34 |         for i in range(len(vector)):
 35 |             # Square the element and add it to the sum
 36 |             sum += vector[i] * vector[i]
 37 |         # Return the square root of the sum
 38 |         return math.sqrt(sum)
 39 | 
 40 |     @staticmethod
 41 |     def normalized_cosine_similarity(vector1: List[int],
 42 |                                      norm1: float,
 43 |                                      vector2: List[int],
 44 |                                      norm2: float) -> float:
 45 |         """
 46 |         Returns the similarity between two vectors using the cosine similarity,
 47 |             considers norms.
 48 |         """
 49 |         # Return the quotient of the dot product and the product of the norms
 50 |         return ItemSelector.dot_product(vector1, vector2) / (norm1 * norm2)
 51 | 
 52 |     @staticmethod
 53 |     def select(metadata: dict,
 54 |                filter: dict) -> bool:
 55 |         """
 56 |         Handles filter logic.
 57 |         """
 58 |         if filter is None:
 59 |             return True
 60 |         for key in filter:
 61 |             if key == '$and':
 62 |                 if not all(ItemSelector.select(metadata, f)
 63 |                            for f in filter['$and']):
 64 |                     return False
 65 |             elif key == '$or':
 66 |                 if not any(ItemSelector.select(metadata, f)
 67 |                            for f in filter['$or']):
 68 |                     return False
 69 |             else:
 70 |                 value = filter[key]
 71 |                 if value is None:
 72 |                     return False
 73 |                 elif isinstance(value, dict):
 74 |                     if not ItemSelector.metadataFilter(metadata.get(key),
 75 |                                                        value):
 76 |                         return False
 77 |                 else:
 78 |                     if metadata.get(key) != value:
 79 |                         return False
 80 |         return True
 81 | 
 82 |     @staticmethod
 83 |     def dot_product(vector1: List[int],
 84 |                     vector2: List[int]) -> int:
 85 |         """
 86 |         Returns the dot product of two vectors.
 87 |         """
 88 |         # Zip the two vectors and multiply each pair, then sum the products
 89 |         return sum(a * b for a, b in zip(vector1, vector2))
 90 | 
 91 |     @staticmethod
 92 |     def metadata_filter(value,
 93 |                         filter) -> bool:
 94 |         """
 95 |         Handles metadata filter logic.
 96 |         """
 97 |         if value is None:
 98 |             return False
 99 | 
100 |         for key in filter:
101 |             if key == "$eq":
102 |                 if value != filter[key]:
103 |                     return False
104 |             elif key == "$ne":
105 |                 if value == filter[key]:
106 |                     return False
107 |             elif key == "$gt":
108 |                 if not isinstance(value, int) or value <= filter[key]:
109 |                     return False
110 |             elif key == "$gte":
111 |                 if not isinstance(value, int) or value < filter[key]:
112 |                     return False
113 |             elif key == "$lt":
114 |                 if not isinstance(value, int) or value >= filter[key]:
115 |                     return False
116 |             elif key == "$lte":
117 |                 if not isinstance(value, int) or value > filter[key]:
118 |                     return False
119 |             elif key == "$in":
120 |                 if not isinstance(value, bool) or value not in filter[key]:
121 |                     return False
122 |             elif key == "$nin":
123 |                 if not isinstance(value, bool) or value in filter[key]:
124 |                     return False
125 |             else:
126 |                 if value != filter[key]:
127 |                     return False
128 | 
129 |         return True
130 | 


--------------------------------------------------------------------------------
/src/vectra_py/local_document.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import json
 4 | 
 5 | 
 6 | class LocalDocument:
 7 |     def __init__(self, folder_path, id, uri):
 8 |         self._folder_path = folder_path
 9 |         self._id = id
10 |         self._uri = uri
11 |         self._metadata = None
12 |         self._text = None
13 | 
14 |     @property
15 |     def folder_path(self):
16 |         return self._folder_path
17 | 
18 |     @property
19 |     def id(self):
20 |         return self._id
21 | 
22 |     @property
23 |     def uri(self):
24 |         return self._uri
25 | 
26 |     async def has_metadata(self):
27 |         try:
28 |             await asyncio.to_thread(os.access, os.path.join(self.folder_path, f"{self.id}.json"), os.R_OK)
29 |             return True
30 |         except Exception as err:
31 |             print(f'Error checking metadata for document "{self.uri}": {str(err)}')
32 |             return False
33 | 
34 |     async def load_metadata(self):
35 |         if self._metadata is None:
36 |             try:
37 |                 with open(os.path.join(self.folder_path, f"{self.id}.json"), 'r') as file:
38 |                     json_str = await asyncio.to_thread(file.read)
39 |                     self._metadata = json.loads(json_str)
40 |             except Exception as err:
41 |                 raise Exception(f'Error reading metadata for document "{self.uri}": {str(err)}')
42 | 
43 |         return self._metadata
44 | 
45 |     async def load_text(self):
46 |         if self._text is None:
47 |             try:
48 |                 with open(os.path.join(self.folder_path, f"{self.id}.txt"), 'r') as file:
49 |                     self._text = await asyncio.to_thread(file.read)
50 |             except Exception as err:
51 |                 raise Exception(f'Error reading text file for document "{self.uri}": {str(err)}')
52 | 
53 |         return self._text
54 | 


--------------------------------------------------------------------------------
/src/vectra_py/local_document_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | from pathlib import Path
  4 | import time
  5 | import aiofiles.os
  6 | import json
  7 | import asyncio
  8 | from uuid import uuid4
  9 | from gpt3_tokenizer import GPT3Tokenizer
 10 | from local_index import LocalIndex, CreateIndexConfig
 11 | from text_splitter import TextSplitter, TextSplitterConfig
 12 | from custom_types import (
 13 |     MetadataFilter,
 14 |     EmbeddingsModel,
 15 |     Tokenizer,
 16 |     MetadataTypes,
 17 |     EmbeddingsResponse,
 18 |     QueryResult,
 19 |     DocumentChunkMetadata,
 20 |     DocumentCatalogStats,
 21 | )
 22 | from local_document_result import LocalDocumentResult
 23 | from local_document import LocalDocument
 24 | from typing import Dict, Optional, List, Union
 25 | from dataclasses import dataclass
 26 | 
 27 | 
 28 | @dataclass
 29 | class DocumentQueryOptions:
 30 |     max_documents: Optional[int] = None
 31 |     max_chunks: Optional[int] = None
 32 |     filter: Optional[MetadataFilter] = None
 33 | 
 34 | 
 35 | @dataclass
 36 | class LocalDocumentIndexConfig:
 37 |     folder_path: str
 38 |     tokenizer: Tokenizer
 39 |     embeddings: Optional[EmbeddingsModel] = None
 40 |     chunking_config: Optional[TextSplitterConfig] = None
 41 | 
 42 | 
 43 | @dataclass
 44 | class DocumentCatalog:
 45 |     version: int
 46 |     count: int
 47 |     uri_to_id: Dict[str, str]
 48 |     id_to_uri: Dict[str, str]
 49 | 
 50 | 
 51 | def is_catalog_created():
 52 |     # TODO: pass in appropriate path
 53 |     catalog_path = "/Users/brian/Documents/GitHub/vectra-py/index/catalog.json"
 54 |     exists = os.path.exists(catalog_path)
 55 |     if exists:
 56 |         print(f"exists: {exists}")
 57 |     # time.sleep(1)
 58 |     return exists
 59 | 
 60 | 
 61 | class LocalDocumentIndex(LocalIndex):
 62 |     def __init__(self, doc_index_config: LocalDocumentIndexConfig):
 63 |         super().__init__(doc_index_config.folder_path)
 64 |         self._embeddings = doc_index_config.embeddings
 65 |         self._chunking_config = {
 66 |             "keep_separators": True,
 67 |             "chunk_size": 512,
 68 |             "chunk_overlap": 0,
 69 |             **(doc_index_config.chunking_config or {}),
 70 |         }
 71 |         self._tokenizer = doc_index_config.tokenizer or self._chunking_config.get("tokenizer") or GPT3Tokenizer()
 72 |         self._chunking_config["tokenizer"] = self._tokenizer
 73 |         self._catalog = None
 74 |         self._new_catalog = None
 75 | 
 76 |     async def get_document_id(self, uri: str) -> Optional[str]:
 77 |         await self.load_index_data()
 78 |         return self._catalog["uri_to_id"].get(uri)
 79 | 
 80 |     async def get_document_uri(self, document_id: str) -> Optional[str]:
 81 |         await self.load_index_data()
 82 |         return self._catalog.id_to_uri.get(document_id)
 83 | 
 84 |     async def create_index(self, config: Optional[CreateIndexConfig] = None) -> None:
 85 |         await super().create_index(config)
 86 |         await self.load_index_data()
 87 | 
 88 |     async def delete_document(self, uri: str) -> None:
 89 |         document_id = await self.get_document_id(uri)
 90 |         if document_id is None:
 91 |             return
 92 | 
 93 |         await self.begin_update()
 94 |         try:
 95 |             chunks = await self.list_items_by_metadata(DocumentChunkMetadata(document_id=document_id))
 96 |             for chunk in chunks:
 97 |                 await self.deleteItem(chunk.id)
 98 | 
 99 |             del self._new_catalog.uri_to_id[uri]
100 |             del self._new_catalog.id_to_uri[document_id]
101 |             self._new_catalog.count -= 1
102 | 
103 |             await self.end_update()
104 |         except Exception as err:
105 |             self.cancel_update()
106 |             raise Exception(f'Error deleting document "{uri}": {str(err)}')
107 | 
108 |         try:
109 |             os.unlink(os.path.join(self.folder_path, f'{document_id}.txt'))
110 |         except Exception as err:
111 |             raise Exception(f'Error removing text file for document "{uri}" from disk: {str(err)}')
112 | 
113 |         try:
114 |             os.unlink(os.path.join(self.folder_path, f'{document_id}.json'))
115 |         except Exception as err:
116 |             raise Exception(f'Error removing json metadata file for document "{uri}" from disk: {str(err)}')
117 | 
118 |     async def get_catalog_stats(self) -> DocumentCatalogStats:
119 |         stats = await self.getIndexStats()
120 |         return DocumentCatalogStats(
121 |             version=self._catalog.version,
122 |             documents=self._catalog.count,
123 |             chunks=stats.items,
124 |             metadata_config=stats.metadata_config,
125 |         )
126 | 
127 |     async def upsert_document(
128 |         self,
129 |         uri: str,
130 |         text: str,
131 |         doc_type: Optional[str] = None,
132 |         metadata: Optional[Dict[str, MetadataTypes]] = None
133 |     ) -> LocalDocument:
134 |         if not self._embeddings:
135 |             raise Exception('Embeddings model not configured.')
136 | 
137 |         document_id = await self.get_document_id(uri)
138 |         if document_id is not None:
139 |             await self.delete_document(uri)
140 |         else:
141 |             document_id = str(uuid4())
142 | 
143 |         config = {
144 |             **(self._chunking_config or {}),
145 |             "doc_type": doc_type or self._chunking_config.get("doc_type"),
146 |         }
147 | 
148 |         if config["doc_type"] is None:
149 |             pos = uri.rfind('.')
150 |             if pos >= 0:
151 |                 ext = uri[pos + 1:].lower()
152 |                 config["doc_type"] = ext
153 | 
154 |         splitter = TextSplitter(config)
155 |         chunks = splitter.split(text)
156 |         total_tokens = 0
157 |         chunk_batches = []
158 |         current_batch = []
159 | 
160 |         for chunk in chunks:
161 |             total_tokens += len(chunk.tokens)
162 | 
163 |             if total_tokens > self._embeddings.max_tokens:
164 |                 chunk_batches.append(current_batch)
165 |                 current_batch = []
166 |                 total_tokens = len(chunk.tokens)
167 | 
168 |             current_batch.append(chunk.text.replace('\n', ' '))
169 | 
170 |         if current_batch:
171 |             chunk_batches.append(current_batch)
172 | 
173 |         embeddings = []
174 | 
175 |         for batch in chunk_batches:
176 |             try:
177 |                 response = await self._embeddings.create_embeddings(batch)
178 |             except Exception as err:
179 |                 raise Exception(f'Error generating embeddings: {str(err)}')
180 | 
181 |             if response.status != 'success':
182 |                 raise Exception(f'Error generating embeddings: {response.message}')
183 | 
184 |             embeddings.extend(response.output or [])
185 | 
186 |         await self.begin_update()
187 |         try:
188 |             for i, chunk in enumerate(chunks):
189 |                 embedding = embeddings[i]
190 |                 chunk_metadata = {
191 |                     "document_id": document_id,
192 |                     "start_pos": chunk.start_pos,
193 |                     "end_pos": chunk.end_pos,
194 |                     **(metadata or {}),
195 |                 }
196 |                 await self.insert_item(
197 |                     {
198 |                         "id": str(uuid4()),
199 |                         "metadata": chunk_metadata,
200 |                         "vector": embedding,
201 |                     }
202 |                 )
203 |             if metadata:
204 |                 with open(os.path.join(self.folder_path, f'{document_id}.json'), 'w') as metadata_file:
205 |                     json.dump(metadata, metadata_file)
206 | 
207 |             with open(os.path.join(self.folder_path, f'{document_id}.txt'), 'w') as text_file:
208 |                 text_file.write(text)
209 | 
210 |             self._new_catalog['uri_to_id'][uri] = document_id
211 |             self._new_catalog['id_to_uri'][document_id] = uri
212 |             self._new_catalog['count'] += 1
213 | 
214 |             await self.end_update()
215 |         except Exception as err:
216 |             self.cancel_update()
217 |             raise Exception(f'Error adding document "{uri}": {str(err)}')
218 | 
219 |         return LocalDocument(self.folder_path, document_id, uri)
220 | 
221 |     async def query_documents(self, query: str, options: DocumentQueryOptions = None) -> List[LocalDocumentResult]:
222 |         if not self._embeddings:
223 |             raise Exception('Embeddings model not configured.')
224 | 
225 |         options = options or DocumentQueryOptions(max_documents=10, max_chunks=50)
226 | 
227 |         try:
228 |             embeddings = await self._embeddings.create_embeddings(query.replace('\n', ' '))
229 |         except Exception as err:
230 |             raise Exception(f'Error generating embeddings for query: {str(err)}')
231 | 
232 |         if embeddings.status != 'success':
233 |             raise Exception(f'Error generating embeddings for query: {embeddings.message}')
234 | 
235 |         results = await self.query_items(embeddings.output[0], options.max_chunks, options.filter)
236 |         document_chunks = {}
237 | 
238 |         for result in results:
239 |             metadata = result.item.metadata
240 | 
241 |             if metadata.document_id not in document_chunks:
242 |                 document_chunks[metadata.document_id] = []
243 | 
244 |             document_chunks[metadata.document_id].append(result)
245 | 
246 |         document_results = []
247 | 
248 |         for document_id, chunks in document_chunks.items():
249 |             uri = await self.get_document_uri(document_id)
250 |             document_result = LocalDocumentResult(self.folder_path, document_id, uri, chunks, self._tokenizer)
251 |             document_results.append(document_result)
252 | 
253 |         document_results.sort(key=lambda x: x.score, reverse=True)
254 |         return document_results[:options.max_documents]
255 | 
256 |     async def begin_update(self):
257 |         await super().begin_update()
258 |         self._new_catalog = self._catalog.copy()
259 | 
260 |     def cancel_update(self):
261 |         super().cancel_update()
262 |         self._new_catalog = None
263 | 
264 |     async def end_update(self):
265 |         await super().end_update()
266 | 
267 |         try:
268 |             # Save catalog
269 |             catalog_path = os.path.join(self.folder_path, 'catalog.json')
270 |             with open(catalog_path, 'w') as catalog_file:
271 |                 json.dump(self._new_catalog, catalog_file)
272 |             self._catalog = self._new_catalog
273 |             self._new_catalog = None
274 |         except Exception as err:
275 |             raise Exception(f'Error saving document catalog: {str(err)}')
276 | 
277 |     async def load_index_data(self):
278 |         await super().load_index_data()
279 | 
280 |         if self._catalog:
281 |             return
282 | 
283 |         catalog_path = os.path.join(self.folder_path, 'catalog.json')
284 |         thread_test = await asyncio.gather(
285 |                                             asyncio.to_thread(is_catalog_created),
286 |                                             asyncio.sleep(1)
287 |                                             )
288 |         if is_catalog_created():
289 |             # Load catalog
290 |             async with aiofiles.open(catalog_path, 'r') as catalog_file:
291 |                 contents = await catalog_file.read()
292 |                 self._catalog = json.loads(contents)
293 |         else:
294 |             try:
295 |                 # Initialize catalog
296 |                 self._catalog = {
297 |                     'version': 1,
298 |                     'count': 0,
299 |                     'uri_to_id': {},
300 |                     'id_to_uri': {},
301 |                 }
302 |                 with open(catalog_path, 'w') as catalog_file:
303 |                     json.dump(self._catalog, catalog_file)
304 |             except Exception as err:
305 |                 raise Exception(f'Error creating document catalog: {str(err)}')
306 | 


--------------------------------------------------------------------------------
/src/vectra_py/local_document_result.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from local_document import LocalDocument
  3 | from custom_types import QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection
  4 | 
  5 | 
  6 | class LocalDocumentResult(LocalDocument):
  7 |     def __init__(self, folder_path: str, id: str, uri: str, chunks, tokenizer: Tokenizer):  # List[QueryResult[DocumentChunkMetadata]]
  8 |         super().__init__(folder_path, id, uri)
  9 |         self._chunks = chunks
 10 |         self._tokenizer = tokenizer
 11 | 
 12 |         # Compute average score
 13 |         score = 0
 14 |         for chunk in self._chunks:
 15 |             score += chunk.score
 16 |         self._score = score / len(self._chunks)
 17 | 
 18 |     @property
 19 |     def chunks(self):  # -> List[QueryResult[DocumentChunkMetadata]]
 20 |         return self._chunks
 21 | 
 22 |     @property
 23 |     def score(self) -> float:
 24 |         return self._score
 25 | 
 26 |     async def render_sections(self, max_tokens: int, max_sections: int) -> List[DocumentTextSection]:
 27 |         # Load text from disk
 28 |         text = await self.load_text()
 29 | 
 30 |         # First check to see if the entire document is less than max_tokens
 31 |         tokens = self._tokenizer.encode(text)
 32 |         if len(tokens) < max_tokens:
 33 |             return [{
 34 |                 "text": text,
 35 |                 "token_count": len(tokens),
 36 |                 "score": 1.0
 37 |             }]
 38 | 
 39 |         # Otherwise, we need to split the document into sections
 40 |         # - Add each chunk to a temp array and filter out any chunk that's longer than max_tokens.
 41 |         # - Sort the array by start_pos to arrange chunks in document order.
 42 |         # - Generate a new array of sections by combining chunks until the max_tokens is reached for each section.
 43 |         # - Generate an aggregate score for each section by averaging the score of each chunk in the section.
 44 |         # - Sort the sections by score and limit to max_sections.
 45 |         # - For each remaining section, combine adjacent chunks of text.
 46 |         # - Dynamically add overlapping chunks of text to each section until the max_tokens is reached.
 47 |         chunks = []
 48 |         for chunk in self._chunks:
 49 |             start_pos = chunk.item.metadata.start_pos
 50 |             end_pos = chunk.item.metadata.end_pos
 51 |             chunk_text = text[start_pos:end_pos + 1]
 52 |             chunk_tokens = self._tokenizer.encode(chunk_text)
 53 |             if len(chunk_tokens) <= max_tokens:
 54 |                 chunks.append({
 55 |                     "text": chunk_text,
 56 |                     "start_pos": start_pos,
 57 |                     "end_pos": end_pos,
 58 |                     "score": chunk.score,
 59 |                     "token_count": len(chunk_tokens)
 60 |                 })
 61 | 
 62 |         chunks.sort(key=lambda x: x["start_pos"])
 63 | 
 64 |         if not chunks:
 65 |             # Take the top chunk and return a subset of its text
 66 |             top_chunk = self._chunks[0]
 67 |             start_pos = top_chunk.item.metadata.start_pos
 68 |             end_pos = top_chunk.item.metadata.end_pos
 69 |             chunk_text = text[start_pos:end_pos + 1]
 70 |             tokens = self._tokenizer.encode(chunk_text)
 71 |             return [{
 72 |                 "text": self._tokenizer.decode(tokens[:max_tokens]),
 73 |                 "token_count": max_tokens,
 74 |                 "score": top_chunk.score
 75 |             }]
 76 | 
 77 |         sections = []
 78 |         current_section = {
 79 |             "chunks": [],
 80 |             "score": 0,
 81 |             "token_count": 0
 82 |         }
 83 | 
 84 |         for chunk in chunks:
 85 |             if current_section["token_count"] + chunk["token_count"] > max_tokens:
 86 |                 sections.append(current_section.copy())
 87 |                 current_section = {
 88 |                     "chunks": [],
 89 |                     "score": 0,
 90 |                     "token_count": 0
 91 |                 }
 92 |             current_section["chunks"].append(chunk)
 93 |             current_section["score"] += chunk["score"]
 94 |             current_section["token_count"] += chunk["token_count"]
 95 | 
 96 |         # Normalize section scores
 97 |         for section in sections:
 98 |             section["score"] /= len(section["chunks"])
 99 | 
100 |         # Sort sections by score and limit to max_sections
101 |         sections.sort(key=lambda x: x["score"], reverse=True)
102 |         if len(sections) > max_sections:
103 |             sections = sections[:max_sections]
104 | 
105 |         # Combine adjacent chunks of text
106 |         for section in sections:
107 |             i = 0
108 |             while i < len(section["chunks"]) - 1:
109 |                 chunk = section["chunks"][i]
110 |                 next_chunk = section["chunks"][i + 1]
111 |                 if chunk["end_pos"] + 1 == next_chunk["start_pos"]:
112 |                     chunk["text"] += next_chunk["text"]
113 |                     chunk["end_pos"] = next_chunk["end_pos"]
114 |                     chunk["token_count"] += next_chunk["token_count"]
115 |                     section["chunks"].pop(i + 1)
116 |                 else:
117 |                     i += 1
118 | 
119 |         # Add overlapping chunks of text to each section until the max_tokens is reached
120 |         connector = {
121 |             "text": '\n\n...\n\n',
122 |             "start_pos": -1,
123 |             "end_pos": -1,
124 |             "score": 0,
125 |             "token_count": self._tokenizer.encode('\n\n...\n\n')
126 |         }
127 | 
128 |         for section in sections:
129 |             # Insert connectors between chunks
130 |             if len(section["chunks"]) > 1:
131 |                 i = 0
132 |                 while i < len(section["chunks"]) - 1:
133 |                     section["chunks"].insert(i + 1, connector)
134 |                     section["token_count"] += connector["token_count"]
135 |                     i += 2
136 | 
137 |             # Add chunks to the beginning and end of the section until max_tokens is reached
138 |             budget = max_tokens - section["token_count"]
139 |             if budget > 40:
140 |                 section_start = section["chunks"][0]["start_pos"]
141 |                 section_end = section["chunks"][-1]["end_pos"]
142 |                 if section_start > 0:
143 |                     before_text = text[:section_start]
144 |                     before_tokens = self._tokenizer.encode(before_text)
145 |                     before_budget = min(len(before_tokens), budget // 2)
146 |                     chunk = {
147 |                         "text": self._tokenizer.decode(before_tokens[-before_budget:]),
148 |                         "start_pos": section_start - before_budget,
149 |                         "end_pos": section_start - 1,
150 |                         "score": 0,
151 |                         "token_count": before_budget
152 |                     }
153 |                     section["chunks"].insert(0, chunk)
154 |                     section["token_count"] += chunk["token_count"]
155 |                     budget -= chunk["token_count"]
156 | 
157 |                 if section_end < len(text) - 1:
158 |                     after_text = text[section_end + 1:]
159 |                     after_tokens = self._tokenizer.encode(after_text)
160 |                     after_budget = min(len(after_tokens), budget)
161 |                     chunk = {
162 |                         "text": self._tokenizer.decode(after_tokens[:after_budget]),
163 |                         "start_pos": section_end + 1,
164 |                         "end_pos": section_end + after_budget,
165 |                         "score": 0,
166 |                         "token_count": after_budget
167 |                     }
168 |                     section["chunks"].append(chunk)
169 |                     section["token_count"] += chunk["token_count"]
170 |                     budget -= chunk["token_count"]
171 | 
172 |         # Return final rendered sections
173 |         rendered_sections = []
174 |         for section in sections:
175 |             text = ''
176 |             for chunk in section["chunks"]:
177 |                 text += chunk["text"]
178 |             rendered_sections.append({
179 |                 "text": text,
180 |                 "token_count": section["token_count"],
181 |                 "score": section["score"]
182 |             })
183 |         return rendered_sections
184 | 


--------------------------------------------------------------------------------
/src/vectra_py/local_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import json
  4 | from uuid import uuid4
  5 | from typing import List, Optional, Dict, Union, Any
  6 | from item_selector import ItemSelector
  7 | from custom_types import IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult
  8 | 
  9 | 
 10 | class CreateIndexConfig:
 11 |     def __init__(self, version: int, delete_if_exists: bool = False, metadata_config: Dict = {}):
 12 |         self.version = version
 13 |         self.delete_if_exists = delete_if_exists
 14 |         self.metadata_config = metadata_config
 15 | 
 16 | 
 17 | class LocalIndex:
 18 |     def __init__(self, folder_path: str, index_name: Optional[str] = None):
 19 |         self._folder_path = folder_path
 20 |         self._index_name = index_name or "index.json"
 21 |         self._data = None
 22 |         self._update = None
 23 | 
 24 |     @property
 25 |     def folder_path(self) -> str:
 26 |         return self._folder_path
 27 | 
 28 |     @property
 29 |     def index_name(self) -> str:
 30 |         return self._index_name
 31 | 
 32 |     async def begin_update(self) -> None:
 33 |         if self._update:
 34 |             raise ValueError('Update already in progress')
 35 | 
 36 |         await self.load_index_data()
 37 |         self._update = self._data.copy()
 38 | 
 39 |     def cancel_update(self) -> None:
 40 |         self._update = None
 41 | 
 42 |     async def create_index(self, config: CreateIndexConfig = CreateIndexConfig(version=1)) -> None:
 43 |         if self.is_index_created():
 44 |             if config.delete_if_exists:
 45 |                 await self.delete_index()
 46 |             else:
 47 |                 raise ValueError('Index already exists')
 48 |         try:
 49 |             os.mkdir(self._folder_path)
 50 |             self._data = {
 51 |                 "version": config.version,
 52 |                 "metadata_config": config.metadata_config,
 53 |                 "items": []
 54 |             }
 55 |             with open(os.path.join(self._folder_path, self._index_name), 'w') as index_file:
 56 |                 json.dump(self._data, index_file)
 57 |         except Exception:
 58 |             await self.delete_index()
 59 |             raise ValueError('Error creating index')
 60 | 
 61 |     async def delete_index(self) -> None:
 62 |         self._data = None
 63 |         try:
 64 |             shutil.rmtree(self._folder_path)
 65 |         except Exception as err:
 66 |             print(err)
 67 | 
 68 |     async def delete_item(self, id: str) -> None:
 69 |         if self._update:
 70 |             index = next((i for i, item in enumerate(self._update["items"]) if item["id"] == id), None)
 71 |             if index is not None:
 72 |                 self._update["items"].pop(index)
 73 |         else:
 74 |             await self.begin_update()
 75 |             index = next((i for i, item in enumerate(self._update["items"]) if item["id"] == id), None)
 76 |             if index is not None:
 77 |                 self._update["items"].pop(index)
 78 |             await self.end_update()
 79 | 
 80 |     async def end_update(self) -> None:
 81 |         if not self._update:
 82 |             raise ValueError('No update in progress')
 83 | 
 84 |         try:
 85 |             with open(os.path.join(self._folder_path, self._index_name), 'w') as index_file:
 86 |                 json.dump(self._update, index_file)
 87 |             self._data = self._update.copy()
 88 |             self._update = None
 89 |         except Exception as err:
 90 |             raise ValueError(f'Error saving index: {str(err)}')
 91 | 
 92 |     async def get_index_stats(self) -> IndexStats:
 93 |         await self.load_index_data()
 94 |         return {
 95 |             "version": self._data["version"],
 96 |             "metadata_config": self._data["metadata_config"],
 97 |             "items": len(self._data["items"])
 98 |         }
 99 | 
100 |     async def get_item(self, id: str) -> Optional[IndexItem]:
101 |         await self.load_index_data()
102 |         item = next((item for item in self._data["items"] if item["id"] == id), None)
103 |         return item
104 | 
105 |     async def insert_item(self, item: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
106 |         if self._update:
107 |             return await self.add_item_to_update(item, True)
108 |         else:
109 |             await self.begin_update()
110 |             new_item = await self.add_item_to_update(item, True)
111 |             await self.end_update()
112 |             return new_item
113 | 
114 |     def is_index_created(self) -> bool:
115 |         return os.path.exists(os.path.join(self._folder_path, self._index_name))
116 | 
117 |     async def list_items(self) -> List[IndexItem]:
118 |         await self.load_index_data()
119 |         return self._data["items"][:]
120 | 
121 |     async def list_items_by_metadata(self, filter: MetadataFilter) -> List[IndexItem]:
122 |         await self.load_index_data()
123 |         return [item for item in self._data["items"] if ItemSelector.select(item["metadata"], filter)]
124 | 
125 |     async def query_items(self,
126 |                           vector: List[float],
127 |                           top_k: int,
128 |                           filter: Optional[MetadataFilter] = None) -> List[QueryResult]:
129 |         await self.load_index_data()
130 | 
131 |         items = self._data["items"][:]
132 |         if filter:
133 |             items = [item for item in items if ItemSelector.select(item["metadata"], filter)]
134 | 
135 |         norm = ItemSelector.normalize(vector)
136 |         distances = []
137 |         for i, item in enumerate(items):
138 |             distance = ItemSelector.normalized_cosine_similarity(vector, norm, item["vector"], item["norm"])
139 |             distances.append({"index": i, "distance": distance})
140 | 
141 |         distances.sort(key=lambda x: x["distance"], reverse=True)
142 |         top_items = distances[:top_k]
143 | 
144 |         for item in top_items:
145 |             if "metadataFile" in items[item["index"]]:
146 |                 metadata_path = os.path.join(self._folder_path, items[item["index"]]["metadataFile"])
147 |                 with open(metadata_path, 'r') as metadata_file:
148 |                     items[item["index"]]["metadata"] = json.load(metadata_file)
149 | 
150 |         return [{"item": items[item["index"]], "score": item["distance"]} for item in top_items]
151 | 
152 |     async def upsert_item(self, item: Optional[Dict[str, Any]] = None) -> IndexItem:
153 |         if self._update:
154 |             return await self.add_item_to_update(item, False)
155 |         else:
156 |             await self.begin_update()
157 |             new_item = await self.add_item_to_update(item, False)
158 |             await self.end_update()
159 |             return new_item
160 | 
161 |     async def load_index_data(self) -> None:
162 |         if self._data:
163 |             return
164 | 
165 |         if not self.is_index_created():
166 |             raise ValueError('Index does not exist')
167 | 
168 |         try:
169 |             with open(os.path.join(self._folder_path, self._index_name), 'r') as index_file:
170 |                 self._data = json.load(index_file)
171 |         except Exception:
172 |             raise ValueError('Error loading index data')
173 | 
174 |     async def add_item_to_update(self, item: Optional[Dict[str, Any]], unique: bool) -> IndexItem:
175 |         if "vector" not in item:
176 |             raise ValueError('Vector is required')
177 | 
178 |         item_id = item.get("id") or str(uuid4())
179 |         if unique:
180 |             existing_item = next((i for i in self._update["items"] if i["id"] == item_id), None)
181 |             if existing_item:
182 |                 raise ValueError(f'Item with id {item_id} already exists')
183 | 
184 |         metadata = {}
185 |         metadata_file = None
186 |         if (
187 |             "metadata" in item
188 |             and self._update["metadata_config"].get("indexed")
189 |             and len(self._update["metadata_config"]["indexed"]) > 0
190 |         ):
191 |             for key in self._update["metadata_config"]["indexed"]:
192 |                 if key in item["metadata"]:
193 |                     metadata[key] = item["metadata"][key]
194 |             if item.get("metadata"):
195 |                 metadata_file = f'{str(uuid4())}.json'
196 |                 metadata_path = os.path.join(self._folder_path, metadata_file)
197 |                 with open(metadata_path, 'w') as metadata_file:
198 |                     json.dump(item["metadata"], metadata_file)
199 |         elif item.get("metadata"):
200 |             metadata = item["metadata"]
201 |         # print('local index, after metadata')
202 |         # print('item vector type and len', type(item["vector"]), len(item["vector"]))
203 |         # print('item vector chunk inspection', item["vector"][0])
204 |         try:
205 |             new_item = {
206 |                 "id": item_id,
207 |                 "metadata": metadata,
208 |                 "vector": item["vector"],
209 |                 "norm": ItemSelector.normalize(item["vector"])
210 |             }
211 |         except Exception as e:
212 |             raise ValueError(f'Error creating item: {e}')
213 |         if metadata_file:
214 |             new_item["metadataFile"] = metadata_file
215 | 
216 |         if not unique:
217 |             existing_item = next((i for i in self._update["items"] if i["id"] == item_id), None)
218 |             if existing_item:
219 |                 existing_item.update(new_item)
220 |                 return existing_item
221 | 
222 |         self._update["items"].append(new_item)
223 |         return new_item
224 | 


--------------------------------------------------------------------------------
/src/vectra_py/openai_embeddings.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import requests
  3 | from typing import List, Union, Dict
  4 | 
  5 | 
  6 | class BaseOpenAIEmbeddingsOptions:
  7 |     def __init__(self, retry_policy: List[int] = None, request_config: Dict = None):
  8 |         self.retry_policy = retry_policy if retry_policy else [2000, 5000]
  9 |         self.request_config = request_config if request_config else {}
 10 | 
 11 | 
 12 | class OpenAIEmbeddingsOptions(BaseOpenAIEmbeddingsOptions):
 13 |     def __init__(
 14 |         self,
 15 |         api_key: str,
 16 |         model: str,
 17 |         organization: str = None,
 18 |         endpoint: str = None,
 19 |         **kwargs
 20 |     ):
 21 |         super().__init__(**kwargs)
 22 |         self.api_key = api_key
 23 |         self.model = model
 24 |         self.organization = organization
 25 |         self.endpoint = endpoint
 26 | 
 27 | 
 28 | class AzureOpenAIEmbeddingsOptions(BaseOpenAIEmbeddingsOptions):
 29 |     def __init__(
 30 |         self,
 31 |         azure_api_key: str,
 32 |         azure_endpoint: str,
 33 |         azure_deployment: str,
 34 |         azure_api_version: str = "2023-05-15",
 35 |         **kwargs
 36 |     ):
 37 |         super().__init__(**kwargs)
 38 |         self.azure_api_key = azure_api_key
 39 |         self.azure_endpoint = azure_endpoint
 40 |         self.azure_deployment = azure_deployment
 41 |         self.azure_api_version = azure_api_version
 42 | 
 43 | 
 44 | class EmbeddingsResponse:
 45 |     def __init__(self, status: str, output: List[float] = None, message: str = None):
 46 |         self.status = status
 47 |         self.output = output
 48 |         self.message = message
 49 | 
 50 | 
 51 | class CreateEmbeddingRequest:
 52 |     def __init__(self, input: Union[str, List[str]]):
 53 |         self.input = input
 54 | 
 55 | 
 56 | class CreateEmbeddingResponse:
 57 |     def __init__(self, data: List[Dict], model: str, usage: Dict):
 58 |         self.data = data
 59 |         self.model = model
 60 |         self.usage = usage
 61 | 
 62 | 
 63 | class OpenAIEmbeddings:
 64 |     def __init__(self, options: Union[OpenAIEmbeddingsOptions, AzureOpenAIEmbeddingsOptions]):
 65 |         self._use_azure = isinstance(options, AzureOpenAIEmbeddingsOptions)
 66 |         self.options = options
 67 |         self.user_agent = "AlphaWave"
 68 | 
 69 |     @property
 70 |     def max_tokens(self):
 71 |         return 8000
 72 | 
 73 |     async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse:
 74 |         response = await self.create_embedding_request({"input": inputs})
 75 |         # convert the response.text to json
 76 |         json_response = response.json()
 77 |         data = response.json().get('data')
 78 |         if response.status_code < 300:
 79 |             return EmbeddingsResponse(
 80 |                 status="success",
 81 |                 output=[item["embedding"] for item in data],
 82 |                 message={"model": json_response.get('model'),
 83 |                          "usage": json_response.get('usage')}
 84 |             )
 85 |         elif response.status_code == 429:
 86 |             return EmbeddingsResponse(
 87 |                 status="rate_limited",
 88 |                 output=None,
 89 |                 message="The embeddings API returned a rate limit error.",
 90 |             )
 91 |         else:
 92 |             return EmbeddingsResponse(
 93 |                 status="error",
 94 |                 output=None,
 95 |                 message=f"The embeddings API returned an error status of {response.status_code}: {response.statusText}",
 96 |             )
 97 | 
 98 |     async def create_embedding_request(self, request: CreateEmbeddingRequest):
 99 |         if self._use_azure:
100 |             options = self.options
101 |             url = f"{options.azure_endpoint}/openai/deployments/{options.azure_deployment}/embeddings?api-version={options.azure_api_version}"
102 |             return self.post(url, request)
103 |         else:
104 |             options = self.options
105 |             url = f"{options.endpoint or 'https://api.openai.com'}/v1/embeddings"
106 |             request['model'] = options.model
107 |             test = await self.post(url, request, retry_count=0)
108 |             return test
109 | 
110 |     async def post(self, url: str, body: Dict, retry_count: int = 0):
111 |         request_config = dict(self.options.request_config)
112 | 
113 |         request_headers = request_config.setdefault("headers", {})
114 |         request_headers.setdefault("Content-Type", "application/json")
115 |         request_headers.setdefault("User-Agent", self.user_agent)
116 | 
117 |         if self._use_azure:
118 |             options = self.options
119 |             request_headers["api-key"] = options.azure_api_key
120 |         else:
121 |             options = self.options
122 |             request_headers["Authorization"] = f"Bearer {options.api_key}"
123 |             if options.organization:
124 |                 request_headers["OpenAI-Organization"] = options.organization
125 | 
126 |         response = requests.post(url, json=body, **request_config)
127 | 
128 |         if response.status_code == 429 and isinstance(self.options.retry_policy, list) and retry_count < len(self.options.retry_policy):
129 |             delay = self.options.retry_policy[retry_count]
130 |             await asyncio.sleep(delay / 1000)
131 |             return await self.post(url, body, retry_count + 1)
132 |         else:
133 |             return response
134 | 


--------------------------------------------------------------------------------
/src/vectra_py/oss_embeddings.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import requests
  3 | from typing import List, Union, Dict
  4 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer
  5 | 
  6 | 
  7 | class BaseOSSEmbeddingsOptions:
  8 |     def __init__(self, retry_policy: List[int] = None, request_config: Dict = None):
  9 |         self.retry_policy = retry_policy if retry_policy else [2000, 5000]
 10 |         self.request_config = request_config if request_config else {}
 11 | 
 12 | 
 13 | class OSSEmbeddingsOptions(BaseOSSEmbeddingsOptions):
 14 |     def __init__(
 15 |         self,
 16 |         model: str,
 17 |         tokenizer: OSSTokenizer,
 18 |         **kwargs
 19 |     ):
 20 |         super().__init__(**kwargs)
 21 |         self.tokenizer = OSSTokenizer(model_name=model)
 22 |         self.model = model
 23 | 
 24 | 
 25 | class EmbeddingsResponse:
 26 |     def __init__(self, status: str, output: List[float] = None, message: str = None):
 27 |         self.status = status
 28 |         self.output = output
 29 |         self.message = message
 30 | 
 31 | 
 32 | class CreateEmbeddingRequest:
 33 |     def __init__(self, input: Union[str, List[str]]):
 34 |         self.input = input
 35 | 
 36 | 
 37 | class CreateEmbeddingResponse:
 38 |     def __init__(self, data: List[Dict], model: str, usage: Dict):
 39 |         self.data = data
 40 |         self.model = model
 41 |         self.usage = usage
 42 | 
 43 | 
 44 | class OSSEmbeddings:
 45 |     def __init__(self, options: OSSEmbeddingsOptions):
 46 |         self._local = True  # use a locally stored model
 47 |         self.options = options
 48 |         self.model = options.model
 49 |         self.tokenizer = options.tokenizer
 50 |         # self.user_agent = "AlphaWave"
 51 | 
 52 |     @property
 53 |     def max_tokens(self):
 54 |         return 8000
 55 |     
 56 |     async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse:
 57 |         # create embeddings from the local model
 58 |         try:
 59 |             data = [self.options.tokenizer.encode(item) for item in inputs]
 60 |             return EmbeddingsResponse(
 61 |                 status="success",
 62 |                 output=data,
 63 |                 message={"model": self.model,
 64 |                          "usage": 'unknown'}
 65 |             )
 66 |         except Exception as e:
 67 |             print('OSS encoding error', e)
 68 |             return EmbeddingsResponse(
 69 |                 status="error",
 70 |                 output=None,
 71 |                 message=f"Encoding error: {e}",
 72 |             )
 73 | 
 74 | 
 75 |     # async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse:
 76 |     #     # print('openai create_embeddings', inputs)
 77 |     #     print('create_embeddings', OSEmbeddingsOptions.__dict__)
 78 |     #     response = await self.create_embedding_request({"input": inputs})
 79 |     #     # convert the response.text to json
 80 |     #     json_response = response.json()
 81 |     #     data = response.json().get('data')
 82 |     #     if response.status_code < 300:
 83 |     #         return EmbeddingsResponse(
 84 |     #             status="success",
 85 |     #             output=[item["embedding"] for item in data],
 86 |     #             message={"model": json_response.get('model'),
 87 |     #                      "usage": json_response.get('usage')}
 88 |     #         )
 89 |     #     elif response.status_code == 429:
 90 |     #         return EmbeddingsResponse(
 91 |     #             status="rate_limited",
 92 |     #             output=None,
 93 |     #             message="The embeddings API returned a rate limit error.",
 94 |     #         )
 95 |     #     else:
 96 |     #         return EmbeddingsResponse(
 97 |     #             status="error",
 98 |     #             output=None,
 99 |     #             message=f"The embeddings API returned an error status of {response.status_code}: {response.statusText}",
100 |     #         )
101 | 
102 |     # async def create_embedding_request(self, request: CreateEmbeddingRequest):
103 |     #     # print('openai create_embedding_request', request)
104 |     #     if self._use_azure:
105 |     #         options = self.options
106 |     #         url = f"{options.azure_endpoint}/openai/deployments/{options.azure_deployment}/embeddings?api-version={options.azure_api_version}"
107 |     #         return self.post(url, request)
108 |     #     else:
109 |     #         # print('else', self.options.__dict__)
110 |     #         options = self.options
111 |     #         # print('openai create_embedding_request else', options.__dict__)
112 |     #         url = f"{options.endpoint or 'https://api.openai.com'}/v1/embeddings"
113 |     #         # print('---------------openai create_embedding_request else request', request.keys())
114 |     #         # print('---------------openai create_embedding_request else, options', options.model)
115 |     #         request['model'] = options.model
116 |     #         # print('zaza')
117 |     #         # print(options.model)
118 |     #         test = await self.post(url, request, retry_count=0)
119 |     #         return test
120 | 
121 |     # async def post(self, url: str, body: Dict, retry_count: int = 0):
122 |     #     # print('openai post', url, body, retry_count)
123 |     #     request_config = dict(self.options.request_config)
124 | 
125 |     #     request_headers = request_config.setdefault("headers", {})
126 |     #     request_headers.setdefault("Content-Type", "application/json")
127 |     #     request_headers.setdefault("User-Agent", self.user_agent)
128 | 
129 |     #     if self._use_azure:
130 |     #         options = self.options
131 |     #         request_headers["api-key"] = options.azure_api_key
132 |     #     else:
133 |     #         options = self.options
134 |     #         request_headers["Authorization"] = f"Bearer {options.api_key}"
135 |     #         if options.organization:
136 |     #             request_headers["OpenAI-Organization"] = options.organization
137 | 
138 |     #     response = requests.post(url, json=body, **request_config)
139 |     #     # print('post', response.__dict__.keys())
140 | 
141 |     #     if response.status_code == 429 and isinstance(self.options.retry_policy, list) and retry_count < len(self.options.retry_policy):
142 |     #         delay = self.options.retry_policy[retry_count]
143 |     #         await asyncio.sleep(delay / 1000)
144 |     #         return await self.post(url, body, retry_count + 1)
145 |     #     else:
146 |     #         return response
147 | 


--------------------------------------------------------------------------------
/src/vectra_py/text_splitter.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | from custom_types import Tokenizer
  4 | from gpt3_tokenizer import GPT3Tokenizer
  5 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer
  6 | 
  7 | ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
  8 | 
  9 | 
 10 | class TextSplitterConfig:
 11 |     def __init__(
 12 |         self,
 13 |         separators: List[str],
 14 |         keep_separators: bool,
 15 |         chunk_size: int,
 16 |         chunk_overlap: int,
 17 |         tokenizer: Tokenizer,
 18 |         doc_type: Optional[str] = None
 19 |     ):
 20 |         self.separators = separators
 21 |         self.keep_separators = keep_separators
 22 |         self.chunk_size = chunk_size
 23 |         self.chunk_overlap = chunk_overlap
 24 |         self.tokenizer = tokenizer
 25 |         self.doc_type = doc_type
 26 | 
 27 | 
 28 | class TextChunk:
 29 |     def __init__(self,
 30 |                  text: str,
 31 |                  tokens: List[int],
 32 |                  start_pos: int,
 33 |                  end_pos: int,
 34 |                  start_overlap: List[int],
 35 |                  end_overlap: List[int]):
 36 |         self.text = text
 37 |         self.tokens = tokens
 38 |         self.start_pos = start_pos
 39 |         self.end_pos = end_pos
 40 |         self.start_overlap = start_overlap
 41 |         self.end_overlap = end_overlap
 42 | 
 43 | 
 44 | class TextSplitter:
 45 |     def __init__(self, config: Optional[TextSplitterConfig] = None):
 46 |         if config is None:
 47 |             config = TextSplitterConfig(
 48 |                 separators=[],
 49 |                 keep_separators=False,
 50 |                 chunk_size=400,
 51 |                 chunk_overlap=40,
 52 |                 tokenizer=None
 53 |             )
 54 |         self.config = config
 55 |         # Create a default tokenizer if none is provided
 56 |         if not self.config.get('tokenizer'):
 57 |             print('tokenizer not found. defaulting to GPT3.')
 58 |             self.config.tokenizer = GPT3Tokenizer()
 59 | 
 60 |         # Use default separators if none are provided
 61 |         if not self.config.get('separators') or len(self.config.get('separators')) == 0:
 62 |             self.config['separators'] = self.get_separators(self.config['doc_type'])
 63 | 
 64 |         # Validate the config settings
 65 |         if self.config.get('chunk_size') < 1:
 66 |             raise ValueError("chunk_size must be >= 1")
 67 |         elif self.config.get('chunk_overlap') < 0:
 68 |             raise ValueError("chunk_overlap must be >= 0")
 69 |         elif self.config.get('chunk_overlap') > self.config.get('chunk_size'):
 70 |             raise ValueError("chunk_overlap must be <= chunk_size")
 71 | 
 72 |     def split(self, text: str) -> List[TextChunk]:
 73 |         # Get basic chunks
 74 |         chunks = self.recursive_split(text, self.config.get('separators'), 0)
 75 | 
 76 |         def get_overlap_tokens(tokens: Optional[List[int]] = None) -> List[int]:
 77 |             if tokens is not None:
 78 |                 length = min(len(tokens), self.config.get('chunk_overlap'))
 79 |                 return tokens[:length]
 80 |             else:
 81 |                 return []
 82 | 
 83 |         # Add overlap tokens and text to the start and end of each chunk
 84 |         if self.config.get('chunk_overlap') > 0:
 85 |             for i in range(1, len(chunks)):
 86 |                 previous_chunk = chunks[i - 1]
 87 |                 chunk = chunks[i]
 88 |                 next_chunk = chunks[i + 1] if i < len(chunks) - 1 else None
 89 |                 chunk.start_overlap = get_overlap_tokens(previous_chunk.tokens[::-1])[::-1]
 90 |                 chunk.end_overlap = get_overlap_tokens(next_chunk.tokens) if next_chunk else []
 91 | 
 92 |         return chunks
 93 | 
 94 |     def recursive_split(self, text: str, separators: List[str], start_pos: int) -> List[TextChunk]:
 95 |         chunks = []
 96 |         if len(text) > 0:
 97 |             # Split text into parts
 98 |             parts = []
 99 |             separator = ''
100 |             next_separators = separators[1:] if len(separators) > 1 else []
101 |             if separators:
102 |                 # Split by separator
103 |                 separator = separators[0]
104 |                 parts = text.split(separator)
105 |             else:
106 |                 # Cut text in half
107 |                 half = len(text) // 2
108 |                 parts = [text[:half], text[half:]]
109 | 
110 |             # Iterate over parts
111 |             for i in range(len(parts)):
112 |                 last_chunk = i == len(parts) - 1
113 |                 # Get chunk text and end_pos
114 |                 chunk = parts[i]
115 |                 end_pos = start_pos + (len(chunk) - 1) + (0 if last_chunk else len(separator))
116 |                 if self.config.get('keep_separators') and not last_chunk:
117 |                     chunk += separator
118 | 
119 |                 # Ensure chunk contains text
120 |                 if not self.contains_alphanumeric(chunk):
121 |                     continue
122 | 
123 |                 # Optimization to avoid encoding really large chunks
124 |                 if len(chunk) / 6 > self.config.get('chunk_size'):
125 |                     # Break the text into smaller chunks
126 |                     sub_chunks = self.recursive_split(chunk, next_separators, start_pos)
127 |                     chunks.extend(sub_chunks)
128 |                 else:
129 |                     # Encode chunk text
130 |                     tokens = self.config.get('tokenizer').encode(chunk)
131 |                     if len(tokens) > self.config.get('chunk_size'):
132 |                         # Break the text into smaller chunks
133 |                         sub_chunks = self.recursive_split(chunk, next_separators, start_pos)
134 |                         chunks.extend(sub_chunks)
135 |                     else:
136 |                         # Append chunk to output
137 |                         chunks.append(TextChunk(
138 |                             text=chunk,
139 |                             tokens=tokens,
140 |                             start_pos=start_pos,
141 |                             end_pos=end_pos,
142 |                             start_overlap=[],
143 |                             end_overlap=[],
144 |                         ))
145 |                 # Update start_pos
146 |                 start_pos = end_pos + 1
147 | 
148 |         return self.combine_chunks(chunks)
149 | 
150 |     def combine_chunks(self, chunks: List[TextChunk]) -> List[TextChunk]:
151 |         combined_chunks = []
152 |         current_chunk = None
153 |         current_length = 0
154 |         separator = '' if self.config.get('keep_separators') else ' '
155 |         for i in range(len(chunks)):
156 |             chunk = chunks[i]
157 |             if current_chunk:
158 |                 length = len(current_chunk.tokens) + len(chunk.tokens)
159 |                 if length > self.config.get('chunk_size'):
160 |                     combined_chunks.append(current_chunk)
161 |                     current_chunk = chunk
162 |                     current_length = len(chunk.tokens)
163 |                 else:
164 |                     current_chunk.text += separator + chunk.text
165 |                     current_chunk.tokens.extend(chunk.tokens)
166 |                     current_length += len(chunk.tokens)
167 |             else:
168 |                 current_chunk = chunk
169 |                 current_length = len(chunk.tokens)
170 | 
171 |         if current_chunk:
172 |             combined_chunks.append(current_chunk)
173 | 
174 |         return combined_chunks
175 | 
176 |     def contains_alphanumeric(self, text: str) -> bool:
177 |         return any(char in ALPHANUMERIC_CHARS for char in text)
178 | 
179 |     def get_separators(self, doc_type: str = "") -> List[str]:
180 |         separators = {
181 |             "cpp": [
182 |                 # Split along class definitions
183 |                 "\nclass ",
184 |                 # Split along function definitions
185 |                 "\nvoid ",
186 |                 "\nint ",
187 |                 "\nfloat ",
188 |                 "\ndouble ",
189 |                 # Split along control flow statements
190 |                 "\nif ",
191 |                 "\nfor ",
192 |                 "\nwhile ",
193 |                 "\nswitch ",
194 |                 "\ncase ",
195 |                 # Split by the normal type of lines
196 |                 "\n\n",
197 |                 "\n",
198 |                 " "
199 |             ],
200 |             "go": [
201 |                 # Split along function definitions
202 |                 "\nfunc ",
203 |                 "\nvar ",
204 |                 "\nconst ",
205 |                 "\ntype ",
206 |                 # Split along control flow statements
207 |                 "\nif ",
208 |                 "\nfor ",
209 |                 "\nswitch ",
210 |                 "\ncase ",
211 |                 # Split by the normal type of lines
212 |                 "\n\n",
213 |                 "\n",
214 |                 " "
215 |             ],
216 |             "java": [
217 |                 # Split along class definitions
218 |                 "\nclass ",
219 |                 # Split along method definitions
220 |                 "\npublic ",
221 |                 "\nprotected ",
222 |                 "\nprivate ",
223 |                 "\nstatic ",
224 |                 # Split along control flow statements
225 |                 "\nif ",
226 |                 "\nfor ",
227 |                 "\nwhile ",
228 |                 "\nswitch ",
229 |                 "\ncase ",
230 |                 # Split by the normal type of lines
231 |                 "\n\n",
232 |                 "\n",
233 |                 " "
234 |             ],
235 |             "c#": [
236 |                 # Split along class definitions
237 |                 "\nclass ",
238 |                 # Split along method definitions
239 |                 "\npublic ",
240 |                 "\nprotected ",
241 |                 "\nprivate ",
242 |                 "\nstatic ",
243 |                 # Split along control flow statements
244 |                 "\nif ",
245 |                 "\nfor ",
246 |                 "\nwhile ",
247 |                 "\nswitch ",
248 |                 "\ncase ",
249 |                 # Split by the normal type of lines
250 |                 "\n\n",
251 |                 "\n",
252 |                 " "
253 |             ],
254 |             "csharp": [
255 |                 # Split along class definitions
256 |                 "\nclass ",
257 |                 # Split along method definitions
258 |                 "\npublic ",
259 |                 "\nprotected ",
260 |                 "\nprivate ",
261 |                 "\nstatic ",
262 |                 # Split along control flow statements
263 |                 "\nif ",
264 |                 "\nfor ",
265 |                 "\nwhile ",
266 |                 "\nswitch ",
267 |                 "\ncase ",
268 |                 # Split by the normal type of lines
269 |                 "\n\n",
270 |                 "\n",
271 |                 " "
272 |             ],
273 |             "cs": [
274 |                 # Split along class definitions
275 |                 "\nclass ",
276 |                 # Split along method definitions
277 |                 "\npublic ",
278 |                 "\nprotected ",
279 |                 "\nprivate ",
280 |                 "\nstatic ",
281 |                 # Split along control flow statements
282 |                 "\nif ",
283 |                 "\nfor ",
284 |                 "\nwhile ",
285 |                 "\nswitch ",
286 |                 "\ncase ",
287 |                 # Split by the normal type of lines
288 |                 "\n\n",
289 |                 "\n",
290 |                 " "
291 |             ],
292 |             "ts": [
293 |                 # Split along class definitions
294 |                 "\nclass ",
295 |                 # Split along method definitions
296 |                 "\npublic ",
297 |                 "\nprotected ",
298 |                 "\nprivate ",
299 |                 "\nstatic ",
300 |                 # Split along control flow statements
301 |                 "\nif ",
302 |                 "\nfor ",
303 |                 "\nwhile ",
304 |                 "\nswitch ",
305 |                 "\ncase ",
306 |                 # Split by the normal type of lines
307 |                 "\n\n",
308 |                 "\n",
309 |                 " "
310 |             ],
311 |             "tsx": [
312 |                 # Split along class definitions
313 |                 "\nclass ",
314 |                 # Split along method definitions
315 |                 "\npublic ",
316 |                 "\nprotected ",
317 |                 "\nprivate ",
318 |                 "\nstatic ",
319 |                 # Split along control flow statements
320 |                 "\nif ",
321 |                 "\nfor ",
322 |                 "\nwhile ",
323 |                 "\nswitch ",
324 |                 "\ncase ",
325 |                 # Split by the normal type of lines
326 |                 "\n\n",
327 |                 "\n",
328 |                 " "
329 |             ],
330 |             "typescript": [
331 |                 # Split along class definitions
332 |                 "\nclass ",
333 |                 # Split along method definitions
334 |                 "\npublic ",
335 |                 "\nprotected ",
336 |                 "\nprivate ",
337 |                 "\nstatic ",
338 |                 # Split along control flow statements
339 |                 "\nif ",
340 |                 "\nfor ",
341 |                 "\nwhile ",
342 |                 "\nswitch ",
343 |                 "\ncase ",
344 |                 # Split by the normal type of lines
345 |                 "\n\n",
346 |                 "\n",
347 |                 " "
348 |             ],
349 |             "js": [
350 |                 # Split along class definitions
351 |                 "\nclass ",
352 |                 # Split along function definitions
353 |                 "\nfunction ",
354 |                 "\nconst ",
355 |                 "\nlet ",
356 |                 "\nvar ",
357 |                 "\nclass ",
358 |                 # Split along control flow statements
359 |                 "\nif ",
360 |                 "\nfor ",
361 |                 "\nwhile ",
362 |                 "\nswitch ",
363 |                 "\ncase ",
364 |                 "\ndefault ",
365 |                 # Split by the normal type of lines
366 |                 "\n\n",
367 |                 "\n",
368 |                 " "
369 |             ],
370 |             "jsx": [
371 |                 # Split along class definitions
372 |                 "\nclass ",
373 |                 # Split along function definitions
374 |                 "\nfunction ",
375 |                 "\nconst ",
376 |                 "\nlet ",
377 |                 "\nvar ",
378 |                 "\nclass ",
379 |                 # Split along control flow statements
380 |                 "\nif ",
381 |                 "\nfor ",
382 |                 "\nwhile ",
383 |                 "\nswitch ",
384 |                 "\ncase ",
385 |                 "\ndefault ",
386 |                 # Split by the normal type of lines
387 |                 "\n\n",
388 |                 "\n",
389 |                 " "
390 |             ],
391 |             "javascript": [
392 |                 # Split along class definitions
393 |                 "\nclass ",
394 |                 # Split along function definitions
395 |                 "\nfunction ",
396 |                 "\nconst ",
397 |                 "\nlet ",
398 |                 "\nvar ",
399 |                 "\nclass ",
400 |                 # Split along control flow statements
401 |                 "\nif ",
402 |                 "\nfor ",
403 |                 "\nwhile ",
404 |                 "\nswitch ",
405 |                 "\ncase ",
406 |                 "\ndefault ",
407 |                 # Split by the normal type of lines
408 |                 "\n\n",
409 |                 "\n",
410 |                 " "
411 |             ],
412 |             "php": [
413 |                 # Split along function definitions
414 |                 "\nfunction ",
415 |                 # Split along class definitions
416 |                 "\nclass ",
417 |                 # Split along control flow statements
418 |                 "\nif ",
419 |                 "\nforeach ",
420 |                 "\nwhile ",
421 |                 "\ndo ",
422 |                 "\nswitch ",
423 |                 "\ncase ",
424 |                 # Split by the normal type of lines
425 |                 "\n\n",
426 |                 "\n",
427 |                 " "
428 |             ],
429 |             "proto": [
430 |                 # Split along message definitions
431 |                 "\nmessage ",
432 |                 # Split along service definitions
433 |                 "\nservice ",
434 |                 # Split along enum definitions
435 |                 "\nenum ",
436 |                 # Split along option definitions
437 |                 "\noption ",
438 |                 # Split along import statements
439 |                 "\nimport ",
440 |                 # Split along syntax declarations
441 |                 "\nsyntax ",
442 |                 # Split by the normal type of lines
443 |                 "\n\n",
444 |                 "\n",
445 |                 " "
446 |             ],
447 |             "python": [
448 |                 # First, try to split along class definitions
449 |                 "\nclass ",
450 |                 "\ndef ",
451 |                 "\n\tdef ",
452 |                 # Now split by the normal type of lines
453 |                 "\n\n",
454 |                 "\n",
455 |                 " "
456 |             ],
457 |             "py": [
458 |                 # First, try to split along class definitions
459 |                 "\nclass ",
460 |                 "\ndef ",
461 |                 "\n\tdef ",
462 |                 # Now split by the normal type of lines
463 |                 "\n\n",
464 |                 "\n",
465 |                 " "
466 |             ],
467 |             "rst": [
468 |                 # Split along section titles
469 |                 "\n===\n",
470 |                 "\n---\n",
471 |                 "\n***\n",
472 |                 # Split along directive markers
473 |                 "\n.. ",
474 |                 # Split by the normal type of lines
475 |                 "\n\n",
476 |                 "\n",
477 |                 " "
478 |             ],
479 |             "ruby": [
480 |                 # Split along method definitions
481 |                 "\ndef ",
482 |                 "\nclass ",
483 |                 # Split along control flow statements
484 |                 "\nif ",
485 |                 "\nunless ",
486 |                 "\nwhile ",
487 |                 "\nfor ",
488 |                 "\ndo ",
489 |                 "\nbegin ",
490 |                 "\nrescue ",
491 |                 # Split by the normal type of lines
492 |                 "\n\n",
493 |                 "\n",
494 |                 " "
495 |             ],
496 |             "rust": [
497 |                 # Split along function definitions
498 |                 "\nfn ",
499 |                 "\nconst ",
500 |                 "\nlet ",
501 |                 # Split along control flow statements
502 |                 "\nif ",
503 |                 "\nwhile ",
504 |                 "\nfor ",
505 |                 "\nloop ",
506 |                 "\nmatch ",
507 |                 "\nconst ",
508 |                 # Split by the normal type of lines
509 |                 "\n\n",
510 |                 "\n",
511 |                 " "
512 |             ],
513 |             "scala": [
514 |                 # Split along class definitions
515 |                 "\nclass ",
516 |                 "\nobject ",
517 |                 # Split along method definitions
518 |                 "\ndef ",
519 |                 "\nval ",
520 |                 "\nvar ",
521 |                 # Split along control flow statements
522 |                 "\nif ",
523 |                 "\nfor ",
524 |                 "\nwhile ",
525 |                 "\nmatch ",
526 |                 "\ncase ",
527 |                 # Split by the normal type of lines
528 |                 "\n\n",
529 |                 "\n",
530 |                 " "
531 |             ],
532 |             "swift": [
533 |                 # Split along function definitions
534 |                 "\nfunc ",
535 |                 # Split along class definitions
536 |                 "\nclass ",
537 |                 "\nstruct ",
538 |                 "\nenum ",
539 |                 # Split along control flow statements
540 |                 "\nif ",
541 |                 "\nfor ",
542 |                 "\nwhile ",
543 |                 "\ndo ",
544 |                 "\nswitch ",
545 |                 "\ncase ",
546 |                 # Split by the normal type of lines
547 |                 "\n\n",
548 |                 "\n",
549 |                 " "
550 |             ],
551 |             "md": [
552 |                 # First, try to split along Markdown headings (starting with level 2)
553 |                 "\n## ",
554 |                 "\n### ",
555 |                 "\n#### ",
556 |                 "\n##### ",
557 |                 "\n###### ",
558 |                 # Note the alternative syntax for headings (below) is not handled here
559 |                 # Heading level 2
560 |                 # ---------------
561 |                 # End of code block
562 |                 "```\n\n",
563 |                 # Horizontal lines
564 |                 "\n\n***\n\n",
565 |                 "\n\n---\n\n",
566 |                 "\n\n___\n\n",
567 |                 # Note that this splitter doesn't handle horizontal lines defined
568 |                 # by *three or more* of ***, ---, or ___, but this is not handled
569 |                 # Github tables
570 |                 "<table>",
571 |                 # "<tr>",
572 |                 # "<td>",
573 |                 # "<td ",
574 |                 "\n\n",
575 |                 "\n",
576 |                 " "
577 |             ],
578 |             "latex": [
579 |                 # First, try to split along Latex sections
580 |                 "\n\\chapter{",
581 |                 "\n\\section{",
582 |                 "\n\\subsection{",
583 |                 "\n\\subsubsection{",
584 | 
585 |                 # Now split by environments
586 |                 "\n\\begin{enumerate}",
587 |                 "\n\\begin{itemize}",
588 |                 "\n\\begin{description}",
589 |                 "\n\\begin{list}",
590 |                 "\n\\begin{quote}",
591 |                 "\n\\begin{quotation}",
592 |                 "\n\\begin{verse}",
593 |                 "\n\\begin{verbatim}",
594 | 
595 |                 # Now split by math environments
596 |                 "\n\\begin{align}",
597 |                 "$$",
598 |                 "$",
599 | 
600 |                 # Now split by the normal type of lines
601 |                 "\n\n",
602 |                 "\n",
603 |                 " "
604 |             ],
605 |             "html": [
606 |                 # First, try to split along HTML tags
607 |                 "<body>",
608 |                 "<div>",
609 |                 "<p>",
610 |                 "<br>",
611 |                 "<li>",
612 |                 "<h1>",
613 |                 "<h2>",
614 |                 "<h3>",
615 |                 "<h4>",
616 |                 "<h5>",
617 |                 "<h6>",
618 |                 "<span>",
619 |                 "<table>",
620 |                 "<tr>",
621 |                 "<td>",
622 |                 "<th>",
623 |                 "<ul>",
624 |                 "<ol>",
625 |                 "<header>",
626 |                 "<footer>",
627 |                 "<nav>",
628 |                 # Head
629 |                 "<head>",
630 |                 "<style>",
631 |                 "<script>",
632 |                 "<meta>",
633 |                 "<title>",
634 |                 # Normal type of lines
635 |                 " "
636 |             ],
637 |             "sol": [
638 |                 # Split along compiler informations definitions
639 |                 "\npragma ",
640 |                 "\nusing ",
641 |                 # Split along contract definitions
642 |                 "\ncontract ",
643 |                 "\ninterface ",
644 |                 "\nlibrary ",
645 |                 # Split along method definitions
646 |                 "\nconstructor ",
647 |                 "\ntype ",
648 |                 "\nfunction ",
649 |                 "\nevent ",
650 |                 "\nmodifier ",
651 |                 "\nerror ",
652 |                 "\nstruct ",
653 |                 "\nenum ",
654 |                 # Split along control flow statements
655 |                 "\nif ",
656 |                 "\nfor ",
657 |                 "\nwhile ",
658 |                 "\ndo while ",
659 |                 "\nassembly ",
660 |                 # Split by the normal type of lines
661 |                 "\n\n",
662 |                 "\n",
663 |                 " "
664 |             ]
665 |         }
666 | 
667 |         return separators.get(doc_type, ["\n\n", "\n", " "])
668 | 


--------------------------------------------------------------------------------
/src/vectra_py/vectra-cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | from local_document_index import LocalDocumentIndex
  6 | from web_fetcher import WebFetcher
  7 | from openai_embeddings import OpenAIEmbeddings
  8 | from file_fetcher import FileFetcher
  9 | 
 10 | 
 11 | async def run():
 12 |     parser = argparse.ArgumentParser(prog="vectra")
 13 |     subparsers = parser.add_subparsers(dest="command")
 14 | 
 15 |     # Create command
 16 |     create_parser = subparsers.add_parser("create", description="Create a new local index")
 17 |     create_parser.add_argument("index", type=str, help="Path to the index folder")
 18 | 
 19 |     # Delete command
 20 |     delete_parser = subparsers.add_parser("delete", description="Delete an existing local index")
 21 |     delete_parser.add_argument("index", type=str, help="Path to the index folder")
 22 | 
 23 |     # Add command
 24 |     add_parser = subparsers.add_parser("add", description="Add one or more web pages to an index")
 25 |     add_parser.add_argument("index", type=str, help="Path to the index folder")
 26 |     add_parser.add_argument("--keys", "-k", type=str, required=True, help="Path to a JSON file containing model keys")
 27 |     add_parser.add_argument("--uri", "-u", type=str, nargs="+", help="HTTP/HTTPS links to web pages")
 28 |     add_parser.add_argument("--list", "-l", type=str, help="Path to a file containing a list of web pages")
 29 |     add_parser.add_argument("--chunk-size", "-cs", type=int, default=512, help="Size of generated chunks in tokens")
 30 | 
 31 |     # Remove command
 32 |     remove_parser = subparsers.add_parser("remove", description="Remove one or more documents from an index")
 33 |     remove_parser.add_argument("index", type=str, help="Path to the index folder")
 34 |     remove_parser.add_argument("--uri", "-u", type=str, nargs="+", help="URIs of documents to remove")
 35 |     remove_parser.add_argument("--list", "-l", type=str, help="Path to a file containing a list of documents to remove")
 36 | 
 37 |     # Stats command
 38 |     stats_parser = subparsers.add_parser("stats", description="Print the stats for a local index")
 39 |     stats_parser.add_argument("index", type=str, help="Path to the index folder")
 40 | 
 41 |     # Query command
 42 |     query_parser = subparsers.add_parser("query", description="Query a local index")
 43 |     query_parser.add_argument("index", type=str, help="Path to the index folder")
 44 |     query_parser.add_argument("query", type=str, help="Query text")
 45 |     query_parser.add_argument("--keys", "-k", type=str, required=True, help="Path to a JSON file containing model keys")
 46 |     query_parser.add_argument("--document-count", "-dc", type=int, default=10, help="Max number of documents to return")
 47 |     query_parser.add_argument("--chunk-count", "-cc", type=int, default=50, help="Max number of chunks to return")
 48 |     query_parser.add_argument("--section-count", "-sc", type=int, default=1, help="Max number of document sections to render")
 49 |     query_parser.add_argument("--tokens", "-t", type=int, default=2000, help="Max number of tokens to render for each section")
 50 |     query_parser.add_argument("--format", "-f", type=str, default="sections", choices=["sections", "stats", "chunks"], help="Format of the rendered results")
 51 | 
 52 |     args = parser.parse_args()
 53 | 
 54 |     if args.command == "create":
 55 |         folder_path = args.index
 56 |         index = LocalDocumentIndex(folder_path)
 57 |         print(f"Creating index at {folder_path}")
 58 |         await index.create_index(version=1, delete_if_exists=True)
 59 | 
 60 |     elif args.command == "delete":
 61 |         folder_path = args.index
 62 |         print(f"Deleting index at {folder_path}")
 63 |         index = LocalDocumentIndex(folder_path)
 64 |         await index.delete_index()
 65 | 
 66 |     elif args.command == "add":
 67 |         print("Adding Web Pages to Index")
 68 | 
 69 |         # Create embeddings
 70 |         with open(args.keys, "r", encoding="utf-8") as keys_file:
 71 |             keys = json.load(keys_file)
 72 |         embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", **keys)
 73 | 
 74 |         # Initialize index
 75 |         folder_path = args.index
 76 |         index = LocalDocumentIndex(
 77 |             folder_path=folder_path,
 78 |             embeddings=embeddings,
 79 |             chunking_config={"chunk_size": args.chunk_size},
 80 |         )
 81 | 
 82 |         # Get list of URIs
 83 |         uris = get_item_list(args.uri, args.list, "web page")
 84 | 
 85 |         # Fetch web pages
 86 |         file_fetcher = FileFetcher()
 87 |         web_fetcher = WebFetcher()
 88 |         for uri in uris:
 89 |             try:
 90 |                 print(f"Fetching {uri}")
 91 |                 fetcher = web_fetcher if uri.startswith("http") else file_fetcher
 92 |                 await fetcher.fetch(uri, index_upsert_document(index))
 93 |             except Exception as err:
 94 |                 print(f"Error adding: {uri}\n{str(err)}")
 95 | 
 96 |     elif args.command == "remove":
 97 |         folder_path = args.index
 98 |         index = LocalDocumentIndex(folder_path)
 99 | 
100 |         # Get list of URIs
101 |         uris = get_item_list(args.uri, args.list, "document")
102 | 
103 |         # Remove documents
104 |         for uri in uris:
105 |             print(f"Removing {uri}")
106 |             await index.delete_document(uri)
107 | 
108 |     elif args.command == "stats":
109 |         folder_path = args.index
110 |         index = LocalDocumentIndex(folder_path)
111 |         stats = await index.get_catalog_stats()
112 |         print("Index Stats")
113 |         print(stats)
114 | 
115 |     elif args.command == "query":
116 |         print("Querying Index")
117 | 
118 |         # Create embeddings
119 |         with open(args.keys, "r", encoding="utf-8") as keys_file:
120 |             keys = json.load(keys_file)
121 |         embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", **keys)
122 | 
123 |         # Initialize index
124 |         folder_path = args.index
125 |         index = LocalDocumentIndex(folder_path=folder_path, embeddings=embeddings)
126 | 
127 |         # Query index
128 |         query = args.query
129 |         results = await index.query_documents(
130 |             query,
131 |             max_documents=args.document_count,
132 |             max_chunks=args.chunk_count,
133 |         )
134 | 
135 |         # Render results
136 |         for result in results:
137 |             print(result.uri)
138 |             print("score:", result.score)
139 |             print("chunks:", len(result.chunks))
140 |             if args.format == "sections":
141 |                 sections = await result.render_sections(args.tokens, args.section_count)
142 |                 for i, section in enumerate(sections):
143 |                     print(f"Section {i + 1}" if args.section_count > 1 else "Section")
144 |                     print("score:", section.score)
145 |                     print("tokens:", section.token_count)
146 |                     print(section.text)
147 |             elif args.format == "chunks":
148 |                 text = await result.load_text()
149 |                 for i, chunk in enumerate(result.chunks):
150 |                     start_pos = chunk.item.metadata["startPos"]
151 |                     end_pos = chunk.item.metadata["endPos"]
152 |                     print(f"Chunk {i + 1}")
153 |                     print("score:", chunk.score)
154 |                     print("startPos:", start_pos)
155 |                     print("endPos:", end_pos)
156 |                     print(text[start_pos:end_pos + 1])
157 | 
158 | 
159 | def get_item_list(items, list_file, item_type):
160 |     if items is not None and len(items) > 0:
161 |         return items
162 |     elif list_file is not None and list_file.strip():
163 |         with open(list_file, "r", encoding="utf-8") as file:
164 |             item_list = [line.strip() for line in file.readlines() if line.strip()]
165 |         return item_list
166 |     else:
167 |         raise ValueError(f"You must specify either one or more '--uri <{item_type}>' for the items or a '--list <file path>' for a file containing the items.")
168 | 
169 | 
170 | def index_upsert_document(index):
171 |     async def upsert_document(uri, text, doc_type):
172 |         print(f"Indexing {uri}")
173 |         await index.upsert_document(uri, text, doc_type)
174 |         print(f"Added {uri}")
175 |         return True
176 | 
177 |     return upsert_document
178 | 
179 | if __name__ == "__main__":
180 |     run()
181 | 


--------------------------------------------------------------------------------
/src/vectra_py/vectra-pipeline.py:
--------------------------------------------------------------------------------
  1 | # Truncated operational pipeline to load docs to a local index.
  2 | # Created to reduce variables and test the vectra-py operations.
  3 | # Vectra-cli.py is the eventual operational entrypoint.
  4 | 
  5 | import os
  6 | import json
  7 | import asyncio
  8 | from typing import List
  9 | from dataclasses import dataclass
 10 | 
 11 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer
 12 | from oss_embeddings import OSSEmbeddings, OSSEmbeddingsOptions
 13 | from openai_embeddings import OpenAIEmbeddings, OpenAIEmbeddingsOptions
 14 | from local_index import LocalIndex, CreateIndexConfig
 15 | from local_document_index import LocalDocumentIndex, LocalDocumentIndexConfig
 16 | # from local_index import CreateIndexConfig
 17 | from file_fetcher import FileFetcher
 18 | from web_fetcher import WebFetcher
 19 | 
 20 | # test defaults
 21 | keys_file = "vectra.keys"
 22 | uri = None
 23 | list_file = "test_filings_1.json"
 24 | item_type = "html"
 25 | 
 26 | openai_options = OpenAIEmbeddingsOptions(
 27 |     api_key=os.environ.get("OPENAI_API_KEY"),
 28 |     model="text-embedding-ada-002",
 29 |     retry_policy=[2000, 5000],
 30 |     request_config={"timeout": 30}
 31 | )
 32 | 
 33 | oss_options = OSSEmbeddingsOptions(
 34 |     tokenizer=OSSTokenizer(model_name="sentence-transformers/all-MiniLM-L6-v2"),
 35 |     model="sentence-transformers/all-MiniLM-L6-v2"
 36 | )
 37 | 
 38 | 
 39 | @dataclass
 40 | class Filing:
 41 |     company_name: str
 42 |     form_type: str
 43 |     filing_date: str
 44 |     url: str
 45 | 
 46 | 
 47 | def get_item_list(uri: str, list_file: str, item_type: str) -> List[str]:
 48 |     """Get a list of URIs from a specified URI or list file"""
 49 |     if uri:
 50 |         return [uri]
 51 |     elif list_file:
 52 |         with open(list_file, "r", encoding="utf-8") as file:
 53 |             filings = json.load(file)['filings']
 54 |             return [Filing(**filing) for filing in filings]
 55 | 
 56 |     else:
 57 |         raise Exception(f"Please provide a {item_type} URI or list file")
 58 | 
 59 | 
 60 | async def add_docs_to_index(uri: str = None, list_file: str = None, item_type: str = None):
 61 |     """
 62 |     Handle operations.
 63 |     Establish the index, prepare the config, fetch the docs, and add them to the index.
 64 |     """
 65 |     print("Adding Web Pages to Index")
 66 | 
 67 |     # Create embeddings and tokenizer
 68 |     # embeddings = OpenAIEmbeddings(options=openai_options)
 69 |     # tokenizer = None  # the tokenizer is wrapped in the openai embedding.
 70 |     embeddings = OSSEmbeddings(options=oss_options)
 71 |     tokenizer = embeddings.tokenizer
 72 |     # Initialize index in current directory
 73 |     # update the index_config to include the embeddings
 74 |     doc_index_config = LocalDocumentIndexConfig(folder_path=(os.path.join(os.getcwd(), 'index')),
 75 |                                                 tokenizer=tokenizer,
 76 |                                                 embeddings=embeddings)
 77 |     simple_index_config = CreateIndexConfig(version=1, 
 78 |                                             delete_if_exists=True,
 79 |                                             metadata_config={"model_framework": embeddings.__class__.__name__,
 80 |                                                              "model_name": embeddings.options.model},
 81 |                                             )
 82 |     index = LocalDocumentIndex(doc_index_config)
 83 |     await index.create_index(simple_index_config)
 84 | 
 85 |     # Get list of URIs
 86 |     uris = get_item_list(uri, list_file, item_type)
 87 |     print('uris', uris)
 88 | 
 89 |     # Fetch web pages
 90 |     file_fetcher = FileFetcher()
 91 |     web_fetcher = WebFetcher()
 92 |     for uri in uris:
 93 |         try:
 94 |             url = uri.url if isinstance(uri, Filing) else uri
 95 |             print(f"Fetching {url}")
 96 |             fetcher = web_fetcher if url.startswith("http") else file_fetcher
 97 |             fetched_doc = fetcher.fetch(url)
 98 |             await index.upsert_document(url,
 99 |                                         fetched_doc,
100 |                                         doc_type=item_type)
101 |         except Exception as err:
102 |             print(f"Error adding: {uri}\n{str(err)}")
103 | 
104 | 
105 | async def main():
106 |     await add_docs_to_index(list_file=list_file, item_type=item_type)
107 | 
108 | if __name__ == "__main__":
109 |     asyncio.run(main())
110 | 


--------------------------------------------------------------------------------
/src/vectra_py/web_fetcher.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from typing import Callable
  3 | from urllib.parse import urlparse
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | from markdownify import markdownify as md
  7 | 
  8 | ALLOWED_CONTENT_TYPES = [
  9 |     "text/html",
 10 |     "application/json",
 11 |     "application/xml",
 12 |     "application/javascript",
 13 |     "text/plain",
 14 | ]
 15 | 
 16 | DEFAULT_HEADERS = {
 17 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 18 |     "Accept-Encoding": "gzip, deflate",
 19 |     "Accept-Language": "en-US,en;q=0.5",
 20 |     "Alt-Used": "LEAVE-THIS-KEY-SET-BY-TOOL",
 21 |     "Connection": "keep-alive",
 22 |     "Host": "LEAVE-THIS-KEY-SET-BY-TOOL",
 23 |     "Referer": "https://www.google.com/",
 24 |     "Sec-Fetch-Dest": "document",
 25 |     "Sec-Fetch-Mode": "navigate",
 26 |     "Sec-Fetch-Site": "cross-site",
 27 |     "Upgrade-Insecure-Requests": "1",
 28 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
 29 | }
 30 | 
 31 | 
 32 | class WebFetcher:
 33 |     def __init__(self, config=None):
 34 |         self._config = {
 35 |             "htmlToMarkdown": True,
 36 |             "summarizeHtml": False,
 37 |         }
 38 |         if config:
 39 |             self._config.update(config)
 40 | 
 41 |     def fetch(self, uri: str) -> str:
 42 |         headers = DEFAULT_HEADERS.copy()
 43 |         parsed_uri = urlparse(uri)
 44 |         headers["Host"] = parsed_uri.hostname
 45 |         headers["Alt-Used"] = parsed_uri.hostname
 46 | 
 47 |         response = requests.get(uri, headers=headers, **self._config.get("requestConfig", {}))
 48 |         response.raise_for_status()
 49 | 
 50 |         content_type = response.headers["content-type"]
 51 |         content_type_array = content_type.split(";")
 52 |         if not content_type_array[0] or content_type_array[0] not in ALLOWED_CONTENT_TYPES:
 53 |             raise Exception(f"Site returned an invalid content type of {content_type}")
 54 | 
 55 |         doc_type = content_type_array[0].split("/")[1] if content_type_array[0] != "text/plain" else None
 56 |         if doc_type == "html" and self._config["htmlToMarkdown"]:
 57 |             text = self.html_to_markdown(response.text, uri)
 58 |             return text
 59 |         else:
 60 |             return response.text
 61 | 
 62 |     @staticmethod
 63 |     def html_to_markdown(html: str, base_url: str) -> str:
 64 |         soup = BeautifulSoup(html, "html.parser")
 65 | 
 66 |         for script in soup.find_all("script"):
 67 |             script.extract()
 68 | 
 69 |         for a in soup.find_all("a"):
 70 |             href = a.get("href")
 71 |             if href and not href.startswith("http"):
 72 |                 try:
 73 |                     a["href"] = requests.compat.urljoin(base_url, href)
 74 |                 except ValueError:
 75 |                     pass
 76 | 
 77 |         markdown = md(str(soup.body), heading_style="ATX", bullet_style="-", code_style="backticks")
 78 |         markdown = "\n\n".join(markdown.splitlines())
 79 |         if len(markdown) > 64:
 80 |             start = markdown.find("\n")
 81 |             if start != -1:
 82 |                 markdown = markdown[start:]
 83 |             else:
 84 |                 start = markdown.find(" ")
 85 |                 if start != -1:
 86 |                     markdown = markdown[start:]
 87 | 
 88 |         return markdown
 89 | 
 90 | 
 91 | 
 92 | # Example usage:
 93 | # web_fetcher = WebFetcher()
 94 | # # result = web_fetcher.fetch("https://earthshotprize.org/the-prize/earthshots/")
 95 | # result = web_fetcher.fetch("https://www.sec.gov/Archives/edgar/data/1442145/000143774923004945/vrsk20221231_10k.htm")
 96 | # print('test', result)
 97 | 
 98 | # # save to a test file
 99 | # with open("test3.md", "w") as f:
100 | #     f.write(result)
101 | 


--------------------------------------------------------------------------------