├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── env ├── pyproject.toml ├── requirements.dev.txt ├── requirements.txt └── src └── vectra_py ├── __init__.py ├── all_MiniLM_L6_v2_tokenizer.py ├── custom_types.py ├── file_fetcher.py ├── gpt3_tokenizer.py ├── item_selector.py ├── local_document.py ├── local_document_index.py ├── local_document_result.py ├── local_index.py ├── openai_embeddings.py ├── oss_embeddings.py ├── text_splitter.py ├── vectra-cli.py ├── vectra-pipeline.py └── web_fetcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # keyfiles 163 | *.keys -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 brian schleckser 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Update: Revisiting this very stale project in Fall 2024. Going to start with a general review of the source vectra project, then go from there. As such I'll probably reply to the issues and close them, pending new discussion. 2 | 3 | # vectra-py 4 | This is a faithful port of Steven Ickman's [Vectra](https://github.com/Stevenic/vectra) in memory vector index project. Only modifications were to port into python, adjust for format, and generate some python friendly example code. Below readme follows on from his, with similar pythonic adjustments. 5 | 6 | Thanks for the inspiriation Steve! 7 | 8 | 9 | Vectra-py is a local vector database for Python with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID. 10 | 11 | When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by similarity. Every item in the index will first be filtered by metadata and then ranked for similarity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms. 12 | 13 | Keep in mind that your entire Vectra index is loaded into memory so it's not well suited for scenarios like long term chat bot memory. Use a real vector DB for that. Vectra is intended to be used in scenarios where you have a small corpus of mostly static data that you'd like to include in your prompt. Infinite few shot examples would be a great use case for Vectra or even just a single document you want to ask questions over. 14 | 15 | Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace. 16 | 17 | ## Installation 18 | 19 | ``` 20 | $ pip install vectra-py 21 | ``` 22 | 23 | ## Prep 24 | 25 | Use dotenv or set env var to store your openAI API Key. 26 | 27 | ## Usage 28 | 29 | First create an instance of `LocalIndex` with the path to the folder where you want you're items stored: 30 | 31 | ```python 32 | from vectra_py import LocalIndex 33 | 34 | index = LocalIndex(os.path.join(os.getcwd(), 'index')) 35 | ``` 36 | 37 | Next, from inside an async function, create your index: 38 | 39 | ```python 40 | if not index.isIndexCreated(): 41 | index.createIndex() 42 | ``` 43 | 44 | Add some items to your index: 45 | 46 | ```python 47 | openai.api_key = os.environ.get("OPENAI_APIKEY") 48 | 49 | async def get_vector(text: str): 50 | print(text) 51 | model = "text-embedding-ada-002" 52 | response = await openai_async.embeddings( 53 | openai.api_key, 54 | timeout=2, 55 | payload={"model": model, 56 | "input": [text]}, 57 | ) 58 | return response.json()['data'][0]['embedding'] 59 | 60 | 61 | async def add_item(text: str): 62 | vector = await get_vector(text) 63 | metadata = {'text': text} 64 | print(vector, metadata) 65 | await index.insertItem({'vector': vector, 66 | 'metadata': metadata}) 67 | 68 | // Add items 69 | await add_item('apple'); 70 | await add_item('oranges'); 71 | await add_item('red'); 72 | await add_item('blue'); 73 | ``` 74 | 75 | Then query for items: 76 | 77 | ```python 78 | async def query(text: str): 79 | vector = await get_vector(text) 80 | results = await index.queryItems(vector, 3) 81 | if len(results) > 0: 82 | for result in results: 83 | print(f"[{result['score']}] \ 84 | {result.get('item')['metadata']['text']}") 85 | else: 86 | print("No results found.") 87 | 88 | await query('green') 89 | /* 90 | [0.9036569942401076] blue 91 | [0.8758153664568566] red 92 | [0.8323828606103998] apple 93 | */ 94 | 95 | await query('banana') 96 | /* 97 | [0.9033128691220631] apple 98 | [0.8493374123092652] oranges 99 | [0.8415324469533297] blue 100 | */ 101 | ``` 102 | 103 | Creating a document index is a bit more involved. 104 | 105 | First, set up configurations. Pass in an example list of Filing objects as a list_file like: 106 | ```json 107 | { 108 | "filings": [ 109 | { 110 | "company_name": "DigitalBridge Group, Inc.", 111 | "form_type": "10-Q", 112 | "filing_date": "20230505", 113 | "url": "https://www.sec.gov/Archives/edgar/data/0001679688/000167968823000049/dbrg-20230331.htm" 114 | } 115 | ] 116 | } 117 | ``` 118 | 119 | ```python 120 | import os 121 | import json 122 | import asyncio 123 | from typing import List 124 | from dataclasses import dataclass 125 | 126 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer 127 | from oss_embeddings import OSSEmbeddings, OSSEmbeddingsOptions 128 | from openai_embeddings import OpenAIEmbeddings, OpenAIEmbeddingsOptions 129 | from local_index import LocalIndex, CreateIndexConfig 130 | from local_document_index import LocalDocumentIndex, LocalDocumentIndexConfig 131 | from file_fetcher import FileFetcher 132 | from web_fetcher import WebFetcher 133 | 134 | # test defaults 135 | keys_file = "vectra.keys" 136 | uri = None 137 | list_file = "test_filings_1.json" 138 | item_type = "html" 139 | 140 | openai_options = OpenAIEmbeddingsOptions( 141 | api_key=os.environ.get("OPENAI_API_KEY"), 142 | model="text-embedding-ada-002", 143 | retry_policy=[2000, 5000], 144 | request_config={"timeout": 30} 145 | ) 146 | 147 | oss_options = OSSEmbeddingsOptions( 148 | tokenizer=OSSTokenizer(model_name="sentence-transformers/all-MiniLM-L6-v2"), 149 | model="sentence-transformers/all-MiniLM-L6-v2" 150 | ) 151 | 152 | 153 | @dataclass 154 | class Filing: 155 | company_name: str 156 | form_type: str 157 | filing_date: str 158 | url: str 159 | ``` 160 | 161 | Next, write a basic way to organize the filings. 162 | ```python 163 | def get_item_list(uri: str, list_file: str, item_type: str) -> List[str]: 164 | """Get a list of URIs from a specified URI or list file""" 165 | if uri: 166 | return [uri] 167 | elif list_file: 168 | with open(list_file, "r", encoding="utf-8") as file: 169 | filings = json.load(file)['filings'] 170 | return [Filing(**filing) for filing in filings] 171 | 172 | else: 173 | raise Exception(f"Please provide a {item_type} URI or list file") 174 | ``` 175 | 176 | Then, handle the operations to create, manage, and populate the doc index. 177 | 178 | ```python 179 | async def add_docs_to_index(uri: str = None, list_file: str = None, item_type: str = None): 180 | """ 181 | Handle operations. 182 | Establish the index, prepare the config, fetch the docs, and add them to the index. 183 | """ 184 | print("Adding Web Pages to Index") 185 | 186 | # Create embeddings and tokenizer 187 | # embeddings = OpenAIEmbeddings(options=openai_options) 188 | # tokenizer = None # the tokenizer is wrapped in the openai embedding. 189 | embeddings = OSSEmbeddings(options=oss_options) 190 | tokenizer = embeddings.tokenizer 191 | # Initialize index in current directory 192 | # update the index_config to include the embeddings 193 | doc_index_config = LocalDocumentIndexConfig(folder_path=(os.path.join(os.getcwd(), 'index')), 194 | tokenizer=tokenizer, 195 | embeddings=embeddings) 196 | simple_index_config = CreateIndexConfig(version=1, 197 | delete_if_exists=True, 198 | metadata_config={"model_framework": embeddings.__class__.__name__, 199 | "model_name": embeddings.options.model}, 200 | ) 201 | index = LocalDocumentIndex(doc_index_config) 202 | await index.create_index(simple_index_config) 203 | 204 | # Get list of URIs 205 | uris = get_item_list(uri, list_file, item_type) 206 | print('uris', uris) 207 | 208 | # Fetch web pages 209 | file_fetcher = FileFetcher() 210 | web_fetcher = WebFetcher() 211 | for uri in uris: 212 | try: 213 | url = uri.url if isinstance(uri, Filing) else uri 214 | print(f"Fetching {url}") 215 | fetcher = web_fetcher if url.startswith("http") else file_fetcher 216 | fetched_doc = fetcher.fetch(url) 217 | await index.upsert_document(url, 218 | fetched_doc, 219 | doc_type=item_type) 220 | except Exception as err: 221 | print(f"Error adding: {uri}\n{str(err)}") 222 | 223 | 224 | async def main(): 225 | await add_docs_to_index(list_file=list_file, item_type=item_type) 226 | 227 | if __name__ == "__main__": 228 | asyncio.run(main()) 229 | 230 | ``` 231 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BMS-geodev/vectra-py/285bd341f1da469d9695fdee584b2d526f24a4ee/__init__.py -------------------------------------------------------------------------------- /env: -------------------------------------------------------------------------------- 1 | # copy to .env, then replace ### with your API key. 2 | OPENAI_APIKEY=### -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "vectra_py" 3 | version = "0.0.5" 4 | authors = [ 5 | { name="Brian Schleckser", email="brian.schleckser+vectrapy@gmail.com" }, 6 | ] 7 | description = "An in memory vector index project, simliar to Pinecone DB." 8 | readme = "README.md" 9 | requires-python = ">=3.9" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: OS Independent", 14 | ] 15 | 16 | [project.urls] 17 | "Homepage" = "https://github.com/BMS-geodev/vectra-py" 18 | "Bug Tracker" = "https://github.com/BMS-geodev/vectra-py/issues" -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | openai 2 | tiktoken 3 | openai-async 4 | python-dotenv 5 | bs4 6 | markdownify 7 | transformers 8 | sentence-transformers 9 | colorize 10 | aiofiles -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tiktoken 2 | openai 3 | bs4 4 | markdownify 5 | transformers 6 | sentence-transformers 7 | aiofiles -------------------------------------------------------------------------------- /src/vectra_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BMS-geodev/vectra-py/285bd341f1da469d9695fdee584b2d526f24a4ee/src/vectra_py/__init__.py -------------------------------------------------------------------------------- /src/vectra_py/all_MiniLM_L6_v2_tokenizer.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | from transformers import AutoTokenizer 3 | 4 | 5 | class OSSTokenizer: 6 | def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): 7 | # Load model from HuggingFace Hub 8 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 9 | 10 | def decode(self, tokens): 11 | pass 12 | 13 | def encode(self, text): 14 | try: 15 | if len(text) > 1: # if text is a list of strings 16 | data = [self.tokenizer.encode(item) for item in text] 17 | return data 18 | else: 19 | data = self.tokenizer.encode(text) 20 | return data 21 | except Exception as e: 22 | print('encoding error', e) 23 | return None 24 | -------------------------------------------------------------------------------- /src/vectra_py/custom_types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union, Dict, Optional, Any 3 | 4 | 5 | @dataclass 6 | class EmbeddingsModel: 7 | max_tokens: int 8 | 9 | # async def create_embeddings(self, inputs: Union[str, List[str]]) -> 'EmbeddingsResponse': 10 | # pass 11 | 12 | 13 | @dataclass 14 | class EmbeddingsResponse: 15 | status: str 16 | output: List[List[float]] = None 17 | message: str = None 18 | 19 | 20 | @dataclass 21 | class TextChunk: 22 | text: str 23 | tokens: List[int] 24 | start_pos: int 25 | end_pos: int 26 | start_overlap: List[int] 27 | end_overlap: List[int] 28 | 29 | 30 | @dataclass 31 | class TextFetcher: 32 | async def fetch(self, uri: str) -> Dict[str, Union[str, None]]: 33 | pass 34 | 35 | 36 | @dataclass 37 | class IndexStats: 38 | version: int 39 | metadata_config: Dict[str, Optional[List[str]]] 40 | items: int 41 | 42 | 43 | @dataclass 44 | class IndexItem: 45 | id: str 46 | metadata: Dict[str, Any] 47 | vector: List[float] 48 | norm: float 49 | metadata_file: str = None 50 | 51 | 52 | @dataclass 53 | class MetadataFilter: 54 | eq: Union[int, str, bool] = None # Equal to (number, string, boolean) 55 | ne: Union[int, str, bool] = None # Not equal to (number, string, boolean) 56 | gt: int = None # Greater than (number) 57 | gte: int = None # Greater than or equal to (number) 58 | lt: int = None # Less than (number) 59 | lte: int = None # Less than or equal to (number) 60 | _in: List[Union[int, str]] = None # In array (string or number) 61 | nin: List[Union[int, str]] = None # Not in array (string or number) 62 | _and: List['MetadataFilter'] = None # AND (MetadataFilter[]) 63 | _or: List['MetadataFilter'] = None # OR (MetadataFilter[]) 64 | extra: Dict[str, Any] = None 65 | 66 | 67 | @dataclass 68 | class MetadataTypes: 69 | value: Union[int, str, bool] 70 | 71 | 72 | @dataclass 73 | class QueryResult: 74 | item: IndexItem 75 | score: float 76 | 77 | 78 | @dataclass 79 | class Tokenizer: 80 | def decode(self, tokens: List[int]) -> str: 81 | pass 82 | 83 | def encode(self, text: str) -> List[int]: 84 | pass 85 | 86 | 87 | @dataclass 88 | class DocumentChunkMetadata: 89 | document_id: str 90 | start_pos: int 91 | end_pos: int 92 | extra: Dict[str, Any] = None 93 | 94 | 95 | @dataclass 96 | class DocumentCatalogStats: 97 | version: int 98 | documents: int 99 | chunks: int 100 | metadata_config: Dict[str, Optional[List[str]]] 101 | extra: Dict[str, Any] = None 102 | 103 | 104 | @dataclass 105 | class DocumentTextSection: 106 | text: str 107 | token_count: int 108 | score: float 109 | -------------------------------------------------------------------------------- /src/vectra_py/file_fetcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class FileFetcher: 5 | async def fetch(self, uri): 6 | # Check if the path exists and whether it's a directory 7 | if os.path.exists(uri): 8 | if os.path.isdir(uri): 9 | # If it's a directory, read all files and recurse 10 | files = os.listdir(uri) 11 | for file in files: 12 | file_path = os.path.join(uri, file) 13 | await self.fetch(file_path) 14 | return True 15 | else: 16 | # If it's a file, read its contents 17 | with open(uri, 'r', encoding='utf-8') as file: 18 | text = file.read() 19 | # Determine the document type based on the file extension 20 | _, file_extension = os.path.splitext(uri) 21 | doc_type = file_extension[1:].lower() if file_extension else None 22 | return uri, text, doc_type 23 | else: 24 | # Handle the case where the path doesn't exist 25 | return None 26 | -------------------------------------------------------------------------------- /src/vectra_py/gpt3_tokenizer.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | # from tiktoken import encode, decode 3 | 4 | 5 | class GPT3Tokenizer: 6 | def __init__(self, model_name: str = "gpt-3.5-turbo"): 7 | self.encoding = tiktoken.encoding_for_model(model_name) 8 | 9 | def decode(self, tokens): 10 | return self.encoding.decode(tokens) 11 | 12 | def encode(self, text): 13 | return self.encoding.encode(text) 14 | -------------------------------------------------------------------------------- /src/vectra_py/item_selector.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import math 3 | 4 | 5 | class ItemSelector: 6 | """ 7 | A class for selecting items based on their similarity. 8 | """ 9 | @staticmethod 10 | def cosine_similarity(vector1: List[int], 11 | vector2: List[int]) -> float: 12 | """ 13 | Returns the similarity between two vectors using the cosine similarity. 14 | """ 15 | # the quotient of the dot product and the product of the norms 16 | return (ItemSelector.dot_product(vector1, vector2) / 17 | (ItemSelector.normalize(vector1) * 18 | ItemSelector.normalize(vector2))) 19 | 20 | @staticmethod 21 | def normalize(vector: List[int]) -> float: 22 | """ 23 | The norm of a vector is 24 | the square root of the sum of the squares of the elements. 25 | Returns the normalized value of a vector. 26 | """ 27 | # crutch to santize lists of lists that come from some embedding models 28 | # this will almost certainly have consequences 29 | if isinstance(vector[0], list): 30 | vector = vector[0] 31 | # Initialize a variable to store the sum of the squares 32 | sum = 0 33 | # Loop through the elements of the array 34 | for i in range(len(vector)): 35 | # Square the element and add it to the sum 36 | sum += vector[i] * vector[i] 37 | # Return the square root of the sum 38 | return math.sqrt(sum) 39 | 40 | @staticmethod 41 | def normalized_cosine_similarity(vector1: List[int], 42 | norm1: float, 43 | vector2: List[int], 44 | norm2: float) -> float: 45 | """ 46 | Returns the similarity between two vectors using the cosine similarity, 47 | considers norms. 48 | """ 49 | # Return the quotient of the dot product and the product of the norms 50 | return ItemSelector.dot_product(vector1, vector2) / (norm1 * norm2) 51 | 52 | @staticmethod 53 | def select(metadata: dict, 54 | filter: dict) -> bool: 55 | """ 56 | Handles filter logic. 57 | """ 58 | if filter is None: 59 | return True 60 | for key in filter: 61 | if key == '$and': 62 | if not all(ItemSelector.select(metadata, f) 63 | for f in filter['$and']): 64 | return False 65 | elif key == '$or': 66 | if not any(ItemSelector.select(metadata, f) 67 | for f in filter['$or']): 68 | return False 69 | else: 70 | value = filter[key] 71 | if value is None: 72 | return False 73 | elif isinstance(value, dict): 74 | if not ItemSelector.metadataFilter(metadata.get(key), 75 | value): 76 | return False 77 | else: 78 | if metadata.get(key) != value: 79 | return False 80 | return True 81 | 82 | @staticmethod 83 | def dot_product(vector1: List[int], 84 | vector2: List[int]) -> int: 85 | """ 86 | Returns the dot product of two vectors. 87 | """ 88 | # Zip the two vectors and multiply each pair, then sum the products 89 | return sum(a * b for a, b in zip(vector1, vector2)) 90 | 91 | @staticmethod 92 | def metadata_filter(value, 93 | filter) -> bool: 94 | """ 95 | Handles metadata filter logic. 96 | """ 97 | if value is None: 98 | return False 99 | 100 | for key in filter: 101 | if key == "$eq": 102 | if value != filter[key]: 103 | return False 104 | elif key == "$ne": 105 | if value == filter[key]: 106 | return False 107 | elif key == "$gt": 108 | if not isinstance(value, int) or value <= filter[key]: 109 | return False 110 | elif key == "$gte": 111 | if not isinstance(value, int) or value < filter[key]: 112 | return False 113 | elif key == "$lt": 114 | if not isinstance(value, int) or value >= filter[key]: 115 | return False 116 | elif key == "$lte": 117 | if not isinstance(value, int) or value > filter[key]: 118 | return False 119 | elif key == "$in": 120 | if not isinstance(value, bool) or value not in filter[key]: 121 | return False 122 | elif key == "$nin": 123 | if not isinstance(value, bool) or value in filter[key]: 124 | return False 125 | else: 126 | if value != filter[key]: 127 | return False 128 | 129 | return True 130 | -------------------------------------------------------------------------------- /src/vectra_py/local_document.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import json 4 | 5 | 6 | class LocalDocument: 7 | def __init__(self, folder_path, id, uri): 8 | self._folder_path = folder_path 9 | self._id = id 10 | self._uri = uri 11 | self._metadata = None 12 | self._text = None 13 | 14 | @property 15 | def folder_path(self): 16 | return self._folder_path 17 | 18 | @property 19 | def id(self): 20 | return self._id 21 | 22 | @property 23 | def uri(self): 24 | return self._uri 25 | 26 | async def has_metadata(self): 27 | try: 28 | await asyncio.to_thread(os.access, os.path.join(self.folder_path, f"{self.id}.json"), os.R_OK) 29 | return True 30 | except Exception as err: 31 | print(f'Error checking metadata for document "{self.uri}": {str(err)}') 32 | return False 33 | 34 | async def load_metadata(self): 35 | if self._metadata is None: 36 | try: 37 | with open(os.path.join(self.folder_path, f"{self.id}.json"), 'r') as file: 38 | json_str = await asyncio.to_thread(file.read) 39 | self._metadata = json.loads(json_str) 40 | except Exception as err: 41 | raise Exception(f'Error reading metadata for document "{self.uri}": {str(err)}') 42 | 43 | return self._metadata 44 | 45 | async def load_text(self): 46 | if self._text is None: 47 | try: 48 | with open(os.path.join(self.folder_path, f"{self.id}.txt"), 'r') as file: 49 | self._text = await asyncio.to_thread(file.read) 50 | except Exception as err: 51 | raise Exception(f'Error reading text file for document "{self.uri}": {str(err)}') 52 | 53 | return self._text 54 | -------------------------------------------------------------------------------- /src/vectra_py/local_document_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from pathlib import Path 4 | import time 5 | import aiofiles.os 6 | import json 7 | import asyncio 8 | from uuid import uuid4 9 | from gpt3_tokenizer import GPT3Tokenizer 10 | from local_index import LocalIndex, CreateIndexConfig 11 | from text_splitter import TextSplitter, TextSplitterConfig 12 | from custom_types import ( 13 | MetadataFilter, 14 | EmbeddingsModel, 15 | Tokenizer, 16 | MetadataTypes, 17 | EmbeddingsResponse, 18 | QueryResult, 19 | DocumentChunkMetadata, 20 | DocumentCatalogStats, 21 | ) 22 | from local_document_result import LocalDocumentResult 23 | from local_document import LocalDocument 24 | from typing import Dict, Optional, List, Union 25 | from dataclasses import dataclass 26 | 27 | 28 | @dataclass 29 | class DocumentQueryOptions: 30 | max_documents: Optional[int] = None 31 | max_chunks: Optional[int] = None 32 | filter: Optional[MetadataFilter] = None 33 | 34 | 35 | @dataclass 36 | class LocalDocumentIndexConfig: 37 | folder_path: str 38 | tokenizer: Tokenizer 39 | embeddings: Optional[EmbeddingsModel] = None 40 | chunking_config: Optional[TextSplitterConfig] = None 41 | 42 | 43 | @dataclass 44 | class DocumentCatalog: 45 | version: int 46 | count: int 47 | uri_to_id: Dict[str, str] 48 | id_to_uri: Dict[str, str] 49 | 50 | 51 | def is_catalog_created(): 52 | # TODO: pass in appropriate path 53 | catalog_path = "/Users/brian/Documents/GitHub/vectra-py/index/catalog.json" 54 | exists = os.path.exists(catalog_path) 55 | if exists: 56 | print(f"exists: {exists}") 57 | # time.sleep(1) 58 | return exists 59 | 60 | 61 | class LocalDocumentIndex(LocalIndex): 62 | def __init__(self, doc_index_config: LocalDocumentIndexConfig): 63 | super().__init__(doc_index_config.folder_path) 64 | self._embeddings = doc_index_config.embeddings 65 | self._chunking_config = { 66 | "keep_separators": True, 67 | "chunk_size": 512, 68 | "chunk_overlap": 0, 69 | **(doc_index_config.chunking_config or {}), 70 | } 71 | self._tokenizer = doc_index_config.tokenizer or self._chunking_config.get("tokenizer") or GPT3Tokenizer() 72 | self._chunking_config["tokenizer"] = self._tokenizer 73 | self._catalog = None 74 | self._new_catalog = None 75 | 76 | async def get_document_id(self, uri: str) -> Optional[str]: 77 | await self.load_index_data() 78 | return self._catalog["uri_to_id"].get(uri) 79 | 80 | async def get_document_uri(self, document_id: str) -> Optional[str]: 81 | await self.load_index_data() 82 | return self._catalog.id_to_uri.get(document_id) 83 | 84 | async def create_index(self, config: Optional[CreateIndexConfig] = None) -> None: 85 | await super().create_index(config) 86 | await self.load_index_data() 87 | 88 | async def delete_document(self, uri: str) -> None: 89 | document_id = await self.get_document_id(uri) 90 | if document_id is None: 91 | return 92 | 93 | await self.begin_update() 94 | try: 95 | chunks = await self.list_items_by_metadata(DocumentChunkMetadata(document_id=document_id)) 96 | for chunk in chunks: 97 | await self.deleteItem(chunk.id) 98 | 99 | del self._new_catalog.uri_to_id[uri] 100 | del self._new_catalog.id_to_uri[document_id] 101 | self._new_catalog.count -= 1 102 | 103 | await self.end_update() 104 | except Exception as err: 105 | self.cancel_update() 106 | raise Exception(f'Error deleting document "{uri}": {str(err)}') 107 | 108 | try: 109 | os.unlink(os.path.join(self.folder_path, f'{document_id}.txt')) 110 | except Exception as err: 111 | raise Exception(f'Error removing text file for document "{uri}" from disk: {str(err)}') 112 | 113 | try: 114 | os.unlink(os.path.join(self.folder_path, f'{document_id}.json')) 115 | except Exception as err: 116 | raise Exception(f'Error removing json metadata file for document "{uri}" from disk: {str(err)}') 117 | 118 | async def get_catalog_stats(self) -> DocumentCatalogStats: 119 | stats = await self.getIndexStats() 120 | return DocumentCatalogStats( 121 | version=self._catalog.version, 122 | documents=self._catalog.count, 123 | chunks=stats.items, 124 | metadata_config=stats.metadata_config, 125 | ) 126 | 127 | async def upsert_document( 128 | self, 129 | uri: str, 130 | text: str, 131 | doc_type: Optional[str] = None, 132 | metadata: Optional[Dict[str, MetadataTypes]] = None 133 | ) -> LocalDocument: 134 | if not self._embeddings: 135 | raise Exception('Embeddings model not configured.') 136 | 137 | document_id = await self.get_document_id(uri) 138 | if document_id is not None: 139 | await self.delete_document(uri) 140 | else: 141 | document_id = str(uuid4()) 142 | 143 | config = { 144 | **(self._chunking_config or {}), 145 | "doc_type": doc_type or self._chunking_config.get("doc_type"), 146 | } 147 | 148 | if config["doc_type"] is None: 149 | pos = uri.rfind('.') 150 | if pos >= 0: 151 | ext = uri[pos + 1:].lower() 152 | config["doc_type"] = ext 153 | 154 | splitter = TextSplitter(config) 155 | chunks = splitter.split(text) 156 | total_tokens = 0 157 | chunk_batches = [] 158 | current_batch = [] 159 | 160 | for chunk in chunks: 161 | total_tokens += len(chunk.tokens) 162 | 163 | if total_tokens > self._embeddings.max_tokens: 164 | chunk_batches.append(current_batch) 165 | current_batch = [] 166 | total_tokens = len(chunk.tokens) 167 | 168 | current_batch.append(chunk.text.replace('\n', ' ')) 169 | 170 | if current_batch: 171 | chunk_batches.append(current_batch) 172 | 173 | embeddings = [] 174 | 175 | for batch in chunk_batches: 176 | try: 177 | response = await self._embeddings.create_embeddings(batch) 178 | except Exception as err: 179 | raise Exception(f'Error generating embeddings: {str(err)}') 180 | 181 | if response.status != 'success': 182 | raise Exception(f'Error generating embeddings: {response.message}') 183 | 184 | embeddings.extend(response.output or []) 185 | 186 | await self.begin_update() 187 | try: 188 | for i, chunk in enumerate(chunks): 189 | embedding = embeddings[i] 190 | chunk_metadata = { 191 | "document_id": document_id, 192 | "start_pos": chunk.start_pos, 193 | "end_pos": chunk.end_pos, 194 | **(metadata or {}), 195 | } 196 | await self.insert_item( 197 | { 198 | "id": str(uuid4()), 199 | "metadata": chunk_metadata, 200 | "vector": embedding, 201 | } 202 | ) 203 | if metadata: 204 | with open(os.path.join(self.folder_path, f'{document_id}.json'), 'w') as metadata_file: 205 | json.dump(metadata, metadata_file) 206 | 207 | with open(os.path.join(self.folder_path, f'{document_id}.txt'), 'w') as text_file: 208 | text_file.write(text) 209 | 210 | self._new_catalog['uri_to_id'][uri] = document_id 211 | self._new_catalog['id_to_uri'][document_id] = uri 212 | self._new_catalog['count'] += 1 213 | 214 | await self.end_update() 215 | except Exception as err: 216 | self.cancel_update() 217 | raise Exception(f'Error adding document "{uri}": {str(err)}') 218 | 219 | return LocalDocument(self.folder_path, document_id, uri) 220 | 221 | async def query_documents(self, query: str, options: DocumentQueryOptions = None) -> List[LocalDocumentResult]: 222 | if not self._embeddings: 223 | raise Exception('Embeddings model not configured.') 224 | 225 | options = options or DocumentQueryOptions(max_documents=10, max_chunks=50) 226 | 227 | try: 228 | embeddings = await self._embeddings.create_embeddings(query.replace('\n', ' ')) 229 | except Exception as err: 230 | raise Exception(f'Error generating embeddings for query: {str(err)}') 231 | 232 | if embeddings.status != 'success': 233 | raise Exception(f'Error generating embeddings for query: {embeddings.message}') 234 | 235 | results = await self.query_items(embeddings.output[0], options.max_chunks, options.filter) 236 | document_chunks = {} 237 | 238 | for result in results: 239 | metadata = result.item.metadata 240 | 241 | if metadata.document_id not in document_chunks: 242 | document_chunks[metadata.document_id] = [] 243 | 244 | document_chunks[metadata.document_id].append(result) 245 | 246 | document_results = [] 247 | 248 | for document_id, chunks in document_chunks.items(): 249 | uri = await self.get_document_uri(document_id) 250 | document_result = LocalDocumentResult(self.folder_path, document_id, uri, chunks, self._tokenizer) 251 | document_results.append(document_result) 252 | 253 | document_results.sort(key=lambda x: x.score, reverse=True) 254 | return document_results[:options.max_documents] 255 | 256 | async def begin_update(self): 257 | await super().begin_update() 258 | self._new_catalog = self._catalog.copy() 259 | 260 | def cancel_update(self): 261 | super().cancel_update() 262 | self._new_catalog = None 263 | 264 | async def end_update(self): 265 | await super().end_update() 266 | 267 | try: 268 | # Save catalog 269 | catalog_path = os.path.join(self.folder_path, 'catalog.json') 270 | with open(catalog_path, 'w') as catalog_file: 271 | json.dump(self._new_catalog, catalog_file) 272 | self._catalog = self._new_catalog 273 | self._new_catalog = None 274 | except Exception as err: 275 | raise Exception(f'Error saving document catalog: {str(err)}') 276 | 277 | async def load_index_data(self): 278 | await super().load_index_data() 279 | 280 | if self._catalog: 281 | return 282 | 283 | catalog_path = os.path.join(self.folder_path, 'catalog.json') 284 | thread_test = await asyncio.gather( 285 | asyncio.to_thread(is_catalog_created), 286 | asyncio.sleep(1) 287 | ) 288 | if is_catalog_created(): 289 | # Load catalog 290 | async with aiofiles.open(catalog_path, 'r') as catalog_file: 291 | contents = await catalog_file.read() 292 | self._catalog = json.loads(contents) 293 | else: 294 | try: 295 | # Initialize catalog 296 | self._catalog = { 297 | 'version': 1, 298 | 'count': 0, 299 | 'uri_to_id': {}, 300 | 'id_to_uri': {}, 301 | } 302 | with open(catalog_path, 'w') as catalog_file: 303 | json.dump(self._catalog, catalog_file) 304 | except Exception as err: 305 | raise Exception(f'Error creating document catalog: {str(err)}') 306 | -------------------------------------------------------------------------------- /src/vectra_py/local_document_result.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from local_document import LocalDocument 3 | from custom_types import QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection 4 | 5 | 6 | class LocalDocumentResult(LocalDocument): 7 | def __init__(self, folder_path: str, id: str, uri: str, chunks, tokenizer: Tokenizer): # List[QueryResult[DocumentChunkMetadata]] 8 | super().__init__(folder_path, id, uri) 9 | self._chunks = chunks 10 | self._tokenizer = tokenizer 11 | 12 | # Compute average score 13 | score = 0 14 | for chunk in self._chunks: 15 | score += chunk.score 16 | self._score = score / len(self._chunks) 17 | 18 | @property 19 | def chunks(self): # -> List[QueryResult[DocumentChunkMetadata]] 20 | return self._chunks 21 | 22 | @property 23 | def score(self) -> float: 24 | return self._score 25 | 26 | async def render_sections(self, max_tokens: int, max_sections: int) -> List[DocumentTextSection]: 27 | # Load text from disk 28 | text = await self.load_text() 29 | 30 | # First check to see if the entire document is less than max_tokens 31 | tokens = self._tokenizer.encode(text) 32 | if len(tokens) < max_tokens: 33 | return [{ 34 | "text": text, 35 | "token_count": len(tokens), 36 | "score": 1.0 37 | }] 38 | 39 | # Otherwise, we need to split the document into sections 40 | # - Add each chunk to a temp array and filter out any chunk that's longer than max_tokens. 41 | # - Sort the array by start_pos to arrange chunks in document order. 42 | # - Generate a new array of sections by combining chunks until the max_tokens is reached for each section. 43 | # - Generate an aggregate score for each section by averaging the score of each chunk in the section. 44 | # - Sort the sections by score and limit to max_sections. 45 | # - For each remaining section, combine adjacent chunks of text. 46 | # - Dynamically add overlapping chunks of text to each section until the max_tokens is reached. 47 | chunks = [] 48 | for chunk in self._chunks: 49 | start_pos = chunk.item.metadata.start_pos 50 | end_pos = chunk.item.metadata.end_pos 51 | chunk_text = text[start_pos:end_pos + 1] 52 | chunk_tokens = self._tokenizer.encode(chunk_text) 53 | if len(chunk_tokens) <= max_tokens: 54 | chunks.append({ 55 | "text": chunk_text, 56 | "start_pos": start_pos, 57 | "end_pos": end_pos, 58 | "score": chunk.score, 59 | "token_count": len(chunk_tokens) 60 | }) 61 | 62 | chunks.sort(key=lambda x: x["start_pos"]) 63 | 64 | if not chunks: 65 | # Take the top chunk and return a subset of its text 66 | top_chunk = self._chunks[0] 67 | start_pos = top_chunk.item.metadata.start_pos 68 | end_pos = top_chunk.item.metadata.end_pos 69 | chunk_text = text[start_pos:end_pos + 1] 70 | tokens = self._tokenizer.encode(chunk_text) 71 | return [{ 72 | "text": self._tokenizer.decode(tokens[:max_tokens]), 73 | "token_count": max_tokens, 74 | "score": top_chunk.score 75 | }] 76 | 77 | sections = [] 78 | current_section = { 79 | "chunks": [], 80 | "score": 0, 81 | "token_count": 0 82 | } 83 | 84 | for chunk in chunks: 85 | if current_section["token_count"] + chunk["token_count"] > max_tokens: 86 | sections.append(current_section.copy()) 87 | current_section = { 88 | "chunks": [], 89 | "score": 0, 90 | "token_count": 0 91 | } 92 | current_section["chunks"].append(chunk) 93 | current_section["score"] += chunk["score"] 94 | current_section["token_count"] += chunk["token_count"] 95 | 96 | # Normalize section scores 97 | for section in sections: 98 | section["score"] /= len(section["chunks"]) 99 | 100 | # Sort sections by score and limit to max_sections 101 | sections.sort(key=lambda x: x["score"], reverse=True) 102 | if len(sections) > max_sections: 103 | sections = sections[:max_sections] 104 | 105 | # Combine adjacent chunks of text 106 | for section in sections: 107 | i = 0 108 | while i < len(section["chunks"]) - 1: 109 | chunk = section["chunks"][i] 110 | next_chunk = section["chunks"][i + 1] 111 | if chunk["end_pos"] + 1 == next_chunk["start_pos"]: 112 | chunk["text"] += next_chunk["text"] 113 | chunk["end_pos"] = next_chunk["end_pos"] 114 | chunk["token_count"] += next_chunk["token_count"] 115 | section["chunks"].pop(i + 1) 116 | else: 117 | i += 1 118 | 119 | # Add overlapping chunks of text to each section until the max_tokens is reached 120 | connector = { 121 | "text": '\n\n...\n\n', 122 | "start_pos": -1, 123 | "end_pos": -1, 124 | "score": 0, 125 | "token_count": self._tokenizer.encode('\n\n...\n\n') 126 | } 127 | 128 | for section in sections: 129 | # Insert connectors between chunks 130 | if len(section["chunks"]) > 1: 131 | i = 0 132 | while i < len(section["chunks"]) - 1: 133 | section["chunks"].insert(i + 1, connector) 134 | section["token_count"] += connector["token_count"] 135 | i += 2 136 | 137 | # Add chunks to the beginning and end of the section until max_tokens is reached 138 | budget = max_tokens - section["token_count"] 139 | if budget > 40: 140 | section_start = section["chunks"][0]["start_pos"] 141 | section_end = section["chunks"][-1]["end_pos"] 142 | if section_start > 0: 143 | before_text = text[:section_start] 144 | before_tokens = self._tokenizer.encode(before_text) 145 | before_budget = min(len(before_tokens), budget // 2) 146 | chunk = { 147 | "text": self._tokenizer.decode(before_tokens[-before_budget:]), 148 | "start_pos": section_start - before_budget, 149 | "end_pos": section_start - 1, 150 | "score": 0, 151 | "token_count": before_budget 152 | } 153 | section["chunks"].insert(0, chunk) 154 | section["token_count"] += chunk["token_count"] 155 | budget -= chunk["token_count"] 156 | 157 | if section_end < len(text) - 1: 158 | after_text = text[section_end + 1:] 159 | after_tokens = self._tokenizer.encode(after_text) 160 | after_budget = min(len(after_tokens), budget) 161 | chunk = { 162 | "text": self._tokenizer.decode(after_tokens[:after_budget]), 163 | "start_pos": section_end + 1, 164 | "end_pos": section_end + after_budget, 165 | "score": 0, 166 | "token_count": after_budget 167 | } 168 | section["chunks"].append(chunk) 169 | section["token_count"] += chunk["token_count"] 170 | budget -= chunk["token_count"] 171 | 172 | # Return final rendered sections 173 | rendered_sections = [] 174 | for section in sections: 175 | text = '' 176 | for chunk in section["chunks"]: 177 | text += chunk["text"] 178 | rendered_sections.append({ 179 | "text": text, 180 | "token_count": section["token_count"], 181 | "score": section["score"] 182 | }) 183 | return rendered_sections 184 | -------------------------------------------------------------------------------- /src/vectra_py/local_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import json 4 | from uuid import uuid4 5 | from typing import List, Optional, Dict, Union, Any 6 | from item_selector import ItemSelector 7 | from custom_types import IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult 8 | 9 | 10 | class CreateIndexConfig: 11 | def __init__(self, version: int, delete_if_exists: bool = False, metadata_config: Dict = {}): 12 | self.version = version 13 | self.delete_if_exists = delete_if_exists 14 | self.metadata_config = metadata_config 15 | 16 | 17 | class LocalIndex: 18 | def __init__(self, folder_path: str, index_name: Optional[str] = None): 19 | self._folder_path = folder_path 20 | self._index_name = index_name or "index.json" 21 | self._data = None 22 | self._update = None 23 | 24 | @property 25 | def folder_path(self) -> str: 26 | return self._folder_path 27 | 28 | @property 29 | def index_name(self) -> str: 30 | return self._index_name 31 | 32 | async def begin_update(self) -> None: 33 | if self._update: 34 | raise ValueError('Update already in progress') 35 | 36 | await self.load_index_data() 37 | self._update = self._data.copy() 38 | 39 | def cancel_update(self) -> None: 40 | self._update = None 41 | 42 | async def create_index(self, config: CreateIndexConfig = CreateIndexConfig(version=1)) -> None: 43 | if self.is_index_created(): 44 | if config.delete_if_exists: 45 | await self.delete_index() 46 | else: 47 | raise ValueError('Index already exists') 48 | try: 49 | os.mkdir(self._folder_path) 50 | self._data = { 51 | "version": config.version, 52 | "metadata_config": config.metadata_config, 53 | "items": [] 54 | } 55 | with open(os.path.join(self._folder_path, self._index_name), 'w') as index_file: 56 | json.dump(self._data, index_file) 57 | except Exception: 58 | await self.delete_index() 59 | raise ValueError('Error creating index') 60 | 61 | async def delete_index(self) -> None: 62 | self._data = None 63 | try: 64 | shutil.rmtree(self._folder_path) 65 | except Exception as err: 66 | print(err) 67 | 68 | async def delete_item(self, id: str) -> None: 69 | if self._update: 70 | index = next((i for i, item in enumerate(self._update["items"]) if item["id"] == id), None) 71 | if index is not None: 72 | self._update["items"].pop(index) 73 | else: 74 | await self.begin_update() 75 | index = next((i for i, item in enumerate(self._update["items"]) if item["id"] == id), None) 76 | if index is not None: 77 | self._update["items"].pop(index) 78 | await self.end_update() 79 | 80 | async def end_update(self) -> None: 81 | if not self._update: 82 | raise ValueError('No update in progress') 83 | 84 | try: 85 | with open(os.path.join(self._folder_path, self._index_name), 'w') as index_file: 86 | json.dump(self._update, index_file) 87 | self._data = self._update.copy() 88 | self._update = None 89 | except Exception as err: 90 | raise ValueError(f'Error saving index: {str(err)}') 91 | 92 | async def get_index_stats(self) -> IndexStats: 93 | await self.load_index_data() 94 | return { 95 | "version": self._data["version"], 96 | "metadata_config": self._data["metadata_config"], 97 | "items": len(self._data["items"]) 98 | } 99 | 100 | async def get_item(self, id: str) -> Optional[IndexItem]: 101 | await self.load_index_data() 102 | item = next((item for item in self._data["items"] if item["id"] == id), None) 103 | return item 104 | 105 | async def insert_item(self, item: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: 106 | if self._update: 107 | return await self.add_item_to_update(item, True) 108 | else: 109 | await self.begin_update() 110 | new_item = await self.add_item_to_update(item, True) 111 | await self.end_update() 112 | return new_item 113 | 114 | def is_index_created(self) -> bool: 115 | return os.path.exists(os.path.join(self._folder_path, self._index_name)) 116 | 117 | async def list_items(self) -> List[IndexItem]: 118 | await self.load_index_data() 119 | return self._data["items"][:] 120 | 121 | async def list_items_by_metadata(self, filter: MetadataFilter) -> List[IndexItem]: 122 | await self.load_index_data() 123 | return [item for item in self._data["items"] if ItemSelector.select(item["metadata"], filter)] 124 | 125 | async def query_items(self, 126 | vector: List[float], 127 | top_k: int, 128 | filter: Optional[MetadataFilter] = None) -> List[QueryResult]: 129 | await self.load_index_data() 130 | 131 | items = self._data["items"][:] 132 | if filter: 133 | items = [item for item in items if ItemSelector.select(item["metadata"], filter)] 134 | 135 | norm = ItemSelector.normalize(vector) 136 | distances = [] 137 | for i, item in enumerate(items): 138 | distance = ItemSelector.normalized_cosine_similarity(vector, norm, item["vector"], item["norm"]) 139 | distances.append({"index": i, "distance": distance}) 140 | 141 | distances.sort(key=lambda x: x["distance"], reverse=True) 142 | top_items = distances[:top_k] 143 | 144 | for item in top_items: 145 | if "metadataFile" in items[item["index"]]: 146 | metadata_path = os.path.join(self._folder_path, items[item["index"]]["metadataFile"]) 147 | with open(metadata_path, 'r') as metadata_file: 148 | items[item["index"]]["metadata"] = json.load(metadata_file) 149 | 150 | return [{"item": items[item["index"]], "score": item["distance"]} for item in top_items] 151 | 152 | async def upsert_item(self, item: Optional[Dict[str, Any]] = None) -> IndexItem: 153 | if self._update: 154 | return await self.add_item_to_update(item, False) 155 | else: 156 | await self.begin_update() 157 | new_item = await self.add_item_to_update(item, False) 158 | await self.end_update() 159 | return new_item 160 | 161 | async def load_index_data(self) -> None: 162 | if self._data: 163 | return 164 | 165 | if not self.is_index_created(): 166 | raise ValueError('Index does not exist') 167 | 168 | try: 169 | with open(os.path.join(self._folder_path, self._index_name), 'r') as index_file: 170 | self._data = json.load(index_file) 171 | except Exception: 172 | raise ValueError('Error loading index data') 173 | 174 | async def add_item_to_update(self, item: Optional[Dict[str, Any]], unique: bool) -> IndexItem: 175 | if "vector" not in item: 176 | raise ValueError('Vector is required') 177 | 178 | item_id = item.get("id") or str(uuid4()) 179 | if unique: 180 | existing_item = next((i for i in self._update["items"] if i["id"] == item_id), None) 181 | if existing_item: 182 | raise ValueError(f'Item with id {item_id} already exists') 183 | 184 | metadata = {} 185 | metadata_file = None 186 | if ( 187 | "metadata" in item 188 | and self._update["metadata_config"].get("indexed") 189 | and len(self._update["metadata_config"]["indexed"]) > 0 190 | ): 191 | for key in self._update["metadata_config"]["indexed"]: 192 | if key in item["metadata"]: 193 | metadata[key] = item["metadata"][key] 194 | if item.get("metadata"): 195 | metadata_file = f'{str(uuid4())}.json' 196 | metadata_path = os.path.join(self._folder_path, metadata_file) 197 | with open(metadata_path, 'w') as metadata_file: 198 | json.dump(item["metadata"], metadata_file) 199 | elif item.get("metadata"): 200 | metadata = item["metadata"] 201 | # print('local index, after metadata') 202 | # print('item vector type and len', type(item["vector"]), len(item["vector"])) 203 | # print('item vector chunk inspection', item["vector"][0]) 204 | try: 205 | new_item = { 206 | "id": item_id, 207 | "metadata": metadata, 208 | "vector": item["vector"], 209 | "norm": ItemSelector.normalize(item["vector"]) 210 | } 211 | except Exception as e: 212 | raise ValueError(f'Error creating item: {e}') 213 | if metadata_file: 214 | new_item["metadataFile"] = metadata_file 215 | 216 | if not unique: 217 | existing_item = next((i for i in self._update["items"] if i["id"] == item_id), None) 218 | if existing_item: 219 | existing_item.update(new_item) 220 | return existing_item 221 | 222 | self._update["items"].append(new_item) 223 | return new_item 224 | -------------------------------------------------------------------------------- /src/vectra_py/openai_embeddings.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import requests 3 | from typing import List, Union, Dict 4 | 5 | 6 | class BaseOpenAIEmbeddingsOptions: 7 | def __init__(self, retry_policy: List[int] = None, request_config: Dict = None): 8 | self.retry_policy = retry_policy if retry_policy else [2000, 5000] 9 | self.request_config = request_config if request_config else {} 10 | 11 | 12 | class OpenAIEmbeddingsOptions(BaseOpenAIEmbeddingsOptions): 13 | def __init__( 14 | self, 15 | api_key: str, 16 | model: str, 17 | organization: str = None, 18 | endpoint: str = None, 19 | **kwargs 20 | ): 21 | super().__init__(**kwargs) 22 | self.api_key = api_key 23 | self.model = model 24 | self.organization = organization 25 | self.endpoint = endpoint 26 | 27 | 28 | class AzureOpenAIEmbeddingsOptions(BaseOpenAIEmbeddingsOptions): 29 | def __init__( 30 | self, 31 | azure_api_key: str, 32 | azure_endpoint: str, 33 | azure_deployment: str, 34 | azure_api_version: str = "2023-05-15", 35 | **kwargs 36 | ): 37 | super().__init__(**kwargs) 38 | self.azure_api_key = azure_api_key 39 | self.azure_endpoint = azure_endpoint 40 | self.azure_deployment = azure_deployment 41 | self.azure_api_version = azure_api_version 42 | 43 | 44 | class EmbeddingsResponse: 45 | def __init__(self, status: str, output: List[float] = None, message: str = None): 46 | self.status = status 47 | self.output = output 48 | self.message = message 49 | 50 | 51 | class CreateEmbeddingRequest: 52 | def __init__(self, input: Union[str, List[str]]): 53 | self.input = input 54 | 55 | 56 | class CreateEmbeddingResponse: 57 | def __init__(self, data: List[Dict], model: str, usage: Dict): 58 | self.data = data 59 | self.model = model 60 | self.usage = usage 61 | 62 | 63 | class OpenAIEmbeddings: 64 | def __init__(self, options: Union[OpenAIEmbeddingsOptions, AzureOpenAIEmbeddingsOptions]): 65 | self._use_azure = isinstance(options, AzureOpenAIEmbeddingsOptions) 66 | self.options = options 67 | self.user_agent = "AlphaWave" 68 | 69 | @property 70 | def max_tokens(self): 71 | return 8000 72 | 73 | async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse: 74 | response = await self.create_embedding_request({"input": inputs}) 75 | # convert the response.text to json 76 | json_response = response.json() 77 | data = response.json().get('data') 78 | if response.status_code < 300: 79 | return EmbeddingsResponse( 80 | status="success", 81 | output=[item["embedding"] for item in data], 82 | message={"model": json_response.get('model'), 83 | "usage": json_response.get('usage')} 84 | ) 85 | elif response.status_code == 429: 86 | return EmbeddingsResponse( 87 | status="rate_limited", 88 | output=None, 89 | message="The embeddings API returned a rate limit error.", 90 | ) 91 | else: 92 | return EmbeddingsResponse( 93 | status="error", 94 | output=None, 95 | message=f"The embeddings API returned an error status of {response.status_code}: {response.statusText}", 96 | ) 97 | 98 | async def create_embedding_request(self, request: CreateEmbeddingRequest): 99 | if self._use_azure: 100 | options = self.options 101 | url = f"{options.azure_endpoint}/openai/deployments/{options.azure_deployment}/embeddings?api-version={options.azure_api_version}" 102 | return self.post(url, request) 103 | else: 104 | options = self.options 105 | url = f"{options.endpoint or 'https://api.openai.com'}/v1/embeddings" 106 | request['model'] = options.model 107 | test = await self.post(url, request, retry_count=0) 108 | return test 109 | 110 | async def post(self, url: str, body: Dict, retry_count: int = 0): 111 | request_config = dict(self.options.request_config) 112 | 113 | request_headers = request_config.setdefault("headers", {}) 114 | request_headers.setdefault("Content-Type", "application/json") 115 | request_headers.setdefault("User-Agent", self.user_agent) 116 | 117 | if self._use_azure: 118 | options = self.options 119 | request_headers["api-key"] = options.azure_api_key 120 | else: 121 | options = self.options 122 | request_headers["Authorization"] = f"Bearer {options.api_key}" 123 | if options.organization: 124 | request_headers["OpenAI-Organization"] = options.organization 125 | 126 | response = requests.post(url, json=body, **request_config) 127 | 128 | if response.status_code == 429 and isinstance(self.options.retry_policy, list) and retry_count < len(self.options.retry_policy): 129 | delay = self.options.retry_policy[retry_count] 130 | await asyncio.sleep(delay / 1000) 131 | return await self.post(url, body, retry_count + 1) 132 | else: 133 | return response 134 | -------------------------------------------------------------------------------- /src/vectra_py/oss_embeddings.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import requests 3 | from typing import List, Union, Dict 4 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer 5 | 6 | 7 | class BaseOSSEmbeddingsOptions: 8 | def __init__(self, retry_policy: List[int] = None, request_config: Dict = None): 9 | self.retry_policy = retry_policy if retry_policy else [2000, 5000] 10 | self.request_config = request_config if request_config else {} 11 | 12 | 13 | class OSSEmbeddingsOptions(BaseOSSEmbeddingsOptions): 14 | def __init__( 15 | self, 16 | model: str, 17 | tokenizer: OSSTokenizer, 18 | **kwargs 19 | ): 20 | super().__init__(**kwargs) 21 | self.tokenizer = OSSTokenizer(model_name=model) 22 | self.model = model 23 | 24 | 25 | class EmbeddingsResponse: 26 | def __init__(self, status: str, output: List[float] = None, message: str = None): 27 | self.status = status 28 | self.output = output 29 | self.message = message 30 | 31 | 32 | class CreateEmbeddingRequest: 33 | def __init__(self, input: Union[str, List[str]]): 34 | self.input = input 35 | 36 | 37 | class CreateEmbeddingResponse: 38 | def __init__(self, data: List[Dict], model: str, usage: Dict): 39 | self.data = data 40 | self.model = model 41 | self.usage = usage 42 | 43 | 44 | class OSSEmbeddings: 45 | def __init__(self, options: OSSEmbeddingsOptions): 46 | self._local = True # use a locally stored model 47 | self.options = options 48 | self.model = options.model 49 | self.tokenizer = options.tokenizer 50 | # self.user_agent = "AlphaWave" 51 | 52 | @property 53 | def max_tokens(self): 54 | return 8000 55 | 56 | async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse: 57 | # create embeddings from the local model 58 | try: 59 | data = [self.options.tokenizer.encode(item) for item in inputs] 60 | return EmbeddingsResponse( 61 | status="success", 62 | output=data, 63 | message={"model": self.model, 64 | "usage": 'unknown'} 65 | ) 66 | except Exception as e: 67 | print('OSS encoding error', e) 68 | return EmbeddingsResponse( 69 | status="error", 70 | output=None, 71 | message=f"Encoding error: {e}", 72 | ) 73 | 74 | 75 | # async def create_embeddings(self, inputs: Union[str, List[str]]) -> EmbeddingsResponse: 76 | # # print('openai create_embeddings', inputs) 77 | # print('create_embeddings', OSEmbeddingsOptions.__dict__) 78 | # response = await self.create_embedding_request({"input": inputs}) 79 | # # convert the response.text to json 80 | # json_response = response.json() 81 | # data = response.json().get('data') 82 | # if response.status_code < 300: 83 | # return EmbeddingsResponse( 84 | # status="success", 85 | # output=[item["embedding"] for item in data], 86 | # message={"model": json_response.get('model'), 87 | # "usage": json_response.get('usage')} 88 | # ) 89 | # elif response.status_code == 429: 90 | # return EmbeddingsResponse( 91 | # status="rate_limited", 92 | # output=None, 93 | # message="The embeddings API returned a rate limit error.", 94 | # ) 95 | # else: 96 | # return EmbeddingsResponse( 97 | # status="error", 98 | # output=None, 99 | # message=f"The embeddings API returned an error status of {response.status_code}: {response.statusText}", 100 | # ) 101 | 102 | # async def create_embedding_request(self, request: CreateEmbeddingRequest): 103 | # # print('openai create_embedding_request', request) 104 | # if self._use_azure: 105 | # options = self.options 106 | # url = f"{options.azure_endpoint}/openai/deployments/{options.azure_deployment}/embeddings?api-version={options.azure_api_version}" 107 | # return self.post(url, request) 108 | # else: 109 | # # print('else', self.options.__dict__) 110 | # options = self.options 111 | # # print('openai create_embedding_request else', options.__dict__) 112 | # url = f"{options.endpoint or 'https://api.openai.com'}/v1/embeddings" 113 | # # print('---------------openai create_embedding_request else request', request.keys()) 114 | # # print('---------------openai create_embedding_request else, options', options.model) 115 | # request['model'] = options.model 116 | # # print('zaza') 117 | # # print(options.model) 118 | # test = await self.post(url, request, retry_count=0) 119 | # return test 120 | 121 | # async def post(self, url: str, body: Dict, retry_count: int = 0): 122 | # # print('openai post', url, body, retry_count) 123 | # request_config = dict(self.options.request_config) 124 | 125 | # request_headers = request_config.setdefault("headers", {}) 126 | # request_headers.setdefault("Content-Type", "application/json") 127 | # request_headers.setdefault("User-Agent", self.user_agent) 128 | 129 | # if self._use_azure: 130 | # options = self.options 131 | # request_headers["api-key"] = options.azure_api_key 132 | # else: 133 | # options = self.options 134 | # request_headers["Authorization"] = f"Bearer {options.api_key}" 135 | # if options.organization: 136 | # request_headers["OpenAI-Organization"] = options.organization 137 | 138 | # response = requests.post(url, json=body, **request_config) 139 | # # print('post', response.__dict__.keys()) 140 | 141 | # if response.status_code == 429 and isinstance(self.options.retry_policy, list) and retry_count < len(self.options.retry_policy): 142 | # delay = self.options.retry_policy[retry_count] 143 | # await asyncio.sleep(delay / 1000) 144 | # return await self.post(url, body, retry_count + 1) 145 | # else: 146 | # return response 147 | -------------------------------------------------------------------------------- /src/vectra_py/text_splitter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from custom_types import Tokenizer 4 | from gpt3_tokenizer import GPT3Tokenizer 5 | from all_MiniLM_L6_v2_tokenizer import OSSTokenizer 6 | 7 | ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' 8 | 9 | 10 | class TextSplitterConfig: 11 | def __init__( 12 | self, 13 | separators: List[str], 14 | keep_separators: bool, 15 | chunk_size: int, 16 | chunk_overlap: int, 17 | tokenizer: Tokenizer, 18 | doc_type: Optional[str] = None 19 | ): 20 | self.separators = separators 21 | self.keep_separators = keep_separators 22 | self.chunk_size = chunk_size 23 | self.chunk_overlap = chunk_overlap 24 | self.tokenizer = tokenizer 25 | self.doc_type = doc_type 26 | 27 | 28 | class TextChunk: 29 | def __init__(self, 30 | text: str, 31 | tokens: List[int], 32 | start_pos: int, 33 | end_pos: int, 34 | start_overlap: List[int], 35 | end_overlap: List[int]): 36 | self.text = text 37 | self.tokens = tokens 38 | self.start_pos = start_pos 39 | self.end_pos = end_pos 40 | self.start_overlap = start_overlap 41 | self.end_overlap = end_overlap 42 | 43 | 44 | class TextSplitter: 45 | def __init__(self, config: Optional[TextSplitterConfig] = None): 46 | if config is None: 47 | config = TextSplitterConfig( 48 | separators=[], 49 | keep_separators=False, 50 | chunk_size=400, 51 | chunk_overlap=40, 52 | tokenizer=None 53 | ) 54 | self.config = config 55 | # Create a default tokenizer if none is provided 56 | if not self.config.get('tokenizer'): 57 | print('tokenizer not found. defaulting to GPT3.') 58 | self.config.tokenizer = GPT3Tokenizer() 59 | 60 | # Use default separators if none are provided 61 | if not self.config.get('separators') or len(self.config.get('separators')) == 0: 62 | self.config['separators'] = self.get_separators(self.config['doc_type']) 63 | 64 | # Validate the config settings 65 | if self.config.get('chunk_size') < 1: 66 | raise ValueError("chunk_size must be >= 1") 67 | elif self.config.get('chunk_overlap') < 0: 68 | raise ValueError("chunk_overlap must be >= 0") 69 | elif self.config.get('chunk_overlap') > self.config.get('chunk_size'): 70 | raise ValueError("chunk_overlap must be <= chunk_size") 71 | 72 | def split(self, text: str) -> List[TextChunk]: 73 | # Get basic chunks 74 | chunks = self.recursive_split(text, self.config.get('separators'), 0) 75 | 76 | def get_overlap_tokens(tokens: Optional[List[int]] = None) -> List[int]: 77 | if tokens is not None: 78 | length = min(len(tokens), self.config.get('chunk_overlap')) 79 | return tokens[:length] 80 | else: 81 | return [] 82 | 83 | # Add overlap tokens and text to the start and end of each chunk 84 | if self.config.get('chunk_overlap') > 0: 85 | for i in range(1, len(chunks)): 86 | previous_chunk = chunks[i - 1] 87 | chunk = chunks[i] 88 | next_chunk = chunks[i + 1] if i < len(chunks) - 1 else None 89 | chunk.start_overlap = get_overlap_tokens(previous_chunk.tokens[::-1])[::-1] 90 | chunk.end_overlap = get_overlap_tokens(next_chunk.tokens) if next_chunk else [] 91 | 92 | return chunks 93 | 94 | def recursive_split(self, text: str, separators: List[str], start_pos: int) -> List[TextChunk]: 95 | chunks = [] 96 | if len(text) > 0: 97 | # Split text into parts 98 | parts = [] 99 | separator = '' 100 | next_separators = separators[1:] if len(separators) > 1 else [] 101 | if separators: 102 | # Split by separator 103 | separator = separators[0] 104 | parts = text.split(separator) 105 | else: 106 | # Cut text in half 107 | half = len(text) // 2 108 | parts = [text[:half], text[half:]] 109 | 110 | # Iterate over parts 111 | for i in range(len(parts)): 112 | last_chunk = i == len(parts) - 1 113 | # Get chunk text and end_pos 114 | chunk = parts[i] 115 | end_pos = start_pos + (len(chunk) - 1) + (0 if last_chunk else len(separator)) 116 | if self.config.get('keep_separators') and not last_chunk: 117 | chunk += separator 118 | 119 | # Ensure chunk contains text 120 | if not self.contains_alphanumeric(chunk): 121 | continue 122 | 123 | # Optimization to avoid encoding really large chunks 124 | if len(chunk) / 6 > self.config.get('chunk_size'): 125 | # Break the text into smaller chunks 126 | sub_chunks = self.recursive_split(chunk, next_separators, start_pos) 127 | chunks.extend(sub_chunks) 128 | else: 129 | # Encode chunk text 130 | tokens = self.config.get('tokenizer').encode(chunk) 131 | if len(tokens) > self.config.get('chunk_size'): 132 | # Break the text into smaller chunks 133 | sub_chunks = self.recursive_split(chunk, next_separators, start_pos) 134 | chunks.extend(sub_chunks) 135 | else: 136 | # Append chunk to output 137 | chunks.append(TextChunk( 138 | text=chunk, 139 | tokens=tokens, 140 | start_pos=start_pos, 141 | end_pos=end_pos, 142 | start_overlap=[], 143 | end_overlap=[], 144 | )) 145 | # Update start_pos 146 | start_pos = end_pos + 1 147 | 148 | return self.combine_chunks(chunks) 149 | 150 | def combine_chunks(self, chunks: List[TextChunk]) -> List[TextChunk]: 151 | combined_chunks = [] 152 | current_chunk = None 153 | current_length = 0 154 | separator = '' if self.config.get('keep_separators') else ' ' 155 | for i in range(len(chunks)): 156 | chunk = chunks[i] 157 | if current_chunk: 158 | length = len(current_chunk.tokens) + len(chunk.tokens) 159 | if length > self.config.get('chunk_size'): 160 | combined_chunks.append(current_chunk) 161 | current_chunk = chunk 162 | current_length = len(chunk.tokens) 163 | else: 164 | current_chunk.text += separator + chunk.text 165 | current_chunk.tokens.extend(chunk.tokens) 166 | current_length += len(chunk.tokens) 167 | else: 168 | current_chunk = chunk 169 | current_length = len(chunk.tokens) 170 | 171 | if current_chunk: 172 | combined_chunks.append(current_chunk) 173 | 174 | return combined_chunks 175 | 176 | def contains_alphanumeric(self, text: str) -> bool: 177 | return any(char in ALPHANUMERIC_CHARS for char in text) 178 | 179 | def get_separators(self, doc_type: str = "") -> List[str]: 180 | separators = { 181 | "cpp": [ 182 | # Split along class definitions 183 | "\nclass ", 184 | # Split along function definitions 185 | "\nvoid ", 186 | "\nint ", 187 | "\nfloat ", 188 | "\ndouble ", 189 | # Split along control flow statements 190 | "\nif ", 191 | "\nfor ", 192 | "\nwhile ", 193 | "\nswitch ", 194 | "\ncase ", 195 | # Split by the normal type of lines 196 | "\n\n", 197 | "\n", 198 | " " 199 | ], 200 | "go": [ 201 | # Split along function definitions 202 | "\nfunc ", 203 | "\nvar ", 204 | "\nconst ", 205 | "\ntype ", 206 | # Split along control flow statements 207 | "\nif ", 208 | "\nfor ", 209 | "\nswitch ", 210 | "\ncase ", 211 | # Split by the normal type of lines 212 | "\n\n", 213 | "\n", 214 | " " 215 | ], 216 | "java": [ 217 | # Split along class definitions 218 | "\nclass ", 219 | # Split along method definitions 220 | "\npublic ", 221 | "\nprotected ", 222 | "\nprivate ", 223 | "\nstatic ", 224 | # Split along control flow statements 225 | "\nif ", 226 | "\nfor ", 227 | "\nwhile ", 228 | "\nswitch ", 229 | "\ncase ", 230 | # Split by the normal type of lines 231 | "\n\n", 232 | "\n", 233 | " " 234 | ], 235 | "c#": [ 236 | # Split along class definitions 237 | "\nclass ", 238 | # Split along method definitions 239 | "\npublic ", 240 | "\nprotected ", 241 | "\nprivate ", 242 | "\nstatic ", 243 | # Split along control flow statements 244 | "\nif ", 245 | "\nfor ", 246 | "\nwhile ", 247 | "\nswitch ", 248 | "\ncase ", 249 | # Split by the normal type of lines 250 | "\n\n", 251 | "\n", 252 | " " 253 | ], 254 | "csharp": [ 255 | # Split along class definitions 256 | "\nclass ", 257 | # Split along method definitions 258 | "\npublic ", 259 | "\nprotected ", 260 | "\nprivate ", 261 | "\nstatic ", 262 | # Split along control flow statements 263 | "\nif ", 264 | "\nfor ", 265 | "\nwhile ", 266 | "\nswitch ", 267 | "\ncase ", 268 | # Split by the normal type of lines 269 | "\n\n", 270 | "\n", 271 | " " 272 | ], 273 | "cs": [ 274 | # Split along class definitions 275 | "\nclass ", 276 | # Split along method definitions 277 | "\npublic ", 278 | "\nprotected ", 279 | "\nprivate ", 280 | "\nstatic ", 281 | # Split along control flow statements 282 | "\nif ", 283 | "\nfor ", 284 | "\nwhile ", 285 | "\nswitch ", 286 | "\ncase ", 287 | # Split by the normal type of lines 288 | "\n\n", 289 | "\n", 290 | " " 291 | ], 292 | "ts": [ 293 | # Split along class definitions 294 | "\nclass ", 295 | # Split along method definitions 296 | "\npublic ", 297 | "\nprotected ", 298 | "\nprivate ", 299 | "\nstatic ", 300 | # Split along control flow statements 301 | "\nif ", 302 | "\nfor ", 303 | "\nwhile ", 304 | "\nswitch ", 305 | "\ncase ", 306 | # Split by the normal type of lines 307 | "\n\n", 308 | "\n", 309 | " " 310 | ], 311 | "tsx": [ 312 | # Split along class definitions 313 | "\nclass ", 314 | # Split along method definitions 315 | "\npublic ", 316 | "\nprotected ", 317 | "\nprivate ", 318 | "\nstatic ", 319 | # Split along control flow statements 320 | "\nif ", 321 | "\nfor ", 322 | "\nwhile ", 323 | "\nswitch ", 324 | "\ncase ", 325 | # Split by the normal type of lines 326 | "\n\n", 327 | "\n", 328 | " " 329 | ], 330 | "typescript": [ 331 | # Split along class definitions 332 | "\nclass ", 333 | # Split along method definitions 334 | "\npublic ", 335 | "\nprotected ", 336 | "\nprivate ", 337 | "\nstatic ", 338 | # Split along control flow statements 339 | "\nif ", 340 | "\nfor ", 341 | "\nwhile ", 342 | "\nswitch ", 343 | "\ncase ", 344 | # Split by the normal type of lines 345 | "\n\n", 346 | "\n", 347 | " " 348 | ], 349 | "js": [ 350 | # Split along class definitions 351 | "\nclass ", 352 | # Split along function definitions 353 | "\nfunction ", 354 | "\nconst ", 355 | "\nlet ", 356 | "\nvar ", 357 | "\nclass ", 358 | # Split along control flow statements 359 | "\nif ", 360 | "\nfor ", 361 | "\nwhile ", 362 | "\nswitch ", 363 | "\ncase ", 364 | "\ndefault ", 365 | # Split by the normal type of lines 366 | "\n\n", 367 | "\n", 368 | " " 369 | ], 370 | "jsx": [ 371 | # Split along class definitions 372 | "\nclass ", 373 | # Split along function definitions 374 | "\nfunction ", 375 | "\nconst ", 376 | "\nlet ", 377 | "\nvar ", 378 | "\nclass ", 379 | # Split along control flow statements 380 | "\nif ", 381 | "\nfor ", 382 | "\nwhile ", 383 | "\nswitch ", 384 | "\ncase ", 385 | "\ndefault ", 386 | # Split by the normal type of lines 387 | "\n\n", 388 | "\n", 389 | " " 390 | ], 391 | "javascript": [ 392 | # Split along class definitions 393 | "\nclass ", 394 | # Split along function definitions 395 | "\nfunction ", 396 | "\nconst ", 397 | "\nlet ", 398 | "\nvar ", 399 | "\nclass ", 400 | # Split along control flow statements 401 | "\nif ", 402 | "\nfor ", 403 | "\nwhile ", 404 | "\nswitch ", 405 | "\ncase ", 406 | "\ndefault ", 407 | # Split by the normal type of lines 408 | "\n\n", 409 | "\n", 410 | " " 411 | ], 412 | "php": [ 413 | # Split along function definitions 414 | "\nfunction ", 415 | # Split along class definitions 416 | "\nclass ", 417 | # Split along control flow statements 418 | "\nif ", 419 | "\nforeach ", 420 | "\nwhile ", 421 | "\ndo ", 422 | "\nswitch ", 423 | "\ncase ", 424 | # Split by the normal type of lines 425 | "\n\n", 426 | "\n", 427 | " " 428 | ], 429 | "proto": [ 430 | # Split along message definitions 431 | "\nmessage ", 432 | # Split along service definitions 433 | "\nservice ", 434 | # Split along enum definitions 435 | "\nenum ", 436 | # Split along option definitions 437 | "\noption ", 438 | # Split along import statements 439 | "\nimport ", 440 | # Split along syntax declarations 441 | "\nsyntax ", 442 | # Split by the normal type of lines 443 | "\n\n", 444 | "\n", 445 | " " 446 | ], 447 | "python": [ 448 | # First, try to split along class definitions 449 | "\nclass ", 450 | "\ndef ", 451 | "\n\tdef ", 452 | # Now split by the normal type of lines 453 | "\n\n", 454 | "\n", 455 | " " 456 | ], 457 | "py": [ 458 | # First, try to split along class definitions 459 | "\nclass ", 460 | "\ndef ", 461 | "\n\tdef ", 462 | # Now split by the normal type of lines 463 | "\n\n", 464 | "\n", 465 | " " 466 | ], 467 | "rst": [ 468 | # Split along section titles 469 | "\n===\n", 470 | "\n---\n", 471 | "\n***\n", 472 | # Split along directive markers 473 | "\n.. ", 474 | # Split by the normal type of lines 475 | "\n\n", 476 | "\n", 477 | " " 478 | ], 479 | "ruby": [ 480 | # Split along method definitions 481 | "\ndef ", 482 | "\nclass ", 483 | # Split along control flow statements 484 | "\nif ", 485 | "\nunless ", 486 | "\nwhile ", 487 | "\nfor ", 488 | "\ndo ", 489 | "\nbegin ", 490 | "\nrescue ", 491 | # Split by the normal type of lines 492 | "\n\n", 493 | "\n", 494 | " " 495 | ], 496 | "rust": [ 497 | # Split along function definitions 498 | "\nfn ", 499 | "\nconst ", 500 | "\nlet ", 501 | # Split along control flow statements 502 | "\nif ", 503 | "\nwhile ", 504 | "\nfor ", 505 | "\nloop ", 506 | "\nmatch ", 507 | "\nconst ", 508 | # Split by the normal type of lines 509 | "\n\n", 510 | "\n", 511 | " " 512 | ], 513 | "scala": [ 514 | # Split along class definitions 515 | "\nclass ", 516 | "\nobject ", 517 | # Split along method definitions 518 | "\ndef ", 519 | "\nval ", 520 | "\nvar ", 521 | # Split along control flow statements 522 | "\nif ", 523 | "\nfor ", 524 | "\nwhile ", 525 | "\nmatch ", 526 | "\ncase ", 527 | # Split by the normal type of lines 528 | "\n\n", 529 | "\n", 530 | " " 531 | ], 532 | "swift": [ 533 | # Split along function definitions 534 | "\nfunc ", 535 | # Split along class definitions 536 | "\nclass ", 537 | "\nstruct ", 538 | "\nenum ", 539 | # Split along control flow statements 540 | "\nif ", 541 | "\nfor ", 542 | "\nwhile ", 543 | "\ndo ", 544 | "\nswitch ", 545 | "\ncase ", 546 | # Split by the normal type of lines 547 | "\n\n", 548 | "\n", 549 | " " 550 | ], 551 | "md": [ 552 | # First, try to split along Markdown headings (starting with level 2) 553 | "\n## ", 554 | "\n### ", 555 | "\n#### ", 556 | "\n##### ", 557 | "\n###### ", 558 | # Note the alternative syntax for headings (below) is not handled here 559 | # Heading level 2 560 | # --------------- 561 | # End of code block 562 | "```\n\n", 563 | # Horizontal lines 564 | "\n\n***\n\n", 565 | "\n\n---\n\n", 566 | "\n\n___\n\n", 567 | # Note that this splitter doesn't handle horizontal lines defined 568 | # by *three or more* of ***, ---, or ___, but this is not handled 569 | # Github tables 570 | "
", 573 | # " | ",
608 | " ",
609 | " ",
610 | " ",
613 | "",
615 | "",
617 | "
|