├── .gitignore ├── Evaluate_ADB_Project.ipynb ├── LICENSE ├── README.md ├── Report.pdf ├── Statistics IVF.xlsx ├── proposal.pdf ├── requirements.txt ├── src ├── .gitignore ├── Draft │ ├── access.py │ ├── access_wrong.py │ ├── file.py │ ├── gen.py │ ├── genData.ipynb │ ├── generate.py │ └── test_data.py ├── IVF.py ├── Modules │ ├── IVF.py │ └── LSH.py ├── __init__.py ├── api.py ├── best_case_implementation.py ├── evaluation.py ├── main.py ├── notes.txt ├── pipeline.ipynb ├── utils.py ├── vec_db.py └── worst_case_implementation.py └── vector searching algorithms ├── LSH.ipynb ├── LSHHyperPlane.ipynb ├── Product Qunatization.ipynb ├── SplitBySign ├── Split_by_sign.ipynb └── split_by_sign.py └── clustering.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.vscode 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | DataBase 163 | bucket_files 164 | .vscode 165 | modules/inverted_files 166 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ziad Sherif 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

IntelliQuery

2 | 3 | ## 📝 Table of Contents 4 | 5 | - [📝 Table of Contents](#-table-of-contents) 6 | - [📙 Overview ](#-overview-) 7 | - [Get Started ](#get-started-) 8 | - [Infernce Mode](#infernce-mode) 9 | - [Run Locally](#run-locally) 10 | - [Methods](#methods) 11 | - [Inverted File Inedex (IVF) ](#inverted-file-inedex-ivf-) 12 | - [Local Sensitive Hashing (LSH) ](#local-sensitive-hashing-lsh-) 13 | - [Product Qunatization (PQ) ](#product-qunatization-pq-) 14 | - [PQ-LSH ](#pq-lsh-) 15 | - [🕴 Contributors ](#-contributors-) 16 | - [📃 License ](#-license-) 17 | 18 | ## 📙 Overview 19 | Given the embedding of the search query we can efficent get the top matching k results form DB with 20M document.The objective of this project is to design and implement an indexing system for a 20 | semantic search database. 21 | 22 | 23 | ## Get Started 24 | ### Infernce Mode 25 | ***Check Final Notebook*** 26 | ``` 27 | https://github.com/ZiadSheriif/IntelliQuery/blob/main/Evaluate_ADB_Project.ipynb 28 | ``` 29 | ### Run Locally 30 | 31 | ***Clone Repo*** 32 | ``` 33 | git clone https://github.com/ZiadSheriif/IntelliQuery.git 34 | ``` 35 | ***Install dependencies*** 36 | ``` 37 | pip install -r requirements.txt 38 | ``` 39 | ***Run Indexer*** 40 | ``` 41 | $ python ./src/evaluation.py 42 | ``` 43 | 44 | 45 | ## Methods 46 | ### Inverted File Inedex (IVF) 47 | This is out final Approach with Some Enhancements 48 | 1. Changed MiniBatchKMeans to regular KMeans 49 | 2. We calculate initial centroids with just the first chunk of data 50 | 3. Introduced parallel processing for different regions 51 | 52 | 53 | ### Local Sensitive Hashing (LSH) 54 | 55 | 56 | ### Product Qunatization (PQ) 57 | 58 | 59 | ### PQ-LSH 60 | It Combines both LSH & PQ 61 | 62 | ## 🕴 Contributors 63 | 64 | 65 | 66 | 67 | 68 | 70 | 71 | 72 | 73 |

Ziad Sherif


Zeyad Tarek

69 |

Abdalhameed Emad


Basma Elhoseny

74 | 75 | 76 | 77 | ## 📃 License 78 | 79 | This software is licensed under MIT License, See [License](https://github.com/ZiadSheriif/sematic_search_DB/blob/main/LICENSE) for more information ©Ziad Sherif. 80 | -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiadSheriif/IntelliQuery/e49a854ae66c9a22632927e4956bbf1f032cbcc9/Report.pdf -------------------------------------------------------------------------------- /Statistics IVF.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiadSheriif/IntelliQuery/e49a854ae66c9a22632927e4956bbf1f032cbcc9/Statistics IVF.xlsx -------------------------------------------------------------------------------- /proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiadSheriif/IntelliQuery/e49a854ae66c9a22632927e4956bbf1f032cbcc9/proposal.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiadSheriif/IntelliQuery/e49a854ae66c9a22632927e4956bbf1f032cbcc9/requirements.txt -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | Database/ 2 | inverted_files/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | *.csv 9 | *.bin 10 | 11 | DataBase 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ 168 | bucket_files 169 | Modules/bucket_files 170 | DataBase 171 | .vscode -------------------------------------------------------------------------------- /src/Draft/access.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | # Define the binary file name 4 | binary_file_name = 'records_with_index_name.bin' 5 | 6 | # Define the index of the element you want to access 7 | i = 9999 # Change this to the desired index 8 | 9 | # Calculate the position of the ith element based on record size 10 | record_size = struct.calcsize('I20s20s') # Size of packed data 11 | print(record_size) 12 | position = i * record_size 13 | 14 | # Get the address of the first block in the binary file 15 | binary_file_address = 0 16 | # with open(binary_file_name, 'rb') as file: 17 | # binary_file_address = file.tell() 18 | # print(binary_file_address) 19 | 20 | # Calculate the absolute position of the ith element 21 | absolute_position = binary_file_address + position 22 | 23 | # Open the binary file and seek to the absolute position of the ith element 24 | with open(binary_file_name, 'rb') as file: 25 | file.seek(absolute_position) 26 | 27 | # Read the packed data at the ith position 28 | packed_data = file.read(record_size) 29 | 30 | # Unpack the data 31 | index, name, phone = struct.unpack('I20s20s', packed_data) 32 | name = name.decode().strip('\0') 33 | phone = phone.decode().strip('\0') 34 | 35 | print(f'Index: {index}, Name: {name}, Phone: {phone}') 36 | -------------------------------------------------------------------------------- /src/Draft/access_wrong.py: -------------------------------------------------------------------------------- 1 | # Open a file: file 2 | file = open('records_with_index_name_phone.bin',mode='r') 3 | 4 | # read all lines at once 5 | all_of_it = file.read() 6 | 7 | # close the file 8 | file.close() -------------------------------------------------------------------------------- /src/Draft/file.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # Generate 10,000 float values based on their index 4 | float_data = [float(i) for i in range(10000)] 5 | 6 | # Define the CSV file name 7 | csv_file_name = 'float_records_with_index.csv' 8 | 9 | # Write the float data with index to the CSV file 10 | with open(csv_file_name, 'w', newline='') as csvfile: 11 | writer = csv.writer(csvfile) 12 | writer.writerow(['Index', 'Value']) # Write a header row 13 | for index, value in enumerate(float_data): 14 | writer.writerow([index, value]) 15 | 16 | # Get the address of the first block in the CSV file 17 | csv_file_address = None 18 | with open(csv_file_name, 'rb') as file: 19 | csv_file_address = file.tell() 20 | 21 | print(f'CSV file created: {csv_file_name}') 22 | print(f'Address of the first block in the CSV file: {csv_file_address}') 23 | -------------------------------------------------------------------------------- /src/Draft/gen.py: -------------------------------------------------------------------------------- 1 | from worst_case_implementation import VecDBWorst 2 | import numpy as np 3 | 4 | # Function to generate random embeddings 5 | def generate_embeddings(num_records, embedding_dim): 6 | return [np.random.rand(embedding_dim).tolist() for _ in range(num_records)] 7 | 8 | # Create an instance of VecDB 9 | db = VecDBWorst() 10 | 11 | # Define parameters 12 | total_records = 10000 # 20 million records 13 | chunk_size = 10000 # Insert records in chunks of 10,000 14 | 15 | # Insert records in chunks 16 | for i in range(0, total_records, chunk_size): 17 | chunk_records = [] 18 | for j in range(i + 1, i + chunk_size + 1): 19 | if j > total_records: 20 | break 21 | record = {"id": j, "embed": generate_embeddings(1, 70)[0]} 22 | # make this size of record to be fixed 1500 bytes 23 | # size_of_dummy_needed = 1500 - len(record["embed"]) 24 | 25 | chunk_records.append(record) 26 | 27 | db.insert_records(chunk_records) 28 | print(f"Inserted {len(chunk_records)} records. Total records inserted: {j}") 29 | 30 | print("Insertion complete.") 31 | -------------------------------------------------------------------------------- /src/Draft/generate.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | # Define the binary file name 4 | binary_file_name = 'records_with_index_name.bin' 5 | 6 | # Generate and write the records to the binary file 7 | with open(binary_file_name, 'wb') as file: 8 | for i in range(10000): 9 | # Generate example name and phone number (you can replace with your data source) 10 | name = f"Name-{i}" 11 | phone = f"Phone-{i}" 12 | 13 | # Ensure a fixed length for name and phone 14 | name = name.ljust(20, '\0') # 20 characters 15 | phone = phone.ljust(20, '\0') # 20 characters 16 | 17 | # Pack data into binary format (4 bytes for index, 20 bytes for name, and 20 bytes for phone) 18 | packed_data = struct.pack('I20s20s', i, name.encode(), phone.encode()) 19 | 20 | # Write the packed data to the binary file 21 | file.write(packed_data) -------------------------------------------------------------------------------- /src/Draft/test_data.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset("aadityaubhat/GPT-wiki-intro") 4 | 5 | print(dataset['train'][0]) -------------------------------------------------------------------------------- /src/IVF.py: -------------------------------------------------------------------------------- 1 | # import files 2 | from utils import * 3 | from sklearn.cluster import KMeans 4 | 5 | # improt libraries 6 | import heapq 7 | import numpy as np 8 | 9 | 10 | def IVF_index(file_path,K_means_metric,K_means_n_clusters,k_means_batch_size,k_means_max_iter,k_means_n_init,chunk_size,index_folder_path): 11 | ''' 12 | file_path: path to the data .bin file 13 | 14 | K_means_metric: metric to be used in clustering cosine or euclidean' or TODO use SCANN idea ERORR Think of another way this isn't supported in kmeans 15 | K_means_n_clusters: No of Kmeans Clusters 16 | k_means_batch_size: kmeans batch size to be sampled at each iteration of fitting 17 | k_means_max_iter: max iteration by kmeans default [100] in sklearn 18 | k_means_n_init:The number of times the algorithm will be run with different centroid seeds. 19 | 20 | chunk_size: chunk_size: no of records to be processing together in while performing kmeans 21 | 22 | ivf_folder_path: Folder path to store regions of kmeans 23 | ''' 24 | print("---IVF_index()----") 25 | # ############################################################### ################################# ############################################################### 26 | # ############################################################### Step(1):Clustering Data from file ############################################################### 27 | # ############################################################### ################################# ############################################################### 28 | kmeans = KMeans(n_clusters=K_means_n_clusters, max_iter=k_means_max_iter,n_init=k_means_n_init,random_state=42) 29 | 30 | 31 | # Use the first Chunck to only get teh centroids 32 | data_chunk=read_binary_file_chunk(file_path=file_path,record_format=f"I{70}f",start_index=0,chunk_size=1000000) #[{"id":,"embed":[]}] 33 | # TODO Remove this loop 34 | chunk_vectors=np.array([entry['embed'] for entry in data_chunk]) 35 | kmeans.fit(chunk_vectors) 36 | 37 | 38 | 39 | # We need to Read Data from File chunk by chunk 40 | file_size = os.path.getsize(file_path) 41 | record_size=struct.calcsize(f"I{70}f") 42 | n_records=file_size/record_size 43 | no_chunks=math.ceil(n_records/chunk_size) 44 | 45 | # # Step(1) Getting centroids: 46 | # # Loop to get the Kmeans Centroids 47 | # for i in range(no_chunks): 48 | # data_chunk=read_binary_file_chunk(file_path=file_path,record_format=f"I{70}f",start_index=i*chunk_size,chunk_size=chunk_size) #[{"id":,"embed":[]}] 49 | # # TODO Remove this loop 50 | # chunk_vectors=np.array([entry['embed'] for entry in data_chunk]) 51 | # kmeans.partial_fit(chunk_vectors) 52 | 53 | # Centroids 54 | K_means_centroids=kmeans.cluster_centers_ 55 | # Saving Centroids #TODO Check precision of centroids after read and write in the file @Basma Elhoseny 56 | write_binary_file(file_path=index_folder_path+'/centroids.bin',data_to_write=K_means_centroids,format=f"{70}f") 57 | 58 | # ################################################################## 59 | # #TEST# Centroids are Written Correct ############################# 60 | # ################################################################## 61 | 62 | 63 | 64 | # Step(2) Getting vectors of each regions 65 | for i in range(no_chunks): 66 | data_chunk=read_binary_file_chunk(file_path=file_path,record_format=f"I{70}f",start_index=i*chunk_size,chunk_size=chunk_size,dictionary_format=True) #[{109: np.array([70 dim])}] 67 | 68 | # Get Cluster for each one 69 | labels=kmeans.predict(list(data_chunk.values())) #Each vector corresponding centroid 70 | 71 | 72 | ids=np.array(list(data_chunk.keys())) 73 | vectors=np.array(list(data_chunk.values())) 74 | data_chunk=None #Clear Memory 75 | 76 | # Add vectors to their corresponding region 77 | for label in set(labels): 78 | region_ids=ids[labels==label] # get ids belonging to such region 79 | region_vectors=vectors[labels==label] # get vectors belonging to such region 80 | # Open file of this Region(cluster) Just Once for every Region :D 81 | with open(index_folder_path+f'/cluster{label}.bin', "ab") as fout: 82 | for i in range(len(region_ids)): 83 | #TODO Check whether store id of the vector @Basma Elhoseny 84 | data = struct.pack(f"I{70}f", region_ids[i],*region_vectors[i,:]) 85 | fout.write(data) 86 | 87 | 88 | 89 | return 90 | 91 | 92 | 93 | def semantic_query_ivf(data_file_path, index_folder_path, query, top_k, n_regions): 94 | query = np.squeeze(np.array(query)) 95 | 96 | 97 | # Read Centroids 98 | K_means_centroids = read_binary_file(index_folder_path + '/centroids.bin', f"70f") 99 | 100 | 101 | assert K_means_centroids.shape[0] > n_regions, "n_regions must be less than the number of regions" 102 | 103 | 104 | # Calculate distances to centroids 105 | distances = np.linalg.norm(K_means_centroids - query, axis=1) 106 | # Get indices of the nearest centroids 107 | nearest_regions = np.argsort(distances)[:n_regions] 108 | 109 | 110 | # Use a heap to keep track of the top k scores 111 | top_scores_heap = [] 112 | for region in nearest_regions: 113 | records=read_binary_file_chunk(index_folder_path+f'/cluster{region}.bin', f'I{70}f', 0, chunk_size=100000000000,dictionary_format=True) 114 | 115 | 116 | # Vectorize cosine similarity calculation 117 | vectors = np.array([record for record in records.values()]) 118 | dot_products = np.dot(vectors, query) 119 | norms = np.linalg.norm(vectors, axis=1) * np.linalg.norm(query) 120 | similarities = dot_products / norms 121 | 122 | # Process the scores and maintain a heap 123 | for score, id in zip(similarities, records.keys()): 124 | if len(top_scores_heap) < top_k: 125 | heapq.heappush(top_scores_heap, (score, id)) 126 | else: 127 | heapq.heappushpop(top_scores_heap, (score, id)) 128 | 129 | # Sort and get the top k scores 130 | top_scores_heap.sort(reverse=True) 131 | top_k_ids = [id for _, id in top_scores_heap] 132 | 133 | return top_k_ids 134 | -------------------------------------------------------------------------------- /src/Modules/IVF.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from sklearn.cluster import KMeans 4 | import time 5 | from scipy.spatial.distance import cosine 6 | 7 | 8 | class InvertedFileSystem: 9 | def __init__(self, n_clusters, data_dir): 10 | self.n_clusters = n_clusters 11 | self.data_dir = data_dir 12 | self.inverted_file_paths = [ 13 | os.path.join(data_dir, f"inverted_file_{i}.npy") for i in range(n_clusters) 14 | ] 15 | self.centroids = None 16 | 17 | def build_index(self, data): 18 | # Cluster the data 19 | kmeans = KMeans(n_clusters=self.n_clusters, n_init=10) 20 | labels = kmeans.fit_predict(data) 21 | self.centroids = kmeans.cluster_centers_ 22 | 23 | # Build inverted files 24 | inverted_files = [[] for _ in range(self.n_clusters)] 25 | for idx, label in enumerate(labels): 26 | inverted_files[label].append(idx) 27 | 28 | # Save inverted files to disk 29 | for i, inverted_file in enumerate(inverted_files): 30 | np.save(self.inverted_file_paths[i], inverted_file) 31 | 32 | def query(self, vector, top_k=5): 33 | # Assign vector to nearest cluster 34 | nearest_cluster = np.argmin(np.linalg.norm(self.centroids - vector, axis=1)) 35 | 36 | # Load the corresponding inverted file from disk 37 | inverted_file = np.load(self.inverted_file_paths[nearest_cluster]) 38 | 39 | # Search in the inverted file 40 | distances = [np.linalg.norm(vector - data[idx]) for idx in inverted_file] 41 | nearest_indices = np.argsort(distances)[:top_k] 42 | 43 | return [inverted_file[i] for i in nearest_indices] 44 | 45 | 46 | def brute_force_cosine_similarity(query_vector, data, top_k=5): 47 | # Calculate cosine similarities for each vector in the dataset 48 | similarities = [1 - cosine(query_vector, vector) for vector in data] 49 | 50 | # Get the indices of the top k most similar vectors 51 | nearest_indices = np.argsort(similarities)[-top_k:] 52 | 53 | # Return the indices and their cosine similarities 54 | return [idx for idx in reversed(nearest_indices)] 55 | 56 | def run_queries(n_queries, ivf, data, top_k=5): 57 | total_time_ivf = 0 58 | total_time_brute_force = 0 59 | total_score_ivf = 0 60 | ivf_results = [] 61 | brute_force_results = [] 62 | 63 | for _ in range(n_queries): 64 | query_vector = np.random.rand(70) 65 | 66 | start_time = time.time() 67 | ivf_result = ivf.query(query_vector, top_k) 68 | end_time = time.time() 69 | total_time_ivf += end_time - start_time 70 | ivf_results.append(ivf_result) 71 | 72 | start_time = time.time() 73 | brute_force_result = brute_force_cosine_similarity(query_vector, data, top_k) 74 | end_time = time.time() 75 | total_time_brute_force += end_time - start_time 76 | brute_force_results.append(brute_force_result) 77 | 78 | intersection = len(set(ivf_result).intersection(brute_force_result)) 79 | total_score_ivf += intersection / top_k 80 | 81 | avg_time_ivf = total_time_ivf / n_queries 82 | avg_score_ivf = total_score_ivf / n_queries 83 | avg_time_brute_force = total_time_brute_force / n_queries 84 | 85 | print(f"IVF: Average time = {avg_time_ivf}, Average score = {avg_score_ivf}") 86 | print(f"Brute Force: Average time = {avg_time_brute_force}") 87 | 88 | # Calculate intersection of top k results 89 | intersection = set(ivf_result).intersection(brute_force_result) 90 | print(f"Intersection of top {top_k} results: {intersection}") 91 | 92 | # !testing IVF 93 | data_dir = "inverted_files" 94 | os.makedirs(data_dir, exist_ok=True) 95 | number_of_queries=10 96 | data_set=10000 97 | 98 | data = np.random.rand(data_set, 70) 99 | ivf = InvertedFileSystem(n_clusters=5, data_dir=data_dir) 100 | ivf.build_index(data) 101 | 102 | print("Dataset in k: ",data_set//1000) 103 | print("Number of Queries: ",number_of_queries) 104 | 105 | run_queries(number_of_queries, ivf, data) 106 | 107 | 108 | # # !testing IVF 109 | # data_dir = "inverted_files" 110 | # os.makedirs(data_dir, exist_ok=True) 111 | 112 | # data = np.random.rand(100000, 70) 113 | # ivf = InvertedFileSystem(n_clusters=3, data_dir=data_dir) 114 | # ivf.build_index(data) 115 | 116 | # query_vector = np.random.rand(70) 117 | 118 | 119 | # # brute force search 120 | # start_time = time.time() 121 | # brute_force_results = brute_force_cosine_similarity(query_vector, data, top_k=10) 122 | # brute_force_time = time.time() - start_time 123 | # print("Brute force top k: ", brute_force_results) 124 | # print("Brute force time: ", brute_force_time) 125 | # print("============================================") 126 | # # Timing IVF query 127 | # start_time = time.time() 128 | # top_k_results = ivf.query(query_vector, top_k=10) 129 | # ivf_time = time.time() - start_time 130 | # print("IVF top k: ", top_k_results) 131 | # print("IVF time: ", ivf_time) 132 | 133 | 134 | # # Get intersection 135 | # brute_force_set = set(brute_force_results) 136 | # ivf_set = set(top_k_results) 137 | 138 | # intersection = brute_force_set.intersection(ivf_set) 139 | # print("Intersection of Brute Force and IVF: ", intersection) 140 | # print("length of the intersection: ", len(intersection)) 141 | 142 | # print("********************************************") 143 | -------------------------------------------------------------------------------- /src/Modules/LSH.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | 5 | from scipy.spatial.distance import cosine 6 | # from best_case_implementation import VecDBBest 7 | 8 | 9 | # TODO: 10 | # * 1) Build LSH function (indexing) 11 | # * 2) Build semantic query function (retrieval) 12 | 13 | 14 | def LSH_index(data, nbits, index_path, d=70): 15 | """ 16 | Function to Build the LSH indexing 17 | data:[{'id':int,'embed':vector}] 18 | nbits: no of bits of the Buckets 19 | index_path:path of the Result to be saved 20 | d: vector dimension 21 | """ 22 | # create nbits Random hyperplanes used for portioning 23 | 24 | plane_norms = np.random.rand(nbits, d) - 0.5 25 | 26 | #! for -1,1 27 | #? plane_norms = 2 * np.random.rand(nbits, d) - 1.0 28 | 29 | # If index Folder Doesn't Exist just Create it :D 30 | if not os.path.exists(index_path): 31 | os.makedirs(index_path) 32 | 33 | 34 | for item in data: 35 | vector = item["embed"] 36 | id = item["id"] 37 | 38 | # Dot Product with Random Planes 39 | data_dot_product = np.dot(vector, plane_norms.T) 40 | 41 | # Decision Making 42 | data_set_decision_hamming = (data_dot_product > 0) * 1 43 | 44 | # Bucket no. (Key) 45 | hash_str = "".join(data_set_decision_hamming.astype(str)) # 101001101 46 | 47 | # Add This vector to the bucket 48 | file_path = os.path.join(index_path, hash_str + ".txt") 49 | 50 | # Open File in Append Mode 51 | with open(file_path, "a") as file: 52 | file.write(str(id) + "\n") 53 | 54 | return plane_norms 55 | 56 | def get_top_k_hamming_distances(query, buckets, top_k): 57 | distances = [] 58 | # Calculate Hamming distance for each bucket 59 | for bucket in buckets: 60 | hamming_distance = sum(bit1 != bit2 for bit1, bit2 in zip(query, bucket)) 61 | distances.append((bucket, hamming_distance)) 62 | # Sort distances and get the top K 63 | sorted_distances = sorted(distances, key=lambda x: x[1]) 64 | top_k_distances = sorted_distances[:top_k] 65 | return top_k_distances 66 | def read_text_files_in_folder(folder_path): 67 | text_files_content = {} 68 | 69 | # Iterate over all files in the folder 70 | for filename in os.listdir(folder_path): 71 | file_path = os.path.join(folder_path, filename) 72 | 73 | # Check if the file is a text file 74 | if filename.endswith('.txt') and os.path.isfile(file_path): 75 | # Read the content of the text file 76 | with open(file_path, 'r', encoding='utf-8') as file: 77 | content = file.read() 78 | # Store content in the dictionary with the filename as the key 79 | text_files_content[filename] = content 80 | 81 | return text_files_content 82 | 83 | 84 | 85 | 86 | def semantic_query_lsh(query, plane_norms, index_path): 87 | 88 | 89 | """ 90 | Function to Query the LSH indexing 91 | query:[] query vector 92 | plane_norms: [[]] 93 | index_path:path of the Index to be Search in 94 | """ 95 | # Dot Product with Random Planes 96 | query_dot = np.dot(query, plane_norms.T) 97 | 98 | # Decision Making 99 | query_dot = (query_dot > 0) * 1 100 | 101 | query_dot = query_dot.squeeze() 102 | # Ensure query_dot is 1D for string conversion 103 | if query_dot.ndim == 0: 104 | query_dot = np.array([query_dot]) 105 | # Bucket no. (Key) 106 | # hash_str = "".join(query_dot.astype(str)) # 101001101 107 | hash_str = "".join(map(str, query_dot.astype(int))) # Converts boolean array to int and then to string 108 | 109 | file_path = os.path.join(index_path, hash_str + ".txt") 110 | result = read_text_files_in_folder(index_path) 111 | 112 | list_buckets = [] 113 | for filename, content in result.items(): 114 | list_buckets.append(list(map(int, filename[:-4]))) 115 | number_of_neighbours = 6 116 | min_hamming_buckets = get_top_k_hamming_distances(query_dot, list_buckets, number_of_neighbours) 117 | index_result =[] 118 | for (bucket, hamming_distance) in min_hamming_buckets: 119 | file_path = os.path.join(index_path, "".join(map(str,bucket)) + ".txt") 120 | try: 121 | list_1 = np.loadtxt(file_path, dtype=int) 122 | list_buckets = np.atleast_1d(list_1).tolist() 123 | index_result+=list_buckets 124 | 125 | except FileNotFoundError: 126 | # Handle the case where the file doesn't exist 127 | print(f"The file {file_path} doesn't exist. Setting index_result to a default value.") 128 | index_result = [] 129 | return hash_str, np.array(index_result) # Bucket no 130 | # return index_result 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiadSheriif/IntelliQuery/e49a854ae66c9a22632927e4956bbf1f032cbcc9/src/__init__.py -------------------------------------------------------------------------------- /src/api.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from worst_case_implementation import VecDBWorst 3 | from best_case_implementation import VecDBBest 4 | from typing import Dict, List, Annotated 5 | 6 | class DataApi: 7 | def __init__(self, file_path, worst = False, database_path="./DataBase",delete_db = True) -> None: 8 | self.file_path = file_path 9 | self.worst = worst 10 | if worst: 11 | self.db = VecDBWorst(self.file_path,delete_db) 12 | else: 13 | self.db = VecDBBest(self.file_path,database_path,delete_db) 14 | self.chunk_size = 10000 15 | 16 | # Function to generate random embeddings 17 | def __generate_embeddings(self,num_records, embedding_dim): 18 | return [np.random.rand(embedding_dim).tolist() for _ in range(num_records)] 19 | 20 | 21 | def generate_data_file(self,num_of_records): 22 | # Insert records in chunks 23 | for i in range(0, num_of_records, self.chunk_size): 24 | chunk_records = [] 25 | for j in range(i + 1, i + self.chunk_size + 1): 26 | if j > num_of_records: 27 | break 28 | record = {"id": j, "embed": self.__generate_embeddings(1, 70)[0]} 29 | chunk_records.append(record) 30 | 31 | self.db.insert_records_binary(chunk_records) 32 | print(f"Inserted {len(chunk_records)} records. Total records inserted: {j}") 33 | 34 | print("Insertion complete.") 35 | 36 | 37 | def get_record_by_id(self,record_id): 38 | return self.db.read_record_by_id(record_id) 39 | 40 | def get_first_k_records(self,k): 41 | return self.db.get_top_k_records(k) 42 | 43 | def get_multiple_records_by_ids(self,record_ids): 44 | return self.db.read_multiple_records_by_id(record_ids) 45 | 46 | def insert_records_binary(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 47 | return self.db.insert_records_binary(rows) 48 | 49 | def insert_records(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 50 | return self.db.insert_records(rows) 51 | 52 | def retrive(self, query:Annotated[List[float], 70], top_k = 5): 53 | return self.db.retrive(query,top_k) 54 | 55 | -------------------------------------------------------------------------------- /src/best_case_implementation.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Annotated 2 | import numpy as np 3 | from utils import empty_folder 4 | from Modules.LSH import * 5 | import struct 6 | import time 7 | 8 | class VecDBBest: 9 | def __init__(self,file_path="./DataBase/data.bin", database_path = "./DataBase", new_db = True) -> None: 10 | ''' 11 | Constructor 12 | ''' 13 | self.file_path =file_path # Data File Path 14 | self.database_path= database_path # Path of the Folder to Create Indexes 15 | 16 | if new_db: 17 | # If New Data Base 18 | # Empty DataBase Folder 19 | empty_folder(self.database_path) 20 | 21 | # just open new file to delete the old one 22 | with open(self.file_path, "w") as fout: 23 | # if you need to add any head to the file 24 | pass 25 | 26 | def calculate_offset(self, record_id: int) -> int: 27 | # Calculate the offset for a given record ID 28 | record_size = struct.calcsize("I70f") 29 | return (record_id) * record_size 30 | 31 | def insert_records_binary(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 32 | with open(self.file_path, "ab") as fout: # Open the file in binary mode for appending 33 | for row in rows: 34 | id, embed = row["id"], row["embed"] 35 | # Pack the data into a binary format 36 | data = struct.pack(f"I{70}f", id, *embed) 37 | fout.write(data) 38 | self._build_index() 39 | 40 | def read_multiple_records_by_id(self, records_id: List[int]): 41 | record_size = struct.calcsize("I70f") 42 | records = {} 43 | 44 | with open(self.file_path, "rb") as fin: 45 | for i in range(len(records_id)): 46 | offset = self.calculate_offset(records_id[i]) 47 | fin.seek(offset) # Move the file pointer to the calculated offset 48 | data = fin.read(record_size) 49 | if not data: 50 | records[records_id[i]] = None 51 | continue 52 | 53 | # Unpack the binary data into a dictionary 54 | unpacked_data = struct.unpack("I70f", data) 55 | id_value, floats = unpacked_data[0], unpacked_data[1:] 56 | 57 | # Create and return the record dictionary 58 | record = {"id": id_value, "embed": list(floats)} 59 | records[records_id[i]] = record 60 | return records 61 | 62 | def get_top_k_records(self,k): 63 | records = [] 64 | record_size = struct.calcsize("I70f") 65 | with open(self.file_path,'rb') as fin: 66 | fin.seek(0) 67 | for i in range(k): 68 | data = fin.read(record_size) 69 | unpacked_data = struct.unpack("I70f", data) 70 | id_value, floats = unpacked_data[0], unpacked_data[1:] 71 | 72 | record = {"id": id_value, "embed": list(floats)} 73 | records.append(record) 74 | return records 75 | 76 | def _build_index(self,Level_1_nbits=5, Level_2_nbits=3, Level_3_nbits=3,Level_4_nbits=3)-> None: 77 | 78 | ''' 79 | Build the Index 80 | ''' 81 | top_k_records = 2000 82 | 83 | # measure the time 84 | start = time.time() 85 | 86 | # Layer 1 Indexing 87 | # TODO: Here we are reading the whole file: Change later 88 | level_1_in = self.get_top_k_records(top_k_records) 89 | level_1_planes = LSH_index(data=level_1_in, nbits=Level_1_nbits, index_path=self.database_path + "/Level1") 90 | np.save(self.database_path + "/Level1/"+'metadata.npy',level_1_planes) 91 | print("Layer 1 Finished") 92 | return 93 | 94 | 95 | 96 | # Layer 2 Indexing 97 | for file_name in os.listdir(self.database_path + "/Level1"): 98 | file_path = os.path.join(self.database_path + "/Level1", file_name) 99 | if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 100 | read_data_2 = np.loadtxt(file_path, dtype=int, ndmin=1) 101 | level_2_in = self.read_multiple_records_by_id(read_data_2) 102 | level_2_planes = LSH_index(data=level_2_in.values(), nbits=Level_2_nbits, index_path=self.database_path + "/Level2/" + file_name[:-4]) 103 | np.save(self.database_path + "/Level2/" + file_name[:-4]+'/metadata.npy',level_2_planes) 104 | print("Layer 2 Finished") 105 | return 106 | 107 | 108 | # Layer 3 Indexing 109 | for folder_name in os.listdir(self.database_path + "/Level2"): 110 | folder_path = os.path.join(self.database_path + "/Level2", folder_name) 111 | for file_name in os.listdir(folder_path): 112 | file_path = os.path.join(folder_path, file_name) 113 | if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 114 | read_data_3 = np.loadtxt(file_path, dtype=int, ndmin=1) 115 | level_3_in = self.read_multiple_records_by_id(read_data_3) 116 | level_3_planes = LSH_index(data=level_3_in.values(), nbits=Level_3_nbits, index_path=self.database_path + "/Level3/" + folder_name + '/' + file_name[:-4]) 117 | np.save(self.database_path + "/Level3/" + folder_name + '/' + file_name[:-4]+'/metadata.npy',level_3_planes) 118 | print("Layer 3 Finished") 119 | 120 | return 121 | # Layer 4 Indexing 122 | for folder_name in os.listdir(self.database_path + "/Level3"): 123 | folder_path = os.path.join(self.database_path + "/Level3", folder_name) 124 | for folder_name_2 in os.listdir(folder_path): 125 | folder_path_2 = os.path.join(folder_path, folder_name_2) 126 | for file_name in os.listdir(folder_path_2): 127 | file_path = os.path.join(folder_path_2, file_name) 128 | if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 129 | read_data_4 = np.loadtxt(file_path, dtype=int, ndmin=1) 130 | level_4_in = self.read_multiple_records_by_id(read_data_4) 131 | level_4_planes = LSH_index(data=level_4_in.values(), nbits=Level_4_nbits, index_path=self.database_path + "/Level4/" + folder_name + '/' + folder_name_2 + '/' + file_name[:-4]) 132 | np.save(self.database_path + "/Level4/" + folder_name + '/' + folder_name_2 + '/' + file_name[:-4]+'/metadata.npy',level_4_planes) 133 | print("Layer 4 Finished") 134 | 135 | 136 | # measure the time 137 | end = time.time() 138 | print("Time taken by Indexing: ",end - start) 139 | def retrive(self, query:Annotated[List[float], 70],top_k = 5)-> [int]: 140 | ''' 141 | Get the top_k vectors similar to the Query 142 | 143 | return: list of the top_k similar vectors Ordered by Cosine Similarity 144 | ''' 145 | 146 | # Retrieve from Level 1 147 | level_1_planes = np.load(self.database_path + "/Level1"+'/metadata.npy') 148 | bucket_1,result = semantic_query_lsh(query, level_1_planes, self.database_path + "/Level1") 149 | print("length of first bucket",result.shape) 150 | 151 | if len(result) < top_k: 152 | print('level 1 smaller than top_k') 153 | 154 | # # Retrieve from Level 2 155 | # level_2_planes = np.load(self.database_path + "/Level2/"+bucket_1+'/metadata.npy') 156 | # bucket_2,result = semantic_query_lsh(query, level_2_planes, self.database_path + "/Level2/"+bucket_1) 157 | # print("length of second bucket",result.shape) 158 | 159 | # if len(result) < top_k: 160 | # print('level 2 smaller than top_k') 161 | 162 | # # Retrieve from Level 3 163 | # level_3_planes = np.load(self.database_path + "/Level3/"+bucket_1+'/'+bucket_2+'/metadata.npy') 164 | # bucket_3,result = semantic_query_lsh(query, level_3_planes, self.database_path + "/Level3/"+bucket_1+'/'+bucket_2) 165 | # print("length of third bucket",result.shape) 166 | 167 | # if len(result) < top_k: 168 | # print('level 3 smaller than top_k') 169 | 170 | # # Retrieve from Level 4 171 | # level_4_planes = np.load(self.database_path + "/Level4/"+bucket_1+'/'+bucket_2+'/'+bucket_3+'/metadata.npy') 172 | # bucket_4,result = semantic_query_lsh(query, level_4_planes, self.database_path + "/Level4/"+bucket_1+'/'+bucket_2+'/'+bucket_3) 173 | # print("length of fourth bucket",result.shape) 174 | 175 | # if len(result) < top_k: 176 | # print('level 4 smaller than top_k') 177 | 178 | 179 | # Retrieve from Data Base the Embeddings of the Vectors 180 | final_result= self.read_multiple_records_by_id(result) 181 | 182 | # Calculate the Cosine Similarity between the Query and the Vectors 183 | scores = [] 184 | for row in final_result.values(): 185 | id_value = row['id'] 186 | embed_values = row['embed'] 187 | score = self._cal_score(query, embed_values) 188 | scores.append((score, id_value)) 189 | scores = sorted(scores, reverse=True)[:top_k] 190 | return [s[1] for s in scores] 191 | 192 | 193 | 194 | 195 | def _cal_score(self, vec1, vec2): 196 | dot_product = np.dot(vec1, vec2) 197 | norm_vec1 = np.linalg.norm(vec1) 198 | norm_vec2 = np.linalg.norm(vec2) 199 | cosine_similarity = dot_product / (norm_vec1 * norm_vec2) 200 | return cosine_similarity 201 | 202 | -------------------------------------------------------------------------------- /src/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from worst_case_implementation import VecDBWorst 3 | from best_case_implementation import VecDBBest 4 | import argparse 5 | from utils import extract_embeds_array 6 | import pandas as pd 7 | from api import DataApi 8 | import os 9 | import time 10 | from dataclasses import dataclass 11 | from typing import List 12 | 13 | AVG_OVERX_ROWS = 1 14 | 15 | @dataclass 16 | class Result: 17 | run_time: float 18 | top_k: int 19 | db_ids: List[int] 20 | actual_ids: List[int] 21 | 22 | # def run_queries(db1,db2, np_rows, top_k, num_runs,delete=False): 23 | def run_queries(db, np_rows, top_k, num_runs, delete=False): 24 | results = [] 25 | # results_worst = [] 26 | # results_best = [] 27 | for i in range(num_runs): 28 | if delete: 29 | query = np.random.random((1,70)) 30 | np.save( "./DataBase/q"+str(i)+'.npy',query) 31 | else: 32 | query = np.load( "./DataBase/q"+str(i)+'.npy') 33 | 34 | tic = time.time() 35 | db_ids = db.retrive(query,top_k) 36 | toc = time.time() 37 | run_time= toc - tic 38 | 39 | actual_ids = np.argsort(np_rows.dot(query.T).T / (np.linalg.norm(np_rows, axis=1) * np.linalg.norm(query)), axis=1).squeeze().tolist()[::-1] 40 | 41 | toc = time.time() 42 | np_run_time = toc - tic 43 | 44 | results.append(Result(run_time,top_k,db_ids,actual_ids)) 45 | return results 46 | 47 | def eval(results: List[Result]): 48 | # scores are negative. So getting 0 is the best score. 49 | scores = [] 50 | run_time = [] 51 | for res in results: 52 | run_time.append(res.run_time) 53 | # case for retireving number not equal to top_k, socre will be the lowest 54 | if len(set(res.db_ids)) != res.top_k or len(res.db_ids) != res.top_k: 55 | scores.append( -1 * len(res.actual_ids) * res.top_k) 56 | print('retrieving number not equal to top_k') 57 | continue 58 | 59 | score = 0 60 | for id in res.db_ids: 61 | try: 62 | ind = res.actual_ids.index(id) 63 | if ind > res.top_k * 3: 64 | # print("not in top top_k*3") 65 | score -= ind 66 | except: 67 | # print("not in ids") 68 | score -= len(res.actual_ids) 69 | scores.append(score) 70 | 71 | return sum(scores) / len(scores), sum(run_time) / len(run_time) 72 | 73 | def find_indices(list1, list2): 74 | """ 75 | Find the indices of elements of list1 in list2. 76 | 77 | :param list1: The list containing elements whose indices are to be found. 78 | :param list2: The list in which to search for elements from list1. 79 | :return: A list of indices. 80 | """ 81 | indices = [] 82 | for element in list1: 83 | # Convert both to numpy arrays for consistent handling 84 | np_list2 = np.array(list2) 85 | # Find the index of element in list2 86 | found_indices = np.where(np_list2 == element)[0] 87 | if found_indices.size > 0: 88 | indices.append(found_indices[0]) 89 | 90 | return indices 91 | 92 | 93 | def compare_results_print(worst_res,best_res,top_k): 94 | for i in range(len(worst_res)): 95 | actual_ids=worst_res[i].actual_ids 96 | db_ids_best=best_res[i].db_ids 97 | db_ids_worst=worst_res[i].db_ids 98 | 99 | run_time_worst=worst_res[i].run_time 100 | run_time_best=best_res[i].run_time 101 | 102 | 103 | print("=======================================") 104 | print("Best ids: ",db_ids_best) 105 | print("Actual ids: ",actual_ids[:top_k]) 106 | print("Worst ids: ",db_ids_worst) 107 | print("Intersect: ",set(actual_ids[:top_k]).intersection(set(db_ids_best))) 108 | print("Intersection in top k indices in the best DB: ",find_indices(actual_ids[:top_k], db_ids_best)) 109 | 110 | print("Time taken by Query (Best): ",run_time_best) 111 | print("Time taken by Query (Worst): ",run_time_worst) 112 | print("=======================================") 113 | 114 | if __name__ == "__main__": 115 | print("Hello Semantic LSH") 116 | 117 | number_of_records = 2000 118 | number_of_features = 70 119 | number_of_queries = 5 120 | top_k = 10 121 | print("******************************""") 122 | print("Number of records: ",number_of_records) 123 | print("Number of queries: ",number_of_queries) 124 | print("Top k: ",top_k) 125 | print("******************************""") 126 | 127 | 128 | folder_name = "DataBase" 129 | if not os.path.exists(folder_name): 130 | os.makedirs(folder_name) 131 | 132 | # Mode 133 | parser = argparse.ArgumentParser(description='Description of your script') 134 | parser.add_argument('-d','--delete', help='Description of the -d flag', action='store_true') 135 | args = parser.parse_args() 136 | 137 | # worst_db = VecDBWorst('./DataBase/data.csv',new_db=not args.delete) 138 | worst_api = DataApi('./DataBase/data_worst.csv',True,'./DataBase',args.delete) 139 | # best_db = VecDBBest('./DataBase/data.bin','./DataBase',new_db=not args.delete) 140 | best_api = DataApi('./DataBase/data.bin', False,'./DataBase',args.delete) 141 | 142 | if not args.delete: 143 | print("Reading") 144 | # records_np = pd.read_csv('./DataBase/data.csv',header=None) 145 | # rows_without_first_element = np.array([row[1:].tolist() for _, row in records_np.iterrows()]) 146 | # records_np=rows_without_first_element 147 | 148 | records_database = np.array(best_api.get_first_k_records(number_of_records)) 149 | records_np = extract_embeds_array(records_database) 150 | records_dict = records_database 151 | _len = len(records_np) 152 | else: 153 | # New 154 | 155 | # records_database = np.array(best_api.get_first_k_records(10000)) 156 | print("Generating data files") 157 | records_np = np.random.random((number_of_records, number_of_features)) 158 | # records_np = extract_embeds_array(records_database) 159 | 160 | records_dict = [{"id": i, "embed": list(row)} for i, row in enumerate(records_np)] 161 | # records_dict = records_database 162 | _len = len(records_np) 163 | 164 | worst_api.insert_records(records_dict) 165 | best_api.insert_records_binary(records_dict) 166 | 167 | 168 | # Worst 169 | res_worst = run_queries(worst_api, records_np, top_k, number_of_queries,args.delete) 170 | # Best 171 | res_best = run_queries(best_api, records_np, top_k, number_of_queries,False) 172 | 173 | compare_results_print(res_worst,res_best,top_k) 174 | print("Worst:",eval(res_worst)) 175 | print("Best:",eval(res_best)) 176 | 177 | # res = run_queries(best_api, records_np, 5, 3) 178 | # print("Best:",eval(res)) 179 | # results_worst, results_best = run_queries(worst_api,best_api, records_np, top_k, number_of_queries) 180 | # print("Worst:",eval(results_worst)) 181 | # print("Best:",eval(results_best)) 182 | 183 | # records_np = np.concatenate([records_np, np.random.random((90000, 70))]) 184 | # records_dict = [{"id": i + _len, "embed": list(row)} for i, row in enumerate(records_np[_len:])] 185 | # _len = len(records_np) 186 | # worst_db.insert_records(records_dict) 187 | # res = run_queries(worst_db, records_np, 5, 10) 188 | # print(eval(res)) 189 | 190 | # records_np = np.concatenate([records_np, np.random.random((900000, 70))]) 191 | # records_dict = [{"id": i + _len, "embed": list(row)} for i, row in enumerate(records_np[_len:])] 192 | # _len = len(records_np) 193 | # worst_db.insert_records(records_dict) 194 | # res = run_queries(worst_db, records_np, 5, 10) 195 | # eval(res) 196 | 197 | # records_np = np.concatenate([records_np, np.random.random((4000000, 70))]) 198 | # records_dict = [{"id": i + _len, "embed": list(row)} for i, row in enumerate(records_np[_len:])] 199 | # _len = len(records_np) 200 | # db.insert_records(records_dict) 201 | # res = run_queries(db, records_np, 5, 10) 202 | # eval(res) 203 | 204 | # records_np = np.concatenate([records_np, np.random.random((5000000, 70))]) 205 | # records_dict = [{"id": i + _len, "embed": list(row)} for i, row in enumerate(records_np[_len:])] 206 | # _len = len(records_np) 207 | # db.insert_records(records_dict) 208 | # res = run_queries(db, records_np, 5, 10) 209 | # eval(res) 210 | 211 | # records_np = np.concatenate([records_np, np.random.random((5000000, 70))]) 212 | # records_dict = [{"id": i + _len, "embed": list(row)} for i, row in enumerate(records_np[_len:])] 213 | # _len = len(records_np) 214 | # db.insert_records(records_dict) 215 | # res = run_queries(db, records_np, 5, 10) 216 | # eval(res) -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | from api import DataApi 2 | 3 | 4 | api_data = DataApi("test.bin") 5 | 6 | # api_data.generate_data_file(5000) 7 | 8 | 9 | records = api_data.get_multiple_records_by_ids([2, 1, 5, 8000]) 10 | print(records[8000]) 11 | -------------------------------------------------------------------------------- /src/notes.txt: -------------------------------------------------------------------------------- 1 | 1- 20,000,000 2 | // ==>200 3 | 4 | 2- 100,000 5 | // ==>200 6 | 7 | 8 | 3- 5,000 9 | // ==>500 10 | 11 | 12 | 4- 10 13 | 14 | ============ 15 | 16 | new query 17 | first level: 1-10 18 | 19 | second level: 1-500 ==> most load from ram <=200 20 | 21 | 22 | third level: 1-200 23 | 24 | fourth: level: 1-200 25 | 26 | 27 | final retreving: 910 record 28 | 29 | ============================= 30 | 31 | n_probe =2 32 | 33 | new query 34 | first level: 2-10 35 | second level: 2-1000 36 | third level: 2-400 37 | fourth: level: 2-400 38 | 39 | 40 | ================================== 41 | 42 | 20,000,000 record 43 | 44 | 256 buckets ===> 256 file 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# from Modules.LSH import semantic_query_lsh\n", 10 | "# from Modules.LSH import LSH\n", 11 | "\n", 12 | "\n", 13 | "# import numpy as np\n", 14 | "# import os" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# file_path = \"./random_data.txt\"\n", 24 | "# read_data = np.loadtxt(file_path)\n", 25 | "# plane_norms = LSH(read_data, 8)\n", 26 | "# query=[read_data[0]]\n", 27 | "# folder_name = \"bucket_files\"\n", 28 | "# result = semantic_query_lsh(query, plane_norms,folder_name)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 23, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "The autoreload extension is already loaded. To reload it, use:\n", 41 | " %reload_ext autoreload\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "%load_ext autoreload\n", 47 | "%autoreload 2\n", 48 | "\n", 49 | "\n", 50 | "from utils import *\n", 51 | "from Modules.LSH import*\n", 52 | "from api import *\n", 53 | "from evaluation import *\n", 54 | "from worst_case_implementation import VecDBWorst\n", 55 | "\n", 56 | "\n", 57 | "# datafile_path=\"../DataBase/random_data_10000.txt\"" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 29, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Inserted 10000 records. Total records inserted: 10000\n", 70 | "Insertion complete.\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "data_file='./DataBase/data.bin'\n", 76 | "Level_1_path='./DataBase/Level1'\n", 77 | "Level_2_path='./DataBase/Level2'\n", 78 | "Level_3_path='./DataBase/Level3'\n", 79 | "\n", 80 | "Level_1_nbits=8\n", 81 | "Level_2_nbits=3\n", 82 | "Level_3_nbits=3\n", 83 | "\n", 84 | "data_api = DataApi(data_file)\n", 85 | "data_api.generate_data_file(10000)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 33, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "\n", 95 | "# Test LSH_index\n", 96 | "# Read Data From File\n", 97 | "read_data = data_api.get_top_k_records(10000)\n", 98 | "\n", 99 | "\n", 100 | "# Layer(1)\n", 101 | "level_1_in=read_data\n", 102 | "# TODO: Save Planes to be used in query Search\n", 103 | "level_1_planes=LSH_index(data=level_1_in, nbits=Level_1_nbits,index_path=Level_1_path)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 34, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# Layer(2)\n", 113 | "# On Each Bucket Apply LSH\n", 114 | "\n", 115 | "# List all files in the directory\n", 116 | "files = os.listdir(Level_1_path)\n", 117 | "\n", 118 | "# TODO: Save Planes to be used in query Search\n", 119 | "level_2_planes={}\n", 120 | "\n", 121 | "# Loop over the files\n", 122 | "for file_name in files:\n", 123 | " file_path = os.path.join(Level_1_path, file_name)\n", 124 | " \n", 125 | " if os.path.isfile(file_path):\n", 126 | " # Read Data\n", 127 | " read_data_2 = np.loadtxt(file_path,dtype=int,ndmin=1)\n", 128 | "\n", 129 | " level_2_in=data_api.get_multiple_records_by_ids(read_data_2-1)\n", 130 | " # level_2_in = array_to_dictionary(values=vectors,keys=np.hstack(read_data_2))\n", 131 | "\n", 132 | " # # Apply LSH on this Bucket\n", 133 | " # level_2=arr[level_1]\n", 134 | " level_2_planes[file_name[:-4]]=LSH_index(data=level_2_in.values(), nbits=Level_2_nbits,index_path=Level_2_path+'/' + file_name[:-4])\n", 135 | "\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Layer(3)\n", 145 | "# On Each Bucket Apply LSH\n", 146 | "\n", 147 | "# List all files in the directory\n", 148 | "folders = os.listdir(Level_2_path)\n", 149 | "\n", 150 | "# TODO: Save Planes to be used in query Search\n", 151 | "level_3_planes={}\n", 152 | "# file_3=folder{}\n", 153 | "# Loop over the folders\n", 154 | "for folder_name in folders:\n", 155 | " level_3_planes[folder_name]={}\n", 156 | " folder_path = os.path.join(Level_2_path, folder_name)\n", 157 | " files = os.listdir(folder_path)\n", 158 | " # Loop over the files\n", 159 | " for file_name in files:\n", 160 | " file_path = os.path.join(folder_path, file_name)\n", 161 | " \n", 162 | " if os.path.isfile(file_path):\n", 163 | " # Read Data\n", 164 | " read_data_3 = np.loadtxt(file_path,dtype=int,ndmin=1)\n", 165 | "\n", 166 | " level_3_in=data_api.get_multiple_records_by_ids(read_data_3)\n", 167 | "\n", 168 | " # # Apply LSH on this Bucket\n", 169 | " level_3_planes[folder_name][file_name[:-4]]=LSH_index(data=level_3_in.values(), nbits=Level_3_nbits,index_path=Level_3_path+'/'+folder_name+'/' + file_name[:-4])\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 22, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "bucket of level 1: 11110011\n", 182 | "=====================================\n", 183 | "bucket of level 2: 001\n", 184 | "=====================================\n", 185 | "bucket of level 3: 100\n", 186 | "Length of level 3 189\n", 187 | "Indices of level 3 [ 36 51 104 159 266 357 372 385 434 465 510 671 702 707\n", 188 | " 720 822 824 834 863 938 1034 1044 1165 1248 1264 1438 1505 1565\n", 189 | " 1613 1683 1712 1719 1771 1798 1812 1843 1953 2191 2238 2266 2330 2353\n", 190 | " 2594 2602 2624 2669 2730 2744 2825 2880 2894 2915 2942 2944 3080 3168\n", 191 | " 3286 3351 3490 3645 3648 3735 3798 3851 3859 3911 3986 4026 4030 4065\n", 192 | " 4121 4134 4187 4211 4232 4260 4391 4399 4476 4477 4489 4492 4545 4554\n", 193 | " 4591 4605 4660 4792 4905 4937 4953 4954 4970 4986 4987 5228 5249 5329\n", 194 | " 5398 5454 5471 5495 5584 5708 5712 5725 5744 5799 5899 5900 5908 5952\n", 195 | " 5987 6049 6072 6096 6144 6184 6209 6287 6344 6399 6479 6495 6536 6544\n", 196 | " 6662 6693 6848 6880 6915 6962 7080 7085 7187 7199 7213 7240 7390 7404\n", 197 | " 7417 7442 7531 7538 7554 7584 7625 7664 7708 7721 7765 7768 7808 7827\n", 198 | " 7955 8101 8170 8279 8284 8380 8444 8446 8454 8481 8552 8560 8565 8586\n", 199 | " 8676 8700 8761 8792 8912 8935 9007 9150 9336 9352 9354 9367 9586 9662\n", 200 | " 9745 9762 9794 9801 9859 9948 9972]\n", 201 | "=====================================\n", 202 | "target_vector [0.5522450804710388, 0.8917692303657532, 0.7913368344306946, 0.6000004410743713, 0.2616525888442993, 0.9615220427513123, 0.4808562695980072, 0.6019359827041626, 0.07978673279285431, 0.30365362763404846, 0.7390730381011963, 0.2133997678756714, 0.36366748809814453, 0.1835469752550125, 0.20069865882396698, 0.13891369104385376, 0.11978743225336075, 0.3913387358188629, 0.002954070921987295, 0.5194749236106873, 0.37845972180366516, 0.9680533409118652, 0.6960610747337341, 0.8805666565895081, 0.06497178226709366, 0.5662519335746765, 0.04004804417490959, 0.2919067144393921, 0.737677812576294, 0.10855083167552948, 0.3745698928833008, 0.37776005268096924, 0.9178327322006226, 0.7241680026054382, 0.12325477600097656, 0.3273957073688507, 0.9901415109634399, 0.4085298478603363, 0.6129018068313599, 0.1801413595676422, 0.9952824711799622, 0.3938077688217163, 0.913888692855835, 0.11249328404664993, 0.14214684069156647, 0.6679161787033081, 0.9495717287063599, 0.4362204968929291, 0.3122316896915436, 0.6952698230743408, 0.8448274731636047, 0.965186595916748, 0.35632771253585815, 0.9069381952285767, 0.42551901936531067, 0.9420151710510254, 0.022108066827058792, 0.6098361611366272, 0.897776186466217, 0.4446363151073456, 0.7102886438369751, 0.5624412894248962, 0.5420237183570862, 0.3291500210762024, 0.2226945161819458, 0.6429535150527954, 0.5322402119636536, 0.09856311231851578, 0.5489377379417419, 0.5590397715568542]\n" 203 | ] 204 | }, 205 | { 206 | "ename": "TypeError", 207 | "evalue": "unhashable type: 'slice'", 208 | "output_type": "error", 209 | "traceback": [ 210 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 211 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 212 | "\u001b[1;32md:\\Semantic-Search-Engine\\pipeline.ipynb Cell 8\u001b[0m line \u001b[0;36m2\n\u001b[0;32m 27\u001b[0m index_result_3\u001b[39m=\u001b[39mdata_api\u001b[39m.\u001b[39mget_multiple_records_by_ids(index_result_3)\n\u001b[0;32m 28\u001b[0m level3_res_vectors\u001b[39m=\u001b[39m[entry[\u001b[39m'\u001b[39m\u001b[39membed\u001b[39m\u001b[39m'\u001b[39m] \u001b[39mfor\u001b[39;00m entry \u001b[39min\u001b[39;00m index_result_3\u001b[39m.\u001b[39mvalues()]\n\u001b[1;32m---> 29\u001b[0m top_result,_\u001b[39m=\u001b[39mget_top_k_similar(query,index_result_3,\u001b[39m10\u001b[39;49m)\n\u001b[0;32m 30\u001b[0m \u001b[39m# print(\"Top k results: \",top_result[0])\u001b[39;00m\n\u001b[0;32m 31\u001b[0m \u001b[39m# print(\"=====================================\")\u001b[39;00m\n\u001b[0;32m 32\u001b[0m \u001b[39m# # get the intersection of the two lists level 2 and level3\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[39m# count = sum(element in index_result_2 for element in index_result_3)\u001b[39;00m\n\u001b[0;32m 34\u001b[0m \u001b[39m# print(\"Intersection of the two layers: \",count)\u001b[39;00m\n", 213 | "File \u001b[1;32md:\\Semantic-Search-Engine\\Modules\\LSH.py:145\u001b[0m, in \u001b[0;36mget_top_k_similar\u001b[1;34m(target_vector, data, k)\u001b[0m\n\u001b[0;32m 143\u001b[0m \u001b[39m# Calculate cosine similarities using vectorized operations\u001b[39;00m\n\u001b[0;32m 144\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mtarget_vector\u001b[39m\u001b[39m\"\u001b[39m,target_vector)\n\u001b[1;32m--> 145\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m,data[\u001b[39m1\u001b[39;49m:\u001b[39m5\u001b[39;49m])\n\u001b[0;32m 146\u001b[0m similarities \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m np\u001b[39m.\u001b[39marray([cosine(target_vector, vector) \u001b[39mfor\u001b[39;00m vector \u001b[39min\u001b[39;00m data])\n\u001b[0;32m 148\u001b[0m \u001b[39m# Find the indices of the top k most similar vectors\u001b[39;00m\n", 214 | "\u001b[1;31mTypeError\u001b[0m: unhashable type: 'slice'" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "# Query\n", 220 | "query=data_api.get_record_by_id(5)[5]['embed']\n", 221 | "# Layer (1)\n", 222 | "bucket_1,index_result_1 = semantic_query_lsh(query=query,plane_norms=level_1_planes,index_path=Level_1_path)\n", 223 | "print(\"bucket of level 1: \",bucket_1)\n", 224 | "# print(\"Length of level 1\",len(index_result_1))\n", 225 | "# print(\"Indices of level 1\",index_result_1)\n", 226 | "print(\"=====================================\")\n", 227 | "\n", 228 | "# Layer(2)\n", 229 | "bucket_2,index_result_2 = semantic_query_lsh(query=query,plane_norms=level_2_planes[bucket_1],index_path=Level_2_path+\"/\"+bucket_1)\n", 230 | "print(\"bucket of level 2: \",bucket_2)\n", 231 | "# print(\"Length of level 2\",len(index_result_2))\n", 232 | "# print(\"Indices of level 2\",index_result_2)\n", 233 | "print(\"=====================================\")\n", 234 | "\n", 235 | "# Layer(3)\n", 236 | "bucket_3,index_result_3 = semantic_query_lsh(query=query,plane_norms=level_3_planes[bucket_1][bucket_2],index_path=Level_3_path+\"/\"+bucket_1+'/'+bucket_2)\n", 237 | "print(\"bucket of level 3: \",bucket_3)\n", 238 | "print(\"Length of level 3\",len(index_result_3))\n", 239 | "print(\"Indices of level 3\",index_result_3)\n", 240 | "\n", 241 | "\n", 242 | "print(\"=====================================\")\n", 243 | "\n", 244 | "# get top 10 results from the last layer\n", 245 | "index_result_3=data_api.get_multiple_records_by_ids(index_result_3)\n", 246 | "level3_res_vectors=[entry['embed'] for entry in index_result_3.values()]\n", 247 | "top_result,_=get_top_k_similar(query,index_result_3,10)\n", 248 | "# print(\"Top k results: \",top_result[0])\n", 249 | "# print(\"=====================================\")\n", 250 | "# # get the intersection of the two lists level 2 and level3\n", 251 | "# count = sum(element in index_result_2 for element in index_result_3)\n", 252 | "# print(\"Intersection of the two layers: \",count)\n", 253 | "\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "np_rows = np.array([record['embed'] for record in read_data if 'embed' in record])\n", 263 | "# temp=[5, 966, 536, 1088, 5073, 5549]\n", 264 | "index_result_3_minus_one = [id - 1 for id in top_results[0]]\n", 265 | "res=run_queries(index_result_3_minus_one, np_rows, top_k=len(top_results), num_runs=1,query=np.array([query]))\n", 266 | "print(eval(res))" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "db = VecDBWorst()\n", 276 | "# records_np = np.random.random((10000, 70))\n", 277 | "records_np = np.array([record['embed'] for record in read_data if 'embed' in record])\n", 278 | "\n", 279 | "# records_dict = [{\"id\": i, \"embed\": list(row)} for i, row in enumerate(records_np)]\n", 280 | "records_dict=read_data\n", 281 | "\n", 282 | "# _len = len(records_np)\n", 283 | "db.insert_records(records_dict)\n", 284 | "db_ids=db.retrive(query, top_k=1)\n", 285 | "db_ids_minus_one = [id - 1 for id in db_ids]\n", 286 | "res = run_queries(db_ids_minus_one, records_np, 1, 1, np.array([query]))\n", 287 | "print(eval(res))\n" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.10.9" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import shutil 3 | import os 4 | import math 5 | from typing import Dict, List, Annotated 6 | import struct 7 | import sys 8 | 9 | 10 | def save_20M_record(data): 11 | ''' 12 | Given 20M record save them as required by the TA 13 | data: (20M,70) 14 | ''' 15 | 16 | folder_name='./Data_TA' 17 | if not os.path.exists(folder_name): 18 | os.makedirs(folder_name) 19 | 20 | empty_folder(folder_name) 21 | 22 | files=['data_100K.bin',"data_1M.bin","data_5M.bin","data_10M.bin","data_15M.bin","data_20M.bin"] 23 | # files=["data_20M.bin"] 24 | limits=[10**5,10**6,5*10**6,10**7,15*10**6,2*10**7] 25 | # limits=[20*10**6] 26 | for i,file in enumerate(files): 27 | data_part=data[:limits[i]] 28 | 29 | # Append in Binary Mode 30 | with open(folder_name+'/'+file, "ab") as fout: 31 | for id,vector in enumerate(data_part): 32 | # Pack the data into a binary format 33 | unpacked_data = struct.pack(f"I{70}f", id, *vector) 34 | fout.write(unpacked_data) 35 | 36 | # # Test 37 | # print("len(data)",len(data)) 38 | # # print(data[0]) 39 | # print(data[-1]) 40 | # read_data=read_binary_file_chunk('./Data_TA/data_100K.bin',f"I{70}f",start_index=0,chunk_size=1000000,dictionary_format=True) 41 | # print("len(read_data)",len(read_data)) 42 | # # print(read_data[0]) 43 | # print(read_data[10**5-1]) 44 | 45 | 46 | # # Test 47 | # print("len(data)",len(data)) 48 | # # print(data[0]) 49 | # print(data[-1]) 50 | # read_data=read_binary_file_chunk('./Data_TA/data_1M.bin',f"I{70}f",start_index=0,chunk_size=1000000,dictionary_format=True) 51 | # print("len(read_data)",len(read_data)) 52 | # # print(read_data[0]) 53 | # print(read_data[10**6-1]) 54 | 55 | def read_binary_file(file_path,format): 56 | ''' 57 | Read binary file from its format 58 | ''' 59 | try: 60 | with open(file_path,"rb") as fin: 61 | file_size = os.path.getsize(file_path) 62 | record_size=struct.calcsize(format) 63 | n_records=file_size/record_size 64 | # print("n_records",n_records) 65 | 66 | fin.seek(0) #Move pointer to the beginning of the file 67 | data = fin.read(record_size * int(n_records)) 68 | if not data: 69 | print("Empty File ",file_path,"🔴🔴") 70 | return None 71 | # Unpack the binary data 72 | data=np.frombuffer(data, dtype=np.dtype(format)) 73 | return data 74 | except FileNotFoundError: 75 | print(f"The file '{file_path}' Not Found.") 76 | 77 | def write_binary_file(file_path,data_to_write,format): 78 | ''' 79 | data_to_write: array of values with format as passed 80 | format: format of each element 81 | ''' 82 | try: 83 | with open(file_path, "ab") as fout: 84 | # Pack the entire array into binary data 85 | binary_data = struct.pack(len(data_to_write)*format, *data_to_write.flatten()) 86 | fout.write(binary_data) 87 | except FileNotFoundError: 88 | print(f"The file '{file_path}' could not be created.") 89 | 90 | def read_binary_file_chunk(file_path, record_format, start_index, chunk_size=10,dictionary_format=False): 91 | """ 92 | This Function Reads Chunk from a binary File 93 | If remaining from file are < chunk size they are returned normally 94 | 95 | file_path:Path of the file to be read from 96 | record_format: format of the record ex:f"4I" 4 integers 97 | start_index: index of the record from which we start reading [0_indexed] 98 | chunk_size: no of records to be retrieved 99 | 100 | @return : None in case out of index of file 101 | the records 102 | """ 103 | 104 | # Calculate record size 105 | record_size = struct.calcsize(record_format) 106 | 107 | # Open the binary file for reading 108 | with open(file_path, "rb") as fin: 109 | fin.seek( 110 | start_index * record_size 111 | ) # Move the file pointer to the calculated offset 112 | 113 | # Read a chunk of records 114 | # .read() moves the file pointer (cursor) forward by the number of bytes read. 115 | chunk_data = fin.read(record_size * (chunk_size)) 116 | if len(chunk_data) == 0: 117 | print("Out Of File Index 🔥🔥") 118 | return None 119 | 120 | # file_size = os.path.getsize(file_path) 121 | # print("Current file position:", fin.tell()) 122 | # print("File size:", file_size,"record_format",record_format,"record_size",record_size,"chunk_data len",len(chunk_data)) 123 | 124 | if dictionary_format: 125 | records={} 126 | for i in range(0, len(chunk_data), record_size): 127 | #TODO Remove this loop @Basma Elhoseny 128 | unpacked_record = struct.unpack(record_format, chunk_data[i : i + record_size]) 129 | id, vector = unpacked_record[0], unpacked_record[1:] 130 | records[id]=np.array(vector) 131 | return records 132 | 133 | # Unpack Data 134 | records = [] 135 | for i in range(0, len(chunk_data), record_size): 136 | unpacked_record = struct.unpack( 137 | record_format, chunk_data[i : i + record_size] 138 | ) 139 | id, vector = unpacked_record[0], unpacked_record[1:] 140 | record = {"id": id, "embed": list(vector)} 141 | records.append(record) 142 | return records 143 | def empty_folder(folder_path): 144 | """ 145 | Function to Empty a folder given its path 146 | @param folder_path : path of the folder to be deleted 147 | """ 148 | if not os.path.exists(folder_path): 149 | os.makedirs(folder_path) 150 | print("Created new ", folder_path, "successfully") 151 | return 152 | 153 | for filename in os.listdir(folder_path): 154 | file_path = os.path.join(folder_path, filename) 155 | try: 156 | if os.path.isfile(file_path): 157 | os.unlink(file_path) 158 | elif os.path.isdir(file_path): 159 | shutil.rmtree(file_path) 160 | except Exception as e: 161 | print(f"Error while deleting {file_path}: {e}") 162 | print("Deleted", folder_path, "successfully") 163 | 164 | 165 | def extract_embeds(dict): 166 | # {505: {'id': 505, 'embed': [0.8,....]}} --> [[0.8,....],[.......]] 167 | return [entry["embed"] for entry in dict.values()] 168 | 169 | 170 | def extract_embeds_array(arr): 171 | return np.array([entry["embed"] for entry in arr]) 172 | 173 | 174 | def cal_score(vec1, vec2): 175 | dot_product = np.dot(vec1, vec2) 176 | norm_vec1 = np.linalg.norm(vec1) 177 | norm_vec2 = np.linalg.norm(vec2) 178 | cosine_similarity = dot_product / (norm_vec1 * norm_vec2) 179 | return cosine_similarity 180 | 181 | 182 | def calculate_offset(record_id: int) -> int: 183 | # Calculate the offset for a given record ID 184 | record_size = struct.calcsize("I70f") 185 | return (record_id) * record_size 186 | 187 | 188 | def read_multiple_records_by_id(file_path, records_id: List[int],dictionary_format=False): 189 | record_size = struct.calcsize("I70f") 190 | records = {} 191 | 192 | records_dictionary={} 193 | 194 | with open(file_path, "rb") as fin: 195 | for i in range(len(records_id)): 196 | offset = calculate_offset(records_id[i]) 197 | fin.seek(offset) # Move the file pointer to the calculated offset 198 | data = fin.read(record_size) 199 | if not data: 200 | records[records_id[i]] = None 201 | continue 202 | 203 | # Unpack the binary data into a dictionary 204 | unpacked_data = struct.unpack("I70f", data) 205 | id_value, floats = unpacked_data[0], unpacked_data[1:] 206 | 207 | if dictionary_format: 208 | records_dictionary[id_value]=list(floats) 209 | else: 210 | # Create and return the record dictionary 211 | record = {"id": id_value, "embed": list(floats)} 212 | records[records_id[i]] = record 213 | 214 | if dictionary_format: return records_dictionary 215 | return records 216 | 217 | # def generate_random(k=100): 218 | # # Sample data: k vectors with 70 features each 219 | # data = np.random.uniform(-1, 1, size=(k, 70)) 220 | 221 | # # Write data to a text file 222 | # file_path = "../DataBase/random_data_"+str(k)+".txt" 223 | # np.savetxt(file_path, data) 224 | 225 | # # Read Data from File 226 | # # read_data = np.loadtxt(file_path) 227 | 228 | 229 | # def array_to_dictionary(values,keys=None): 230 | # ''' 231 | # values: [array of values] 232 | # Keys: [array of Keys] optional if not passed the keys are indexed 0-N 233 | # ''' 234 | # if(keys is None): 235 | # keys=range(0,len(values)) 236 | 237 | # if(len(values)!=len(keys)): 238 | # print ("array_to_dictionary(): InCorrect Size of keys and values") 239 | # return None 240 | 241 | # dictionary_data = dict(zip(keys, values)) 242 | # return dictionary_data 243 | 244 | 245 | # def get_vector_from_id(data_path,id): 246 | # ''' 247 | # function to get the vector by its id [BADDDDDDDD Use Seek] 248 | 249 | # ''' 250 | # read_data = np.loadtxt(data_path) 251 | # return read_data[id] 252 | 253 | 254 | # def check_dir(path): 255 | # if os.path.exists(path): 256 | # shutil.rmtree(path, ignore_errors=True, onerror=lambda func, path, exc: None) 257 | # os.makedirs(path) 258 | 259 | 260 | # Test generate_random() 261 | # generate_random(10000) 262 | -------------------------------------------------------------------------------- /src/vec_db.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Dict, List, Annotated 3 | from utils import * 4 | import numpy as np 5 | import time 6 | import os 7 | # from Modules.LSH import LSH_index, semantic_query_lsh 8 | from IVF import IVF_index,semantic_query_ivf 9 | 10 | NUMBER_OF_RECORDS_BRUTE_FORCE = 20*10**3 11 | 12 | class VecDB: 13 | def __init__(self,file_path="./DataBase", new_db = True) -> None: 14 | ''' 15 | Constructor 16 | ''' 17 | self.file_path =file_path+'/data.bin' # Data File Path 18 | self.database_path= file_path # Path of the Folder to Create Indexes 19 | self.n_regions = None # Initialize n_regions 20 | 21 | 22 | if new_db: 23 | if not os.path.exists(self.database_path): 24 | os.makedirs(self.database_path) 25 | 26 | else: 27 | # If New DataBase Empty DataBase Folder 28 | empty_folder(self.database_path) 29 | 30 | # just open new file to delete the old one 31 | with open(self.file_path, "w") as fout: 32 | # if you need to add any head to the file 33 | pass 34 | 35 | 36 | self.level1=None 37 | 38 | def insert_records(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 39 | # Append in Binary Mode 40 | with open(self.file_path, "ab") as fout: 41 | for row in rows: 42 | id, embed = row["id"], row["embed"] 43 | # Pack the data into a binary format 44 | data = struct.pack(f"I{70}f", id, *embed) 45 | fout.write(data) 46 | self._build_index() 47 | 48 | 49 | def _build_index(self,Level_1_nbits=5, Level_2_nbits=3, Level_3_nbits=3,Level_4_nbits=3)-> None: 50 | 51 | ''' 52 | Build the Index 53 | ''' 54 | file_size = os.path.getsize(self.file_path) 55 | record_size=struct.calcsize(f"I{70}f") 56 | n_records=file_size/record_size 57 | if(n_records==10*10**3): 58 | self.number_of_clusters=10 59 | elif(n_records==100*10**3): 60 | self.number_of_clusters=50 61 | elif(n_records==10**6): 62 | self.number_of_clusters=200 63 | elif(n_records==5*10**6): 64 | self.number_of_clusters=500 65 | elif(n_records==10*10**6): 66 | self.number_of_clusters=8000 67 | 68 | # if() 69 | # if(n_records==10**3): 70 | # self.number_of_clusters=50 71 | # if(n_records<=100000): 72 | # self.number_of_clusters=10 73 | # self.n_regions=5 74 | # else: 75 | # self.number_of_clusters=int(n_records/NUMBER_OF_RECORDS_BRUTE_FORCE) 76 | # self.number_of_clusters=NUMBER_OF_RECORDS_BRUTE_FORCE 77 | # self.n_regions=10 78 | print("Record Size: ",record_size) 79 | print("File Size: ",file_size) 80 | print("Building Index ..........") 81 | print("Number of records: ",n_records) 82 | print("Number of Clusters: ",self.number_of_clusters) 83 | # measure the time 84 | start = time.time() 85 | 86 | # Make Level1 Folder 87 | Level1_folder_path = self.database_path+'/Level1' 88 | if not os.path.exists(Level1_folder_path): 89 | os.makedirs(Level1_folder_path) 90 | 91 | # IVF Layer 1 Indexing 92 | chunk_size=100000 93 | print("chunk_size",chunk_size) 94 | IVF_index(file_path=self.file_path,K_means_metric='euclidean',K_means_n_clusters=self.number_of_clusters,k_means_batch_size=chunk_size,k_means_max_iter=100,k_means_n_init='auto',chunk_size=chunk_size,index_folder_path=Level1_folder_path) 95 | 96 | 97 | # # Layer 1 Indexing 98 | # # level_1_in = self.get_top_k_records(top_k_records) 99 | # level_1_planes = LSH_index(file_path=self.file_path, nbits=Level_1_nbits, chunk_size=1000,index_path=self.database_path + "/Level1") 100 | # np.save(self.database_path + "/Level1/"+'metadata.npy',level_1_planes) 101 | # print("Layer 1 Finished") 102 | # return 103 | 104 | 105 | 106 | # # Layer 2 Indexing 107 | # for file_name in os.listdir(self.database_path + "/Level1"): 108 | # file_path = os.path.join(self.database_path + "/Level1", file_name) 109 | # if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 110 | # read_data_2 = np.loadtxt(file_path, dtype=int, ndmin=1) 111 | # level_2_in = self.read_multiple_records_by_id(read_data_2) 112 | # level_2_planes = LSH_index(data=level_2_in.values(), nbits=Level_2_nbits, index_path=self.database_path + "/Level2/" + file_name[:-4]) 113 | # np.save(self.database_path + "/Level2/" + file_name[:-4]+'/metadata.npy',level_2_planes) 114 | # print("Layer 2 Finished") 115 | # return 116 | 117 | 118 | # # Layer 3 Indexing 119 | # for folder_name in os.listdir(self.database_path + "/Level2"): 120 | # folder_path = os.path.join(self.database_path + "/Level2", folder_name) 121 | # for file_name in os.listdir(folder_path): 122 | # file_path = os.path.join(folder_path, file_name) 123 | # if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 124 | # read_data_3 = np.loadtxt(file_path, dtype=int, ndmin=1) 125 | # level_3_in = self.read_multiple_records_by_id(read_data_3) 126 | # level_3_planes = LSH_index(data=level_3_in.values(), nbits=Level_3_nbits, index_path=self.database_path + "/Level3/" + folder_name + '/' + file_name[:-4]) 127 | # np.save(self.database_path + "/Level3/" + folder_name + '/' + file_name[:-4]+'/metadata.npy',level_3_planes) 128 | # print("Layer 3 Finished") 129 | # return 130 | 131 | # # Layer 4 Indexing 132 | # for folder_name in os.listdir(self.database_path + "/Level3"): 133 | # folder_path = os.path.join(self.database_path + "/Level3", folder_name) 134 | # for folder_name_2 in os.listdir(folder_path): 135 | # folder_path_2 = os.path.join(folder_path, folder_name_2) 136 | # for file_name in os.listdir(folder_path_2): 137 | # file_path = os.path.join(folder_path_2, file_name) 138 | # if os.path.isfile(file_path) and file_name.lower().endswith(".txt"): 139 | # read_data_4 = np.loadtxt(file_path, dtype=int, ndmin=1) 140 | # level_4_in = self.read_multiple_records_by_id(read_data_4) 141 | # level_4_planes = LSH_index(data=level_4_in.values(), nbits=Level_4_nbits, index_path=self.database_path + "/Level4/" + folder_name + '/' + folder_name_2 + '/' + file_name[:-4]) 142 | # np.save(self.database_path + "/Level4/" + folder_name + '/' + folder_name_2 + '/' + file_name[:-4]+'/metadata.npy',level_4_planes) 143 | # print("Layer 4 Finished") 144 | 145 | 146 | # measure the time 147 | end = time.time() 148 | print("Indexing Done ...... Time taken by Indexing: ",end - start) 149 | return 150 | 151 | def retrive(self, query:Annotated[List[float], 70],top_k = 5)-> [int]: 152 | ''' 153 | Get the top_k vectors similar to the Query 154 | 155 | return: list of the top_k similar vectors Ordered by Cosine Similarity 156 | ''' 157 | print(f"Retrieving top {top_k} ..........") 158 | Level1_folder_path = self.database_path+'/Level1' 159 | 160 | file_size = os.path.getsize(self.file_path) 161 | record_size=struct.calcsize(f"I{70}f") 162 | n_records=file_size/record_size 163 | if(n_records==10*10**3): 164 | n_probes=3 165 | elif(n_records==100*10**3): 166 | n_probes=10 167 | elif(n_records==10**6): 168 | n_probes=5 169 | elif(n_records==5*10**6): 170 | n_probes=15 171 | elif(n_records==10*10**6): 172 | n_probes=30 173 | elif(n_records==15*10**6): 174 | n_probes=256 175 | elif(n_records==20*10**6): 176 | n_probes=64 177 | # n_probes=0 178 | # if(n_records<=5*10**6): 179 | # n_probes=3 180 | # else: 181 | # n_probes=20 182 | final_result=semantic_query_ivf(data_file_path=self.file_path,index_folder_path=Level1_folder_path,query=query,top_k=top_k,n_regions=n_probes) 183 | 184 | return final_result 185 | 186 | 187 | 188 | # # Retrieve from Level 1 189 | # level_1_planes = np.load(self.database_path + "/Level1"+'/metadata.npy') 190 | # bucket_1,result = semantic_query_lsh(query, level_1_planes, self.database_path + "/Level1") 191 | # print("length of first bucket",result.shape) 192 | 193 | # if len(result) < top_k: 194 | # print('level 1 smaller than top_k') 195 | 196 | # # Retrieve from Level 2 197 | # level_2_planes = np.load(self.database_path + "/Level2/"+bucket_1+'/metadata.npy') 198 | # bucket_2,result = semantic_query_lsh(query, level_2_planes, self.database_path + "/Level2/"+bucket_1) 199 | # print("length of second bucket",result.shape) 200 | 201 | # if len(result) < top_k: 202 | # print('level 2 smaller than top_k') 203 | 204 | # # Retrieve from Level 3 205 | # level_3_planes = np.load(self.database_path + "/Level3/"+bucket_1+'/'+bucket_2+'/metadata.npy') 206 | # bucket_3,result = semantic_query_lsh(query, level_3_planes, self.database_path + "/Level3/"+bucket_1+'/'+bucket_2) 207 | # print("length of third bucket",result.shape) 208 | 209 | # if len(result) < top_k: 210 | # print('level 3 smaller than top_k') 211 | 212 | # # Retrieve from Level 4 213 | # level_4_planes = np.load(self.database_path + "/Level4/"+bucket_1+'/'+bucket_2+'/'+bucket_3+'/metadata.npy') 214 | # bucket_4,result = semantic_query_lsh(query, level_4_planes, self.database_path + "/Level4/"+bucket_1+'/'+bucket_2+'/'+bucket_3) 215 | # print("length of fourth bucket",result.shape) 216 | 217 | # if len(result) < top_k: 218 | # print('level 4 smaller than top_k') 219 | 220 | 221 | # # Retrieve from Data Base the Embeddings of the Vectors 222 | # final_result= read_multiple_records_by_id(self.file_path,result) 223 | 224 | # Calculate the Cosine Similarity between the Query and the Vectors 225 | # scores = [] 226 | # for row in final_result.values(): 227 | # id_value = row['id'] 228 | # embed_values = row['embed'] 229 | # score = self._cal_score(query, embed_values) 230 | # scores.append((score, id_value)) 231 | # scores = sorted(scores, reverse=True)[:top_k] 232 | # return [s[1] for s in scores] 233 | 234 | 235 | 236 | 237 | def _cal_score(self, vec1, vec2): 238 | dot_product = np.dot(vec1, vec2) 239 | norm_vec1 = np.linalg.norm(vec1) 240 | norm_vec2 = np.linalg.norm(vec2) 241 | cosine_similarity = dot_product / (norm_vec1 * norm_vec2) 242 | return cosine_similarity -------------------------------------------------------------------------------- /src/worst_case_implementation.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Annotated 2 | import struct 3 | import numpy as np 4 | 5 | class VecDBWorst: 6 | def __init__(self, file_path = "saved_db.csv", new_db = True) -> None: 7 | self.file_path = file_path 8 | if new_db: 9 | # just open new file to delete the old one 10 | with open(self.file_path, "w") as fout: 11 | # if you need to add any head to the file 12 | pass 13 | 14 | def insert_records(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 15 | with open(self.file_path, "a+") as fout: 16 | for row in rows: 17 | id, embed = row["id"], row["embed"] 18 | row_str = f"{id}," + ",".join([str(e) for e in embed]) 19 | fout.write(f"{row_str}\n") 20 | self._build_index() 21 | 22 | # def insert_records_binary(self, rows: List[Dict[int, Annotated[List[float], 70]]]): 23 | # with open(self.file_path, "ab") as fout: # Open the file in binary mode for appending 24 | # for row in rows: 25 | # id, embed = row["id"], row["embed"] 26 | # # Pack the data into a binary format 27 | # data = struct.pack(f"I{70}f", id, *embed) 28 | # fout.write(data) 29 | # self._build_index() 30 | 31 | # def calculate_offset(self, record_id: int) -> int: 32 | # # Calculate the offset for a given record ID 33 | # record_size = struct.calcsize("I70f") 34 | # return (record_id - 1) * record_size 35 | 36 | # def read_record_by_id(self, record_id: int) -> Dict[int, Annotated[List[float], 70]]: 37 | # record_size = struct.calcsize("I70f") 38 | # offset = self.calculate_offset(record_id) 39 | 40 | # with open(self.file_path, "rb") as fin: 41 | # fin.seek(offset) # Move the file pointer to the calculated offset 42 | # data = fin.read(record_size) 43 | # if not data: 44 | # return {} # Record not found 45 | 46 | # # Unpack the binary data into a dictionary 47 | # unpacked_data = struct.unpack("I70f", data) 48 | # id_value, floats = unpacked_data[0], unpacked_data[1:] 49 | 50 | # # Create and return the record dictionary 51 | # record = {"id": id_value, "embed": list(floats)} 52 | # return {record_id: record} 53 | 54 | # def read_multiple_records_by_id(self, records_id: List[int]): 55 | # record_size = struct.calcsize("I70f") 56 | # records = {} 57 | 58 | # with open(self.file_path, "rb") as fin: 59 | # for i in range(len(records_id)): 60 | # offset = self.calculate_offset(records_id[i]) 61 | # fin.seek(offset) # Move the file pointer to the calculated offset 62 | # data = fin.read(record_size) 63 | # if not data: 64 | # records[records_id[i]] = None 65 | # continue 66 | 67 | # # Unpack the binary data into a dictionary 68 | # unpacked_data = struct.unpack("I70f", data) 69 | # id_value, floats = unpacked_data[0], unpacked_data[1:] 70 | 71 | # # Create and return the record dictionary 72 | # record = {"id": id_value, "embed": list(floats)} 73 | # records[records_id[i]] = record 74 | # return records 75 | 76 | # def get_top_k_records(self,k): 77 | # records = [] 78 | # record_size = struct.calcsize("I70f") 79 | # with open(self.file_path,'rb') as fin: 80 | # fin.seek(0) 81 | # for i in range(k): 82 | # data = fin.read(record_size) 83 | # unpacked_data = struct.unpack("I70f", data) 84 | # id_value, floats = unpacked_data[0], unpacked_data[1:] 85 | 86 | # record = {"id": id_value, "embed": list(floats)} 87 | # records.append(record) 88 | # return records 89 | 90 | def retrive(self, query: Annotated[List[float], 70], top_k = 5): 91 | scores = [] 92 | with open(self.file_path, "r") as fin: 93 | for row in fin.readlines(): 94 | row_splits = row.split(",") 95 | id = int(row_splits[0]) 96 | embed = [float(e) for e in row_splits[1:]] 97 | score = self._cal_score(query, embed) 98 | scores.append((score, id)) 99 | # here we assume that if two rows have the same score, return the lowest ID 100 | scores = sorted(scores, reverse=True)[:top_k] 101 | return [s[1] for s in scores] 102 | 103 | def _cal_score(self, vec1, vec2): 104 | dot_product = np.dot(vec1, vec2) 105 | norm_vec1 = np.linalg.norm(vec1) 106 | norm_vec2 = np.linalg.norm(vec2) 107 | cosine_similarity = dot_product / (norm_vec1 * norm_vec2) 108 | return cosine_similarity 109 | 110 | def _build_index(self): 111 | pass 112 | 113 | 114 | -------------------------------------------------------------------------------- /vector searching algorithms/LSHHyperPlane.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 19, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "nbits = 3 # number of hyperplanes and binary vals to produce\n", 10 | "d = 70 # vector dimensions\n", 11 | "\n", 12 | "#! log2(8)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 20, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "array([[-0.16225197, -0.1159841 , -0.0379947 , 0.47482455, -0.24329282,\n", 24 | " -0.17219515, 0.0889353 , -0.34083189, -0.0857109 , 0.19710836,\n", 25 | " -0.48189731, 0.04372259, -0.40512638, -0.49822556, -0.18287792,\n", 26 | " 0.31290614, -0.3228106 , -0.24989376, -0.30750153, 0.35378187,\n", 27 | " 0.32047542, -0.3405673 , 0.01704872, -0.29642438, -0.13589619,\n", 28 | " -0.28071414, -0.05584624, 0.09991783, -0.3077081 , 0.13647533,\n", 29 | " -0.4770819 , 0.38661425, 0.00783346, -0.3927221 , 0.24110202,\n", 30 | " 0.05990217, -0.06802645, 0.03282446, 0.30179191, 0.19156282,\n", 31 | " 0.1054492 , -0.00932327, 0.32185434, 0.49241347, 0.20733614,\n", 32 | " 0.18026476, -0.26101194, -0.3778801 , 0.4936665 , -0.41047846,\n", 33 | " 0.23792759, -0.23321223, 0.39830173, -0.13084787, -0.31116184,\n", 34 | " -0.00189201, 0.08791531, -0.14980787, 0.30558995, -0.00670045,\n", 35 | " 0.2243021 , -0.41023912, 0.18223908, -0.38580149, -0.11450846,\n", 36 | " -0.43534932, -0.30870211, 0.29227875, -0.15542413, 0.20264467],\n", 37 | " [ 0.14822377, -0.3276353 , 0.26702648, 0.03217607, -0.20768427,\n", 38 | " -0.46076215, -0.1540441 , -0.14252796, -0.21732578, -0.35428344,\n", 39 | " 0.2084557 , -0.18725843, -0.14300948, -0.16831679, -0.15468043,\n", 40 | " -0.07450581, 0.00233269, -0.43596823, 0.19815002, 0.19192439,\n", 41 | " 0.13743071, 0.1589349 , -0.37645398, -0.40863437, 0.18087113,\n", 42 | " 0.26399475, -0.20873789, 0.31885534, -0.47078825, 0.19251382,\n", 43 | " 0.39492556, 0.13612851, 0.05252917, 0.1066819 , -0.20602171,\n", 44 | " 0.019199 , -0.42388983, -0.29196827, -0.08782944, 0.38928403,\n", 45 | " -0.14570291, -0.04510013, -0.11215063, 0.08435185, 0.0814708 ,\n", 46 | " 0.30669655, 0.2207886 , 0.48463154, -0.07672077, 0.02202735,\n", 47 | " -0.43230734, -0.16302195, 0.42660854, 0.21663128, -0.37419712,\n", 48 | " -0.07194837, 0.38714646, -0.34736523, 0.17594598, -0.04035088,\n", 49 | " -0.40523731, -0.46061098, -0.18572369, 0.46951449, 0.04887038,\n", 50 | " -0.47345552, 0.21424037, 0.06851608, -0.44381182, 0.18315833],\n", 51 | " [-0.34032214, 0.20510388, -0.11437108, -0.25271871, -0.29298418,\n", 52 | " -0.39316298, -0.27235239, -0.02090773, 0.42378779, 0.36244202,\n", 53 | " -0.49353892, -0.37632702, 0.08773845, 0.23887437, -0.03724808,\n", 54 | " -0.23496996, 0.46748875, -0.2527981 , -0.39073735, 0.40062121,\n", 55 | " 0.10471371, -0.43568824, 0.14986386, 0.18503889, 0.37681242,\n", 56 | " -0.01243404, -0.39396771, 0.19966568, 0.08733691, 0.2509886 ,\n", 57 | " -0.30488297, 0.39487436, -0.38444297, -0.47168848, 0.40205414,\n", 58 | " -0.40537546, 0.03695501, -0.28056081, 0.42477745, 0.49012324,\n", 59 | " 0.29699303, -0.14461517, 0.37445295, -0.30211604, -0.39721614,\n", 60 | " 0.34692406, 0.11398823, 0.30746006, -0.22106426, -0.1443832 ,\n", 61 | " -0.07727599, 0.23807248, 0.32999453, 0.36904291, -0.01930504,\n", 62 | " -0.01021899, -0.25900161, 0.32297803, -0.08289675, 0.28510423,\n", 63 | " -0.35789496, -0.22445429, -0.43838493, -0.25795092, 0.18836288,\n", 64 | " 0.07583603, 0.24472323, 0.07588773, -0.46374612, 0.29371209]])" 65 | ] 66 | }, 67 | "execution_count": 20, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "import numpy as np\n", 74 | "# create a set of 4 hyperplanes, with 2 dimensions\n", 75 | "plane_norms = np.random.rand(nbits, d) - .5\n", 76 | "plane_norms" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 21, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "[[0.24346879 0.20847865 0.09572784 0.55319155 0.07226476 0.87621505\n", 89 | " 0.58913737 0.46246208 0.3333929 0.96904595 0.89600564 0.59480015\n", 90 | " 0.77746555 0.54283576 0.67569602 0.23497124 0.07950847 0.31373656\n", 91 | " 0.14113477 0.408343 0.81078156 0.57261091 0.35708278 0.41438797\n", 92 | " 0.44477819 0.65966904 0.12290202 0.22656549 0.07834719 0.83130173\n", 93 | " 0.4011682 0.72348172 0.58621532 0.97192664 0.12448728 0.19814639\n", 94 | " 0.84652828 0.19786014 0.70758209 0.91906712 0.93421752 0.70310207\n", 95 | " 0.27726954 0.96761661 0.06944151 0.32237123 0.95214921 0.55453475\n", 96 | " 0.22046089 0.75767774 0.76303426 0.11226826 0.96778584 0.18364252\n", 97 | " 0.86959947 0.35425893 0.05710316 0.67524358 0.84838244 0.84003201\n", 98 | " 0.71003473 0.74628258 0.48265117 0.48526062 0.41682251 0.24458015\n", 99 | " 0.56040784 0.47847743 0.11747069 0.2841326 ]]\n", 100 | "[[0.08021692 0.11425865 0.66780691 0.64591999 0.72730137 0.37124926\n", 101 | " 0.90060496 0.29506503 0.14570761 0.05080957 0.84666837 0.89391903\n", 102 | " 0.24643966 0.44699371 0.64934757 0.67063548 0.75535129 0.75834603\n", 103 | " 0.36194954 0.35376051 0.86358269 0.67616135 0.0427738 0.70108469\n", 104 | " 0.27322976 0.30507254 0.39729602 0.43346906 0.20055928 0.0809176\n", 105 | " 0.05108797 0.16594553 0.60731689 0.25100448 0.60585635 0.18616766\n", 106 | " 0.35069704 0.88805346 0.07440514 0.91451748 0.14379966 0.30309109\n", 107 | " 0.6960067 0.5342824 0.90332074 0.95252648 0.74792365 0.09408956\n", 108 | " 0.44708244 0.75941983 0.49066048 0.6441386 0.63012995 0.98668287\n", 109 | " 0.60327176 0.62091147 0.72095104 0.59418854 0.84847164 0.18768005\n", 110 | " 0.50059785 0.83818945 0.58380742 0.79375409 0.38927237 0.30760971\n", 111 | " 0.90913216 0.75463438 0.32010021 0.71195468]]\n", 112 | "[array([[0.24346879, 0.20847865, 0.09572784, 0.55319155, 0.07226476,\n", 113 | " 0.87621505, 0.58913737, 0.46246208, 0.3333929 , 0.96904595,\n", 114 | " 0.89600564, 0.59480015, 0.77746555, 0.54283576, 0.67569602,\n", 115 | " 0.23497124, 0.07950847, 0.31373656, 0.14113477, 0.408343 ,\n", 116 | " 0.81078156, 0.57261091, 0.35708278, 0.41438797, 0.44477819,\n", 117 | " 0.65966904, 0.12290202, 0.22656549, 0.07834719, 0.83130173,\n", 118 | " 0.4011682 , 0.72348172, 0.58621532, 0.97192664, 0.12448728,\n", 119 | " 0.19814639, 0.84652828, 0.19786014, 0.70758209, 0.91906712,\n", 120 | " 0.93421752, 0.70310207, 0.27726954, 0.96761661, 0.06944151,\n", 121 | " 0.32237123, 0.95214921, 0.55453475, 0.22046089, 0.75767774,\n", 122 | " 0.76303426, 0.11226826, 0.96778584, 0.18364252, 0.86959947,\n", 123 | " 0.35425893, 0.05710316, 0.67524358, 0.84838244, 0.84003201,\n", 124 | " 0.71003473, 0.74628258, 0.48265117, 0.48526062, 0.41682251,\n", 125 | " 0.24458015, 0.56040784, 0.47847743, 0.11747069, 0.2841326 ]]), array([[0.08021692, 0.11425865, 0.66780691, 0.64591999, 0.72730137,\n", 126 | " 0.37124926, 0.90060496, 0.29506503, 0.14570761, 0.05080957,\n", 127 | " 0.84666837, 0.89391903, 0.24643966, 0.44699371, 0.64934757,\n", 128 | " 0.67063548, 0.75535129, 0.75834603, 0.36194954, 0.35376051,\n", 129 | " 0.86358269, 0.67616135, 0.0427738 , 0.70108469, 0.27322976,\n", 130 | " 0.30507254, 0.39729602, 0.43346906, 0.20055928, 0.0809176 ,\n", 131 | " 0.05108797, 0.16594553, 0.60731689, 0.25100448, 0.60585635,\n", 132 | " 0.18616766, 0.35069704, 0.88805346, 0.07440514, 0.91451748,\n", 133 | " 0.14379966, 0.30309109, 0.6960067 , 0.5342824 , 0.90332074,\n", 134 | " 0.95252648, 0.74792365, 0.09408956, 0.44708244, 0.75941983,\n", 135 | " 0.49066048, 0.6441386 , 0.63012995, 0.98668287, 0.60327176,\n", 136 | " 0.62091147, 0.72095104, 0.59418854, 0.84847164, 0.18768005,\n", 137 | " 0.50059785, 0.83818945, 0.58380742, 0.79375409, 0.38927237,\n", 138 | " 0.30760971, 0.90913216, 0.75463438, 0.32010021, 0.71195468]]), array([[0.77078444, 0.08654989, 0.53784898, 0.27799896, 0.8002419 ,\n", 139 | " 0.3718589 , 0.79333253, 0.93137511, 0.62069102, 0.24775348,\n", 140 | " 0.4272542 , 0.98292296, 0.8470418 , 0.52637634, 0.80102169,\n", 141 | " 0.9440776 , 0.68466439, 0.86571266, 0.27882657, 0.71765609,\n", 142 | " 0.75582515, 0.94039075, 0.20125464, 0.64221553, 0.83386962,\n", 143 | " 0.73863021, 0.93711781, 0.4794255 , 0.48793874, 0.86123148,\n", 144 | " 0.92146684, 0.81077912, 0.50759451, 0.18685755, 0.98628992,\n", 145 | " 0.97781323, 0.38057898, 0.87204991, 0.30697755, 0.49756831,\n", 146 | " 0.21495361, 0.38075046, 0.30418495, 0.02362606, 0.96563469,\n", 147 | " 0.80356302, 0.60805212, 0.30632618, 0.59294981, 0.08821319,\n", 148 | " 0.19220448, 0.17337163, 0.93316608, 0.46086827, 0.17464549,\n", 149 | " 0.16804619, 0.05238805, 0.95753146, 0.87498728, 0.5543621 ,\n", 150 | " 0.727997 , 0.54389248, 0.38181964, 0.47099822, 0.59929861,\n", 151 | " 0.35783896, 0.27050514, 0.98113893, 0.49137662, 0.05494532]])]\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "a=np.random.random((1, 70))\n", 157 | "b=np.random.random((1, 70))\n", 158 | "c=np.random.random((1, 70))\n", 159 | "# Repeat the array to generate a 1x70 array\n", 160 | "print(a)\n", 161 | "print(b)\n", 162 | "dataset=[a,b,c]\n", 163 | "print(dataset)\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 22, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "array([[-0.98219519, -1.00134517, 0.23751343]])" 175 | ] 176 | }, 177 | "execution_count": 22, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "# calculate the dot product for each of these\n", 184 | "a_dot = np.dot(a, plane_norms.T)\n", 185 | "b_dot = np.dot(b, plane_norms.T)\n", 186 | "c_dot = np.dot(c, plane_norms.T)\n", 187 | "a_dot" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 23, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "array([[False, False, True]])" 199 | ] 200 | }, 201 | "execution_count": 23, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | " #! Dataset\n", 208 | "# we know that a positive dot product == +ve side of hyperplane\n", 209 | "# and negative dot product == -ve side of hyperplane\n", 210 | "a_dot = a_dot > 0\n", 211 | "b_dot = b_dot > 0\n", 212 | "c_dot = c_dot > 0\n", 213 | "a_dot" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 24, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "array([0, 0, 1])" 225 | ] 226 | }, 227 | "execution_count": 24, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "# convert our boolean arrays to int arrays to make bucketing\n", 234 | "# easier (although is okay to use boolean for Hamming distance)\n", 235 | "a_dot = a_dot.astype(int)[0]\n", 236 | "b_dot = b_dot.astype(int)[0]\n", 237 | "c_dot = c_dot.astype(int)[0]\n", 238 | "a_dot" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 25, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "001\n", 251 | "000\n", 252 | "000\n", 253 | "{'001': [0], '000': [1, 2]}\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "vectors = [a_dot, b_dot, c_dot]\n", 259 | "buckets = {}\n", 260 | "i = 0\n", 261 | "\n", 262 | "for i in range(len(vectors)):\n", 263 | " # convert from array to string\n", 264 | " hash_str = ''.join(vectors[i].astype(str))\n", 265 | " print(hash_str)\n", 266 | " # create bucket if it doesn't exist\n", 267 | " if hash_str not in buckets.keys():\n", 268 | " buckets[hash_str] = []\n", 269 | " # add vector position to bucket\n", 270 | " buckets[hash_str].append(i)\n", 271 | "\n", 272 | "print(buckets)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 26, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "[[0.98217599 0.0740474 0.60518903 0.61501807 0.37022865 0.59778156\n", 285 | " 0.37753566 0.44307262 0.00959564 0.43188369 0.77591898 0.40505507\n", 286 | " 0.40793131 0.30983161 0.20962565 0.47749091 0.64407774 0.07951662\n", 287 | " 0.21950209 0.5617165 0.27452938 0.73727983 0.10188798 0.15470781\n", 288 | " 0.9668675 0.63906846 0.76027852 0.59622264 0.90857345 0.49078738\n", 289 | " 0.17966669 0.29728534 0.76605938 0.89432476 0.17330878 0.91669595\n", 290 | " 0.327371 0.89819097 0.98084002 0.44954373 0.04567631 0.18179054\n", 291 | " 0.97601994 0.11805953 0.42048999 0.19162843 0.69235512 0.48898102\n", 292 | " 0.89062367 0.27993955 0.90024359 0.15118375 0.81479348 0.42161774\n", 293 | " 0.78538813 0.84025532 0.42421928 0.63838074 0.74416435 0.76683117\n", 294 | " 0.63133131 0.36330796 0.24528589 0.42388808 0.53415259 0.58918953\n", 295 | " 0.50997522 0.88050857 0.5938881 0.76094727]]\n", 296 | "Query belongs to bucket: [1, 2]\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | " #! for tesing input query\n", 302 | "\n", 303 | "query=np.random.random((1, 70))\n", 304 | "print(query)\n", 305 | "\n", 306 | "query_dot = np.dot(query, plane_norms.T)\n", 307 | "query_dot = query_dot > 0\n", 308 | "query_dot = query_dot.astype(int)[0]\n", 309 | "\n", 310 | "# Convert the query array to a string\n", 311 | "query_hash_str = ''.join(query_dot.astype(str))\n", 312 | "\n", 313 | "# Check which bucket the query belongs to\n", 314 | "if query_hash_str in buckets.keys():\n", 315 | " bucket_containing_query = buckets[query_hash_str]\n", 316 | " print(\"Query belongs to bucket:\", bucket_containing_query)\n", 317 | "else:\n", 318 | " print(\"Query doesn't match any existing buckets\")\n", 319 | " print(0)\n", 320 | "\n" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 27, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "[[0.24346879 0.20847865 0.09572784 0.55319155 0.07226476 0.87621505\n", 333 | " 0.58913737 0.46246208 0.3333929 0.96904595 0.89600564 0.59480015\n", 334 | " 0.77746555 0.54283576 0.67569602 0.23497124 0.07950847 0.31373656\n", 335 | " 0.14113477 0.408343 0.81078156 0.57261091 0.35708278 0.41438797\n", 336 | " 0.44477819 0.65966904 0.12290202 0.22656549 0.07834719 0.83130173\n", 337 | " 0.4011682 0.72348172 0.58621532 0.97192664 0.12448728 0.19814639\n", 338 | " 0.84652828 0.19786014 0.70758209 0.91906712 0.93421752 0.70310207\n", 339 | " 0.27726954 0.96761661 0.06944151 0.32237123 0.95214921 0.55453475\n", 340 | " 0.22046089 0.75767774 0.76303426 0.11226826 0.96778584 0.18364252\n", 341 | " 0.86959947 0.35425893 0.05710316 0.67524358 0.84838244 0.84003201\n", 342 | " 0.71003473 0.74628258 0.48265117 0.48526062 0.41682251 0.24458015\n", 343 | " 0.56040784 0.47847743 0.11747069 0.2841326 ]]\n", 344 | "[[0.14634457 0.20457996 0.24741999 0.92396483 0.02756294 0.55294575\n", 345 | " 0.53786271 0.25648871 0.87692497 0.0504492 0.78309208 0.78032385\n", 346 | " 0.60510748 0.50301495 0.33663068 0.48482831 0.31022081 0.22420917\n", 347 | " 0.03656832 0.0576642 0.02176892 0.41431776 0.97839866 0.36627294\n", 348 | " 0.90726783 0.942912 0.10056186 0.27157551 0.35269652 0.65476284\n", 349 | " 0.3802201 0.10174655 0.59332797 0.41950798 0.51371538 0.66586081\n", 350 | " 0.86951989 0.30015139 0.39627816 0.90903098 0.71667395 0.27805884\n", 351 | " 0.43209455 0.17558953 0.55876165 0.44371013 0.54468845 0.95573234\n", 352 | " 0.8846391 0.69004526 0.78832344 0.97466743 0.01227129 0.26130364\n", 353 | " 0.46660631 0.64762417 0.39275623 0.7934695 0.29527433 0.66989054\n", 354 | " 0.73093381 0.26445753 0.17682595 0.24664441 0.41129241 0.83235075\n", 355 | " 0.72809304 0.64075518 0.98233083 0.85216486]]\n", 356 | "[[18.0160595]]\n", 357 | "0\n", 358 | "Query belongs to bucket: [0]\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "query=np.random.random((1, 70))\n", 364 | "# print(query)\n", 365 | "\n", 366 | "# Assuming 'query' is your query vector\n", 367 | "query_dot = np.dot(query, plane_norms.T)\n", 368 | "query_dot = query_dot > 0\n", 369 | "query_dot = query_dot.astype(int)[0]\n", 370 | "\n", 371 | "# Convert the query array to a string\n", 372 | "query_hash_str = ''.join(query_dot.astype(str))\n", 373 | "\n", 374 | "# Check which bucket the query belongs to\n", 375 | "if query_hash_str in buckets.keys():\n", 376 | " bucket_containing_query = buckets[query_hash_str]\n", 377 | " min_dist=100\n", 378 | " index=-1\n", 379 | " for vec in bucket_containing_query:\n", 380 | " print(dataset[vec])\n", 381 | " print(query)\n", 382 | " dot_res=np.dot(query,dataset[vec].T)\n", 383 | " print(dot_res)\n", 384 | " res = dot_res / (np.linalg.norm(query) * np.linalg.norm(dataset[vec].T))\n", 385 | " if res list2[j]: 28 | j += 1 29 | else: 30 | intersection.append(list1[i]) 31 | i += 1 32 | j += 1 33 | 34 | return intersection 35 | 36 | 37 | def split_on_sign(data:[[float]],split_on)->int: 38 | ''' 39 | @param data: Data to categorize 40 | @split_on: The max count to split on 41 | 42 | @return dictionary of the data splitted by sign +ve and -ve 43 | ''' 44 | if(split_on is None or split_on>np.shape(data)[1]): 45 | #split on the whole size 46 | # split_on=np.shape(data)[1] 47 | split_on=10 48 | 49 | regions = [] 50 | for col in data[:,:split_on].T: # Transpose the matrix to iterate over columns 51 | positive_region = (col >= 0) 52 | negative_region = (col < 0) 53 | regions.append(np.where(positive_region)[0]) 54 | regions.append(np.where(negative_region)[0]) 55 | return regions 56 | 57 | def search_on_sign(q:[float],regions:[[int]]): 58 | # O(m * n), where m is the average length of the input lists, and n is the number of input lists. 59 | # Check on sign of the feature 60 | intersect=None 61 | split_on=np.shape(regions)[0]//2 62 | for ind,feature in enumerate(q[:split_on]): 63 | if(ind==0): 64 | intersect=regions[0] if feature>=0 else regions[1] 65 | continue 66 | if(feature>=0): 67 | # positive 68 | intersect=sorted_list_intersection(intersect, regions[2*ind]) 69 | else: 70 | #negative 71 | intersect=sorted_list_intersection(intersect, regions[2*ind+1]) 72 | return intersect --------------------------------------------------------------------------------