├── baseline ├── IP-DiskANN │ ├── test.sh │ ├── help │ │ ├── git_push.sh │ │ └── improve.md │ ├── ip-greator.xlsx │ ├── include │ │ ├── threadpool.h │ │ ├── v2 │ │ │ ├── aux_dist.h │ │ │ ├── fs_allocator.h │ │ │ ├── delete_set.h │ │ │ ├── graph_delta.h │ │ │ └── lock.h │ │ ├── file_content.h │ │ ├── windows_customizations.h │ │ ├── exceptions.h │ │ ├── common_includes.h │ │ ├── logger.h │ │ ├── timer.h │ │ ├── content_buf.h │ │ ├── Neighbor_Tag.h │ │ ├── memory_mapper.h │ │ ├── ann_exception.h │ │ ├── tsl │ │ │ └── LICENSE │ │ ├── linux_aligned_file_reader.h │ │ ├── windows_aligned_file_reader.h │ │ ├── cosine_similarity.h │ │ └── percentile_stats.h │ ├── dependencies │ │ └── windows │ │ │ ├── dll │ │ │ ├── zlib1.dll │ │ │ ├── LIBEAY32.dll │ │ │ ├── SSLEAY32.dll │ │ │ ├── cpprest_2_10.dll │ │ │ └── boost_date_time-vc141-mt-x64-1_70.dll │ │ │ └── tcmalloc │ │ │ ├── libtcmalloc_minimal.dll │ │ │ ├── libtcmalloc_minimal.lib │ │ │ └── v140 │ │ │ ├── libtcmalloc_minimal.dll │ │ │ └── libtcmalloc_minimal.lib │ ├── Dockerfile │ ├── src │ │ ├── dll │ │ │ └── dllmain.cpp │ │ ├── CMakeLists.txt │ │ ├── ann_exception.cpp │ │ └── v2 │ │ │ └── fs_allocator.cpp │ ├── .gitattributes │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.MD │ ├── tests │ │ ├── utils │ │ │ ├── int8_to_float.cpp │ │ │ ├── uint8_to_float.cpp │ │ │ ├── uint32_to_uint8.cpp │ │ │ ├── test_pq_from_pivots.cpp │ │ │ ├── test_pq.cpp │ │ │ ├── prep_index_merger.sh │ │ │ ├── merge_shards.cpp │ │ │ ├── test_partitioning.cpp │ │ │ ├── gen_random_slice.cpp │ │ │ ├── partition_data.cpp │ │ │ ├── partition_with_ram_budget.cpp │ │ │ ├── create_disk_layout.cpp │ │ │ ├── tsv_to_bin.cpp │ │ │ ├── calculate_recall.cpp │ │ │ ├── ivecs_to_bin.cpp │ │ │ ├── update_metadata.cpp │ │ │ ├── float_bin_to_int8.cpp │ │ │ └── fvecs_to_bin.cpp │ │ ├── align_query_file.cpp │ │ └── build_disk_index.cpp │ ├── LICENSE │ ├── NOTICE.txt │ ├── CompilerOptions.cmake │ └── study │ │ └── process.md └── LM-DiskANN │ ├── include │ ├── threadpool.h │ ├── v2 │ │ ├── aux_dist.h │ │ ├── fs_allocator.h │ │ ├── delete_set.h │ │ └── graph_delta.h │ ├── file_content.h │ ├── windows_customizations.h │ ├── exceptions.h │ ├── common_includes.h │ ├── logger.h │ ├── timer.h │ ├── content_buf.h │ ├── Neighbor_Tag.h │ ├── memory_mapper.h │ ├── ann_exception.h │ ├── linux_aligned_file_reader.h │ ├── tsl │ │ └── LICENSE │ ├── windows_aligned_file_reader.h │ ├── cosine_similarity.h │ └── percentile_stats.h │ ├── dependencies │ └── windows │ │ ├── dll │ │ ├── zlib1.dll │ │ ├── LIBEAY32.dll │ │ ├── SSLEAY32.dll │ │ ├── cpprest_2_10.dll │ │ └── boost_date_time-vc141-mt-x64-1_70.dll │ │ └── tcmalloc │ │ ├── libtcmalloc_minimal.dll │ │ ├── libtcmalloc_minimal.lib │ │ └── v140 │ │ ├── libtcmalloc_minimal.dll │ │ └── libtcmalloc_minimal.lib │ ├── Dockerfile │ ├── src │ ├── dll │ │ └── dllmain.cpp │ ├── CMakeLists.txt │ ├── ann_exception.cpp │ └── v2 │ │ └── fs_allocator.cpp │ ├── .gitattributes │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.MD │ ├── tests │ ├── utils │ │ ├── int8_to_float.cpp │ │ ├── uint8_to_float.cpp │ │ ├── uint32_to_uint8.cpp │ │ ├── test_pq_from_pivots.cpp │ │ ├── test_pq.cpp │ │ ├── prep_index_merger.sh │ │ ├── merge_shards.cpp │ │ ├── test_partitioning.cpp │ │ ├── gen_random_slice.cpp │ │ ├── partition_data.cpp │ │ ├── partition_with_ram_budget.cpp │ │ ├── create_disk_layout.cpp │ │ ├── tsv_to_bin.cpp │ │ ├── calculate_recall.cpp │ │ ├── ivecs_to_bin.cpp │ │ ├── update_metadata.cpp │ │ ├── float_bin_to_int8.cpp │ │ └── fvecs_to_bin.cpp │ ├── align_query_file.cpp │ └── build_disk_index.cpp │ ├── LICENSE │ ├── NOTICE.txt │ └── CompilerOptions.cmake ├── include ├── threadpool.h ├── v2 │ ├── aux_dist.h │ ├── fs_allocator.h │ ├── delete_set.h │ ├── graph_delta.h │ └── lock.h ├── file_content.h ├── windows_customizations.h ├── exceptions.h ├── common_includes.h ├── logger.h ├── timer.h ├── content_buf.h ├── Neighbor_Tag.h ├── memory_mapper.h ├── ann_exception.h ├── tsl │ └── LICENSE ├── linux_aligned_file_reader.h ├── windows_aligned_file_reader.h ├── cosine_similarity.h └── percentile_stats.h ├── dependencies └── windows │ ├── dll │ ├── zlib1.dll │ ├── LIBEAY32.dll │ ├── SSLEAY32.dll │ ├── cpprest_2_10.dll │ └── boost_date_time-vc141-mt-x64-1_70.dll │ └── tcmalloc │ ├── libtcmalloc_minimal.dll │ ├── libtcmalloc_minimal.lib │ ├── libtcmalloc_minimal.pdb │ └── v140 │ ├── libtcmalloc_minimal.dll │ ├── libtcmalloc_minimal.lib │ └── libtcmalloc_minimal.pdb ├── scripts ├── pre_dataset.sh ├── pre_dataset │ ├── make_tags.sh │ ├── extract_base.sh │ ├── make_trace.sh │ ├── topology_extraction.sh │ ├── compute_knn.sh │ ├── process_index_for_diffR.sh │ └── readme.md ├── build_indices.sh ├── overall_performance.sh └── readme.md ├── Dockerfile ├── src ├── dll │ └── dllmain.cpp ├── CMakeLists.txt ├── ann_exception.cpp └── v2 │ └── fs_allocator.cpp ├── .gitattributes ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.MD ├── tests ├── utils │ ├── int8_to_float.cpp │ ├── uint8_to_float.cpp │ ├── uint32_to_uint8.cpp │ ├── test_pq_from_pivots.cpp │ ├── test_pq.cpp │ ├── prep_index_merger.sh │ ├── merge_shards.cpp │ ├── test_partitioning.cpp │ ├── gen_random_slice.cpp │ ├── partition_data.cpp │ ├── partition_with_ram_budget.cpp │ ├── create_disk_layout.cpp │ ├── tsv_to_bin.cpp │ ├── calculate_recall.cpp │ ├── ivecs_to_bin.cpp │ ├── update_metadata.cpp │ ├── float_bin_to_int8.cpp │ ├── fvecs_to_bin.cpp │ └── bin_to_tsv.cpp ├── align_query_file.cpp └── build_disk_index.cpp ├── LICENSE ├── NOTICE.txt └── CompilerOptions.cmake /baseline/IP-DiskANN/test.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /include/threadpool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/include/threadpool.h -------------------------------------------------------------------------------- /baseline/IP-DiskANN/help/git_push.sh: -------------------------------------------------------------------------------- 1 | git add . 2 | git commit -m "myfirst" 3 | git push -u origin main -------------------------------------------------------------------------------- /include/v2/aux_dist.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace diskann { 4 | class FloatFloat 5 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/IP-DiskANN/ip-greator.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/ip-greator.xlsx -------------------------------------------------------------------------------- /dependencies/windows/dll/zlib1.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/dll/zlib1.dll -------------------------------------------------------------------------------- /dependencies/windows/dll/LIBEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/dll/LIBEAY32.dll -------------------------------------------------------------------------------- /dependencies/windows/dll/SSLEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/dll/SSLEAY32.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/threadpool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/include/threadpool.h -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/v2/aux_dist.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace diskann { 4 | class FloatFloat 5 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/threadpool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/include/threadpool.h -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/v2/aux_dist.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace diskann { 4 | class FloatFloat 5 | } // namespace diskann -------------------------------------------------------------------------------- /dependencies/windows/dll/cpprest_2_10.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/dll/cpprest_2_10.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/help/improve.md: -------------------------------------------------------------------------------- 1 | # 优化过程 2 | ## 1. vec 3 | ### A. First_Scan 4 | ### B. Third_SCan 5 | ## 2. Graph -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/libtcmalloc_minimal.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/libtcmalloc_minimal.pdb -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/dll/zlib1.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/dll/zlib1.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/dll/zlib1.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/dll/zlib1.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/dll/LIBEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/dll/LIBEAY32.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/dll/SSLEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/dll/SSLEAY32.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/dll/LIBEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/dll/LIBEAY32.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/dll/SSLEAY32.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/dll/SSLEAY32.dll -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.pdb -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/dll/cpprest_2_10.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/dll/cpprest_2_10.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/dll/cpprest_2_10.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/dll/cpprest_2_10.dll -------------------------------------------------------------------------------- /dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll -------------------------------------------------------------------------------- /scripts/pre_dataset.sh: -------------------------------------------------------------------------------- 1 | mkdir ./dataset 2 | mkdir ./indices 3 | mkdir ./trace 4 | cd ./pre_dataset 5 | 6 | bash make_trace.sh 7 | 8 | bash extract_base.sh 9 | 10 | bash compute_knn.sh -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/tcmalloc/v140/libtcmalloc_minimal.lib -------------------------------------------------------------------------------- /baseline/IP-DiskANN/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/IP-DiskANN/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll -------------------------------------------------------------------------------- /baseline/LM-DiskANN/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iDC-NEU/Greator/HEAD/baseline/LM-DiskANN/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll -------------------------------------------------------------------------------- /scripts/pre_dataset/make_tags.sh: -------------------------------------------------------------------------------- 1 | g++ make_tags.cpp -o mt 2 | 3 | dataset=sift 4 | indices_prefix=/data/linsy/Greator/scripts/indices/"$dataset"_R32 5 | mkdir $indices_prefix 6 | fname="$indices_prefix"/disk_init/_index_disk.index.tags 7 | dnum=950000 8 | ./mt $fname $dnum 9 | -------------------------------------------------------------------------------- /scripts/pre_dataset/extract_base.sh: -------------------------------------------------------------------------------- 1 | g++ extract_base.cpp -o eb 2 | dataset=sift 3 | datanum=1000000 4 | input_f=/data/linsy/Greator/scripts/dataset/"$dataset"/"$dataset"_base.fbin 5 | output_f=/data/linsy/Greator/scripts/dataset/"$dataset"/"$dataset"_base_95.fbin 6 | ./eb "$input_f" "$output_f" "$datanum" 7 | 8 | -------------------------------------------------------------------------------- /include/file_content.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | namespace diskann { 5 | struct FileContent { 6 | public: 7 | FileContent(void* content, size_t size) : _content(content), _size(size) { 8 | } 9 | 10 | void* _content; 11 | size_t _size; 12 | }; 13 | } // namespace diskann 14 | #endif 15 | -------------------------------------------------------------------------------- /scripts/pre_dataset/make_trace.sh: -------------------------------------------------------------------------------- 1 | g++ make_trace.cpp -o mtrace 2 | 3 | dataset=sift 4 | npts=950000 5 | delta=1000 #num_points_per_update 6 | filename=/data/linsy/Greator/scripts/trace/"$dataset"_trace_0.001 7 | update_iteration=5 8 | mkdir $filename 9 | 10 | ./mtrace $filename/_trace $npts $delta $update_iteration 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/pre_dataset/topology_extraction.sh: -------------------------------------------------------------------------------- 1 | g++ topology_extraction.cpp -o te 2 | 3 | sector_len=4096 4 | new_prefix=/data/linsy/Greator/scripts/indices/"$dataset"_R"$new_R" 5 | ./te /data/linsy/Cout/dynamic/disk_glove_R34/disk_init/_index_disk.index /data/linsy/Cout/dynamic/disk_glove_R34/disk_init/_index_disk.index_with_only_nbrs "$sector_len" 6 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/file_content.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | namespace diskann { 5 | struct FileContent { 6 | public: 7 | FileContent(void* content, size_t size) : _content(content), _size(size) { 8 | } 9 | 10 | void* _content; 11 | size_t _size; 12 | }; 13 | } // namespace diskann 14 | #endif 15 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/file_content.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | namespace diskann { 5 | struct FileContent { 6 | public: 7 | FileContent(void* content, size_t size) : _content(content), _size(size) { 8 | } 9 | 10 | void* _content; 11 | size_t _size; 12 | }; 13 | } // namespace diskann 14 | #endif 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER Changxu Wang 3 | 4 | RUN apt-get update -y 5 | RUN apt-get install -y g++ cmake libboost-dev libgoogle-perftools-dev 6 | 7 | COPY . /opt/nsg 8 | 9 | WORKDIR /opt/nsg 10 | 11 | RUN mkdir -p build && cd build && \ 12 | cmake -DCMAKE_BUILD_TYPE=Release .. && \ 13 | make -j $(nproc) 14 | -------------------------------------------------------------------------------- /include/windows_customizations.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifdef _WINDOWS 7 | #define DISKANN_DLLEXPORT __declspec(dllexport) 8 | #define DISKANN_DLLIMPORT __declspec(dllimport) 9 | #else 10 | #define DISKANN_DLLEXPORT 11 | #define DISKANN_DLLIMPORT 12 | #endif 13 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER Changxu Wang 3 | 4 | RUN apt-get update -y 5 | RUN apt-get install -y g++ cmake libboost-dev libgoogle-perftools-dev 6 | 7 | COPY . /opt/nsg 8 | 9 | WORKDIR /opt/nsg 10 | 11 | RUN mkdir -p build && cd build && \ 12 | cmake -DCMAKE_BUILD_TYPE=Release .. && \ 13 | make -j $(nproc) 14 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER Changxu Wang 3 | 4 | RUN apt-get update -y 5 | RUN apt-get install -y g++ cmake libboost-dev libgoogle-perftools-dev 6 | 7 | COPY . /opt/nsg 8 | 9 | WORKDIR /opt/nsg 10 | 11 | RUN mkdir -p build && cd build && \ 12 | cmake -DCMAKE_BUILD_TYPE=Release .. && \ 13 | make -j $(nproc) 14 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/windows_customizations.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifdef _WINDOWS 7 | #define DISKANN_DLLEXPORT __declspec(dllexport) 8 | #define DISKANN_DLLIMPORT __declspec(dllimport) 9 | #else 10 | #define DISKANN_DLLEXPORT 11 | #define DISKANN_DLLIMPORT 12 | #endif 13 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/windows_customizations.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifdef _WINDOWS 7 | #define DISKANN_DLLEXPORT __declspec(dllexport) 8 | #define DISKANN_DLLIMPORT __declspec(dllimport) 9 | #else 10 | #define DISKANN_DLLEXPORT 11 | #define DISKANN_DLLIMPORT 12 | #endif 13 | -------------------------------------------------------------------------------- /include/exceptions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | 7 | namespace diskann { 8 | 9 | class NotImplementedException : public std::logic_error { 10 | public: 11 | NotImplementedException() 12 | : std::logic_error("Function not yet implemented.") { 13 | } 14 | }; 15 | } // namespace diskann 16 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/exceptions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | 7 | namespace diskann { 8 | 9 | class NotImplementedException : public std::logic_error { 10 | public: 11 | NotImplementedException() 12 | : std::logic_error("Function not yet implemented.") { 13 | } 14 | }; 15 | } // namespace diskann 16 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/exceptions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | 7 | namespace diskann { 8 | 9 | class NotImplementedException : public std::logic_error { 10 | public: 11 | NotImplementedException() 12 | : std::logic_error("Function not yet implemented.") { 13 | } 14 | }; 15 | } // namespace diskann 16 | -------------------------------------------------------------------------------- /src/dll/dllmain.cpp: -------------------------------------------------------------------------------- 1 | // dllmain.cpp : Defines the entry point for the DLL application. 2 | #include 3 | 4 | BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, 5 | LPVOID lpReserved) { 6 | switch (ul_reason_for_call) { 7 | case DLL_PROCESS_ATTACH: 8 | case DLL_THREAD_ATTACH: 9 | case DLL_THREAD_DETACH: 10 | case DLL_PROCESS_DETACH: 11 | break; 12 | } 13 | return TRUE; 14 | } 15 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/src/dll/dllmain.cpp: -------------------------------------------------------------------------------- 1 | // dllmain.cpp : Defines the entry point for the DLL application. 2 | #include 3 | 4 | BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, 5 | LPVOID lpReserved) { 6 | switch (ul_reason_for_call) { 7 | case DLL_PROCESS_ATTACH: 8 | case DLL_THREAD_ATTACH: 9 | case DLL_THREAD_DETACH: 10 | case DLL_PROCESS_DETACH: 11 | break; 12 | } 13 | return TRUE; 14 | } 15 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/src/dll/dllmain.cpp: -------------------------------------------------------------------------------- 1 | // dllmain.cpp : Defines the entry point for the DLL application. 2 | #include 3 | 4 | BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, 5 | LPVOID lpReserved) { 6 | switch (ul_reason_for_call) { 7 | case DLL_PROCESS_ATTACH: 8 | case DLL_THREAD_ATTACH: 9 | case DLL_THREAD_DETACH: 10 | case DLL_PROCESS_DETACH: 11 | break; 12 | } 13 | return TRUE; 14 | } 15 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Denote all files that are truly binary and should not be modified. 13 | *.png binary 14 | *.jpg binary 15 | -------------------------------------------------------------------------------- /include/common_includes.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Denote all files that are truly binary and should not be modified. 13 | *.png binary 14 | *.jpg binary 15 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Denote all files that are truly binary and should not be modified. 13 | *.png binary 14 | *.jpg binary 15 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/common_includes.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/common_includes.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /include/logger.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "windows_customizations.h" 6 | 7 | namespace diskann { 8 | #if defined(DISKANN_DLL) 9 | extern std::basic_ostream cout; 10 | extern std::basic_ostream cerr; 11 | #else 12 | DISKANN_DLLIMPORT extern std::basic_ostream cout; 13 | DISKANN_DLLIMPORT extern std::basic_ostream cerr; 14 | #endif 15 | 16 | } // namespace diskann 17 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/logger.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "windows_customizations.h" 6 | 7 | namespace diskann { 8 | #if defined(DISKANN_DLL) 9 | extern std::basic_ostream cout; 10 | extern std::basic_ostream cerr; 11 | #else 12 | DISKANN_DLLIMPORT extern std::basic_ostream cout; 13 | DISKANN_DLLIMPORT extern std::basic_ostream cerr; 14 | #endif 15 | 16 | } // namespace diskann 17 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/logger.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "windows_customizations.h" 6 | 7 | namespace diskann { 8 | #if defined(DISKANN_DLL) 9 | extern std::basic_ostream cout; 10 | extern std::basic_ostream cerr; 11 | #else 12 | DISKANN_DLLIMPORT extern std::basic_ostream cout; 13 | DISKANN_DLLIMPORT extern std::basic_ostream cerr; 14 | #endif 15 | 16 | } // namespace diskann 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.MD: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | -------------------------------------------------------------------------------- /scripts/pre_dataset/compute_knn.sh: -------------------------------------------------------------------------------- 1 | g++ compute_knn.cpp -fopenmp -o ck 2 | 3 | dataset=sift 4 | basefile=/data/linsy/Greator/scripts/dataset/"$dataset"/"$dataset"_base.fbin 5 | queryfile=/data/linsy/Greator/scripts/dataset/"$dataset"/"$dataset"_query.fbin 6 | gt_prefix=/data/linsy/Greator/scripts/dataset/"$dataset"/gt 7 | mkdir $gt_prefix 8 | # 从0到50的数字进行循环 9 | for i in {0..50} 10 | do 11 | echo "当前轮次是 $i" 12 | 13 | #0~949999 14 | startid=$((0 + i * 1000)) 15 | endid=$((949999 + i * 1000)) 16 | gtfile=$gt_prefix/"$dataset"_gt_K10_"$i".fbin 17 | ./ck "$basefile" "$queryfile" "$gtfile" "$startid" "$endid" 18 | done 19 | 20 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/CONTRIBUTING.MD: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/CONTRIBUTING.MD: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | -------------------------------------------------------------------------------- /tests/utils/int8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl; 10 | exit(-1); 11 | } 12 | 13 | int8_t* input; 14 | size_t npts, nd; 15 | diskann::load_bin(argv[1], input, npts, nd); 16 | float* output = new float[npts * nd]; 17 | diskann::convert_types(input, output, npts, nd); 18 | diskann::save_bin(argv[2], output, npts, nd); 19 | delete[] output; 20 | delete[] input; 21 | } 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | if(MSVC) 7 | add_subdirectory(dll) 8 | else() 9 | #file(GLOB CPP_SOURCES *.cpp) 10 | set(CPP_SOURCES ann_exception.cpp aux_utils.cpp index.cpp 11 | linux_aligned_file_reader.cpp math_utils.cpp memory_mapper.cpp 12 | partition_and_pq.cpp pq_flash_index.cpp logger.cpp distance.cpp 13 | utils.cpp v2/graph_delta.cpp v2/index_merger.cpp 14 | v2/merge_insert.cpp) 15 | add_library(${PROJECT_NAME} ${CPP_SOURCES}) 16 | add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES}) 17 | endif() 18 | install() 19 | -------------------------------------------------------------------------------- /include/timer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | namespace diskann { 7 | class Timer { 8 | typedef std::chrono::high_resolution_clock _clock; 9 | std::chrono::time_point<_clock> check_point; 10 | 11 | public: 12 | Timer() : check_point(_clock::now()) { 13 | } 14 | 15 | void reset() { 16 | check_point = _clock::now(); 17 | } 18 | 19 | long long elapsed() const { 20 | return std::chrono::duration_cast( 21 | _clock::now() - check_point) 22 | .count(); 23 | } 24 | }; 25 | } // namespace diskann 26 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/int8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl; 10 | exit(-1); 11 | } 12 | 13 | int8_t* input; 14 | size_t npts, nd; 15 | diskann::load_bin(argv[1], input, npts, nd); 16 | float* output = new float[npts * nd]; 17 | diskann::convert_types(input, output, npts, nd); 18 | diskann::save_bin(argv[2], output, npts, nd); 19 | delete[] output; 20 | delete[] input; 21 | } 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/int8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl; 10 | exit(-1); 11 | } 12 | 13 | int8_t* input; 14 | size_t npts, nd; 15 | diskann::load_bin(argv[1], input, npts, nd); 16 | float* output = new float[npts * nd]; 17 | diskann::convert_types(input, output, npts, nd); 18 | diskann::save_bin(argv[2], output, npts, nd); 19 | delete[] output; 20 | delete[] input; 21 | } 22 | -------------------------------------------------------------------------------- /tests/utils/uint8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint8_bin output_float_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint8_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | float* output = new float[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/timer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | namespace diskann { 7 | class Timer { 8 | typedef std::chrono::high_resolution_clock _clock; 9 | std::chrono::time_point<_clock> check_point; 10 | 11 | public: 12 | Timer() : check_point(_clock::now()) { 13 | } 14 | 15 | void reset() { 16 | check_point = _clock::now(); 17 | } 18 | 19 | long long elapsed() const { 20 | return std::chrono::duration_cast( 21 | _clock::now() - check_point) 22 | .count(); 23 | } 24 | }; 25 | } // namespace diskann 26 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/timer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | namespace diskann { 7 | class Timer { 8 | typedef std::chrono::high_resolution_clock _clock; 9 | std::chrono::time_point<_clock> check_point; 10 | 11 | public: 12 | Timer() : check_point(_clock::now()) { 13 | } 14 | 15 | void reset() { 16 | check_point = _clock::now(); 17 | } 18 | 19 | long long elapsed() const { 20 | return std::chrono::duration_cast( 21 | _clock::now() - check_point) 22 | .count(); 23 | } 24 | }; 25 | } // namespace diskann 26 | -------------------------------------------------------------------------------- /tests/utils/uint32_to_uint8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint32_bin output_int8_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint32_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | uint8_t* output = new uint8_t[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/uint8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint8_bin output_float_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint8_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | float* output = new float[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/uint8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint8_bin output_float_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint8_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | float* output = new float[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/uint32_to_uint8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint32_bin output_int8_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint32_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | uint8_t* output = new uint8_t[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/uint32_to_uint8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char** argv) { 8 | if (argc != 3) { 9 | diskann::cout << argv[0] << " input_uint32_bin output_int8_bin" 10 | << std::endl; 11 | exit(-1); 12 | } 13 | 14 | uint32_t* input; 15 | size_t npts, nd; 16 | diskann::load_bin(argv[1], input, npts, nd); 17 | uint8_t* output = new uint8_t[npts * nd]; 18 | diskann::convert_types(input, output, npts, nd); 19 | diskann::save_bin(argv[2], output, npts, nd); 20 | delete[] output; 21 | delete[] input; 22 | } 23 | -------------------------------------------------------------------------------- /include/content_buf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | 5 | namespace diskann { 6 | class ContentBuf : public std::basic_streambuf { 7 | public: 8 | ContentBuf(char* p, size_t n) { 9 | setg(p, p, p + n); 10 | } 11 | 12 | virtual pos_type seekoff( 13 | off_type off, std::ios_base::seekdir dir, 14 | std::ios_base::openmode which = std::ios_base::in) { 15 | if (dir == std::ios_base::cur) 16 | gbump((int) off); 17 | else if (dir == std::ios_base::end) 18 | setg(eback(), egptr() + off, egptr()); 19 | else if (dir == std::ios_base::beg) 20 | setg(eback(), eback() + off, egptr()); 21 | return gptr() - eback(); 22 | } 23 | }; 24 | } // namespace diskann 25 | 26 | #endif -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/content_buf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | 5 | namespace diskann { 6 | class ContentBuf : public std::basic_streambuf { 7 | public: 8 | ContentBuf(char* p, size_t n) { 9 | setg(p, p, p + n); 10 | } 11 | 12 | virtual pos_type seekoff( 13 | off_type off, std::ios_base::seekdir dir, 14 | std::ios_base::openmode which = std::ios_base::in) { 15 | if (dir == std::ios_base::cur) 16 | gbump((int) off); 17 | else if (dir == std::ios_base::end) 18 | setg(eback(), egptr() + off, egptr()); 19 | else if (dir == std::ios_base::beg) 20 | setg(eback(), eback() + off, egptr()); 21 | return gptr() - eback(); 22 | } 23 | }; 24 | } // namespace diskann 25 | 26 | #endif -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/content_buf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef EXEC_ENV_OLS 4 | 5 | namespace diskann { 6 | class ContentBuf : public std::basic_streambuf { 7 | public: 8 | ContentBuf(char* p, size_t n) { 9 | setg(p, p, p + n); 10 | } 11 | 12 | virtual pos_type seekoff( 13 | off_type off, std::ios_base::seekdir dir, 14 | std::ios_base::openmode which = std::ios_base::in) { 15 | if (dir == std::ios_base::cur) 16 | gbump((int) off); 17 | else if (dir == std::ios_base::end) 18 | setg(eback(), egptr() + off, egptr()); 19 | else if (dir == std::ios_base::beg) 20 | setg(eback(), eback() + off, egptr()); 21 | return gptr() - eback(); 22 | } 23 | }; 24 | } // namespace diskann 25 | 26 | #endif -------------------------------------------------------------------------------- /scripts/build_indices.sh: -------------------------------------------------------------------------------- 1 | project_dir=/data/linsy/Greator 2 | # Part1: build_disk_index_R32 for FreshDiskANN 3 | 4 | dataset=sift 5 | cindir="$project_dir"/scripts/dataset/"$dataset"/"$dataset"_base_95.fbin 6 | coutdir="$project_dir"/scripts/indices/"$dataset"_R32/disk_init 7 | mkdir $coutdir 8 | cd "$project_dir"/build && make -j 9 | cd "$project_dir"/run 10 | 11 | "$project_dir"/build/tests/build_disk_index float "$cindir" "$coutdir"/_index 32 128 98 98 64 l2 0 12 | 13 | # part2: make_tags for disk_index_R32 and disk_index_R34 14 | 15 | cd "$project_dir"/scripts/pre_dataset 16 | bash make_tags.sh 17 | 18 | # Part3: process disk_index_R32 as disk_index_R34 for Greator (ie, 33/34) 19 | 20 | cd "$project_dir"/scripts/pre_dataset 21 | bash process_index_for_diffR.sh 22 | 23 | 24 | -------------------------------------------------------------------------------- /scripts/pre_dataset/process_index_for_diffR.sh: -------------------------------------------------------------------------------- 1 | g++ process_index_for_diffR.cpp -o pifdr 2 | 3 | 4 | dataset=sift 5 | old_R=32 6 | new_R=34 7 | old_prefix=/data/linsy/Greator/scripts/indices/"$dataset"_R"$old_R" 8 | new_prefix=/data/linsy/Greator/scripts/indices/"$dataset"_R"$new_R" 9 | 10 | mkdir $new_prefix 11 | mkdir $new_prefix/disk_init/ 12 | 13 | cp -r "$old_prefix"/disk_init/ "$new_prefix"/ 14 | rm "$new_prefix"/disk_init/_index_disk.index 15 | 16 | 17 | ./pifdr "$old_prefix"/disk_init/_index_disk.index "$new_prefix"/disk_init/_index_disk.index "$new_R" float 18 | 19 | 20 | # extract topology file 21 | 22 | g++ topology_extraction.cpp -o te 23 | sector_len=4096 24 | 25 | ./te "$new_prefix"/disk_init/_index_disk.index "$new_prefix"/disk_init/_index_disk.index.index_with_only_nbrs "$sector_len" 26 | -------------------------------------------------------------------------------- /include/Neighbor_Tag.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "tsl/robin_set.h" 12 | 13 | #include "parameters.h" 14 | 15 | namespace diskann { 16 | 17 | template 18 | struct Neighbor_Tag { 19 | TagT tag; 20 | float dist; 21 | 22 | Neighbor_Tag() = default; 23 | 24 | Neighbor_Tag(TagT tag, float dist) : tag{tag}, dist{dist} { 25 | } 26 | inline bool operator<(const Neighbor_Tag &other) const { 27 | return (dist < other.dist); 28 | } 29 | inline bool operator==(const Neighbor_Tag &other) const { 30 | return (tag == other.tag); 31 | } 32 | }; 33 | } // namespace diskann 34 | -------------------------------------------------------------------------------- /tests/utils/test_pq_from_pivots.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include "partition_and_pq.h" 8 | #include "util.h" 9 | 10 | // DEPRECATED: NEED TO REPROGRAM 11 | 12 | int main(int argc, char** argv) { 13 | if (argc != 5) { 14 | diskann::cout << argv[0] 15 | << "format: base_set " 16 | "num_clusters_per_chunk number_chunks " 17 | "prefix_for_working_file " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 23 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 24 | generate_pq_data_from_pivots(argv[1], num_centers, num_chunks, argv[4]); 25 | } 26 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/Neighbor_Tag.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "tsl/robin_set.h" 12 | 13 | #include "parameters.h" 14 | 15 | namespace diskann { 16 | 17 | template 18 | struct Neighbor_Tag { 19 | TagT tag; 20 | float dist; 21 | 22 | Neighbor_Tag() = default; 23 | 24 | Neighbor_Tag(TagT tag, float dist) : tag{tag}, dist{dist} { 25 | } 26 | inline bool operator<(const Neighbor_Tag &other) const { 27 | return (dist < other.dist); 28 | } 29 | inline bool operator==(const Neighbor_Tag &other) const { 30 | return (tag == other.tag); 31 | } 32 | }; 33 | } // namespace diskann 34 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/Neighbor_Tag.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "tsl/robin_set.h" 12 | 13 | #include "parameters.h" 14 | 15 | namespace diskann { 16 | 17 | template 18 | struct Neighbor_Tag { 19 | TagT tag; 20 | float dist; 21 | 22 | Neighbor_Tag() = default; 23 | 24 | Neighbor_Tag(TagT tag, float dist) : tag{tag}, dist{dist} { 25 | } 26 | inline bool operator<(const Neighbor_Tag &other) const { 27 | return (dist < other.dist); 28 | } 29 | inline bool operator==(const Neighbor_Tag &other) const { 30 | return (tag == other.tag); 31 | } 32 | }; 33 | } // namespace diskann 34 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/test_pq_from_pivots.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include "partition_and_pq.h" 8 | #include "util.h" 9 | 10 | // DEPRECATED: NEED TO REPROGRAM 11 | 12 | int main(int argc, char** argv) { 13 | if (argc != 5) { 14 | diskann::cout << argv[0] 15 | << "format: base_set " 16 | "num_clusters_per_chunk number_chunks " 17 | "prefix_for_working_file " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 23 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 24 | generate_pq_data_from_pivots(argv[1], num_centers, num_chunks, argv[4]); 25 | } 26 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/test_pq_from_pivots.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include "partition_and_pq.h" 8 | #include "util.h" 9 | 10 | // DEPRECATED: NEED TO REPROGRAM 11 | 12 | int main(int argc, char** argv) { 13 | if (argc != 5) { 14 | diskann::cout << argv[0] 15 | << "format: base_set " 16 | "num_clusters_per_chunk number_chunks " 17 | "prefix_for_working_file " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 23 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 24 | generate_pq_data_from_pivots(argv[1], num_centers, num_chunks, argv[4]); 25 | } 26 | -------------------------------------------------------------------------------- /include/memory_mapper.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifndef _WINDOWS 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #else 14 | #include 15 | #endif 16 | #include 17 | 18 | namespace diskann { 19 | class MemoryMapper { 20 | private: 21 | #ifndef _WINDOWS 22 | int _fd; 23 | #else 24 | HANDLE _bareFile; 25 | HANDLE _fd; 26 | 27 | #endif 28 | char* _buf; 29 | size_t _fileSize; 30 | const char* _fileName; 31 | 32 | public: 33 | MemoryMapper(const char* filename); 34 | MemoryMapper(const std::string& filename); 35 | 36 | char* getBuf(); 37 | size_t getFileSize(); 38 | 39 | ~MemoryMapper(); 40 | }; 41 | } // namespace diskann 42 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | find_package(TBB REQUIRED) 7 | 8 | 9 | include_directories(${TBB_INCLUDE_DIRS}) 10 | if(MSVC) 11 | add_subdirectory(dll) 12 | else() 13 | #file(GLOB CPP_SOURCES *.cpp) 14 | set(CPP_SOURCES ann_exception.cpp aux_utils.cpp index.cpp 15 | linux_aligned_file_reader.cpp math_utils.cpp memory_mapper.cpp 16 | partition_and_pq.cpp pq_flash_index.cpp logger.cpp distance.cpp 17 | utils.cpp v2/graph_delta.cpp v2/index_merger.cpp 18 | v2/merge_insert.cpp) 19 | add_library(${PROJECT_NAME} ${CPP_SOURCES}) 20 | add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES}) 21 | 22 | target_link_libraries(${PROJECT_NAME} ${TBB_LIBRARIES}) 23 | target_link_libraries(${PROJECT_NAME}_s ${TBB_LIBRARIES}) 24 | endif() 25 | install() 26 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/memory_mapper.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifndef _WINDOWS 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #else 14 | #include 15 | #endif 16 | #include 17 | 18 | namespace diskann { 19 | class MemoryMapper { 20 | private: 21 | #ifndef _WINDOWS 22 | int _fd; 23 | #else 24 | HANDLE _bareFile; 25 | HANDLE _fd; 26 | 27 | #endif 28 | char* _buf; 29 | size_t _fileSize; 30 | const char* _fileName; 31 | 32 | public: 33 | MemoryMapper(const char* filename); 34 | MemoryMapper(const std::string& filename); 35 | 36 | char* getBuf(); 37 | size_t getFileSize(); 38 | 39 | ~MemoryMapper(); 40 | }; 41 | } // namespace diskann 42 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/memory_mapper.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifndef _WINDOWS 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #else 14 | #include 15 | #endif 16 | #include 17 | 18 | namespace diskann { 19 | class MemoryMapper { 20 | private: 21 | #ifndef _WINDOWS 22 | int _fd; 23 | #else 24 | HANDLE _bareFile; 25 | HANDLE _fd; 26 | 27 | #endif 28 | char* _buf; 29 | size_t _fileSize; 30 | const char* _fileName; 31 | 32 | public: 33 | MemoryMapper(const char* filename); 34 | MemoryMapper(const std::string& filename); 35 | 36 | char* getBuf(); 37 | size_t getFileSize(); 38 | 39 | ~MemoryMapper(); 40 | }; 41 | } // namespace diskann 42 | -------------------------------------------------------------------------------- /tests/utils/test_pq.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | #include "util.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 6) { 13 | diskann::cout << argv[0] 14 | << " format: train_set " 15 | "num_clusters_per_chunk number_chunks " 16 | "max_reps prefix_for_working_directory " 17 | << std::endl; 18 | exit(-1); 19 | } 20 | 21 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 22 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 23 | size_t max_reps = (size_t) strtol(argv[4], NULL, 10); 24 | 25 | generate_pq_pivots(argv[1], num_centers, num_chunks, max_reps, argv[5]); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/test_pq.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | #include "util.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 6) { 13 | diskann::cout << argv[0] 14 | << " format: train_set " 15 | "num_clusters_per_chunk number_chunks " 16 | "max_reps prefix_for_working_directory " 17 | << std::endl; 18 | exit(-1); 19 | } 20 | 21 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 22 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 23 | size_t max_reps = (size_t) strtol(argv[4], NULL, 10); 24 | 25 | generate_pq_pivots(argv[1], num_centers, num_chunks, max_reps, argv[5]); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/test_pq.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | #include "util.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 6) { 13 | diskann::cout << argv[0] 14 | << " format: train_set " 15 | "num_clusters_per_chunk number_chunks " 16 | "max_reps prefix_for_working_directory " 17 | << std::endl; 18 | exit(-1); 19 | } 20 | 21 | size_t num_centers = (size_t) strtol(argv[2], NULL, 10); 22 | size_t num_chunks = (size_t) strtol(argv[3], NULL, 10); 23 | size_t max_reps = (size_t) strtol(argv[4], NULL, 10); 24 | 25 | generate_pq_pivots(argv[1], num_centers, num_chunks, max_reps, argv[5]); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /include/v2/fs_allocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tsl/robin_map.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace diskann { 12 | // cached allocator for fast aligned mallocs 13 | template 14 | class FixedSizeAlignedAllocator { 15 | public: 16 | // create aligned buffer with at least max_count * ndims elements 17 | FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count); 18 | // destruct allocator, free mem 19 | ~FixedSizeAlignedAllocator(); 20 | // allocate ndims buffer 21 | T* allocate(); 22 | // deallocate ndims elements 23 | void deallocate(T* ptr); 24 | private: 25 | std::mutex lock; 26 | T* buf = nullptr; 27 | tsl::robin_set free_set; 28 | uint32_t count; 29 | }; 30 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/v2/fs_allocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tsl/robin_map.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace diskann { 12 | // cached allocator for fast aligned mallocs 13 | template 14 | class FixedSizeAlignedAllocator { 15 | public: 16 | // create aligned buffer with at least max_count * ndims elements 17 | FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count); 18 | // destruct allocator, free mem 19 | ~FixedSizeAlignedAllocator(); 20 | // allocate ndims buffer 21 | T* allocate(); 22 | // deallocate ndims elements 23 | void deallocate(T* ptr); 24 | private: 25 | std::mutex lock; 26 | T* buf = nullptr; 27 | tsl::robin_set free_set; 28 | uint32_t count; 29 | }; 30 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/IP-DiskANN/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | # 查找 TBB 库 6 | find_package(TBB REQUIRED) 7 | 8 | # 包含 TBB 头文件目录 9 | include_directories(${TBB_INCLUDE_DIRS}) 10 | if(MSVC) 11 | add_subdirectory(dll) 12 | else() 13 | #file(GLOB CPP_SOURCES *.cpp) 14 | set(CPP_SOURCES ann_exception.cpp aux_utils.cpp index.cpp 15 | linux_aligned_file_reader.cpp math_utils.cpp memory_mapper.cpp 16 | partition_and_pq.cpp pq_flash_index.cpp logger.cpp distance.cpp 17 | utils.cpp v2/graph_delta.cpp v2/index_merger.cpp 18 | v2/merge_insert.cpp) 19 | add_library(${PROJECT_NAME} ${CPP_SOURCES}) 20 | add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES}) 21 | # 链接 TBB 库到动态库和静态库 22 | target_link_libraries(${PROJECT_NAME} ${TBB_LIBRARIES}) 23 | target_link_libraries(${PROJECT_NAME}_s ${TBB_LIBRARIES}) 24 | endif() 25 | install() 26 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/v2/fs_allocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tsl/robin_map.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace diskann { 12 | // cached allocator for fast aligned mallocs 13 | template 14 | class FixedSizeAlignedAllocator { 15 | public: 16 | // create aligned buffer with at least max_count * ndims elements 17 | FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count); 18 | // destruct allocator, free mem 19 | ~FixedSizeAlignedAllocator(); 20 | // allocate ndims buffer 21 | T* allocate(); 22 | // deallocate ndims elements 23 | void deallocate(T* ptr); 24 | private: 25 | std::mutex lock; 26 | T* buf = nullptr; 27 | tsl::robin_set free_set; 28 | uint32_t count; 29 | }; 30 | } // namespace diskann -------------------------------------------------------------------------------- /include/ann_exception.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | #include "windows_customizations.h" 7 | 8 | #ifndef _WINDOWS 9 | #define __FUNCSIG__ __PRETTY_FUNCTION__ 10 | #endif 11 | 12 | namespace diskann { 13 | class ANNException { 14 | public: 15 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode); 16 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode, 17 | const std::string& funcSig, 18 | const std::string& fileName, 19 | unsigned int lineNum); 20 | 21 | DISKANN_DLLEXPORT std::string message() const; 22 | DISKANN_DLLEXPORT int errorCode() const; 23 | 24 | private: 25 | int _errorCode; 26 | std::string _message; 27 | std::string _funcSig; 28 | std::string _fileName; 29 | unsigned int _lineNum; 30 | }; 31 | } // namespace diskann 32 | -------------------------------------------------------------------------------- /tests/utils/prep_index_merger.sh: -------------------------------------------------------------------------------- 1 | BASE_PREFIX="/dev/shm/test/sample_base" 2 | MEM_PREFIX="/dev/shm/test/sample_mem" 3 | DELETE_LIST="/dev/shm/sample_deleted.tags" 4 | ONESHOT_PREFIX="/dev/shm/test/sample_oneshot" 5 | MERGED_PREFIX="/dev/shm/test/sample_merged" 6 | NUM_MEM_INDICES=5 7 | # copy tags from base -> base_index 8 | cp ${BASE_PREFIX}.tags ${BASE_PREFIX}_index_disk.index.tags 9 | cp ${ONESHOT_PREFIX}.tags ${ONESHOT_PREFIX}_index_disk.index.tags 10 | 11 | # copy tags file for mem indices 12 | for i in $(seq 1 $NUM_MEM_INDICES) 13 | do 14 | cp ${MEM_PREFIX}_${i}.tags ${MEM_PREFIX}_${i}_index.tags 15 | done 16 | 17 | # copy PQ stuff for merged from base 18 | cp ${BASE_PREFIX}_index_pq_pivots.bin ${MERGED_PREFIX}_index_pq_pivots.bin 19 | cp ${BASE_PREFIX}_index_pq_pivots.bin_centroid.bin ${MERGED_PREFIX}_index_pq_pivots.bin_centroid.bin 20 | cp ${BASE_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin ${MERGED_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin 21 | cp ${BASE_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin ${MERGED_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin 22 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/ann_exception.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | #include "windows_customizations.h" 7 | 8 | #ifndef _WINDOWS 9 | #define __FUNCSIG__ __PRETTY_FUNCTION__ 10 | #endif 11 | 12 | namespace diskann { 13 | class ANNException { 14 | public: 15 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode); 16 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode, 17 | const std::string& funcSig, 18 | const std::string& fileName, 19 | unsigned int lineNum); 20 | 21 | DISKANN_DLLEXPORT std::string message() const; 22 | DISKANN_DLLEXPORT int errorCode() const; 23 | 24 | private: 25 | int _errorCode; 26 | std::string _message; 27 | std::string _funcSig; 28 | std::string _fileName; 29 | unsigned int _lineNum; 30 | }; 31 | } // namespace diskann 32 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/ann_exception.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | #include "windows_customizations.h" 7 | 8 | #ifndef _WINDOWS 9 | #define __FUNCSIG__ __PRETTY_FUNCTION__ 10 | #endif 11 | 12 | namespace diskann { 13 | class ANNException { 14 | public: 15 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode); 16 | DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode, 17 | const std::string& funcSig, 18 | const std::string& fileName, 19 | unsigned int lineNum); 20 | 21 | DISKANN_DLLEXPORT std::string message() const; 22 | DISKANN_DLLEXPORT int errorCode() const; 23 | 24 | private: 25 | int _errorCode; 26 | std::string _message; 27 | std::string _funcSig; 28 | std::string _fileName; 29 | unsigned int _lineNum; 30 | }; 31 | } // namespace diskann 32 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/prep_index_merger.sh: -------------------------------------------------------------------------------- 1 | BASE_PREFIX="/dev/shm/test/sample_base" 2 | MEM_PREFIX="/dev/shm/test/sample_mem" 3 | DELETE_LIST="/dev/shm/sample_deleted.tags" 4 | ONESHOT_PREFIX="/dev/shm/test/sample_oneshot" 5 | MERGED_PREFIX="/dev/shm/test/sample_merged" 6 | NUM_MEM_INDICES=5 7 | # copy tags from base -> base_index 8 | cp ${BASE_PREFIX}.tags ${BASE_PREFIX}_index_disk.index.tags 9 | cp ${ONESHOT_PREFIX}.tags ${ONESHOT_PREFIX}_index_disk.index.tags 10 | 11 | # copy tags file for mem indices 12 | for i in $(seq 1 $NUM_MEM_INDICES) 13 | do 14 | cp ${MEM_PREFIX}_${i}.tags ${MEM_PREFIX}_${i}_index.tags 15 | done 16 | 17 | # copy PQ stuff for merged from base 18 | cp ${BASE_PREFIX}_index_pq_pivots.bin ${MERGED_PREFIX}_index_pq_pivots.bin 19 | cp ${BASE_PREFIX}_index_pq_pivots.bin_centroid.bin ${MERGED_PREFIX}_index_pq_pivots.bin_centroid.bin 20 | cp ${BASE_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin ${MERGED_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin 21 | cp ${BASE_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin ${MERGED_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/prep_index_merger.sh: -------------------------------------------------------------------------------- 1 | BASE_PREFIX="/dev/shm/test/sample_base" 2 | MEM_PREFIX="/dev/shm/test/sample_mem" 3 | DELETE_LIST="/dev/shm/sample_deleted.tags" 4 | ONESHOT_PREFIX="/dev/shm/test/sample_oneshot" 5 | MERGED_PREFIX="/dev/shm/test/sample_merged" 6 | NUM_MEM_INDICES=5 7 | # copy tags from base -> base_index 8 | cp ${BASE_PREFIX}.tags ${BASE_PREFIX}_index_disk.index.tags 9 | cp ${ONESHOT_PREFIX}.tags ${ONESHOT_PREFIX}_index_disk.index.tags 10 | 11 | # copy tags file for mem indices 12 | for i in $(seq 1 $NUM_MEM_INDICES) 13 | do 14 | cp ${MEM_PREFIX}_${i}.tags ${MEM_PREFIX}_${i}_index.tags 15 | done 16 | 17 | # copy PQ stuff for merged from base 18 | cp ${BASE_PREFIX}_index_pq_pivots.bin ${MERGED_PREFIX}_index_pq_pivots.bin 19 | cp ${BASE_PREFIX}_index_pq_pivots.bin_centroid.bin ${MERGED_PREFIX}_index_pq_pivots.bin_centroid.bin 20 | cp ${BASE_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin ${MERGED_PREFIX}_index_pq_pivots.bin_chunk_offsets.bin 21 | cp ${BASE_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin ${MERGED_PREFIX}_index_pq_pivots.bin_rearrangement_perm.bin 22 | -------------------------------------------------------------------------------- /include/v2/delete_set.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/graph_delta.h" 4 | #include "tsl/robin_map.h" 5 | #include "tsl/robin_set.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace diskann { 13 | class DeleteSet { 14 | public: 15 | // max # track_merge calls before `id` becomes free 16 | DeleteSet(uint32_t max_merges); 17 | ~DeleteSet(); 18 | 19 | // adds `id` to deleted set 20 | void add_delete(uint32_t id); 21 | 22 | // checks if `id` is in delete set 23 | bool is_dead(uint32_t id); 24 | void batch_is_dead(const uint32_t *ids, bool* dead, const uint32_t count); 25 | 26 | // track merge + release merged nodes 27 | void merge_start(); 28 | 29 | // returns nodes 30 | std::vector track_merge(); 31 | private: 32 | tsl::robin_map *primary = nullptr; 33 | tsl::robin_map *secondary = nullptr; 34 | uint32_t max_merges; 35 | std::mutex lock; 36 | }; 37 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/v2/delete_set.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/graph_delta.h" 4 | #include "tsl/robin_map.h" 5 | #include "tsl/robin_set.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace diskann { 13 | class DeleteSet { 14 | public: 15 | // max # track_merge calls before `id` becomes free 16 | DeleteSet(uint32_t max_merges); 17 | ~DeleteSet(); 18 | 19 | // adds `id` to deleted set 20 | void add_delete(uint32_t id); 21 | 22 | // checks if `id` is in delete set 23 | bool is_dead(uint32_t id); 24 | void batch_is_dead(const uint32_t *ids, bool* dead, const uint32_t count); 25 | 26 | // track merge + release merged nodes 27 | void merge_start(); 28 | 29 | // returns nodes 30 | std::vector track_merge(); 31 | private: 32 | tsl::robin_map *primary = nullptr; 33 | tsl::robin_map *secondary = nullptr; 34 | uint32_t max_merges; 35 | std::mutex lock; 36 | }; 37 | } // namespace diskann -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/v2/delete_set.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/graph_delta.h" 4 | #include "tsl/robin_map.h" 5 | #include "tsl/robin_set.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace diskann { 13 | class DeleteSet { 14 | public: 15 | // max # track_merge calls before `id` becomes free 16 | DeleteSet(uint32_t max_merges); 17 | ~DeleteSet(); 18 | 19 | // adds `id` to deleted set 20 | void add_delete(uint32_t id); 21 | 22 | // checks if `id` is in delete set 23 | bool is_dead(uint32_t id); 24 | void batch_is_dead(const uint32_t *ids, bool* dead, const uint32_t count); 25 | 26 | // track merge + release merged nodes 27 | void merge_start(); 28 | 29 | // returns nodes 30 | std::vector track_merge(); 31 | private: 32 | tsl::robin_map *primary = nullptr; 33 | tsl::robin_map *secondary = nullptr; 34 | uint32_t max_merges; 35 | std::mutex lock; 36 | }; 37 | } // namespace diskann -------------------------------------------------------------------------------- /include/tsl/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Thibaut Goetghebuer-Planchon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/linux_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifndef _WINDOWS 6 | 7 | #include "aligned_file_reader.h" 8 | 9 | class LinuxAlignedFileReader : public AlignedFileReader { 10 | private: 11 | uint64_t file_sz; 12 | FileHandle file_desc; 13 | io_context_t bad_ctx = (io_context_t) -1; 14 | 15 | public: 16 | LinuxAlignedFileReader(); 17 | ~LinuxAlignedFileReader(); 18 | 19 | IOContext &get_ctx(); 20 | 21 | // register thread-id for a context 22 | void register_thread(); 23 | 24 | // de-register thread-id for a context 25 | void deregister_thread(); 26 | 27 | void deregister_all_threads(); 28 | 29 | // Open & close ops 30 | // Blocking calls 31 | void open(const std::string &fname, bool enable_writes, bool enable_create); 32 | void close(); 33 | 34 | // process batch of aligned requests in parallel 35 | // NOTE :: blocking call 36 | void read(std::vector &read_reqs, IOContext &ctx, 37 | bool async = false); 38 | 39 | void sequential_write(AlignedRead &write_req, IOContext &ctx); 40 | }; 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/tsl/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Thibaut Goetghebuer-Planchon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/tsl/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Thibaut Goetghebuer-Planchon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/align_query_file.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "aux_utils.h" 17 | #include "utils.h" 18 | 19 | #ifndef _WINDOWS 20 | #include 21 | #include 22 | #include 23 | #endif 24 | 25 | #include "memory_mapper.h" 26 | 27 | int main(int argc, char** argv) { 28 | if (argc < 3) { 29 | diskann::cout << "Correct usage : " << argv[0] 30 | << " " << std::endl; 31 | exit(-1); 32 | } 33 | 34 | std::string input(argv[1]); 35 | std::string output(argv[2]); 36 | 37 | float* data = nullptr; 38 | size_t in_num, in_dim, in_aligned_dim; 39 | diskann::load_aligned_bin(input.c_str(), data, in_num, in_dim, 40 | in_aligned_dim); 41 | diskann::save_bin(output + ".bin", data, in_num, in_aligned_dim); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/align_query_file.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "aux_utils.h" 17 | #include "utils.h" 18 | 19 | #ifndef _WINDOWS 20 | #include 21 | #include 22 | #include 23 | #endif 24 | 25 | #include "memory_mapper.h" 26 | 27 | int main(int argc, char** argv) { 28 | if (argc < 3) { 29 | diskann::cout << "Correct usage : " << argv[0] 30 | << " " << std::endl; 31 | exit(-1); 32 | } 33 | 34 | std::string input(argv[1]); 35 | std::string output(argv[2]); 36 | 37 | float* data = nullptr; 38 | size_t in_num, in_dim, in_aligned_dim; 39 | diskann::load_aligned_bin(input.c_str(), data, in_num, in_dim, 40 | in_aligned_dim); 41 | diskann::save_bin(output + ".bin", data, in_num, in_aligned_dim); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/align_query_file.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "aux_utils.h" 17 | #include "utils.h" 18 | 19 | #ifndef _WINDOWS 20 | #include 21 | #include 22 | #include 23 | #endif 24 | 25 | #include "memory_mapper.h" 26 | 27 | int main(int argc, char** argv) { 28 | if (argc < 3) { 29 | diskann::cout << "Correct usage : " << argv[0] 30 | << " " << std::endl; 31 | exit(-1); 32 | } 33 | 34 | std::string input(argv[1]); 35 | std::string output(argv[2]); 36 | 37 | float* data = nullptr; 38 | size_t in_num, in_dim, in_aligned_dim; 39 | diskann::load_aligned_bin(input.c_str(), data, in_num, in_dim, 40 | in_aligned_dim); 41 | diskann::save_bin(output + ".bin", data, in_num, in_aligned_dim); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /include/linux_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifndef _WINDOWS 6 | 7 | #include "aligned_file_reader.h" 8 | 9 | class LinuxAlignedFileReader : public AlignedFileReader { 10 | private: 11 | uint64_t file_sz; 12 | FileHandle file_desc; 13 | io_context_t bad_ctx = (io_context_t) -1; 14 | 15 | public: 16 | LinuxAlignedFileReader(); 17 | ~LinuxAlignedFileReader(); 18 | 19 | IOContext &get_ctx(); 20 | 21 | // register thread-id for a context 22 | void register_thread(); 23 | 24 | // de-register thread-id for a context 25 | void deregister_thread(); 26 | 27 | void deregister_all_threads(); 28 | 29 | // Open & close ops 30 | // Blocking calls 31 | void open(const std::string &fname, bool enable_writes, bool enable_create); 32 | void close(); 33 | 34 | // process batch of aligned requests in parallel 35 | // NOTE :: blocking call 36 | void read(std::vector &read_reqs, IOContext &ctx, 37 | bool async = false); 38 | 39 | void sequential_write(AlignedRead &write_req, IOContext &ctx); 40 | void write(std::vector &write_reqs, IOContext &ctx); 41 | }; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | DiskANN 3 | 4 | MIT License 5 | 6 | Copyright (c) Microsoft Corporation. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE 25 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/linux_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifndef _WINDOWS 6 | 7 | #include "aligned_file_reader.h" 8 | 9 | class LinuxAlignedFileReader : public AlignedFileReader { 10 | private: 11 | uint64_t file_sz; 12 | FileHandle file_desc; 13 | io_context_t bad_ctx = (io_context_t) -1; 14 | 15 | public: 16 | LinuxAlignedFileReader(); 17 | ~LinuxAlignedFileReader(); 18 | 19 | IOContext &get_ctx(); 20 | 21 | // register thread-id for a context 22 | void register_thread(); 23 | 24 | // de-register thread-id for a context 25 | void deregister_thread(); 26 | 27 | void deregister_all_threads(); 28 | 29 | // Open & close ops 30 | // Blocking calls 31 | void open(const std::string &fname, bool enable_writes, bool enable_create); 32 | void close(); 33 | 34 | // process batch of aligned requests in parallel 35 | // NOTE :: blocking call 36 | void read(std::vector &read_reqs, IOContext &ctx, 37 | bool async = false); 38 | 39 | void sequential_write(AlignedRead &write_req, IOContext &ctx); 40 | void write(std::vector &write_reqs, IOContext &ctx); 41 | }; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/LICENSE: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | DiskANN 3 | 4 | MIT License 5 | 6 | Copyright (c) Microsoft Corporation. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE 25 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/LICENSE: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | DiskANN 3 | 4 | MIT License 5 | 6 | Copyright (c) Microsoft Corporation. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE 25 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | This algorithms builds upon [code for NSG](https://github.com/ZJULearning/nsg), commit: 335e8e, licensed under the following terms. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/NOTICE.txt: -------------------------------------------------------------------------------- 1 | This algorithms builds upon [code for NSG](https://github.com/ZJULearning/nsg), commit: 335e8e, licensed under the following terms. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/NOTICE.txt: -------------------------------------------------------------------------------- 1 | This algorithms builds upon [code for NSG](https://github.com/ZJULearning/nsg), commit: 335e8e, licensed under the following terms. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /tests/utils/merge_shards.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "aux_utils.h" 14 | #include "cached_io.h" 15 | #include "utils.h" 16 | 17 | int main(int argc, char **argv) { 18 | if (argc != 9) { 19 | diskann::cout 20 | << argv[0] 21 | << " vamana_index_prefix[1] vamana_index_suffix[2] idmaps_prefix[3] " 22 | "idmaps_suffix[4] n_shards[5] max_degree[6] output_vamana_path[7] " 23 | "output_medoids_path[8]" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::string vamana_prefix(argv[1]); 29 | std::string vamana_suffix(argv[2]); 30 | std::string idmaps_prefix(argv[3]); 31 | std::string idmaps_suffix(argv[4]); 32 | _u64 nshards = (_u64) std::atoi(argv[5]); 33 | _u32 max_degree = (_u64) std::atoi(argv[6]); 34 | std::string output_index(argv[7]); 35 | std::string output_medoids(argv[8]); 36 | 37 | return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, 38 | idmaps_suffix, nshards, max_degree, output_index, 39 | output_medoids); 40 | } 41 | -------------------------------------------------------------------------------- /src/ann_exception.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "ann_exception.h" 5 | #include 6 | 7 | namespace diskann { 8 | ANNException::ANNException(const std::string& message, int errorCode) 9 | : _errorCode(errorCode), _message(message), _funcSig(""), _fileName(""), 10 | _lineNum(0) { 11 | } 12 | 13 | ANNException::ANNException(const std::string& message, int errorCode, 14 | const std::string& funcSig, 15 | const std::string& fileName, unsigned lineNum) 16 | : ANNException(message, errorCode) { 17 | _funcSig = funcSig; 18 | _fileName = fileName; 19 | _lineNum = lineNum; 20 | } 21 | 22 | std::string ANNException::message() const { 23 | std::stringstream sstream; 24 | 25 | sstream << "Exception: " << _message; 26 | if (_funcSig != "") 27 | sstream << ". occurred at: " << _funcSig; 28 | if (_fileName != "" && _lineNum != 0) 29 | sstream << " defined in file: " << _fileName << " at line: " << _lineNum; 30 | if (_errorCode != -1) 31 | sstream << ". OS error code: " << std::hex << _errorCode; 32 | 33 | return sstream.str(); 34 | } 35 | 36 | int ANNException::errorCode() const { 37 | return _errorCode; 38 | } 39 | 40 | } // namespace diskann 41 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/merge_shards.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "aux_utils.h" 14 | #include "cached_io.h" 15 | #include "utils.h" 16 | 17 | int main(int argc, char **argv) { 18 | if (argc != 9) { 19 | diskann::cout 20 | << argv[0] 21 | << " vamana_index_prefix[1] vamana_index_suffix[2] idmaps_prefix[3] " 22 | "idmaps_suffix[4] n_shards[5] max_degree[6] output_vamana_path[7] " 23 | "output_medoids_path[8]" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::string vamana_prefix(argv[1]); 29 | std::string vamana_suffix(argv[2]); 30 | std::string idmaps_prefix(argv[3]); 31 | std::string idmaps_suffix(argv[4]); 32 | _u64 nshards = (_u64) std::atoi(argv[5]); 33 | _u32 max_degree = (_u64) std::atoi(argv[6]); 34 | std::string output_index(argv[7]); 35 | std::string output_medoids(argv[8]); 36 | 37 | return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, 38 | idmaps_suffix, nshards, max_degree, output_index, 39 | output_medoids); 40 | } 41 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/merge_shards.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "aux_utils.h" 14 | #include "cached_io.h" 15 | #include "utils.h" 16 | 17 | int main(int argc, char **argv) { 18 | if (argc != 9) { 19 | diskann::cout 20 | << argv[0] 21 | << " vamana_index_prefix[1] vamana_index_suffix[2] idmaps_prefix[3] " 22 | "idmaps_suffix[4] n_shards[5] max_degree[6] output_vamana_path[7] " 23 | "output_medoids_path[8]" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::string vamana_prefix(argv[1]); 29 | std::string vamana_suffix(argv[2]); 30 | std::string idmaps_prefix(argv[3]); 31 | std::string idmaps_suffix(argv[4]); 32 | _u64 nshards = (_u64) std::atoi(argv[5]); 33 | _u32 max_degree = (_u64) std::atoi(argv[6]); 34 | std::string output_index(argv[7]); 35 | std::string output_medoids(argv[8]); 36 | 37 | return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, 38 | idmaps_suffix, nshards, max_degree, output_index, 39 | output_medoids); 40 | } 41 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/src/ann_exception.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "ann_exception.h" 5 | #include 6 | 7 | namespace diskann { 8 | ANNException::ANNException(const std::string& message, int errorCode) 9 | : _errorCode(errorCode), _message(message), _funcSig(""), _fileName(""), 10 | _lineNum(0) { 11 | } 12 | 13 | ANNException::ANNException(const std::string& message, int errorCode, 14 | const std::string& funcSig, 15 | const std::string& fileName, unsigned lineNum) 16 | : ANNException(message, errorCode) { 17 | _funcSig = funcSig; 18 | _fileName = fileName; 19 | _lineNum = lineNum; 20 | } 21 | 22 | std::string ANNException::message() const { 23 | std::stringstream sstream; 24 | 25 | sstream << "Exception: " << _message; 26 | if (_funcSig != "") 27 | sstream << ". occurred at: " << _funcSig; 28 | if (_fileName != "" && _lineNum != 0) 29 | sstream << " defined in file: " << _fileName << " at line: " << _lineNum; 30 | if (_errorCode != -1) 31 | sstream << ". OS error code: " << std::hex << _errorCode; 32 | 33 | return sstream.str(); 34 | } 35 | 36 | int ANNException::errorCode() const { 37 | return _errorCode; 38 | } 39 | 40 | } // namespace diskann 41 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/src/ann_exception.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "ann_exception.h" 5 | #include 6 | 7 | namespace diskann { 8 | ANNException::ANNException(const std::string& message, int errorCode) 9 | : _errorCode(errorCode), _message(message), _funcSig(""), _fileName(""), 10 | _lineNum(0) { 11 | } 12 | 13 | ANNException::ANNException(const std::string& message, int errorCode, 14 | const std::string& funcSig, 15 | const std::string& fileName, unsigned lineNum) 16 | : ANNException(message, errorCode) { 17 | _funcSig = funcSig; 18 | _fileName = fileName; 19 | _lineNum = lineNum; 20 | } 21 | 22 | std::string ANNException::message() const { 23 | std::stringstream sstream; 24 | 25 | sstream << "Exception: " << _message; 26 | if (_funcSig != "") 27 | sstream << ". occurred at: " << _funcSig; 28 | if (_fileName != "" && _lineNum != 0) 29 | sstream << " defined in file: " << _fileName << " at line: " << _lineNum; 30 | if (_errorCode != -1) 31 | sstream << ". OS error code: " << std::hex << _errorCode; 32 | 33 | return sstream.str(); 34 | } 35 | 36 | int ANNException::errorCode() const { 37 | return _errorCode; 38 | } 39 | 40 | } // namespace diskann 41 | -------------------------------------------------------------------------------- /tests/utils/test_partitioning.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | 8 | // DEPRECATED: NEED TO REPROGRAM 9 | 10 | int main(int argc, char** argv) { 11 | auto s = std::chrono::high_resolution_clock::now(); 12 | 13 | if (argc != 8) { 14 | diskann::cout << argv[0] 15 | << " format: data type base_set train_set " 16 | "num_clusters " 17 | "max_reps prefix_for_working_directory k_base " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | size_t num_clusters = std::atoi(argv[4]); 22 | size_t max_reps = std::atoi(argv[5]); 23 | size_t k_base = std::atoi(argv[7]); 24 | if (std::string(argv[1]) == std::string("float")) 25 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], k_base); 26 | else if (std::string(argv[1]) == std::string("int8")) 27 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 28 | k_base); 29 | else if (std::string(argv[1]) == std::string("uint8")) 30 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 31 | k_base); 32 | else 33 | diskann::cout << "unsupported data format. use float/int8/uint8" 34 | << std::endl; 35 | } 36 | -------------------------------------------------------------------------------- /include/v2/graph_delta.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace diskann { 10 | class GraphDelta { 11 | public: 12 | GraphDelta(const uint32_t offset, const uint32_t max_nodes); 13 | // inserts node `id` into graph with `nhood` as neighbors 14 | // SUCCEEDS ONLY IF `id` belongs to the range [offset, offset + max_nodes] 15 | void insert_vector(const uint32_t id, const uint32_t*nhood, const uint32_t nnbrs); 16 | 17 | // adds required back-edges from `srcs` to `dest` 18 | void inter_insert(const uint32_t dest, const uint32_t* srcs, const uint32_t src_count); 19 | 20 | // get nhood for single ID 21 | const std::vector get_nhood(const uint32_t id); 22 | 23 | void rename_edges(const tsl::robin_map& rename_map); 24 | void rename_edges(const std::function &rename_func); 25 | std::vector> graph; 26 | private: 27 | bool is_relevant(const uint32_t id); 28 | // in-memory graph 29 | 30 | // locks to access nodes in graph 31 | std::unique_ptr locks; 32 | // max nodes 33 | uint32_t offset; 34 | uint32_t max_nodes; 35 | // id 'n' nhood located at graph[n - offset] if offset <= n <= offset + max_nodes 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/v2/graph_delta.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace diskann { 10 | class GraphDelta { 11 | public: 12 | GraphDelta(const uint32_t offset, const uint32_t max_nodes); 13 | // inserts node `id` into graph with `nhood` as neighbors 14 | // SUCCEEDS ONLY IF `id` belongs to the range [offset, offset + max_nodes] 15 | void insert_vector(const uint32_t id, const uint32_t*nhood, const uint32_t nnbrs); 16 | 17 | // adds required back-edges from `srcs` to `dest` 18 | void inter_insert(const uint32_t dest, const uint32_t* srcs, const uint32_t src_count); 19 | 20 | // get nhood for single ID 21 | const std::vector get_nhood(const uint32_t id); 22 | 23 | void rename_edges(const tsl::robin_map& rename_map); 24 | void rename_edges(const std::function &rename_func); 25 | private: 26 | bool is_relevant(const uint32_t id); 27 | // in-memory graph 28 | std::vector> graph; 29 | // locks to access nodes in graph 30 | std::unique_ptr locks; 31 | // max nodes 32 | uint32_t offset; 33 | uint32_t max_nodes; 34 | // id 'n' nhood located at graph[n - offset] if offset <= n <= offset + max_nodes 35 | }; 36 | }; 37 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/test_partitioning.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | 8 | // DEPRECATED: NEED TO REPROGRAM 9 | 10 | int main(int argc, char** argv) { 11 | auto s = std::chrono::high_resolution_clock::now(); 12 | 13 | if (argc != 8) { 14 | diskann::cout << argv[0] 15 | << " format: data type base_set train_set " 16 | "num_clusters " 17 | "max_reps prefix_for_working_directory k_base " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | size_t num_clusters = std::atoi(argv[4]); 22 | size_t max_reps = std::atoi(argv[5]); 23 | size_t k_base = std::atoi(argv[7]); 24 | if (std::string(argv[1]) == std::string("float")) 25 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], k_base); 26 | else if (std::string(argv[1]) == std::string("int8")) 27 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 28 | k_base); 29 | else if (std::string(argv[1]) == std::string("uint8")) 30 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 31 | k_base); 32 | else 33 | diskann::cout << "unsupported data format. use float/int8/uint8" 34 | << std::endl; 35 | } 36 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/test_partitioning.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "partition_and_pq.h" 7 | 8 | // DEPRECATED: NEED TO REPROGRAM 9 | 10 | int main(int argc, char** argv) { 11 | auto s = std::chrono::high_resolution_clock::now(); 12 | 13 | if (argc != 8) { 14 | diskann::cout << argv[0] 15 | << " format: data type base_set train_set " 16 | "num_clusters " 17 | "max_reps prefix_for_working_directory k_base " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | size_t num_clusters = std::atoi(argv[4]); 22 | size_t max_reps = std::atoi(argv[5]); 23 | size_t k_base = std::atoi(argv[7]); 24 | if (std::string(argv[1]) == std::string("float")) 25 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], k_base); 26 | else if (std::string(argv[1]) == std::string("int8")) 27 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 28 | k_base); 29 | else if (std::string(argv[1]) == std::string("uint8")) 30 | partition(argv[2], argv[3], num_clusters, max_reps, argv[6], 31 | k_base); 32 | else 33 | diskann::cout << "unsupported data format. use float/int8/uint8" 34 | << std::endl; 35 | } 36 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/v2/graph_delta.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace diskann { 10 | class GraphDelta { 11 | public: 12 | GraphDelta(const uint32_t offset, const uint32_t max_nodes); 13 | // inserts node `id` into graph with `nhood` as neighbors 14 | // SUCCEEDS ONLY IF `id` belongs to the range [offset, offset + max_nodes] 15 | void insert_vector(const uint32_t id, const uint32_t*nhood, const uint32_t nnbrs); 16 | 17 | // adds required back-edges from `srcs` to `dest` 18 | void inter_insert(const uint32_t dest, const uint32_t* srcs, const uint32_t src_count); 19 | 20 | // get nhood for single ID 21 | const std::vector get_nhood(const uint32_t id); 22 | 23 | void rename_edges(const tsl::robin_map& rename_map); 24 | void rename_edges(const std::function &rename_func); 25 | std::vector> graph; 26 | private: 27 | bool is_relevant(const uint32_t id); 28 | // in-memory graph 29 | 30 | // locks to access nodes in graph 31 | std::unique_ptr locks; 32 | // max nodes 33 | uint32_t offset; 34 | uint32_t max_nodes; 35 | // id 'n' nhood located at graph[n - offset] if offset <= n <= offset + max_nodes 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /include/windows_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifdef _WINDOWS 6 | #ifndef USE_BING_INFRA 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include "aligned_file_reader.h" 16 | #include "tsl/robin_map.h" 17 | #include "utils.h" 18 | #include "windows_customizations.h" 19 | 20 | class WindowsAlignedFileReader : public AlignedFileReader { 21 | private: 22 | std::wstring m_filename; 23 | 24 | protected: 25 | // virtual IOContext createContext(); 26 | 27 | public: 28 | DISKANN_DLLEXPORT WindowsAlignedFileReader(){}; 29 | DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){}; 30 | 31 | // Open & close ops 32 | // Blocking calls 33 | DISKANN_DLLEXPORT virtual void open(const std::string &fname); 34 | DISKANN_DLLEXPORT virtual void close(); 35 | 36 | DISKANN_DLLEXPORT virtual void register_thread(); 37 | DISKANN_DLLEXPORT virtual void deregister_thread() { 38 | } 39 | DISKANN_DLLEXPORT virtual IOContext &get_ctx(); 40 | 41 | // process batch of aligned requests in parallel 42 | // NOTE :: blocking call for the calling thread, but can thread-safe 43 | DISKANN_DLLEXPORT virtual void read(std::vector &read_reqs, 44 | IOContext &ctx, bool async); 45 | }; 46 | #endif // USE_BING_INFRA 47 | #endif //_WINDOWS 48 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/windows_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifdef _WINDOWS 6 | #ifndef USE_BING_INFRA 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include "aligned_file_reader.h" 16 | #include "tsl/robin_map.h" 17 | #include "utils.h" 18 | #include "windows_customizations.h" 19 | 20 | class WindowsAlignedFileReader : public AlignedFileReader { 21 | private: 22 | std::wstring m_filename; 23 | 24 | protected: 25 | // virtual IOContext createContext(); 26 | 27 | public: 28 | DISKANN_DLLEXPORT WindowsAlignedFileReader(){}; 29 | DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){}; 30 | 31 | // Open & close ops 32 | // Blocking calls 33 | DISKANN_DLLEXPORT virtual void open(const std::string &fname); 34 | DISKANN_DLLEXPORT virtual void close(); 35 | 36 | DISKANN_DLLEXPORT virtual void register_thread(); 37 | DISKANN_DLLEXPORT virtual void deregister_thread() { 38 | } 39 | DISKANN_DLLEXPORT virtual IOContext &get_ctx(); 40 | 41 | // process batch of aligned requests in parallel 42 | // NOTE :: blocking call for the calling thread, but can thread-safe 43 | DISKANN_DLLEXPORT virtual void read(std::vector &read_reqs, 44 | IOContext &ctx, bool async); 45 | }; 46 | #endif // USE_BING_INFRA 47 | #endif //_WINDOWS 48 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/windows_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifdef _WINDOWS 6 | #ifndef USE_BING_INFRA 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include "aligned_file_reader.h" 16 | #include "tsl/robin_map.h" 17 | #include "utils.h" 18 | #include "windows_customizations.h" 19 | 20 | class WindowsAlignedFileReader : public AlignedFileReader { 21 | private: 22 | std::wstring m_filename; 23 | 24 | protected: 25 | // virtual IOContext createContext(); 26 | 27 | public: 28 | DISKANN_DLLEXPORT WindowsAlignedFileReader(){}; 29 | DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){}; 30 | 31 | // Open & close ops 32 | // Blocking calls 33 | DISKANN_DLLEXPORT virtual void open(const std::string &fname); 34 | DISKANN_DLLEXPORT virtual void close(); 35 | 36 | DISKANN_DLLEXPORT virtual void register_thread(); 37 | DISKANN_DLLEXPORT virtual void deregister_thread() { 38 | } 39 | DISKANN_DLLEXPORT virtual IOContext &get_ctx(); 40 | 41 | // process batch of aligned requests in parallel 42 | // NOTE :: blocking call for the calling thread, but can thread-safe 43 | DISKANN_DLLEXPORT virtual void read(std::vector &read_reqs, 44 | IOContext &ctx, bool async); 45 | }; 46 | #endif // USE_BING_INFRA 47 | #endif //_WINDOWS 48 | -------------------------------------------------------------------------------- /tests/utils/gen_random_slice.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "partition_and_pq.h" 16 | #include "utils.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | template 24 | int aux_main(char** argv) { 25 | std::string base_file(argv[2]); 26 | std::string output_prefix(argv[3]); 27 | float sampling_rate = (float) (std::atof(argv[4])); 28 | gen_random_slice(base_file, output_prefix, sampling_rate); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char** argv) { 33 | if (argc != 5) { 34 | diskann::cout << argv[0] 35 | << " data_type [float/int8/uint8] base_bin_file " 36 | "sample_output_prefix sampling_probability" 37 | << std::endl; 38 | exit(-1); 39 | } 40 | 41 | if (std::string(argv[1]) == std::string("float")) { 42 | aux_main(argv); 43 | } else if (std::string(argv[1]) == std::string("int8")) { 44 | aux_main(argv); 45 | } else if (std::string(argv[1]) == std::string("uint8")) { 46 | aux_main(argv); 47 | } else 48 | diskann::cout << "Unsupported type. Use float/int8/uint8." << std::endl; 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/gen_random_slice.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "partition_and_pq.h" 16 | #include "utils.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | template 24 | int aux_main(char** argv) { 25 | std::string base_file(argv[2]); 26 | std::string output_prefix(argv[3]); 27 | float sampling_rate = (float) (std::atof(argv[4])); 28 | gen_random_slice(base_file, output_prefix, sampling_rate); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char** argv) { 33 | if (argc != 5) { 34 | diskann::cout << argv[0] 35 | << " data_type [float/int8/uint8] base_bin_file " 36 | "sample_output_prefix sampling_probability" 37 | << std::endl; 38 | exit(-1); 39 | } 40 | 41 | if (std::string(argv[1]) == std::string("float")) { 42 | aux_main(argv); 43 | } else if (std::string(argv[1]) == std::string("int8")) { 44 | aux_main(argv); 45 | } else if (std::string(argv[1]) == std::string("uint8")) { 46 | aux_main(argv); 47 | } else 48 | diskann::cout << "Unsupported type. Use float/int8/uint8." << std::endl; 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/gen_random_slice.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "partition_and_pq.h" 16 | #include "utils.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | template 24 | int aux_main(char** argv) { 25 | std::string base_file(argv[2]); 26 | std::string output_prefix(argv[3]); 27 | float sampling_rate = (float) (std::atof(argv[4])); 28 | gen_random_slice(base_file, output_prefix, sampling_rate); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char** argv) { 33 | if (argc != 5) { 34 | diskann::cout << argv[0] 35 | << " data_type [float/int8/uint8] base_bin_file " 36 | "sample_output_prefix sampling_probability" 37 | << std::endl; 38 | exit(-1); 39 | } 40 | 41 | if (std::string(argv[1]) == std::string("float")) { 42 | aux_main(argv); 43 | } else if (std::string(argv[1]) == std::string("int8")) { 44 | aux_main(argv); 45 | } else if (std::string(argv[1]) == std::string("uint8")) { 46 | aux_main(argv); 47 | } else 48 | diskann::cout << "Unsupported type. Use float/int8/uint8." << std::endl; 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /scripts/overall_performance.sh: -------------------------------------------------------------------------------- 1 | 2 | clear 3 | project_dir=/data/linsy/Greator 4 | id_map=2 5 | delete_dir="$project_dir/scripts/indices/sift_R34" 6 | batchsize=0.001 7 | find "$delete_dir" -mindepth 1 ! -path "$delete_dir/disk_init*" -exec rm -rf {} + 8 | 9 | cp "$delete_dir/disk_init"/* "$delete_dir"/ 10 | 11 | rm -r "$delete_dir"/_index_temp 12 | mkdir "$delete_dir"/_index_temp 13 | 14 | 15 | cd /data/linsy/Greator/build && make -j 16 | cd /data/linsy/Greator/run 17 | 18 | name=sift 19 | mydir="/data/linsy/Greator/scripts" 20 | index_type="float" 21 | base_data_file="$mydir"/dataset/"$name"/"$name"_base_95.fbin 22 | L_mem=75 23 | R_mem=34 24 | alpha_mem=1.2 25 | L_disk=75 26 | R_disk=34 27 | alpha_disk=1.2 28 | num_start=0 29 | num_shards=100 30 | num_pq_chunks=100 31 | num_nodes_to_cache=10000 32 | save_graph_file="$delete_dir"/_index 33 | update=true 34 | build=false 35 | full_data_bin="$mydir"/dataset/"$name"/"$name"_base.fbin 36 | query_bin="$mydir"/dataset/"$name"/"$name"_query.fbin 37 | truthset="$mydir"/dataset/"$name"/gt/"$name"_gt_K10_ 38 | recall_k=10 39 | search_L1=120 40 | beamwidth=2 41 | trace_file_prefix="$mydir"/trace/"$name"_trace_"$batchsize"/_trace 42 | step=5 43 | C=160 44 | 45 | 46 | "$project_dir"/build/tests/overall_performance "$index_type" "$base_data_file" "$L_mem" "$R_mem" "$alpha_mem" "$L_disk" "$R_disk" "$alpha_disk" "$num_start" "$num_shards" "$num_pq_chunks" "$num_nodes_to_cache" "$save_graph_file" "$update" "$build" "$full_data_bin" "$query_bin" "$truthset" "$recall_k" "$search_L1" "$beamwidth" "$trace_file_prefix" "$step" "$id_map" 47 | 48 | -------------------------------------------------------------------------------- /CompilerOptions.cmake: -------------------------------------------------------------------------------- 1 | if(MSVC) 2 | #changing default target to X64 3 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 4 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 5 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 6 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 7 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 8 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 9 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 10 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 11 | string(REGEX REPLACE "Debug" "Release" CMAKE_BUILD_TYPE_INIT "${CMAKE_BUILD_TYPE_INIT}") 12 | endif() 13 | 14 | 15 | get_cmake_property(_varNames VARIABLES) 16 | list (REMOVE_DUPLICATES _varNames) 17 | list (SORT _varNames) 18 | foreach (_varName ${_varNames}) 19 | message(STATUS "${_varName}=${${_varName}}") 20 | endforeach() 21 | 22 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/CompilerOptions.cmake: -------------------------------------------------------------------------------- 1 | if(MSVC) 2 | #changing default target to X64 3 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 4 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 5 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 6 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 7 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 8 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 9 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 10 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 11 | string(REGEX REPLACE "Debug" "Release" CMAKE_BUILD_TYPE_INIT "${CMAKE_BUILD_TYPE_INIT}") 12 | endif() 13 | 14 | 15 | get_cmake_property(_varNames VARIABLES) 16 | list (REMOVE_DUPLICATES _varNames) 17 | list (SORT _varNames) 18 | foreach (_varName ${_varNames}) 19 | message(STATUS "${_varName}=${${_varName}}") 20 | endforeach() 21 | 22 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/CompilerOptions.cmake: -------------------------------------------------------------------------------- 1 | if(MSVC) 2 | #changing default target to X64 3 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 4 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 5 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 6 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 7 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}") 8 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}") 9 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}") 10 | string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}") 11 | string(REGEX REPLACE "Debug" "Release" CMAKE_BUILD_TYPE_INIT "${CMAKE_BUILD_TYPE_INIT}") 12 | endif() 13 | 14 | 15 | get_cmake_property(_varNames VARIABLES) 16 | list (REMOVE_DUPLICATES _varNames) 17 | list (SORT _varNames) 18 | foreach (_varName ${_varNames}) 19 | message(STATUS "${_varName}=${${_varName}}") 20 | endforeach() 21 | 22 | -------------------------------------------------------------------------------- /tests/utils/partition_data.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 7) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const size_t num_partitions = (size_t) std::atoi(argv[5]); 26 | const size_t max_reps = 15; 27 | const size_t k_index = (size_t) std::atoi(argv[6]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition(data_path, sampling_rate, num_partitions, max_reps, 31 | prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition(data_path, sampling_rate, num_partitions, max_reps, 34 | prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition(data_path, sampling_rate, num_partitions, max_reps, 37 | prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/partition_data.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 7) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const size_t num_partitions = (size_t) std::atoi(argv[5]); 26 | const size_t max_reps = 15; 27 | const size_t k_index = (size_t) std::atoi(argv[6]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition(data_path, sampling_rate, num_partitions, max_reps, 31 | prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition(data_path, sampling_rate, num_partitions, max_reps, 34 | prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition(data_path, sampling_rate, num_partitions, max_reps, 37 | prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/partition_data.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 7) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const size_t num_partitions = (size_t) std::atoi(argv[5]); 26 | const size_t max_reps = 15; 27 | const size_t k_index = (size_t) std::atoi(argv[6]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition(data_path, sampling_rate, num_partitions, max_reps, 31 | prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition(data_path, sampling_rate, num_partitions, max_reps, 34 | prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition(data_path, sampling_rate, num_partitions, max_reps, 37 | prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /tests/utils/partition_with_ram_budget.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 8) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const double ram_budget = (double) std::atof(argv[5]); 26 | const size_t graph_degree = (size_t) std::atoi(argv[6]); 27 | const size_t k_index = (size_t) std::atoi(argv[7]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 31 | graph_degree, prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 34 | graph_degree, prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 37 | graph_degree, prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/partition_with_ram_budget.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 8) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const double ram_budget = (double) std::atof(argv[5]); 26 | const size_t graph_degree = (size_t) std::atoi(argv[6]); 27 | const size_t k_index = (size_t) std::atoi(argv[7]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 31 | graph_degree, prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 34 | graph_degree, prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 37 | graph_degree, prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/partition_with_ram_budget.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition_and_pq.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char** argv) { 12 | if (argc != 8) { 13 | diskann::cout << "Usage:\n" 14 | << argv[0] 15 | << " datatype " 16 | " " 17 | " " 18 | << std::endl; 19 | exit(-1); 20 | } 21 | 22 | const std::string data_path(argv[2]); 23 | const std::string prefix_path(argv[3]); 24 | const float sampling_rate = (float) atof(argv[4]); 25 | const double ram_budget = (double) std::atof(argv[5]); 26 | const size_t graph_degree = (size_t) std::atoi(argv[6]); 27 | const size_t k_index = (size_t) std::atoi(argv[7]); 28 | 29 | if (std::string(argv[1]) == std::string("float")) 30 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 31 | graph_degree, prefix_path, k_index); 32 | else if (std::string(argv[1]) == std::string("int8")) 33 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 34 | graph_degree, prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, 37 | graph_degree, prefix_path, k_index); 38 | else 39 | diskann::cout << "unsupported data format. use float/int8/uint8" 40 | << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /include/cosine_similarity.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "simd_utils.h" 18 | 19 | extern bool Avx2SupportedCPU; 20 | 21 | namespace diskann { 22 | template 23 | inline float compute_l2_norm(const T* vector, uint64_t ndims) { 24 | float norm = 0.0f; 25 | for (uint64_t i = 0; i < ndims; i++) { 26 | norm += (float) (vector[i] * vector[i]); 27 | } 28 | return std::sqrt(norm); 29 | } 30 | 31 | template 32 | inline float compute_cosine_similarity(const T* left, const T* right, 33 | uint64_t ndims) { 34 | float left_norm = compute_l2_norm(left, ndims); 35 | float right_norm = compute_l2_norm(right, ndims); 36 | float dot = 0.0f; 37 | for (uint64_t i = 0; i < ndims; i++) { 38 | dot += (float) (left[i] * right[i]); 39 | } 40 | float cos_sim = dot / (left_norm * right_norm); 41 | return cos_sim; 42 | } 43 | 44 | inline std::vector compute_cosine_similarity_batch( 45 | const float* query, const unsigned* indices, const float* all_data, 46 | const unsigned ndims, const unsigned npts) { 47 | std::vector cos_dists; 48 | cos_dists.reserve(npts); 49 | 50 | for (size_t i = 0; i < npts; i++) { 51 | const float* point = all_data + (size_t) (indices[i]) * (size_t) (ndims); 52 | cos_dists.push_back( 53 | compute_cosine_similarity(point, query, ndims)); 54 | } 55 | return cos_dists; 56 | } 57 | } // namespace diskann 58 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/cosine_similarity.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "simd_utils.h" 18 | 19 | extern bool Avx2SupportedCPU; 20 | 21 | namespace diskann { 22 | template 23 | inline float compute_l2_norm(const T* vector, uint64_t ndims) { 24 | float norm = 0.0f; 25 | for (uint64_t i = 0; i < ndims; i++) { 26 | norm += (float) (vector[i] * vector[i]); 27 | } 28 | return std::sqrt(norm); 29 | } 30 | 31 | template 32 | inline float compute_cosine_similarity(const T* left, const T* right, 33 | uint64_t ndims) { 34 | float left_norm = compute_l2_norm(left, ndims); 35 | float right_norm = compute_l2_norm(right, ndims); 36 | float dot = 0.0f; 37 | for (uint64_t i = 0; i < ndims; i++) { 38 | dot += (float) (left[i] * right[i]); 39 | } 40 | float cos_sim = dot / (left_norm * right_norm); 41 | return cos_sim; 42 | } 43 | 44 | inline std::vector compute_cosine_similarity_batch( 45 | const float* query, const unsigned* indices, const float* all_data, 46 | const unsigned ndims, const unsigned npts) { 47 | std::vector cos_dists; 48 | cos_dists.reserve(npts); 49 | 50 | for (size_t i = 0; i < npts; i++) { 51 | const float* point = all_data + (size_t) (indices[i]) * (size_t) (ndims); 52 | cos_dists.push_back( 53 | compute_cosine_similarity(point, query, ndims)); 54 | } 55 | return cos_dists; 56 | } 57 | } // namespace diskann 58 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/cosine_similarity.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "simd_utils.h" 18 | 19 | extern bool Avx2SupportedCPU; 20 | 21 | namespace diskann { 22 | template 23 | inline float compute_l2_norm(const T* vector, uint64_t ndims) { 24 | float norm = 0.0f; 25 | for (uint64_t i = 0; i < ndims; i++) { 26 | norm += (float) (vector[i] * vector[i]); 27 | } 28 | return std::sqrt(norm); 29 | } 30 | 31 | template 32 | inline float compute_cosine_similarity(const T* left, const T* right, 33 | uint64_t ndims) { 34 | float left_norm = compute_l2_norm(left, ndims); 35 | float right_norm = compute_l2_norm(right, ndims); 36 | float dot = 0.0f; 37 | for (uint64_t i = 0; i < ndims; i++) { 38 | dot += (float) (left[i] * right[i]); 39 | } 40 | float cos_sim = dot / (left_norm * right_norm); 41 | return cos_sim; 42 | } 43 | 44 | inline std::vector compute_cosine_similarity_batch( 45 | const float* query, const unsigned* indices, const float* all_data, 46 | const unsigned ndims, const unsigned npts) { 47 | std::vector cos_dists; 48 | cos_dists.reserve(npts); 49 | 50 | for (size_t i = 0; i < npts; i++) { 51 | const float* point = all_data + (size_t) (indices[i]) * (size_t) (ndims); 52 | cos_dists.push_back( 53 | compute_cosine_similarity(point, query, ndims)); 54 | } 55 | return cos_dists; 56 | } 57 | } // namespace diskann 58 | -------------------------------------------------------------------------------- /tests/utils/create_disk_layout.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "aux_utils.h" 12 | #include "cached_io.h" 13 | #include "utils.h" 14 | 15 | template 16 | int create_disk_layout(char **argv) { 17 | std::string vamana_file(argv[2]); 18 | std::string base_file(argv[3]); 19 | std::string tags_file(argv[4]); 20 | std::string pq_pivots_file(argv[5]); 21 | std::string pq_vectors_file(argv[6]); 22 | std::string output_file(argv[7]); 23 | bool single_index_flag = false; 24 | if (base_file == "null") 25 | single_index_flag = true; 26 | diskann::create_disk_layout(vamana_file, base_file, "", 27 | pq_pivots_file, pq_vectors_file, 28 | single_index_flag, output_file); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char **argv) { 33 | if (argc != 8) { 34 | diskann::cout << argv[0] 35 | << " data_type vamana_index_file " 36 | " data_file tags_bin pq_pivots_file pq_vectors_file " 37 | "output_diskann_file" 38 | << std::endl; 39 | exit(-1); 40 | } 41 | int ret_val = -1; 42 | if (std::string(argv[1]) == std::string("float")) 43 | ret_val = create_disk_layout(argv); 44 | else if (std::string(argv[1]) == std::string("int8")) 45 | ret_val = create_disk_layout(argv); 46 | else if (std::string(argv[1]) == std::string("uint8")) 47 | ret_val = create_disk_layout(argv); 48 | else { 49 | diskann::cout << "unsupported type. use int8/uint8/float " << std::endl; 50 | ret_val = -2; 51 | } 52 | return ret_val; 53 | } 54 | -------------------------------------------------------------------------------- /tests/utils/tsv_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, _u64 npts, _u64 ndims) { 9 | auto cursor = read_buf; 10 | float val; 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; ++d) { 14 | reader >> val; 15 | *cursor = val; 16 | cursor++; 17 | } 18 | } 19 | writer.write((char*) read_buf, npts * ndims * sizeof(float)); 20 | } 21 | 22 | int main(int argc, char** argv) { 23 | if (argc != 5) { 24 | diskann::cout << argv[0] 25 | << " input_filename.tsv output_filename.bin dim num_pts>" 26 | << std::endl; 27 | exit(-1); 28 | } 29 | 30 | _u64 ndims = atoi(argv[3]); 31 | _u64 npts = atoi(argv[4]); 32 | 33 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 34 | // _u64 fsize = reader.tellg(); 35 | reader.seekg(0, std::ios::beg); 36 | reader.seekg(0, std::ios::beg); 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | diskann::cout << "# blks: " << nblks << std::endl; 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | int npts_s32 = (_s32) npts; 43 | int ndims_s32 = (_s32) ndims; 44 | writer.write((char*) &npts_s32, sizeof(_s32)); 45 | writer.write((char*) &ndims_s32, sizeof(_s32)); 46 | float* read_buf = new float[npts * (ndims + 1)]; 47 | for (_u64 i = 0; i < nblks; i++) { 48 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 49 | block_convert(reader, writer, read_buf, cblk_size, ndims); 50 | diskann::cout << "Block #" << i << " written" << std::endl; 51 | } 52 | 53 | delete[] read_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/create_disk_layout.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "aux_utils.h" 12 | #include "cached_io.h" 13 | #include "utils.h" 14 | 15 | template 16 | int create_disk_layout(char **argv) { 17 | std::string vamana_file(argv[2]); 18 | std::string base_file(argv[3]); 19 | std::string tags_file(argv[4]); 20 | std::string pq_pivots_file(argv[5]); 21 | std::string pq_vectors_file(argv[6]); 22 | std::string output_file(argv[7]); 23 | bool single_index_flag = false; 24 | if (base_file == "null") 25 | single_index_flag = true; 26 | diskann::create_disk_layout(vamana_file, base_file, "", 27 | pq_pivots_file, pq_vectors_file, 28 | single_index_flag, output_file); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char **argv) { 33 | if (argc != 8) { 34 | diskann::cout << argv[0] 35 | << " data_type vamana_index_file " 36 | " data_file tags_bin pq_pivots_file pq_vectors_file " 37 | "output_diskann_file" 38 | << std::endl; 39 | exit(-1); 40 | } 41 | int ret_val = -1; 42 | if (std::string(argv[1]) == std::string("float")) 43 | ret_val = create_disk_layout(argv); 44 | else if (std::string(argv[1]) == std::string("int8")) 45 | ret_val = create_disk_layout(argv); 46 | else if (std::string(argv[1]) == std::string("uint8")) 47 | ret_val = create_disk_layout(argv); 48 | else { 49 | diskann::cout << "unsupported type. use int8/uint8/float " << std::endl; 50 | ret_val = -2; 51 | } 52 | return ret_val; 53 | } 54 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/create_disk_layout.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "aux_utils.h" 12 | #include "cached_io.h" 13 | #include "utils.h" 14 | 15 | template 16 | int create_disk_layout(char **argv) { 17 | std::string vamana_file(argv[2]); 18 | std::string base_file(argv[3]); 19 | std::string tags_file(argv[4]); 20 | std::string pq_pivots_file(argv[5]); 21 | std::string pq_vectors_file(argv[6]); 22 | std::string output_file(argv[7]); 23 | bool single_index_flag = false; 24 | if (base_file == "null") 25 | single_index_flag = true; 26 | diskann::create_disk_layout(vamana_file, base_file, "", 27 | pq_pivots_file, pq_vectors_file, 28 | single_index_flag, output_file); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char **argv) { 33 | if (argc != 8) { 34 | diskann::cout << argv[0] 35 | << " data_type vamana_index_file " 36 | " data_file tags_bin pq_pivots_file pq_vectors_file " 37 | "output_diskann_file" 38 | << std::endl; 39 | exit(-1); 40 | } 41 | int ret_val = -1; 42 | if (std::string(argv[1]) == std::string("float")) 43 | ret_val = create_disk_layout(argv); 44 | else if (std::string(argv[1]) == std::string("int8")) 45 | ret_val = create_disk_layout(argv); 46 | else if (std::string(argv[1]) == std::string("uint8")) 47 | ret_val = create_disk_layout(argv); 48 | else { 49 | diskann::cout << "unsupported type. use int8/uint8/float " << std::endl; 50 | ret_val = -2; 51 | } 52 | return ret_val; 53 | } 54 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/tsv_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, _u64 npts, _u64 ndims) { 9 | auto cursor = read_buf; 10 | float val; 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; ++d) { 14 | reader >> val; 15 | *cursor = val; 16 | cursor++; 17 | } 18 | } 19 | writer.write((char*) read_buf, npts * ndims * sizeof(float)); 20 | } 21 | 22 | int main(int argc, char** argv) { 23 | if (argc != 5) { 24 | diskann::cout << argv[0] 25 | << " input_filename.tsv output_filename.bin dim num_pts>" 26 | << std::endl; 27 | exit(-1); 28 | } 29 | 30 | _u64 ndims = atoi(argv[3]); 31 | _u64 npts = atoi(argv[4]); 32 | 33 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 34 | // _u64 fsize = reader.tellg(); 35 | reader.seekg(0, std::ios::beg); 36 | reader.seekg(0, std::ios::beg); 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | diskann::cout << "# blks: " << nblks << std::endl; 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | int npts_s32 = (_s32) npts; 43 | int ndims_s32 = (_s32) ndims; 44 | writer.write((char*) &npts_s32, sizeof(_s32)); 45 | writer.write((char*) &ndims_s32, sizeof(_s32)); 46 | float* read_buf = new float[npts * (ndims + 1)]; 47 | for (_u64 i = 0; i < nblks; i++) { 48 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 49 | block_convert(reader, writer, read_buf, cblk_size, ndims); 50 | diskann::cout << "Block #" << i << " written" << std::endl; 51 | } 52 | 53 | delete[] read_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/tsv_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, _u64 npts, _u64 ndims) { 9 | auto cursor = read_buf; 10 | float val; 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; ++d) { 14 | reader >> val; 15 | *cursor = val; 16 | cursor++; 17 | } 18 | } 19 | writer.write((char*) read_buf, npts * ndims * sizeof(float)); 20 | } 21 | 22 | int main(int argc, char** argv) { 23 | if (argc != 5) { 24 | diskann::cout << argv[0] 25 | << " input_filename.tsv output_filename.bin dim num_pts>" 26 | << std::endl; 27 | exit(-1); 28 | } 29 | 30 | _u64 ndims = atoi(argv[3]); 31 | _u64 npts = atoi(argv[4]); 32 | 33 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 34 | // _u64 fsize = reader.tellg(); 35 | reader.seekg(0, std::ios::beg); 36 | reader.seekg(0, std::ios::beg); 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | diskann::cout << "# blks: " << nblks << std::endl; 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | int npts_s32 = (_s32) npts; 43 | int ndims_s32 = (_s32) ndims; 44 | writer.write((char*) &npts_s32, sizeof(_s32)); 45 | writer.write((char*) &ndims_s32, sizeof(_s32)); 46 | float* read_buf = new float[npts * (ndims + 1)]; 47 | for (_u64 i = 0; i < nblks; i++) { 48 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 49 | block_convert(reader, writer, read_buf, cblk_size, ndims); 50 | diskann::cout << "Block #" << i << " written" << std::endl; 51 | } 52 | 53 | delete[] read_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /tests/utils/calculate_recall.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "aux_utils.h" 13 | #include "utils.h" 14 | 15 | int main(int argc, char** argv) { 16 | if (argc != 4) { 17 | diskann::cout << argv[0] << " " 18 | << std::endl; 19 | return -1; 20 | } 21 | unsigned* gold_std = NULL; 22 | float* gs_dist = nullptr; 23 | unsigned* our_results = NULL; 24 | float* or_dist = nullptr; 25 | size_t points_num, points_num_gs, points_num_or; 26 | size_t dim_gs; 27 | size_t dim_or; 28 | diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs); 29 | diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or); 30 | 31 | if (points_num_gs != points_num_or) { 32 | diskann::cout 33 | << "Error. Number of queries mismatch in ground truth and our results" 34 | << std::endl; 35 | return -1; 36 | } 37 | points_num = points_num_gs; 38 | 39 | uint32_t recall_at = std::atoi(argv[3]); 40 | 41 | if ((dim_or < recall_at) || (recall_at > dim_gs)) { 42 | diskann::cout << "ground truth has size " << dim_gs << "; our set has " 43 | << dim_or << " points. Asking for recall " << recall_at 44 | << std::endl; 45 | return -1; 46 | } 47 | diskann::cout << "Calculating recall@" << recall_at << std::endl; 48 | float recall_val = (float) diskann::calculate_recall( 49 | (_u32) points_num, gold_std, gs_dist, (_u32) dim_gs, our_results, 50 | (_u32) dim_or, recall_at); 51 | 52 | // double avg_recall = (recall*1.0)/(points_num*1.0); 53 | diskann::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n"; 54 | } 55 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/calculate_recall.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "aux_utils.h" 13 | #include "utils.h" 14 | 15 | int main(int argc, char** argv) { 16 | if (argc != 4) { 17 | diskann::cout << argv[0] << " " 18 | << std::endl; 19 | return -1; 20 | } 21 | unsigned* gold_std = NULL; 22 | float* gs_dist = nullptr; 23 | unsigned* our_results = NULL; 24 | float* or_dist = nullptr; 25 | size_t points_num, points_num_gs, points_num_or; 26 | size_t dim_gs; 27 | size_t dim_or; 28 | diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs); 29 | diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or); 30 | 31 | if (points_num_gs != points_num_or) { 32 | diskann::cout 33 | << "Error. Number of queries mismatch in ground truth and our results" 34 | << std::endl; 35 | return -1; 36 | } 37 | points_num = points_num_gs; 38 | 39 | uint32_t recall_at = std::atoi(argv[3]); 40 | 41 | if ((dim_or < recall_at) || (recall_at > dim_gs)) { 42 | diskann::cout << "ground truth has size " << dim_gs << "; our set has " 43 | << dim_or << " points. Asking for recall " << recall_at 44 | << std::endl; 45 | return -1; 46 | } 47 | diskann::cout << "Calculating recall@" << recall_at << std::endl; 48 | float recall_val = (float) diskann::calculate_recall( 49 | (_u32) points_num, gold_std, gs_dist, (_u32) dim_gs, our_results, 50 | (_u32) dim_or, recall_at); 51 | 52 | // double avg_recall = (recall*1.0)/(points_num*1.0); 53 | diskann::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n"; 54 | } 55 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/calculate_recall.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "aux_utils.h" 13 | #include "utils.h" 14 | 15 | int main(int argc, char** argv) { 16 | if (argc != 4) { 17 | diskann::cout << argv[0] << " " 18 | << std::endl; 19 | return -1; 20 | } 21 | unsigned* gold_std = NULL; 22 | float* gs_dist = nullptr; 23 | unsigned* our_results = NULL; 24 | float* or_dist = nullptr; 25 | size_t points_num, points_num_gs, points_num_or; 26 | size_t dim_gs; 27 | size_t dim_or; 28 | diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs); 29 | diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or); 30 | 31 | if (points_num_gs != points_num_or) { 32 | diskann::cout 33 | << "Error. Number of queries mismatch in ground truth and our results" 34 | << std::endl; 35 | return -1; 36 | } 37 | points_num = points_num_gs; 38 | 39 | uint32_t recall_at = std::atoi(argv[3]); 40 | 41 | if ((dim_or < recall_at) || (recall_at > dim_gs)) { 42 | diskann::cout << "ground truth has size " << dim_gs << "; our set has " 43 | << dim_or << " points. Asking for recall " << recall_at 44 | << std::endl; 45 | return -1; 46 | } 47 | diskann::cout << "Calculating recall@" << recall_at << std::endl; 48 | float recall_val = (float) diskann::calculate_recall( 49 | (_u32) points_num, gold_std, gs_dist, (_u32) dim_gs, our_results, 50 | (_u32) dim_or, recall_at); 51 | 52 | // double avg_recall = (recall*1.0)/(points_num*1.0); 53 | diskann::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n"; 54 | } 55 | -------------------------------------------------------------------------------- /scripts/readme.md: -------------------------------------------------------------------------------- 1 | ### Step 1: Run `pre_dataset.sh` 2 | 3 | The example in this paper uses the SIFT1M dataset. Please download and prepare the files `sift_base.fbin` and `sift_query.fbin` before running the scripts. 4 | 5 | This shell script consists of multiple smaller scripts, each serving a specific purpose. For detailed information about each script, please refer to `./scripts/pre_dataset/readme.md`. 6 | 7 | In this step, we generate the update trace for each iteration, the base 95% vectors file, and the ground truth files for each round. 8 | - The directory `./scripts/dataset/sift/gt` contains `num_iterations` ground truth files. 9 | - The directory `./scripts/dataset/sift` contains `sift_base_95.fbin`, `sift_base.fbin`, and `sift_query.fbin`. 10 | - The directory `./scripts/trace/sift_trace_0.001` contains `num_iterations` trace files. 11 | 12 | --- 13 | 14 | 15 | ### Step 2: Run `build_indices.sh` 16 | 17 | In this step, we build the `index_R32` and `index_R34` (i.e., 33/34) index files. 18 | - The directory `./scripts/indices/sift_R32/disk_init` contains 6 files. 19 | - The directory `./scripts/indices/sift_R34/disk_init` contains 7 files (including the topology file):`_index_disk.index`, `_index_disk.index.tags`, `_index_disk.index_with_only_nbrs`, `_index_pq_compressed.bin`, `_index_pq_pivots.bin`, `_index_sample_data.bin`, `_index_sample_ids.bin`. 20 | 21 | --- 22 | 23 | ### Step 3: Run `overall_performance.sh` 24 | 25 | In this step, we evaluate the overall performance of the `index_R34` (i.e., 33/34) index. For each update round, we calculate the recall and update time. 26 | We focus on the output values of **Recall** and the update time indicated by messages like: 27 | `Merge_kernel use 6.09201 s.` 28 | 29 | It is worth noting that Greator’s update algorithms support asynchronous updates. However, to more accurately evaluate the update algorithm’s performance and the recall after updates, we adopt a synchronous update approach. Using asynchronous updates makes it difficult to precisely assess the update throughput and the quality of the updated index. 30 | 31 | -------------------------------------------------------------------------------- /scripts/pre_dataset/readme.md: -------------------------------------------------------------------------------- 1 | ### Streaming Update Trace Generator (make_trace.cpp) 2 | 3 | This script is designed to generate trace files for **streaming update scenarios** in dynamic vector indexing experiments. 4 | 5 | To ensure **standardization** and **reproducibility** of results, we adopt the following experimental setup: 6 | 7 | - **Only the first 95% of the vectors** (i.e., IDs from `0` to `0.95×N − 1`) are used to construct the initial index. 8 | - In each update round: 9 | - The earliest **0.1%** of vectors in the current index are **deleted**. 10 | - A new **0.1%** of vectors are **inserted**. 11 | - This simulates a typical **sliding-window update pattern** and supports 50 rounds of streaming updates. 12 | 13 | #### Example 14 | 15 | Suppose the total number of vectors is **1,000,000**: 16 | 17 | - The initial index is built from vectors with IDs ranging from `0` to `949,999`. 18 | - In the **first update**: 19 | - The system **deletes** vectors with IDs from `0` to `999`. 20 | - Then **inserts** vectors with IDs from `950,000` to `950,999`. 21 | 22 | ### Base_95 Generator (extract_base.cpp) 23 | 24 | To accurately reproduce the experimental results, no randomization is introduced during the preprocessing stage. Therefore, the extract_base.cpp file simply extracts the first 95% of vectors from the dataset (assuming 50 update rounds with 0.1% updates per round). 25 | 26 | ### Ground Truth File Generator for Each Update Round (compute_knn.cpp) 27 | 28 | To accurately calculate the recall after each update round, the ground truth files for each round need to be generated in advance. 29 | 30 | ### Tags File Generator (make_tags.cpp) 31 | 32 | This script is designed to generate tags files for **streaming update scenarios** in dynamic vector indexing experiments. 33 | 34 | ### Process Index_R32 as Index_R34 (process_index_for_diffR.cpp) 35 | 36 | This script is designed to process the index_R32 file to generate index_R34 file. 37 | 38 | ### Extract Topology File from Index_R34 (topology_extraction.cpp) 39 | 40 | This script is designed to extract the topology file from the index_R34 file. -------------------------------------------------------------------------------- /src/v2/fs_allocator.cpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/fs_allocator.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.h" 11 | #include "logger.h" 12 | 13 | namespace diskann { 14 | 15 | template 16 | FixedSizeAlignedAllocator::FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count) { 17 | assert(IS_ALIGNED(ndims, 32)); 18 | this->count = ROUND_UP(max_count, 32); 19 | alloc_aligned( (void**)&this->buf, this->count * sizeof(T), 32); 20 | std::vector ids(this->count); 21 | std::iota(ids.begin(), ids.end(), 0); 22 | this->free_set.insert(ids.begin(), ids.end()); 23 | ids.clear(); 24 | } 25 | 26 | template 27 | FixedSizeAlignedAllocator::~FixedSizeAlignedAllocator() { 28 | std::lock_guard lk(this->lock); 29 | assert(this->free_set.size() == this->count); 30 | aligned_free(this->buf); 31 | } 32 | 33 | template 34 | T* FixedSizeAlignedAllocator::allocate(){ 35 | std::lock_guard lk(this->lock); 36 | uint32_t id = std::numeric_limits::max(); 37 | for(auto &v : this->free_set) { 38 | id = v; 39 | break; 40 | } 41 | 42 | if(id == std::numeric_limits::max()) { 43 | std::cerr << "UNABLE TO ALLOCATE MEMORY" << std::endl; 44 | return nullptr; 45 | } else{ 46 | this->free_set.erase(id); 47 | } 48 | return this->buf + (id * ndims); 49 | } 50 | 51 | template 52 | void FixedSizeAlignedAllocator::deallocate(T* ptr) { 53 | assert(IS_ALIGNED(ptr, 32)); 54 | uint32_t id = (uint32_t) (ptr - this->buf) / ndims; 55 | std::lock_guard lk(this->lock); 56 | this->free_set.insert(id); 57 | } 58 | 59 | // vectors 60 | template class FixedSizeAlignedAllocator; 61 | template class FixedSizeAlignedAllocator; 62 | template class FixedSizeAlignedAllocator; 63 | // nhoods 64 | template class FixedSizeAlignedAllocator; 65 | } // namespace diskann 66 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/study/process.md: -------------------------------------------------------------------------------- 1 | # 加油吧!!! 2 | # Overall_performance 运行过程 3 | ## 1. 设置参数 4 | ## 2. Build 5 | > build(learn.bin,L_mem, R_mem, alpha_mem, L_disk, R_disk,alpha_disk, num_start, num_shards, num_pq_chunks,nodes_to_cache, save_path) 6 | ### 2.1 读取data_learn.bin -> data_load 7 | ### 2.2 创建tags(num=num_start,value=0),并保存到save_path+"_disk.index.tags" 8 | ## 3. update 9 | > update(full_data_path, L_mem, R_mem, alpha_mem, L_disk,R_disk, alpha_disk, step, num_start,num_pq_chunks, nodes_to_cache, save_path,query_file, truthset, recall_at, Lsearch,beam_width, trace_file_prefix, &dist_cmp) 10 | ### 3.1 定义MergeInsert类型变量sync_index,并读取disk/_index所有索引文件 11 | > (paras, dim, save_path + "_mem",save_path, save_path + "_merge",dist_cmp, metric, false, save_path) 12 | ### 3.2 读取query文件 #pts = 10000, #dims = 128, aligned_dim = 128 13 | ### 3.3 获取当前的truthfile_name(不计算recall就不用读) 14 | ### 3.4 搜索设置每个查询点的邻居——sync_search_kernel(),并保存到result_overall_spacev_diskann{cur_time}.bin中 15 | ### 3.5 批次处理迭代——for batch 16 | ### 3.5.1 通过_trace_i文件获得update_size、insert_ids、delete_ids,然后通过learn.bin获得ids对应的向量 17 | ### 3.5.2 异步执行 insertion_kernel() 和 deletion_kernel() 18 | #### 3.5.2.1 insertion_kernel() 19 | > 1.遍历insert_vec里的每个元素,执行insert操作。 20 | >> * 等到写锁消失,即可以insert(写) 21 | >> * 上写锁 22 | >> * 寻找可以用的mem_index(0/1) 23 | >> * 执行insert_point 操作: 24 | >>> A. 对于重复插入的点,先删除(delete_set.insert+location_to_tag.erase+tag_to_location.erase) 25 | >>>> B. 获取插入的locations,更新tag_to_location和tag_to_location_size 26 | >>>> C. 获得插入点的邻居节点,剪枝处理 27 | >>>> D. 将剪枝后的邻居加到graph[location]中,并且添加反向边。 28 | #### 3.5.2.2 delete_kernel() 29 | > 1.遍历delete_vec里的每个元素,执行delete操作。 30 | >> * 上删锁 31 | >> * 寻找可以用的mem_index(0/1) 32 | >> * 执行lazy_delete 操作:添加到_deletion_set_1/0中,和mem_index_0/1的_delete_set 33 | 34 | 35 | 36 | 37 | save_del_set():将merge_insert的_deletion_set_1/0 保存到_deleted_tags_vector,保存前先清空。 38 | switch_index()中的save()是将mem_index的_delete_set 保存到内存文件,但不清空delete_set. 39 | 故deleted_tags存的是内存中 40 | 41 | mem_data存的实际上就是插入的点。 42 | 43 | 44 | process_deletes() 问题: 45 | 1. #586 对于检查medoid,仅检查了第0个。且对于含删除标记的medoid的处理欠妥,代码的做法是从medoid[0]的邻居中选择第0个作为新的medoid。这里直接选择邻居是没问题的,因为已经去除了所有点的含删除标记的邻居。 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/src/v2/fs_allocator.cpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/fs_allocator.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.h" 11 | #include "logger.h" 12 | 13 | namespace diskann { 14 | 15 | template 16 | FixedSizeAlignedAllocator::FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count) { 17 | assert(IS_ALIGNED(ndims, 32)); 18 | this->count = ROUND_UP(max_count, 32); 19 | alloc_aligned( (void**)&this->buf, this->count * sizeof(T), 32); 20 | std::vector ids(this->count); 21 | std::iota(ids.begin(), ids.end(), 0); 22 | this->free_set.insert(ids.begin(), ids.end()); 23 | ids.clear(); 24 | } 25 | 26 | template 27 | FixedSizeAlignedAllocator::~FixedSizeAlignedAllocator() { 28 | std::lock_guard lk(this->lock); 29 | assert(this->free_set.size() == this->count); 30 | aligned_free(this->buf); 31 | } 32 | 33 | template 34 | T* FixedSizeAlignedAllocator::allocate(){ 35 | std::lock_guard lk(this->lock); 36 | uint32_t id = std::numeric_limits::max(); 37 | for(auto &v : this->free_set) { 38 | id = v; 39 | break; 40 | } 41 | 42 | if(id == std::numeric_limits::max()) { 43 | std::cerr << "UNABLE TO ALLOCATE MEMORY" << std::endl; 44 | return nullptr; 45 | } else{ 46 | this->free_set.erase(id); 47 | } 48 | return this->buf + (id * ndims); 49 | } 50 | 51 | template 52 | void FixedSizeAlignedAllocator::deallocate(T* ptr) { 53 | assert(IS_ALIGNED(ptr, 32)); 54 | uint32_t id = (uint32_t) (ptr - this->buf) / ndims; 55 | std::lock_guard lk(this->lock); 56 | this->free_set.insert(id); 57 | } 58 | 59 | // vectors 60 | template class FixedSizeAlignedAllocator; 61 | template class FixedSizeAlignedAllocator; 62 | template class FixedSizeAlignedAllocator; 63 | // nhoods 64 | template class FixedSizeAlignedAllocator; 65 | } // namespace diskann 66 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/src/v2/fs_allocator.cpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "v2/fs_allocator.h" 4 | #include "tsl/robin_set.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.h" 11 | #include "logger.h" 12 | 13 | namespace diskann { 14 | 15 | template 16 | FixedSizeAlignedAllocator::FixedSizeAlignedAllocator(const uint32_t ndims, const uint32_t max_count) { 17 | assert(IS_ALIGNED(ndims, 32)); 18 | this->count = ROUND_UP(max_count, 32); 19 | alloc_aligned( (void**)&this->buf, this->count * sizeof(T), 32); 20 | std::vector ids(this->count); 21 | std::iota(ids.begin(), ids.end(), 0); 22 | this->free_set.insert(ids.begin(), ids.end()); 23 | ids.clear(); 24 | } 25 | 26 | template 27 | FixedSizeAlignedAllocator::~FixedSizeAlignedAllocator() { 28 | std::lock_guard lk(this->lock); 29 | assert(this->free_set.size() == this->count); 30 | aligned_free(this->buf); 31 | } 32 | 33 | template 34 | T* FixedSizeAlignedAllocator::allocate(){ 35 | std::lock_guard lk(this->lock); 36 | uint32_t id = std::numeric_limits::max(); 37 | for(auto &v : this->free_set) { 38 | id = v; 39 | break; 40 | } 41 | 42 | if(id == std::numeric_limits::max()) { 43 | std::cerr << "UNABLE TO ALLOCATE MEMORY" << std::endl; 44 | return nullptr; 45 | } else{ 46 | this->free_set.erase(id); 47 | } 48 | return this->buf + (id * ndims); 49 | } 50 | 51 | template 52 | void FixedSizeAlignedAllocator::deallocate(T* ptr) { 53 | assert(IS_ALIGNED(ptr, 32)); 54 | uint32_t id = (uint32_t) (ptr - this->buf) / ndims; 55 | std::lock_guard lk(this->lock); 56 | this->free_set.insert(id); 57 | } 58 | 59 | // vectors 60 | template class FixedSizeAlignedAllocator; 61 | template class FixedSizeAlignedAllocator; 62 | template class FixedSizeAlignedAllocator; 63 | // nhoods 64 | template class FixedSizeAlignedAllocator; 65 | } // namespace diskann 66 | -------------------------------------------------------------------------------- /include/percentile_stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #ifdef _WINDOWS 11 | #include 12 | #endif 13 | #include 14 | #include 15 | 16 | #include "distance.h" 17 | #include "parameters.h" 18 | 19 | namespace diskann { 20 | struct QueryStats { 21 | double total_us = 0; // total time to process query in micros 22 | double n_4k = 0; // # of 4kB reads 23 | double n_8k = 0; // # of 8kB reads 24 | double n_12k = 0; // # of 12kB reads 25 | double n_ios = 0; // total # of IOs issued 26 | double read_size = 0; // total # of bytes read 27 | double io_us = 0; // total time spent in IO 28 | double cpu_us = 0; // total time spent in CPU 29 | double n_cmps_saved = 0; // # cmps saved 30 | double n_cmps = 0; // # cmps 31 | double n_cache_hits = 0; // # cache_hits 32 | double n_hops = 0; // # search hops 33 | double n_current_used = 0; // # force return for latency limit 34 | }; 35 | 36 | inline double get_percentile_stats( 37 | QueryStats *stats, uint64_t len, float percentile, 38 | const std::function &member_fn) { 39 | std::vector vals(len); 40 | for (uint64_t i = 0; i < len; i++) { 41 | vals[i] = member_fn(stats[i]); 42 | } 43 | 44 | std::sort( 45 | vals.begin(), vals.end(), 46 | [](const double &left, const double &right) { return left < right; }); 47 | 48 | auto retval = vals[(uint64_t) (percentile * ((float) len))]; 49 | vals.clear(); 50 | return retval; 51 | } 52 | 53 | inline double get_mean_stats( 54 | QueryStats *stats, uint64_t len, 55 | const std::function &member_fn) { 56 | double avg = 0; 57 | for (uint64_t i = 0; i < len; i++) { 58 | avg += member_fn(stats[i]); 59 | } 60 | return avg / ((double) len); 61 | } 62 | } // namespace diskann 63 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/percentile_stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #ifdef _WINDOWS 11 | #include 12 | #endif 13 | #include 14 | #include 15 | 16 | #include "distance.h" 17 | #include "parameters.h" 18 | 19 | namespace diskann { 20 | struct QueryStats { 21 | double total_us = 0; // total time to process query in micros 22 | double n_4k = 0; // # of 4kB reads 23 | double n_8k = 0; // # of 8kB reads 24 | double n_12k = 0; // # of 12kB reads 25 | double n_ios = 0; // total # of IOs issued 26 | double read_size = 0; // total # of bytes read 27 | double io_us = 0; // total time spent in IO 28 | double cpu_us = 0; // total time spent in CPU 29 | double n_cmps_saved = 0; // # cmps saved 30 | double n_cmps = 0; // # cmps 31 | double n_cache_hits = 0; // # cache_hits 32 | double n_hops = 0; // # search hops 33 | double n_current_used = 0; // # force return for latency limit 34 | }; 35 | 36 | inline double get_percentile_stats( 37 | QueryStats *stats, uint64_t len, float percentile, 38 | const std::function &member_fn) { 39 | std::vector vals(len); 40 | for (uint64_t i = 0; i < len; i++) { 41 | vals[i] = member_fn(stats[i]); 42 | } 43 | 44 | std::sort( 45 | vals.begin(), vals.end(), 46 | [](const double &left, const double &right) { return left < right; }); 47 | 48 | auto retval = vals[(uint64_t) (percentile * ((float) len))]; 49 | vals.clear(); 50 | return retval; 51 | } 52 | 53 | inline double get_mean_stats( 54 | QueryStats *stats, uint64_t len, 55 | const std::function &member_fn) { 56 | double avg = 0; 57 | for (uint64_t i = 0; i < len; i++) { 58 | avg += member_fn(stats[i]); 59 | } 60 | return avg / ((double) len); 61 | } 62 | } // namespace diskann 63 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/include/percentile_stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #ifdef _WINDOWS 11 | #include 12 | #endif 13 | #include 14 | #include 15 | 16 | #include "distance.h" 17 | #include "parameters.h" 18 | 19 | namespace diskann { 20 | struct QueryStats { 21 | double total_us = 0; // total time to process query in micros 22 | double n_4k = 0; // # of 4kB reads 23 | double n_8k = 0; // # of 8kB reads 24 | double n_12k = 0; // # of 12kB reads 25 | double n_ios = 0; // total # of IOs issued 26 | double read_size = 0; // total # of bytes read 27 | double io_us = 0; // total time spent in IO 28 | double cpu_us = 0; // total time spent in CPU 29 | double n_cmps_saved = 0; // # cmps saved 30 | double n_cmps = 0; // # cmps 31 | double n_cache_hits = 0; // # cache_hits 32 | double n_hops = 0; // # search hops 33 | double n_current_used = 0; // # force return for latency limit 34 | }; 35 | 36 | inline double get_percentile_stats( 37 | QueryStats *stats, uint64_t len, float percentile, 38 | const std::function &member_fn) { 39 | std::vector vals(len); 40 | for (uint64_t i = 0; i < len; i++) { 41 | vals[i] = member_fn(stats[i]); 42 | } 43 | 44 | std::sort( 45 | vals.begin(), vals.end(), 46 | [](const double &left, const double &right) { return left < right; }); 47 | 48 | auto retval = vals[(uint64_t) (percentile * ((float) len))]; 49 | vals.clear(); 50 | return retval; 51 | } 52 | 53 | inline double get_mean_stats( 54 | QueryStats *stats, uint64_t len, 55 | const std::function &member_fn) { 56 | double avg = 0; 57 | for (uint64_t i = 0; i < len; i++) { 58 | avg += member_fn(stats[i]); 59 | } 60 | return avg / ((double) len); 61 | } 62 | } // namespace diskann 63 | -------------------------------------------------------------------------------- /tests/utils/ivecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, _u32* read_buf, 8 | _u32* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(_u32) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(_u32)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(_u32)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_ivecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(_u32)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | _u32* read_buf = new _u32[npts * (ndims + 1)]; 44 | _u32* write_buf = new _u32[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/ivecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, _u32* read_buf, 8 | _u32* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(_u32) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(_u32)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(_u32)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_ivecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(_u32)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | _u32* read_buf = new _u32[npts * (ndims + 1)]; 44 | _u32* write_buf = new _u32[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/ivecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, _u32* read_buf, 8 | _u32* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(_u32) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(_u32)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(_u32)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_ivecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(_u32)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | _u32* read_buf = new _u32[npts * (ndims + 1)]; 44 | _u32* write_buf = new _u32[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /tests/utils/update_metadata.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, uint8_t* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(float)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | float* read_buf = new float[npts * (ndims + 1)]; 44 | uint8_t* write_buf = new uint8_t[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /tests/utils/float_bin_to_int8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ofstream& writer, int8_t* write_buf, 8 | std::ifstream& reader, float* read_buf, _u64 npts, 9 | _u64 ndims, float bias, float scale) { 10 | reader.read((char*) read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; d++) { 14 | write_buf[d + i * ndims] = 15 | (int8_t) ((read_buf[d + i * ndims] - bias) * (256.0 / scale)); 16 | } 17 | } 18 | writer.write((char*) write_buf, npts * ndims); 19 | } 20 | 21 | int main(int argc, char** argv) { 22 | if (argc != 5) { 23 | std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::ifstream reader(argv[1], std::ios::binary); 29 | _u32 npts_u32; 30 | _u32 ndims_u32; 31 | reader.read((char*) &npts_u32, sizeof(_s32)); 32 | reader.read((char*) &ndims_u32, sizeof(_s32)); 33 | size_t npts = npts_u32; 34 | size_t ndims = ndims_u32; 35 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 36 | << std::endl; 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | auto read_buf = new float[blk_size * ndims]; 43 | auto write_buf = new int8_t[blk_size * ndims]; 44 | float bias = atof(argv[3]); 45 | float scale = atof(argv[4]); 46 | 47 | writer.write((char*) (&npts_u32), sizeof(_u32)); 48 | writer.write((char*) (&ndims_u32), sizeof(_u32)); 49 | 50 | for (_u64 i = 0; i < nblks; i++) { 51 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 52 | block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, 53 | scale); 54 | std::cout << "Block #" << i << " written" << std::endl; 55 | } 56 | 57 | delete[] read_buf; 58 | delete[] write_buf; 59 | 60 | writer.close(); 61 | reader.close(); 62 | } 63 | -------------------------------------------------------------------------------- /tests/utils/fvecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | #include "logger.h" 7 | 8 | void block_convert(std::ifstream& reader, std::ofstream& writer, 9 | float* read_buf, float* write_buf, _u64 npts, _u64 ndims) { 10 | reader.read((char*) read_buf, 11 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 12 | for (_u64 i = 0; i < npts; i++) { 13 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 14 | ndims * sizeof(float)); 15 | } 16 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | if (argc != 3) { 21 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 22 | exit(-1); 23 | } 24 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 25 | _u64 fsize = reader.tellg(); 26 | reader.seekg(0, std::ios::beg); 27 | 28 | unsigned ndims_u32; 29 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 30 | reader.seekg(0, std::ios::beg); 31 | _u64 ndims = (_u64) ndims_u32; 32 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 33 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 34 | << std::endl; 35 | 36 | _u64 blk_size = 131072; 37 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 38 | diskann::cout << "# blks: " << nblks << std::endl; 39 | std::ofstream writer(argv[2], std::ios::binary); 40 | int npts_s32 = (_s32) npts; 41 | int ndims_s32 = (_s32) ndims; 42 | writer.write((char*) &npts_s32, sizeof(_s32)); 43 | writer.write((char*) &ndims_s32, sizeof(_s32)); 44 | float* read_buf = new float[npts * (ndims + 1)]; 45 | float* write_buf = new float[npts * ndims]; 46 | for (_u64 i = 0; i < nblks; i++) { 47 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 48 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 49 | diskann::cout << "Block #" << i << " written" << std::endl; 50 | } 51 | 52 | delete[] read_buf; 53 | delete[] write_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/update_metadata.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, uint8_t* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(float)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | float* read_buf = new float[npts * (ndims + 1)]; 44 | uint8_t* write_buf = new uint8_t[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/update_metadata.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, 8 | float* read_buf, uint8_t* write_buf, _u64 npts, _u64 ndims) { 9 | reader.read((char*) read_buf, 10 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 11 | for (_u64 i = 0; i < npts; i++) { 12 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 13 | ndims * sizeof(float)); 14 | } 15 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | if (argc != 3) { 20 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 21 | exit(-1); 22 | } 23 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 24 | _u64 fsize = reader.tellg(); 25 | reader.seekg(0, std::ios::beg); 26 | 27 | unsigned ndims_u32; 28 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 29 | reader.seekg(0, std::ios::beg); 30 | _u64 ndims = (_u64) ndims_u32; 31 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 32 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 33 | << std::endl; 34 | 35 | _u64 blk_size = 131072; 36 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 37 | diskann::cout << "# blks: " << nblks << std::endl; 38 | std::ofstream writer(argv[2], std::ios::binary); 39 | int npts_s32 = (_s32) npts; 40 | int ndims_s32 = (_s32) ndims; 41 | writer.write((char*) &npts_s32, sizeof(_s32)); 42 | writer.write((char*) &ndims_s32, sizeof(_s32)); 43 | float* read_buf = new float[npts * (ndims + 1)]; 44 | uint8_t* write_buf = new uint8_t[npts * ndims]; 45 | for (_u64 i = 0; i < nblks; i++) { 46 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | diskann::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/float_bin_to_int8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ofstream& writer, int8_t* write_buf, 8 | std::ifstream& reader, float* read_buf, _u64 npts, 9 | _u64 ndims, float bias, float scale) { 10 | reader.read((char*) read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; d++) { 14 | write_buf[d + i * ndims] = 15 | (int8_t) ((read_buf[d + i * ndims] - bias) * (256.0 / scale)); 16 | } 17 | } 18 | writer.write((char*) write_buf, npts * ndims); 19 | } 20 | 21 | int main(int argc, char** argv) { 22 | if (argc != 5) { 23 | std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::ifstream reader(argv[1], std::ios::binary); 29 | _u32 npts_u32; 30 | _u32 ndims_u32; 31 | reader.read((char*) &npts_u32, sizeof(_s32)); 32 | reader.read((char*) &ndims_u32, sizeof(_s32)); 33 | size_t npts = npts_u32; 34 | size_t ndims = ndims_u32; 35 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 36 | << std::endl; 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | auto read_buf = new float[blk_size * ndims]; 43 | auto write_buf = new int8_t[blk_size * ndims]; 44 | float bias = atof(argv[3]); 45 | float scale = atof(argv[4]); 46 | 47 | writer.write((char*) (&npts_u32), sizeof(_u32)); 48 | writer.write((char*) (&ndims_u32), sizeof(_u32)); 49 | 50 | for (_u64 i = 0; i < nblks; i++) { 51 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 52 | block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, 53 | scale); 54 | std::cout << "Block #" << i << " written" << std::endl; 55 | } 56 | 57 | delete[] read_buf; 58 | delete[] write_buf; 59 | 60 | writer.close(); 61 | reader.close(); 62 | } 63 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/float_bin_to_int8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ofstream& writer, int8_t* write_buf, 8 | std::ifstream& reader, float* read_buf, _u64 npts, 9 | _u64 ndims, float bias, float scale) { 10 | reader.read((char*) read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; d++) { 14 | write_buf[d + i * ndims] = 15 | (int8_t) ((read_buf[d + i * ndims] - bias) * (256.0 / scale)); 16 | } 17 | } 18 | writer.write((char*) write_buf, npts * ndims); 19 | } 20 | 21 | int main(int argc, char** argv) { 22 | if (argc != 5) { 23 | std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" 24 | << std::endl; 25 | exit(-1); 26 | } 27 | 28 | std::ifstream reader(argv[1], std::ios::binary); 29 | _u32 npts_u32; 30 | _u32 ndims_u32; 31 | reader.read((char*) &npts_u32, sizeof(_s32)); 32 | reader.read((char*) &ndims_u32, sizeof(_s32)); 33 | size_t npts = npts_u32; 34 | size_t ndims = ndims_u32; 35 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 36 | << std::endl; 37 | 38 | _u64 blk_size = 131072; 39 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | auto read_buf = new float[blk_size * ndims]; 43 | auto write_buf = new int8_t[blk_size * ndims]; 44 | float bias = atof(argv[3]); 45 | float scale = atof(argv[4]); 46 | 47 | writer.write((char*) (&npts_u32), sizeof(_u32)); 48 | writer.write((char*) (&ndims_u32), sizeof(_u32)); 49 | 50 | for (_u64 i = 0; i < nblks; i++) { 51 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 52 | block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, 53 | scale); 54 | std::cout << "Block #" << i << " written" << std::endl; 55 | } 56 | 57 | delete[] read_buf; 58 | delete[] write_buf; 59 | 60 | writer.close(); 61 | reader.close(); 62 | } 63 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/utils/fvecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | #include "logger.h" 7 | 8 | void block_convert(std::ifstream& reader, std::ofstream& writer, 9 | float* read_buf, float* write_buf, _u64 npts, _u64 ndims) { 10 | reader.read((char*) read_buf, 11 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 12 | for (_u64 i = 0; i < npts; i++) { 13 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 14 | ndims * sizeof(float)); 15 | } 16 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | if (argc != 3) { 21 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 22 | exit(-1); 23 | } 24 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 25 | _u64 fsize = reader.tellg(); 26 | reader.seekg(0, std::ios::beg); 27 | 28 | unsigned ndims_u32; 29 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 30 | reader.seekg(0, std::ios::beg); 31 | _u64 ndims = (_u64) ndims_u32; 32 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 33 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 34 | << std::endl; 35 | 36 | _u64 blk_size = 131072; 37 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 38 | diskann::cout << "# blks: " << nblks << std::endl; 39 | std::ofstream writer(argv[2], std::ios::binary); 40 | int npts_s32 = (_s32) npts; 41 | int ndims_s32 = (_s32) ndims; 42 | writer.write((char*) &npts_s32, sizeof(_s32)); 43 | writer.write((char*) &ndims_s32, sizeof(_s32)); 44 | float* read_buf = new float[npts * (ndims + 1)]; 45 | float* write_buf = new float[npts * ndims]; 46 | for (_u64 i = 0; i < nblks; i++) { 47 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 48 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 49 | diskann::cout << "Block #" << i << " written" << std::endl; 50 | } 51 | 52 | delete[] read_buf; 53 | delete[] write_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/utils/fvecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | #include "logger.h" 7 | 8 | void block_convert(std::ifstream& reader, std::ofstream& writer, 9 | float* read_buf, float* write_buf, _u64 npts, _u64 ndims) { 10 | reader.read((char*) read_buf, 11 | npts * (ndims * sizeof(float) + sizeof(unsigned))); 12 | for (_u64 i = 0; i < npts; i++) { 13 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, 14 | ndims * sizeof(float)); 15 | } 16 | writer.write((char*) write_buf, npts * ndims * sizeof(float)); 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | if (argc != 3) { 21 | diskann::cout << argv[0] << " input_fvecs output_bin" << std::endl; 22 | exit(-1); 23 | } 24 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 25 | _u64 fsize = reader.tellg(); 26 | reader.seekg(0, std::ios::beg); 27 | 28 | unsigned ndims_u32; 29 | reader.read((char*) &ndims_u32, sizeof(unsigned)); 30 | reader.seekg(0, std::ios::beg); 31 | _u64 ndims = (_u64) ndims_u32; 32 | _u64 npts = fsize / ((ndims + 1) * sizeof(float)); 33 | diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 34 | << std::endl; 35 | 36 | _u64 blk_size = 131072; 37 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 38 | diskann::cout << "# blks: " << nblks << std::endl; 39 | std::ofstream writer(argv[2], std::ios::binary); 40 | int npts_s32 = (_s32) npts; 41 | int ndims_s32 = (_s32) ndims; 42 | writer.write((char*) &npts_s32, sizeof(_s32)); 43 | writer.write((char*) &ndims_s32, sizeof(_s32)); 44 | float* read_buf = new float[npts * (ndims + 1)]; 45 | float* write_buf = new float[npts * ndims]; 46 | for (_u64 i = 0; i < nblks; i++) { 47 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 48 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 49 | diskann::cout << "Block #" << i << " written" << std::endl; 50 | } 51 | 52 | delete[] read_buf; 53 | delete[] write_buf; 54 | 55 | reader.close(); 56 | writer.close(); 57 | } 58 | -------------------------------------------------------------------------------- /include/v2/lock.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | class CASRWLock { 12 | public: 13 | CASRWLock() : lock(0) { 14 | } 15 | void ReadLock() { 16 | uint64_t i, n; 17 | uint64_t old_readers; 18 | for (;;) { 19 | old_readers = lock; 20 | if (old_readers != WLOCK && 21 | __sync_bool_compare_and_swap(&lock, old_readers, old_readers + 1)) { 22 | return; 23 | } 24 | for (n = 1; n < SPIN; n <<= 1) { 25 | for (i = 0; i < n; i++) { 26 | __asm__("pause"); 27 | } 28 | old_readers = lock; 29 | if (old_readers != WLOCK && 30 | __sync_bool_compare_and_swap(&lock, old_readers, old_readers + 1)) { 31 | return; 32 | } 33 | } 34 | sched_yield(); 35 | } 36 | } 37 | void WriteLock() { 38 | uint64_t i, n; 39 | for (;;) { 40 | if (lock == 0 && __sync_bool_compare_and_swap(&lock, 0, WLOCK)) { 41 | return; 42 | } 43 | for (n = 1; n < SPIN; n <<= 1) { 44 | for (i = 0; i < n; i++) { 45 | std::cout << "lock" << std::endl; 46 | __asm__("pause"); 47 | } 48 | if (lock == 0 && __sync_bool_compare_and_swap(&lock, 0, WLOCK)) { 49 | return; 50 | } 51 | } 52 | sched_yield(); 53 | } 54 | } 55 | void ReadUnLock() { 56 | uint64_t old_readers; 57 | old_readers = lock; 58 | if (old_readers == WLOCK) { 59 | lock = 0; 60 | return; 61 | } 62 | for (;;) { 63 | if (__sync_bool_compare_and_swap(&lock, old_readers, old_readers - 1)) { 64 | return; 65 | } 66 | old_readers = lock; 67 | } 68 | } 69 | void WriteUnlock() { 70 | uint64_t old_readers; 71 | old_readers = lock; 72 | if (old_readers == WLOCK) { 73 | lock = 0; 74 | return; 75 | } 76 | for (;;) { 77 | if (__sync_bool_compare_and_swap(&lock, old_readers, old_readers - 1)) { 78 | return; 79 | } 80 | old_readers = lock; 81 | } 82 | } 83 | 84 | private: 85 | static const uint64_t SPIN = 2048; 86 | static const uint64_t WLOCK = ((unsigned long) -1); 87 | uint64_t lock; 88 | }; 89 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/include/v2/lock.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | class CASRWLock { 12 | public: 13 | CASRWLock() : lock(0) { 14 | } 15 | void ReadLock() { 16 | uint64_t i, n; 17 | uint64_t old_readers; 18 | for (;;) { 19 | old_readers = lock; 20 | if (old_readers != WLOCK && 21 | __sync_bool_compare_and_swap(&lock, old_readers, old_readers + 1)) { 22 | return; 23 | } 24 | for (n = 1; n < SPIN; n <<= 1) { 25 | for (i = 0; i < n; i++) { 26 | __asm__("pause"); 27 | } 28 | old_readers = lock; 29 | if (old_readers != WLOCK && 30 | __sync_bool_compare_and_swap(&lock, old_readers, old_readers + 1)) { 31 | return; 32 | } 33 | } 34 | sched_yield(); 35 | } 36 | } 37 | void WriteLock() { 38 | uint64_t i, n; 39 | for (;;) { 40 | if (lock == 0 && __sync_bool_compare_and_swap(&lock, 0, WLOCK)) { 41 | return; 42 | } 43 | for (n = 1; n < SPIN; n <<= 1) { 44 | for (i = 0; i < n; i++) { 45 | std::cout << "lock" << std::endl; 46 | __asm__("pause"); 47 | } 48 | if (lock == 0 && __sync_bool_compare_and_swap(&lock, 0, WLOCK)) { 49 | return; 50 | } 51 | } 52 | sched_yield(); 53 | } 54 | } 55 | void ReadUnLock() { 56 | uint64_t old_readers; 57 | old_readers = lock; 58 | if (old_readers == WLOCK) { 59 | lock = 0; 60 | return; 61 | } 62 | for (;;) { 63 | if (__sync_bool_compare_and_swap(&lock, old_readers, old_readers - 1)) { 64 | return; 65 | } 66 | old_readers = lock; 67 | } 68 | } 69 | void WriteUnlock() { 70 | uint64_t old_readers; 71 | old_readers = lock; 72 | if (old_readers == WLOCK) { 73 | lock = 0; 74 | return; 75 | } 76 | for (;;) { 77 | if (__sync_bool_compare_and_swap(&lock, old_readers, old_readers - 1)) { 78 | return; 79 | } 80 | old_readers = lock; 81 | } 82 | } 83 | 84 | private: 85 | static const uint64_t SPIN = 2048; 86 | static const uint64_t WLOCK = ((unsigned long) -1); 87 | uint64_t lock; 88 | }; 89 | -------------------------------------------------------------------------------- /tests/build_disk_index.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "omp.h" 5 | 6 | #include "aux_utils.h" 7 | #include "index.h" 8 | #include "math_utils.h" 9 | #include "partition_and_pq.h" 10 | #include "utils.h" 11 | 12 | template 13 | bool build_index(const char* dataFilePath, const char* indexFilePath, 14 | const char* indexBuildParameters, diskann::Metric m, 15 | bool singleFile) { 16 | return diskann::build_disk_index(dataFilePath, indexFilePath, 17 | indexBuildParameters, m, singleFile); 18 | } 19 | 20 | int main(int argc, char** argv) { 21 | if (argc != 11) { 22 | diskann::cout << "Usage: " << argv[0] 23 | << " " 24 | " " 25 | " ." 26 | " " 27 | " See README for more information on parameters." 28 | << std::endl; 29 | } else { 30 | std::string params = std::string(argv[4]) + " " + std::string(argv[5]) + 31 | " " + std::string(argv[6]) + " " + 32 | std::string(argv[7]) + " " + std::string(argv[8]); 33 | std::string dist_metric(argv[9]); 34 | bool single_file_index = std::atoi(argv[10]) != 0; 35 | 36 | diskann::Metric m = 37 | dist_metric == "cosine" ? diskann::Metric::COSINE : diskann::Metric::L2; 38 | if (dist_metric != "l2" && m == diskann::Metric::L2) { 39 | diskann::cout << "Metric " << dist_metric << " is not supported. Using L2" 40 | << std::endl; 41 | } 42 | if (std::string(argv[1]) == std::string("float")) 43 | build_index(argv[2], argv[3], params.c_str(), m, 44 | single_file_index); 45 | else if (std::string(argv[1]) == std::string("int8")) 46 | build_index(argv[2], argv[3], params.c_str(), m, 47 | single_file_index); 48 | else if (std::string(argv[1]) == std::string("uint8")) 49 | build_index(argv[2], argv[3], params.c_str(), m, 50 | single_file_index); 51 | else 52 | diskann::cout << "Error. wrong file type" << std::endl; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /baseline/IP-DiskANN/tests/build_disk_index.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "omp.h" 5 | 6 | #include "aux_utils.h" 7 | #include "index.h" 8 | #include "math_utils.h" 9 | #include "partition_and_pq.h" 10 | #include "utils.h" 11 | 12 | template 13 | bool build_index(const char* dataFilePath, const char* indexFilePath, 14 | const char* indexBuildParameters, diskann::Metric m, 15 | bool singleFile) { 16 | return diskann::build_disk_index(dataFilePath, indexFilePath, 17 | indexBuildParameters, m, singleFile); 18 | } 19 | 20 | int main(int argc, char** argv) { 21 | if (argc != 11) { 22 | diskann::cout << "Usage: " << argv[0] 23 | << " " 24 | " " 25 | " ." 26 | " " 27 | " See README for more information on parameters." 28 | << std::endl; 29 | } else { 30 | std::string params = std::string(argv[4]) + " " + std::string(argv[5]) + 31 | " " + std::string(argv[6]) + " " + 32 | std::string(argv[7]) + " " + std::string(argv[8]); 33 | std::string dist_metric(argv[9]); 34 | bool single_file_index = std::atoi(argv[10]) != 0; 35 | 36 | diskann::Metric m = 37 | dist_metric == "cosine" ? diskann::Metric::COSINE : diskann::Metric::L2; 38 | if (dist_metric != "l2" && m == diskann::Metric::L2) { 39 | diskann::cout << "Metric " << dist_metric << " is not supported. Using L2" 40 | << std::endl; 41 | } 42 | if (std::string(argv[1]) == std::string("float")) 43 | build_index(argv[2], argv[3], params.c_str(), m, 44 | single_file_index); 45 | else if (std::string(argv[1]) == std::string("int8")) 46 | build_index(argv[2], argv[3], params.c_str(), m, 47 | single_file_index); 48 | else if (std::string(argv[1]) == std::string("uint8")) 49 | build_index(argv[2], argv[3], params.c_str(), m, 50 | single_file_index); 51 | else 52 | diskann::cout << "Error. wrong file type" << std::endl; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /baseline/LM-DiskANN/tests/build_disk_index.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "omp.h" 5 | 6 | #include "aux_utils.h" 7 | #include "index.h" 8 | #include "math_utils.h" 9 | #include "partition_and_pq.h" 10 | #include "utils.h" 11 | 12 | template 13 | bool build_index(const char* dataFilePath, const char* indexFilePath, 14 | const char* indexBuildParameters, diskann::Metric m, 15 | bool singleFile) { 16 | return diskann::build_disk_index(dataFilePath, indexFilePath, 17 | indexBuildParameters, m, singleFile); 18 | } 19 | 20 | int main(int argc, char** argv) { 21 | if (argc != 11) { 22 | diskann::cout << "Usage: " << argv[0] 23 | << " " 24 | " " 25 | " ." 26 | " " 27 | " See README for more information on parameters." 28 | << std::endl; 29 | } else { 30 | std::string params = std::string(argv[4]) + " " + std::string(argv[5]) + 31 | " " + std::string(argv[6]) + " " + 32 | std::string(argv[7]) + " " + std::string(argv[8]); 33 | std::string dist_metric(argv[9]); 34 | bool single_file_index = std::atoi(argv[10]) != 0; 35 | 36 | diskann::Metric m = 37 | dist_metric == "cosine" ? diskann::Metric::COSINE : diskann::Metric::L2; 38 | if (dist_metric != "l2" && m == diskann::Metric::L2) { 39 | diskann::cout << "Metric " << dist_metric << " is not supported. Using L2" 40 | << std::endl; 41 | } 42 | if (std::string(argv[1]) == std::string("float")) 43 | build_index(argv[2], argv[3], params.c_str(), m, 44 | single_file_index); 45 | else if (std::string(argv[1]) == std::string("int8")) 46 | build_index(argv[2], argv[3], params.c_str(), m, 47 | single_file_index); 48 | else if (std::string(argv[1]) == std::string("uint8")) 49 | build_index(argv[2], argv[3], params.c_str(), m, 50 | single_file_index); 51 | else 52 | diskann::cout << "Error. wrong file type" << std::endl; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/utils/bin_to_tsv.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | template 8 | void block_convert(std::ofstream& writer, std::ifstream& reader, T* read_buf, 9 | _u64 npts, _u64 ndims) { 10 | reader.read((char*) read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (_u64 i = 0; i < npts; i++) { 13 | for (_u64 d = 0; d < ndims; d++) { 14 | writer << read_buf[d + i * ndims]; 15 | if (d < ndims - 1) 16 | writer << "\t"; 17 | else 18 | writer << "\n"; 19 | } 20 | } 21 | } 22 | 23 | int main(int argc, char** argv) { 24 | if (argc != 4) { 25 | std::cout << argv[0] << " input_bin output_tsv" 26 | << std::endl; 27 | exit(-1); 28 | } 29 | std::string type_string(argv[1]); 30 | if ((type_string != std::string("float")) && 31 | (type_string != std::string("int8")) && 32 | (type_string != std::string("uin8"))) { 33 | std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl; 34 | } 35 | 36 | std::ifstream reader(argv[2], std::ios::binary); 37 | _u32 npts_u32; 38 | _u32 ndims_u32; 39 | reader.read((char*) &npts_u32, sizeof(_s32)); 40 | reader.read((char*) &ndims_u32, sizeof(_s32)); 41 | size_t npts = npts_u32; 42 | size_t ndims = ndims_u32; 43 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims 44 | << std::endl; 45 | 46 | _u64 blk_size = 131072; 47 | _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; 48 | 49 | std::ofstream writer(argv[3]); 50 | char* read_buf = new char[blk_size * ndims * 4]; 51 | for (_u64 i = 0; i < nblks; i++) { 52 | _u64 cblk_size = std::min(npts - i * blk_size, blk_size); 53 | if (type_string == std::string("float")) 54 | block_convert(writer, reader, (float*) read_buf, cblk_size, ndims); 55 | else if (type_string == std::string("int8")) 56 | block_convert(writer, reader, (int8_t*) read_buf, cblk_size, 57 | ndims); 58 | else if (type_string == std::string("uint8")) 59 | block_convert(writer, reader, (uint8_t*) read_buf, cblk_size, 60 | ndims); 61 | std::cout << "Block #" << i << " written" << std::endl; 62 | } 63 | 64 | delete[] read_buf; 65 | 66 | writer.close(); 67 | reader.close(); 68 | } 69 | --------------------------------------------------------------------------------