├── .clang-format ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ ├── feature_request.md │ └── usage-question.md ├── PULL_REQUEST_TEMPLATE.md ├── actions │ ├── build │ │ └── action.yml │ ├── format-check │ │ └── action.yml │ ├── generate-high-dim-random │ │ └── action.yml │ ├── generate-random │ │ └── action.yml │ └── python-wheel │ │ └── action.yml └── workflows │ ├── build-python-pdoc.yml │ ├── build-python.yml │ ├── common.yml │ ├── disk-pq.yml │ ├── dynamic-labels.yml │ ├── dynamic.yml │ ├── in-mem-no-pq.yml │ ├── in-mem-pq.yml │ ├── labels.yml │ ├── multi-sector-disk-pq.yml │ ├── perf.yml │ ├── pr-test.yml │ ├── push-test.yml │ ├── python-release.yml │ └── unit-tests.yml ├── .gitignore ├── .gitmodules ├── AnyBuildLogs └── latest.txt ├── CMakeLists.txt ├── CMakeSettings.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── DockerfileDev ├── LICENSE ├── MANIFEST.in ├── NOTICE.txt ├── README.md ├── SECURITY.md ├── apps ├── CMakeLists.txt ├── build_disk_index.cpp ├── build_memory_index.cpp ├── build_stitched_index.cpp ├── python │ ├── README.md │ ├── requirements.txt │ └── restapi │ │ ├── __init__.py │ │ ├── disk_ann_util.py │ │ └── test_ssd_rest_api.py ├── range_search_disk_index.cpp ├── restapi │ ├── CMakeLists.txt │ ├── client.cpp │ ├── inmem_server.cpp │ ├── main.cpp │ ├── multiple_ssdindex_server.cpp │ └── ssd_server.cpp ├── search_disk_index.cpp ├── search_memory_index.cpp ├── test_insert_deletes_consolidate.cpp ├── test_streaming_scenario.cpp └── utils │ ├── CMakeLists.txt │ ├── bin_to_fvecs.cpp │ ├── bin_to_tsv.cpp │ ├── calculate_recall.cpp │ ├── compute_groundtruth.cpp │ ├── compute_groundtruth_for_filters.cpp │ ├── count_bfs_levels.cpp │ ├── create_disk_layout.cpp │ ├── float_bin_to_int8.cpp │ ├── fvecs_to_bin.cpp │ ├── fvecs_to_bvecs.cpp │ ├── gen_random_slice.cpp │ ├── generate_pq.cpp │ ├── generate_synthetic_labels.cpp │ ├── int8_to_float.cpp │ ├── int8_to_float_scale.cpp │ ├── ivecs_to_bin.cpp │ ├── merge_shards.cpp │ ├── partition_data.cpp │ ├── partition_with_ram_budget.cpp │ ├── rand_data_gen.cpp │ ├── simulate_aggregate_recall.cpp │ ├── stats_label_data.cpp │ ├── tsv_to_bin.cpp │ ├── uint32_to_uint8.cpp │ ├── uint8_to_float.cpp │ └── vector_analysis.cpp ├── clang-format.cmake ├── include ├── abstract_data_store.h ├── abstract_graph_store.h ├── abstract_index.h ├── abstract_scratch.h ├── aligned_file_reader.h ├── ann_exception.h ├── any_wrappers.h ├── boost_dynamic_bitset_fwd.h ├── cached_io.h ├── common_includes.h ├── concurrent_queue.h ├── cosine_similarity.h ├── defaults.h ├── disk_utils.h ├── distance.h ├── exceptions.h ├── filter_utils.h ├── in_mem_data_store.h ├── in_mem_graph_store.h ├── index.h ├── index_build_params.h ├── index_config.h ├── index_factory.h ├── linux_aligned_file_reader.h ├── locking.h ├── logger.h ├── logger_impl.h ├── math_utils.h ├── memory_mapper.h ├── natural_number_map.h ├── natural_number_set.h ├── neighbor.h ├── parameters.h ├── partition.h ├── percentile_stats.h ├── pq.h ├── pq_common.h ├── pq_data_store.h ├── pq_flash_index.h ├── pq_l2_distance.h ├── pq_scratch.h ├── program_options_utils.hpp ├── quantized_distance.h ├── restapi │ ├── common.h │ ├── search_wrapper.h │ └── server.h ├── scratch.h ├── simd_utils.h ├── tag_uint128.h ├── timer.h ├── tsl │ ├── .clang-format │ ├── robin_growth_policy.h │ ├── robin_hash.h │ ├── robin_map.h │ ├── robin_set.h │ ├── sparse_growth_policy.h │ ├── sparse_hash.h │ ├── sparse_map.h │ └── sparse_set.h ├── types.h ├── utils.h ├── windows_aligned_file_reader.h ├── windows_customizations.h └── windows_slim_lock.h ├── pyproject.toml ├── python ├── CMakeLists.txt ├── README.md ├── apps │ ├── cli │ │ └── __main__.py │ ├── cluster.py │ ├── in-mem-dynamic.py │ ├── in-mem-static.py │ ├── insert-in-clustered-order.py │ ├── requirements.txt │ └── utils.py ├── include │ ├── builder.h │ ├── common.h │ ├── dynamic_memory_index.h │ ├── static_disk_index.h │ └── static_memory_index.h ├── src │ ├── __init__.py │ ├── _builder.py │ ├── _builder.pyi │ ├── _common.py │ ├── _dynamic_memory_index.py │ ├── _files.py │ ├── _static_disk_index.py │ ├── _static_memory_index.py │ ├── builder.cpp │ ├── defaults.py │ ├── dynamic_memory_index.cpp │ ├── module.cpp │ ├── py.typed │ ├── static_disk_index.cpp │ └── static_memory_index.cpp └── tests │ ├── fixtures │ ├── __init__.py │ ├── build_memory_index.py │ ├── create_test_data.py │ └── recall.py │ ├── test_builder.py │ ├── test_dynamic_memory_index.py │ ├── test_files.py │ ├── test_static_disk_index.py │ └── test_static_memory_index.py ├── rust ├── Cargo.lock ├── Cargo.toml ├── cmd_drivers │ ├── build_and_insert_delete_memory_index │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── build_and_insert_memory_index │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── build_disk_index │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── build_memory_index │ │ ├── Cargo.toml │ │ └── src │ │ │ ├── args.rs │ │ │ └── main.rs │ ├── convert_f32_to_bf16 │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── load_and_insert_memory_index │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ └── search_memory_index │ │ ├── Cargo.toml │ │ └── src │ │ ├── main.rs │ │ └── search_index_utils.rs ├── diskann │ ├── Cargo.toml │ ├── benches │ │ ├── distance_bench.rs │ │ ├── kmeans_bench.rs │ │ └── neighbor_bench.rs │ ├── src │ │ ├── algorithm │ │ │ ├── mod.rs │ │ │ ├── prune │ │ │ │ ├── mod.rs │ │ │ │ └── prune.rs │ │ │ └── search │ │ │ │ ├── mod.rs │ │ │ │ └── search.rs │ │ ├── common │ │ │ ├── aligned_allocator.rs │ │ │ ├── ann_result.rs │ │ │ └── mod.rs │ │ ├── index │ │ │ ├── disk_index │ │ │ │ ├── ann_disk_index.rs │ │ │ │ ├── disk_index.rs │ │ │ │ └── mod.rs │ │ │ ├── inmem_index │ │ │ │ ├── ann_inmem_index.rs │ │ │ │ ├── inmem_index.rs │ │ │ │ ├── inmem_index_storage.rs │ │ │ │ └── mod.rs │ │ │ └── mod.rs │ │ ├── instrumentation │ │ │ ├── disk_index_build_logger.rs │ │ │ ├── index_logger.rs │ │ │ └── mod.rs │ │ ├── lib.rs │ │ ├── model │ │ │ ├── configuration │ │ │ │ ├── disk_index_build_parameter.rs │ │ │ │ ├── index_configuration.rs │ │ │ │ ├── index_write_parameters.rs │ │ │ │ └── mod.rs │ │ │ ├── data_store │ │ │ │ ├── disk_scratch_dataset.rs │ │ │ │ ├── inmem_dataset.rs │ │ │ │ └── mod.rs │ │ │ ├── graph │ │ │ │ ├── adjacency_list.rs │ │ │ │ ├── disk_graph.rs │ │ │ │ ├── inmem_graph.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── sector_graph.rs │ │ │ │ └── vertex_and_neighbors.rs │ │ │ ├── mod.rs │ │ │ ├── neighbor │ │ │ │ ├── mod.rs │ │ │ │ ├── neighbor.rs │ │ │ │ ├── neighbor_priority_queue.rs │ │ │ │ └── sorted_neighbor_vector.rs │ │ │ ├── pq │ │ │ │ ├── fixed_chunk_pq_table.rs │ │ │ │ ├── mod.rs │ │ │ │ └── pq_construction.rs │ │ │ ├── scratch │ │ │ │ ├── concurrent_queue.rs │ │ │ │ ├── inmem_query_scratch.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── pq_scratch.rs │ │ │ │ ├── scratch_store_manager.rs │ │ │ │ ├── scratch_traits.rs │ │ │ │ ├── ssd_io_context.rs │ │ │ │ ├── ssd_query_scratch.rs │ │ │ │ └── ssd_thread_data.rs │ │ │ ├── vertex │ │ │ │ ├── dimension.rs │ │ │ │ ├── mod.rs │ │ │ │ └── vertex.rs │ │ │ └── windows_aligned_file_reader │ │ │ │ ├── mod.rs │ │ │ │ └── windows_aligned_file_reader.rs │ │ ├── storage │ │ │ ├── disk_graph_storage.rs │ │ │ ├── disk_index_storage.rs │ │ │ ├── mod.rs │ │ │ └── pq_storage.rs │ │ ├── test_utils │ │ │ ├── inmem_index_initialization.rs │ │ │ └── mod.rs │ │ └── utils │ │ │ ├── bit_vec_extension.rs │ │ │ ├── cached_reader.rs │ │ │ ├── cached_writer.rs │ │ │ ├── file_util.rs │ │ │ ├── hashset_u32.rs │ │ │ ├── kmeans.rs │ │ │ ├── math_util.rs │ │ │ ├── mod.rs │ │ │ ├── partition.rs │ │ │ ├── rayon_util.rs │ │ │ ├── timer.rs │ │ │ └── utils.rs │ └── tests │ │ └── data │ │ ├── delete_set_50pts.bin │ │ ├── disk_index_node_data_aligned_reader_truth.bin │ │ ├── disk_index_siftsmall_learn_256pts_R4_L50_A1.2_alligned_reader_test.index │ │ ├── disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index │ │ ├── disk_index_siftsmall_learn_256pts_R4_L50_A1.2_mem.index │ │ ├── siftsmall_learn.bin │ │ ├── siftsmall_learn.bin_pq_compressed.bin │ │ ├── siftsmall_learn.bin_pq_pivots.bin │ │ ├── siftsmall_learn_256pts.fbin │ │ ├── siftsmall_learn_256pts_2.fbin │ │ ├── truth_disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index │ │ ├── truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2 │ │ ├── truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2 │ │ ├── truth_index_siftsmall_learn_256pts_R4_L50_A1.2 │ │ └── truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data ├── logger │ ├── Cargo.toml │ ├── build.rs │ └── src │ │ ├── error_logger.rs │ │ ├── examples │ │ └── trace_example.rs │ │ ├── indexlog.proto │ │ ├── lib.rs │ │ ├── log_error.rs │ │ ├── message_handler.rs │ │ └── trace_logger.rs ├── platform │ ├── Cargo.toml │ └── src │ │ ├── file_handle.rs │ │ ├── file_io.rs │ │ ├── io_completion_port.rs │ │ ├── lib.rs │ │ └── perf.rs ├── project.code-workspace ├── readme.md ├── rust-toolchain.toml ├── vector │ ├── Cargo.toml │ ├── build.rs │ ├── distance.c │ └── src │ │ ├── distance.rs │ │ ├── distance_test.rs │ │ ├── half.rs │ │ ├── l2_float_distance.rs │ │ ├── lib.rs │ │ ├── metric.rs │ │ ├── test_util.rs │ │ └── utils.rs └── vector_base64 │ ├── Cargo.toml │ └── src │ └── main.rs ├── scripts ├── IndexParser │ ├── BinFileParser.py │ ├── DiskANNIndexParser.py │ ├── parse_common.py │ ├── parse_disk_index.py │ └── parse_pq.py ├── dev │ └── install-dev-deps-ubuntu.bash └── perf │ ├── Dockerfile │ ├── README.md │ └── perf_test.sh ├── setup.py ├── src ├── CMakeLists.txt ├── abstract_data_store.cpp ├── abstract_index.cpp ├── ann_exception.cpp ├── disk_utils.cpp ├── distance.cpp ├── dll │ ├── CMakeLists.txt │ └── dllmain.cpp ├── filter_utils.cpp ├── in_mem_data_store.cpp ├── in_mem_graph_store.cpp ├── index.cpp ├── index_factory.cpp ├── linux_aligned_file_reader.cpp ├── logger.cpp ├── math_utils.cpp ├── memory_mapper.cpp ├── natural_number_map.cpp ├── natural_number_set.cpp ├── partition.cpp ├── pq.cpp ├── pq_data_store.cpp ├── pq_flash_index.cpp ├── pq_l2_distance.cpp ├── restapi │ ├── search_wrapper.cpp │ └── server.cpp ├── scratch.cpp ├── utils.cpp └── windows_aligned_file_reader.cpp ├── tests ├── CMakeLists.txt ├── README.md ├── index_write_parameters_builder_tests.cpp └── main.cpp ├── unit_tester.sh ├── windows ├── packages.config.in └── packages_restapi.config.in └── workflows ├── SSD_index.md ├── dynamic_index.md ├── filtered_in_memory.md ├── filtered_ssd_index.md ├── in_memory_index.md ├── python.md └── rest_api.md /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Microsoft 3 | --- 4 | Language: Cpp 5 | SortIncludes: false 6 | ... 7 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Denote all files that are truly binary and should not be modified. 13 | *.png binary 14 | *.jpg binary 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Bug reports help us improve! Thanks for submitting yours! 4 | title: "[BUG] " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Expected Behavior 11 | Tell us what should happen 12 | 13 | ## Actual Behavior 14 | Tell us what happens instead 15 | 16 | ## Example Code 17 | Please see [How to create a Minimal, Reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) for some guidance on creating the best possible example of the problem 18 | ```bash 19 | 20 | ``` 21 | 22 | ## Dataset Description 23 | Please tell us about the shape and datatype of your data, (e.g. 128 dimensions, 12.3 billion points, floats) 24 | - Dimensions: 25 | - Number of Points: 26 | - Data type: 27 | 28 | ## Error 29 | ``` 30 | Paste the full error, with any sensitive information minimally redacted and marked $$REDACTED$$ 31 | 32 | ``` 33 | 34 | ## Your Environment 35 | * Operating system (e.g. Windows 11 Pro, Ubuntu 22.04.1 LTS) 36 | * DiskANN version (or commit built from) 37 | 38 | ## Additional Details 39 | Any other contextual information you might feel is important. 40 | 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Is your feature request related to a problem? Please describe. 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | ## Describe the solution you'd like 14 | A clear and concise description of what you want to happen. 15 | 16 | ## Describe alternatives you've considered 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | ## Provide references (if applicable) 20 | If your feature request is related to a published algorithm/idea, please provide links to 21 | any relevant articles or webpages. 22 | 23 | ## Additional context 24 | Add any other context or screenshots about the feature request here. 25 | 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/usage-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Usage Question 3 | about: Ask us a question about DiskANN! 4 | title: "[Question]" 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | This is our forum for asking whatever DiskANN question you'd like! No need to feel shy - we're happy to talk about use cases and optimal tuning strategies! 11 | 12 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | - [ ] Does this PR have a descriptive title that could go in our release notes? 6 | - [ ] Does this PR add any new dependencies? 7 | - [ ] Does this PR modify any existing APIs? 8 | - [ ] Is the change to the API backwards compatible? 9 | - [ ] Should this result in any changes to our documentation, either updating existing docs or adding new ones? 10 | 11 | #### Reference Issues/PRs 12 | 18 | 19 | #### What does this implement/fix? Briefly explain your changes. 20 | 21 | #### Any other comments? 22 | 23 | -------------------------------------------------------------------------------- /.github/actions/build/action.yml: -------------------------------------------------------------------------------- 1 | name: 'DiskANN Build Bootstrap' 2 | description: 'Prepares DiskANN build environment and executes build' 3 | runs: 4 | using: "composite" 5 | steps: 6 | # ------------ Linux Build --------------- 7 | - name: Prepare and Execute Build 8 | if: ${{ runner.os == 'Linux' }} 9 | run: | 10 | sudo scripts/dev/install-dev-deps-ubuntu.bash 11 | cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True 12 | cmake --build build -- -j 13 | cmake --install build --prefix="dist" 14 | shell: bash 15 | # ------------ End Linux Build --------------- 16 | # ------------ Windows Build --------------- 17 | - name: Add VisualStudio command line tools into path 18 | if: runner.os == 'Windows' 19 | uses: ilammy/msvc-dev-cmd@v1 20 | - name: Run configure and build for Windows 21 | if: runner.os == 'Windows' 22 | run: | 23 | mkdir build && cd build && cmake .. -DUNIT_TEST=True && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" -consoleloggerparameters:"ErrorsOnly;Summary" 24 | cd .. 25 | mkdir dist 26 | mklink /j .\dist\bin .\x64\Release\ 27 | shell: cmd 28 | # ------------ End Windows Build --------------- 29 | # ------------ Windows Build With EXEC_ENV_OLS and USE_BING_INFRA --------------- 30 | - name: Add VisualStudio command line tools into path 31 | if: runner.os == 'Windows' 32 | uses: ilammy/msvc-dev-cmd@v1 33 | - name: Run configure and build for Windows with Bing feature flags 34 | if: runner.os == 'Windows' 35 | run: | 36 | mkdir build_bing && cd build_bing && cmake .. -DEXEC_ENV_OLS=1 -DUSE_BING_INFRA=1 -DUNIT_TEST=True && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" -consoleloggerparameters:"ErrorsOnly;Summary" 37 | cd .. 38 | shell: cmd 39 | # ------------ End Windows Build --------------- 40 | -------------------------------------------------------------------------------- /.github/actions/format-check/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Checking code formatting...' 2 | description: 'Ensures code complies with code formatting rules' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - name: Checking code formatting... 7 | run: | 8 | sudo apt install clang-format 9 | find include -name '*.h' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run 10 | find src -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run 11 | find apps -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run 12 | find python -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run 13 | shell: bash 14 | -------------------------------------------------------------------------------- /.github/actions/generate-high-dim-random/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Generating Random Data (Basic)' 2 | description: 'Generates the random data files used in acceptance tests' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - name: Generate Random Data (Basic) 7 | run: | 8 | mkdir data 9 | 10 | echo "Generating random 1020,1024,1536D float and 4096 int8 vectors for index" 11 | dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_5K_norm1.0.bin -D 1020 -N 5000 --norm 1.0 12 | #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_5K_norm1.0.bin -D 1024 -N 5000 --norm 1.0 13 | dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_5K_norm1.0.bin -D 1536 -N 5000 --norm 1.0 14 | dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_5K_norm1.0.bin -D 4096 -N 5000 --norm 1.0 15 | 16 | echo "Generating random 1020,1024,1536D float and 4096D int8 avectors for query" 17 | dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_1K_norm1.0.bin -D 1020 -N 1000 --norm 1.0 18 | #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_1K_norm1.0.bin -D 1024 -N 1000 --norm 1.0 19 | dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_1K_norm1.0.bin -D 1536 -N 1000 --norm 1.0 20 | dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_4096D_1K_norm1.0.bin -D 4096 -N 1000 --norm 1.0 21 | 22 | echo "Computing ground truth for 1020,1024,1536D float and 4096D int8 avectors for query" 23 | dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1020D_5K_norm1.0.bin --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_5K_norm1.0_1020D_1K_norm1.0_gt100 --K 100 24 | #dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1024D_5K_norm1.0.bin --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_5K_norm1.0_1024D_1K_norm1.0_gt100 --K 100 25 | dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_1536D_5K_norm1.0.bin --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_5K_norm1.0_1536D_1K_norm1.0_gt100 --K 100 26 | dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_4096D_5K_norm1.0.bin --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_5K_norm1.0_4096D_1K_norm1.0_gt100 --K 100 27 | 28 | shell: bash 29 | -------------------------------------------------------------------------------- /.github/actions/python-wheel/action.yml: -------------------------------------------------------------------------------- 1 | name: Build Python Wheel 2 | description: Builds a python wheel with cibuildwheel 3 | inputs: 4 | cibw-identifier: 5 | description: "CI build wheel identifier to build" 6 | required: true 7 | runs: 8 | using: "composite" 9 | steps: 10 | - uses: actions/setup-python@v3 11 | - name: Install cibuildwheel 12 | run: python -m pip install cibuildwheel==2.11.3 13 | shell: bash 14 | - name: Building Python ${{inputs.cibw-identifier}} Wheel 15 | run: python -m cibuildwheel --output-dir dist 16 | env: 17 | CIBW_BUILD: ${{inputs.cibw-identifier}} 18 | shell: bash 19 | - uses: actions/upload-artifact@v3 20 | with: 21 | name: wheels 22 | path: ./dist/*.whl 23 | -------------------------------------------------------------------------------- /.github/workflows/build-python.yml: -------------------------------------------------------------------------------- 1 | name: DiskANN Build Python Wheel 2 | on: [workflow_call] 3 | jobs: 4 | linux-build: 5 | name: Python - Ubuntu - ${{matrix.cibw-identifier}} 6 | strategy: 7 | fail-fast: false 8 | matrix: 9 | cibw-identifier: ["cp39-manylinux_x86_64", "cp310-manylinux_x86_64", "cp311-manylinux_x86_64"] 10 | runs-on: ubuntu-latest 11 | defaults: 12 | run: 13 | shell: bash 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 1 19 | - name: Building python wheel ${{matrix.cibw-identifier}} 20 | uses: ./.github/actions/python-wheel 21 | with: 22 | cibw-identifier: ${{matrix.cibw-identifier}} 23 | windows-build: 24 | name: Python - Windows - ${{matrix.cibw-identifier}} 25 | strategy: 26 | fail-fast: false 27 | matrix: 28 | cibw-identifier: ["cp39-win_amd64", "cp310-win_amd64", "cp311-win_amd64"] 29 | runs-on: windows-latest 30 | defaults: 31 | run: 32 | shell: bash 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v3 36 | with: 37 | submodules: true 38 | fetch-depth: 1 39 | - name: Building python wheel ${{matrix.cibw-identifier}} 40 | uses: ./.github/actions/python-wheel 41 | with: 42 | cibw-identifier: ${{matrix.cibw-identifier}} 43 | -------------------------------------------------------------------------------- /.github/workflows/common.yml: -------------------------------------------------------------------------------- 1 | name: DiskANN Common Checks 2 | # common means common to both pr-test and push-test 3 | on: [workflow_call] 4 | jobs: 5 | formatting-check: 6 | strategy: 7 | fail-fast: true 8 | name: Code Formatting Test 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout repository 12 | uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 1 15 | - name: Checking code formatting... 16 | uses: ./.github/actions/format-check 17 | docker-container-build: 18 | name: Docker Container Build 19 | needs: [formatting-check] 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v3 24 | with: 25 | fetch-depth: 1 26 | - name: Docker build 27 | run: | 28 | docker build . -------------------------------------------------------------------------------- /.github/workflows/perf.yml: -------------------------------------------------------------------------------- 1 | name: DiskANN Nightly Performance Metrics 2 | on: 3 | schedule: 4 | - cron: "41 14 * * *" # 14:41 UTC, 7:41 PDT, 8:41 PST, 08:11 IST 5 | jobs: 6 | perf-test: 7 | name: Run Perf Test from main 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v3 12 | with: 13 | fetch-depth: 1 14 | - name: Build Perf Container 15 | run: | 16 | docker build --build-arg GIT_COMMIT_ISH="$GITHUB_SHA" -t perf -f scripts/perf/Dockerfile scripts 17 | - name: Performance Tests 18 | run: | 19 | mkdir metrics 20 | docker run -v ./metrics:/app/logs perf &> ./metrics/combined_stdouterr.log 21 | - name: Upload Metrics Logs 22 | uses: actions/upload-artifact@v4 23 | with: 24 | name: metrics-${{matrix.os}} 25 | path: | 26 | ./metrics/** 27 | -------------------------------------------------------------------------------- /.github/workflows/pr-test.yml: -------------------------------------------------------------------------------- 1 | name: DiskANN Pull Request Build and Test 2 | on: [pull_request] 3 | jobs: 4 | common: 5 | strategy: 6 | fail-fast: true 7 | name: DiskANN Common Build Checks 8 | uses: ./.github/workflows/common.yml 9 | unit-tests: 10 | name: Unit tests 11 | uses: ./.github/workflows/unit-tests.yml 12 | in-mem-pq: 13 | name: In-Memory with PQ 14 | uses: ./.github/workflows/in-mem-pq.yml 15 | in-mem-no-pq: 16 | name: In-Memory without PQ 17 | uses: ./.github/workflows/in-mem-no-pq.yml 18 | disk-pq: 19 | name: Disk with PQ 20 | uses: ./.github/workflows/disk-pq.yml 21 | multi-sector-disk-pq: 22 | name: Multi-sector Disk with PQ 23 | uses: ./.github/workflows/multi-sector-disk-pq.yml 24 | labels: 25 | name: Labels 26 | uses: ./.github/workflows/labels.yml 27 | dynamic: 28 | name: Dynamic 29 | uses: ./.github/workflows/dynamic.yml 30 | dynamic-labels: 31 | name: Dynamic Labels 32 | uses: ./.github/workflows/dynamic-labels.yml 33 | python: 34 | name: Python 35 | uses: ./.github/workflows/build-python.yml 36 | -------------------------------------------------------------------------------- /.github/workflows/push-test.yml: -------------------------------------------------------------------------------- 1 | name: DiskANN Push Build 2 | on: [push] 3 | jobs: 4 | common: 5 | strategy: 6 | fail-fast: true 7 | name: DiskANN Common Build Checks 8 | uses: ./.github/workflows/common.yml 9 | build-documentation: 10 | permissions: 11 | contents: write 12 | strategy: 13 | fail-fast: true 14 | name: DiskANN Build Documentation 15 | uses: ./.github/workflows/build-python-pdoc.yml 16 | build: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ ubuntu-latest, windows-2019, windows-latest ] 21 | name: Build for ${{matrix.os}} 22 | runs-on: ${{matrix.os}} 23 | defaults: 24 | run: 25 | shell: bash 26 | steps: 27 | - name: Checkout repository 28 | if: ${{ runner.os == 'Linux' }} 29 | uses: actions/checkout@v3 30 | with: 31 | fetch-depth: 1 32 | - name: Checkout repository 33 | if: ${{ runner.os == 'Windows' }} 34 | uses: actions/checkout@v3 35 | with: 36 | fetch-depth: 1 37 | submodules: true 38 | - name: Build diskannpy dependency tree 39 | run: | 40 | pip install diskannpy pipdeptree 41 | echo "dependencies" > dependencies_${{ matrix.os }}.txt 42 | pipdeptree >> dependencies_${{ matrix.os }}.txt 43 | - name: Archive diskannpy dependencies artifact 44 | uses: actions/upload-artifact@v4 45 | with: 46 | name: dependencies_${{ matrix.os }} 47 | path: | 48 | dependencies_${{ matrix.os }}.txt 49 | - name: DiskANN Build CLI Applications 50 | uses: ./.github/actions/build 51 | -------------------------------------------------------------------------------- /.github/workflows/python-release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release Python Wheels 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | python-release-wheels: 7 | name: Python 8 | uses: ./.github/workflows/build-python.yml 9 | build-documentation: 10 | strategy: 11 | fail-fast: true 12 | name: DiskANN Build Documentation 13 | uses: ./.github/workflows/build-python-pdoc.yml 14 | release: 15 | permissions: 16 | contents: write 17 | runs-on: ubuntu-latest 18 | needs: python-release-wheels 19 | steps: 20 | - uses: actions/download-artifact@v3 21 | with: 22 | name: wheels 23 | path: dist/ 24 | - name: Generate SHA256 files for each wheel 25 | run: | 26 | sha256sum dist/*.whl > checksums.txt 27 | cat checksums.txt 28 | - uses: actions/setup-python@v3 29 | - name: Install twine 30 | run: python -m pip install twine 31 | - name: Publish with twine 32 | env: 33 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 34 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 35 | run: | 36 | twine upload dist/*.whl 37 | - name: Update release with SHA256 and Artifacts 38 | uses: softprops/action-gh-release@v1 39 | with: 40 | token: ${{ secrets.GITHUB_TOKEN }} 41 | files: | 42 | dist/*.whl 43 | checksums.txt 44 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | on: [workflow_call] 3 | jobs: 4 | acceptance-tests-labels: 5 | name: Unit Tests 6 | strategy: 7 | fail-fast: false 8 | matrix: 9 | os: [ubuntu-latest, windows-2019, windows-latest] 10 | runs-on: ${{matrix.os}} 11 | defaults: 12 | run: 13 | shell: bash 14 | steps: 15 | - name: Checkout repository 16 | if: ${{ runner.os == 'Linux' }} 17 | uses: actions/checkout@v3 18 | with: 19 | fetch-depth: 1 20 | - name: Checkout repository 21 | if: ${{ runner.os == 'Windows' }} 22 | uses: actions/checkout@v3 23 | with: 24 | fetch-depth: 1 25 | submodules: true 26 | - name: DiskANN Build CLI Applications 27 | uses: ./.github/actions/build 28 | 29 | - name: Run Unit Tests 30 | run: | 31 | cd build 32 | ctest -C Release -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "gperftools"] 2 | path = gperftools 3 | url = https://github.com/gperftools/gperftools.git 4 | -------------------------------------------------------------------------------- /AnyBuildLogs/latest.txt: -------------------------------------------------------------------------------- 1 | 20231019-111207-d314f8bf -------------------------------------------------------------------------------- /CMakeSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "x64-Release", 5 | "generator": "Ninja", 6 | "configurationType": "Release", 7 | "inheritEnvironments": [ "msvc_x64" ], 8 | "buildRoot": "${projectDir}\\out\\build\\${name}", 9 | "installRoot": "${projectDir}\\out\\install\\${name}", 10 | "cmakeCommandArgs": "", 11 | "buildCommandArgs": "", 12 | "ctestCommandArgs": "" 13 | }, 14 | { 15 | "name": "WSL-GCC-Release", 16 | "generator": "Ninja", 17 | "configurationType": "RelWithDebInfo", 18 | "buildRoot": "${projectDir}\\out\\build\\${name}", 19 | "installRoot": "${projectDir}\\out\\install\\${name}", 20 | "cmakeExecutable": "cmake", 21 | "cmakeCommandArgs": "", 22 | "buildCommandArgs": "", 23 | "ctestCommandArgs": "", 24 | "inheritEnvironments": [ "linux_x64" ], 25 | "wslPath": "${defaultWSLPath}" 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #Copyright(c) Microsoft Corporation.All rights reserved. 2 | #Licensed under the MIT license. 3 | 4 | FROM ubuntu:jammy 5 | 6 | RUN apt update 7 | RUN apt install -y software-properties-common 8 | RUN add-apt-repository -y ppa:git-core/ppa 9 | RUN apt update 10 | RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 11 | 12 | WORKDIR /app 13 | RUN git clone https://github.com/microsoft/DiskANN.git 14 | WORKDIR /app/DiskANN 15 | RUN mkdir build 16 | RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release 17 | RUN cmake --build build -- -j 18 | -------------------------------------------------------------------------------- /DockerfileDev: -------------------------------------------------------------------------------- 1 | #Copyright(c) Microsoft Corporation.All rights reserved. 2 | #Licensed under the MIT license. 3 | 4 | FROM ubuntu:jammy 5 | 6 | RUN apt update 7 | RUN apt install -y software-properties-common 8 | RUN add-apt-repository -y ppa:git-core/ppa 9 | RUN apt update 10 | RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libboost-test-dev libmkl-full-dev libcpprest-dev python3.10 11 | 12 | WORKDIR /app 13 | RUN git clone https://github.com/microsoft/DiskANN.git 14 | WORKDIR /app/DiskANN 15 | RUN mkdir build 16 | RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True 17 | RUN cmake --build build -- -j 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DiskANN 2 | 3 | MIT License 4 | 5 | Copyright (c) Microsoft Corporation. 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include *.txt 3 | include *.md 4 | include setup.py 5 | include pyproject.toml 6 | include *.cmake 7 | recursive-include gperftools * 8 | recursive-include include * 9 | recursive-include python * 10 | recursive-include windows * 11 | prune python/tests 12 | recursive-include src * 13 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | This algorithms builds upon [code for NSG](https://github.com/ZJULearning/nsg), commit: 335e8e, licensed under the following terms. 2 | 3 | MIT License 4 | 5 | Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_COMPILE_WARNING_AS_ERROR ON) 6 | 7 | add_executable(build_memory_index build_memory_index.cpp) 8 | target_link_libraries(build_memory_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 9 | 10 | add_executable(build_stitched_index build_stitched_index.cpp) 11 | target_link_libraries(build_stitched_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 12 | 13 | add_executable(search_memory_index search_memory_index.cpp) 14 | target_link_libraries(search_memory_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 15 | 16 | add_executable(build_disk_index build_disk_index.cpp) 17 | target_link_libraries(build_disk_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options) 18 | 19 | add_executable(search_disk_index search_disk_index.cpp) 20 | target_link_libraries(search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 21 | 22 | add_executable(range_search_disk_index range_search_disk_index.cpp) 23 | target_link_libraries(range_search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 24 | 25 | add_executable(test_streaming_scenario test_streaming_scenario.cpp) 26 | target_link_libraries(test_streaming_scenario ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 27 | 28 | add_executable(test_insert_deletes_consolidate test_insert_deletes_consolidate.cpp) 29 | target_link_libraries(test_insert_deletes_consolidate ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) 30 | 31 | if (NOT MSVC) 32 | install(TARGETS build_memory_index 33 | build_stitched_index 34 | search_memory_index 35 | build_disk_index 36 | search_disk_index 37 | range_search_disk_index 38 | test_streaming_scenario 39 | test_insert_deletes_consolidate 40 | RUNTIME 41 | ) 42 | endif() 43 | -------------------------------------------------------------------------------- /apps/python/README.md: -------------------------------------------------------------------------------- 1 | 3 | 4 | # Integration Tests 5 | The following tests use Python to prepare, run, verify, and tear down the rest api services. 6 | 7 | We do make use of the built-in `unittest` library, but that's only to take advantage of test reporting purposes. 8 | 9 | These are decidedly **not** _unit_ tests. These are end to end integration tests. 10 | 11 | ## Caveats 12 | This has only been tested or built for Linux, though we have written platform agnostic Python for the smoke test 13 | (i.e. using `os.path.join`, etc) 14 | 15 | It has been tested on Python 3.9 and 3.10, but should work on Python 3.6+. 16 | 17 | ## How to Run 18 | 19 | First, build the DiskANN RestAPI code; see $REPOSITORY_ROOT/workflows/rest_api.md for detailed instructions. 20 | 21 | ```bash 22 | cd tests/python 23 | python3 -m venv venv 24 | source venv/bin/activate 25 | pip install -r requirements.txt 26 | 27 | export DISKANN_BUILD_DIR=/path/to/your/diskann/build 28 | python -m unittest 29 | ``` 30 | 31 | ## Smoke Test Failed, Now What? 32 | The smoke test written takes advantage of temporary directories that are only valid during the 33 | lifetime of the test. The contents of these directories include: 34 | - Randomized vectors (first in tsv, then bin form) used to build the PQFlashIndex 35 | - The PQFlashIndex files 36 | 37 | It is useful to keep these around. By setting some environment variables, you can control whether an ephemeral, 38 | temporary directory is used (and deleted on test completion), or left as an exercise for the developer to 39 | clean up. 40 | 41 | The valid environment variables are: 42 | - `DISKANN_REST_TEST_WORKING_DIR` (example: `$USER/DiskANNRestTest`) 43 | - If this is specified, it **must exist** and **must be writeable**. Any existing files will be clobbered. 44 | - `DISKANN_REST_SERVER` (example: `http://127.0.0.1:10067`) 45 | - Note that if this is set, no data will be generated, nor will a server be started; it is presumed you have done 46 | all the work in creating and starting the rest server prior to running the test and just submits requests against it. 47 | -------------------------------------------------------------------------------- /apps/python/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | requests 3 | -------------------------------------------------------------------------------- /apps/python/restapi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/apps/python/restapi/__init__.py -------------------------------------------------------------------------------- /apps/restapi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | 6 | add_executable(inmem_server inmem_server.cpp) 7 | if(MSVC) 8 | target_link_options(inmem_server PRIVATE /MACHINE:x64) 9 | target_link_libraries(inmem_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options) 10 | target_link_libraries(inmem_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options) 11 | else() 12 | target_link_libraries(inmem_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options) 13 | endif() 14 | 15 | add_executable(ssd_server ssd_server.cpp) 16 | if(MSVC) 17 | target_link_options(ssd_server PRIVATE /MACHINE:x64) 18 | target_link_libraries(ssd_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options) 19 | target_link_libraries(ssd_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options) 20 | else() 21 | target_link_libraries(ssd_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options) 22 | endif() 23 | 24 | add_executable(multiple_ssdindex_server multiple_ssdindex_server.cpp) 25 | if(MSVC) 26 | target_link_options(multiple_ssdindex_server PRIVATE /MACHINE:x64) 27 | target_link_libraries(multiple_ssdindex_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options) 28 | target_link_libraries(multiple_ssdindex_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options) 29 | else() 30 | target_link_libraries(multiple_ssdindex_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options) 31 | endif() 32 | 33 | add_executable(client client.cpp) 34 | if(MSVC) 35 | target_link_options(client PRIVATE /MACHINE:x64) 36 | target_link_libraries(client debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options) 37 | target_link_libraries(client optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options) 38 | else() 39 | target_link_libraries(client ${PROJECT_NAME} -lboost_system -lcrypto -lssl -lcpprest Boost::program_options) 40 | endif() -------------------------------------------------------------------------------- /apps/utils/bin_to_fvecs.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "util.h" 6 | 7 | void block_convert(std::ifstream &writr, std::ofstream &readr, float *read_buf, float *write_buf, uint64_t npts, 8 | uint64_t ndims) 9 | { 10 | writr.write((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(unsigned))); 11 | #pragma omp parallel for 12 | for (uint64_t i = 0; i < npts; i++) 13 | { 14 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(float)); 15 | } 16 | readr.read((char *)write_buf, npts * ndims * sizeof(float)); 17 | } 18 | 19 | int main(int argc, char **argv) 20 | { 21 | if (argc != 3) 22 | { 23 | std::cout << argv[0] << " input_bin output_fvecs" << std::endl; 24 | exit(-1); 25 | } 26 | std::ifstream readr(argv[1], std::ios::binary); 27 | int npts_s32; 28 | int ndims_s32; 29 | readr.read((char *)&npts_s32, sizeof(int32_t)); 30 | readr.read((char *)&ndims_s32, sizeof(int32_t)); 31 | size_t npts = npts_s32; 32 | size_t ndims = ndims_s32; 33 | uint32_t ndims_u32 = (uint32_t)ndims_s32; 34 | // uint64_t fsize = writr.tellg(); 35 | readr.seekg(0, std::ios::beg); 36 | 37 | unsigned ndims_u32; 38 | writr.write((char *)&ndims_u32, sizeof(unsigned)); 39 | writr.seekg(0, std::ios::beg); 40 | uint64_t ndims = (uint64_t)ndims_u32; 41 | uint64_t npts = fsize / ((ndims + 1) * sizeof(float)); 42 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 43 | 44 | uint64_t blk_size = 131072; 45 | uint64_t nblks = ROUND_UP(npts, blk_size) / blk_size; 46 | std::cout << "# blks: " << nblks << std::endl; 47 | 48 | std::ofstream writr(argv[2], std::ios::binary); 49 | float *read_buf = new float[npts * (ndims + 1)]; 50 | float *write_buf = new float[npts * ndims]; 51 | for (uint64_t i = 0; i < nblks; i++) 52 | { 53 | uint64_t cblk_size = std::min(npts - i * blk_size, blk_size); 54 | block_convert(writr, readr, read_buf, write_buf, cblk_size, ndims); 55 | std::cout << "Block #" << i << " written" << std::endl; 56 | } 57 | 58 | delete[] read_buf; 59 | delete[] write_buf; 60 | 61 | writr.close(); 62 | readr.close(); 63 | } 64 | -------------------------------------------------------------------------------- /apps/utils/bin_to_tsv.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | template 8 | void block_convert(std::ofstream &writer, std::ifstream &reader, T *read_buf, size_t npts, size_t ndims) 9 | { 10 | reader.read((char *)read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (size_t i = 0; i < npts; i++) 13 | { 14 | for (size_t d = 0; d < ndims; d++) 15 | { 16 | writer << read_buf[d + i * ndims]; 17 | if (d < ndims - 1) 18 | writer << "\t"; 19 | else 20 | writer << "\n"; 21 | } 22 | } 23 | } 24 | 25 | int main(int argc, char **argv) 26 | { 27 | if (argc != 4) 28 | { 29 | std::cout << argv[0] << " input_bin output_tsv" << std::endl; 30 | exit(-1); 31 | } 32 | std::string type_string(argv[1]); 33 | if ((type_string != std::string("float")) && (type_string != std::string("int8")) && 34 | (type_string != std::string("uin8"))) 35 | { 36 | std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl; 37 | } 38 | 39 | std::ifstream reader(argv[2], std::ios::binary); 40 | uint32_t npts_u32; 41 | uint32_t ndims_u32; 42 | reader.read((char *)&npts_u32, sizeof(uint32_t)); 43 | reader.read((char *)&ndims_u32, sizeof(uint32_t)); 44 | size_t npts = npts_u32; 45 | size_t ndims = ndims_u32; 46 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 47 | 48 | size_t blk_size = 131072; 49 | size_t nblks = ROUND_UP(npts, blk_size) / blk_size; 50 | 51 | std::ofstream writer(argv[3]); 52 | char *read_buf = new char[blk_size * ndims * 4]; 53 | for (size_t i = 0; i < nblks; i++) 54 | { 55 | size_t cblk_size = std::min(npts - i * blk_size, blk_size); 56 | if (type_string == std::string("float")) 57 | block_convert(writer, reader, (float *)read_buf, cblk_size, ndims); 58 | else if (type_string == std::string("int8")) 59 | block_convert(writer, reader, (int8_t *)read_buf, cblk_size, ndims); 60 | else if (type_string == std::string("uint8")) 61 | block_convert(writer, reader, (uint8_t *)read_buf, cblk_size, ndims); 62 | std::cout << "Block #" << i << " written" << std::endl; 63 | } 64 | 65 | delete[] read_buf; 66 | 67 | writer.close(); 68 | reader.close(); 69 | } 70 | -------------------------------------------------------------------------------- /apps/utils/calculate_recall.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "utils.h" 13 | #include "disk_utils.h" 14 | 15 | int main(int argc, char **argv) 16 | { 17 | if (argc != 4) 18 | { 19 | std::cout << argv[0] << " " << std::endl; 20 | return -1; 21 | } 22 | uint32_t *gold_std = NULL; 23 | float *gs_dist = nullptr; 24 | uint32_t *our_results = NULL; 25 | float *or_dist = nullptr; 26 | size_t points_num, points_num_gs, points_num_or; 27 | size_t dim_gs; 28 | size_t dim_or; 29 | diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs); 30 | diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or); 31 | 32 | if (points_num_gs != points_num_or) 33 | { 34 | std::cout << "Error. Number of queries mismatch in ground truth and " 35 | "our results" 36 | << std::endl; 37 | return -1; 38 | } 39 | points_num = points_num_gs; 40 | 41 | uint32_t recall_at = std::atoi(argv[3]); 42 | 43 | if ((dim_or < recall_at) || (recall_at > dim_gs)) 44 | { 45 | std::cout << "ground truth has size " << dim_gs << "; our set has " << dim_or << " points. Asking for recall " 46 | << recall_at << std::endl; 47 | return -1; 48 | } 49 | std::cout << "Calculating recall@" << recall_at << std::endl; 50 | double recall_val = diskann::calculate_recall((uint32_t)points_num, gold_std, gs_dist, (uint32_t)dim_gs, 51 | our_results, (uint32_t)dim_or, (uint32_t)recall_at); 52 | 53 | // double avg_recall = (recall*1.0)/(points_num*1.0); 54 | std::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n"; 55 | } 56 | -------------------------------------------------------------------------------- /apps/utils/create_disk_layout.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "utils.h" 12 | #include "disk_utils.h" 13 | #include "cached_io.h" 14 | 15 | template int create_disk_layout(char **argv) 16 | { 17 | std::string base_file(argv[2]); 18 | std::string vamana_file(argv[3]); 19 | std::string output_file(argv[4]); 20 | diskann::create_disk_layout(base_file, vamana_file, output_file); 21 | return 0; 22 | } 23 | 24 | int main(int argc, char **argv) 25 | { 26 | if (argc != 5) 27 | { 28 | std::cout << argv[0] 29 | << " data_type data_bin " 30 | "vamana_index_file output_diskann_index_file" 31 | << std::endl; 32 | exit(-1); 33 | } 34 | 35 | int ret_val = -1; 36 | if (std::string(argv[1]) == std::string("float")) 37 | ret_val = create_disk_layout(argv); 38 | else if (std::string(argv[1]) == std::string("int8")) 39 | ret_val = create_disk_layout(argv); 40 | else if (std::string(argv[1]) == std::string("uint8")) 41 | ret_val = create_disk_layout(argv); 42 | else 43 | { 44 | std::cout << "unsupported type. use int8/uint8/float " << std::endl; 45 | ret_val = -2; 46 | } 47 | return ret_val; 48 | } 49 | -------------------------------------------------------------------------------- /apps/utils/float_bin_to_int8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ofstream &writer, int8_t *write_buf, std::ifstream &reader, float *read_buf, size_t npts, 8 | size_t ndims, float bias, float scale) 9 | { 10 | reader.read((char *)read_buf, npts * ndims * sizeof(float)); 11 | 12 | for (size_t i = 0; i < npts; i++) 13 | { 14 | for (size_t d = 0; d < ndims; d++) 15 | { 16 | write_buf[d + i * ndims] = (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale)); 17 | } 18 | } 19 | writer.write((char *)write_buf, npts * ndims); 20 | } 21 | 22 | int main(int argc, char **argv) 23 | { 24 | if (argc != 5) 25 | { 26 | std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" << std::endl; 27 | exit(-1); 28 | } 29 | 30 | std::ifstream reader(argv[1], std::ios::binary); 31 | uint32_t npts_u32; 32 | uint32_t ndims_u32; 33 | reader.read((char *)&npts_u32, sizeof(uint32_t)); 34 | reader.read((char *)&ndims_u32, sizeof(uint32_t)); 35 | size_t npts = npts_u32; 36 | size_t ndims = ndims_u32; 37 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 38 | 39 | size_t blk_size = 131072; 40 | size_t nblks = ROUND_UP(npts, blk_size) / blk_size; 41 | 42 | std::ofstream writer(argv[2], std::ios::binary); 43 | auto read_buf = new float[blk_size * ndims]; 44 | auto write_buf = new int8_t[blk_size * ndims]; 45 | float bias = (float)atof(argv[3]); 46 | float scale = (float)atof(argv[4]); 47 | 48 | writer.write((char *)(&npts_u32), sizeof(uint32_t)); 49 | writer.write((char *)(&ndims_u32), sizeof(uint32_t)); 50 | 51 | for (size_t i = 0; i < nblks; i++) 52 | { 53 | size_t cblk_size = std::min(npts - i * blk_size, blk_size); 54 | block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale); 55 | std::cout << "Block #" << i << " written" << std::endl; 56 | } 57 | 58 | delete[] read_buf; 59 | delete[] write_buf; 60 | 61 | writer.close(); 62 | reader.close(); 63 | } 64 | -------------------------------------------------------------------------------- /apps/utils/fvecs_to_bvecs.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream &reader, std::ofstream &writer, float *read_buf, uint8_t *write_buf, size_t npts, 8 | size_t ndims) 9 | { 10 | reader.read((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(uint32_t))); 11 | for (size_t i = 0; i < npts; i++) 12 | { 13 | memcpy(write_buf + i * (ndims + 4), read_buf + i * (ndims + 1), sizeof(uint32_t)); 14 | for (size_t d = 0; d < ndims; d++) 15 | write_buf[i * (ndims + 4) + 4 + d] = (uint8_t)read_buf[i * (ndims + 1) + 1 + d]; 16 | } 17 | writer.write((char *)write_buf, npts * (ndims * 1 + 4)); 18 | } 19 | 20 | int main(int argc, char **argv) 21 | { 22 | if (argc != 3) 23 | { 24 | std::cout << argv[0] << " input_fvecs output_bvecs(uint8)" << std::endl; 25 | exit(-1); 26 | } 27 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 28 | size_t fsize = reader.tellg(); 29 | reader.seekg(0, std::ios::beg); 30 | 31 | uint32_t ndims_u32; 32 | reader.read((char *)&ndims_u32, sizeof(uint32_t)); 33 | reader.seekg(0, std::ios::beg); 34 | size_t ndims = (size_t)ndims_u32; 35 | size_t npts = fsize / ((ndims + 1) * sizeof(float)); 36 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 37 | 38 | size_t blk_size = 131072; 39 | size_t nblks = ROUND_UP(npts, blk_size) / blk_size; 40 | std::cout << "# blks: " << nblks << std::endl; 41 | std::ofstream writer(argv[2], std::ios::binary); 42 | auto read_buf = new float[npts * (ndims + 1)]; 43 | auto write_buf = new uint8_t[npts * (ndims + 4)]; 44 | for (size_t i = 0; i < nblks; i++) 45 | { 46 | size_t cblk_size = std::min(npts - i * blk_size, blk_size); 47 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 48 | std::cout << "Block #" << i << " written" << std::endl; 49 | } 50 | 51 | delete[] read_buf; 52 | delete[] write_buf; 53 | 54 | reader.close(); 55 | writer.close(); 56 | } 57 | -------------------------------------------------------------------------------- /apps/utils/gen_random_slice.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "partition.h" 16 | #include "utils.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | template int aux_main(char **argv) 24 | { 25 | std::string base_file(argv[2]); 26 | std::string output_prefix(argv[3]); 27 | float sampling_rate = (float)(std::atof(argv[4])); 28 | gen_random_slice(base_file, output_prefix, sampling_rate); 29 | return 0; 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | if (argc != 5) 35 | { 36 | std::cout << argv[0] 37 | << " data_type [float/int8/uint8] base_bin_file " 38 | "sample_output_prefix sampling_probability" 39 | << std::endl; 40 | exit(-1); 41 | } 42 | 43 | if (std::string(argv[1]) == std::string("float")) 44 | { 45 | aux_main(argv); 46 | } 47 | else if (std::string(argv[1]) == std::string("int8")) 48 | { 49 | aux_main(argv); 50 | } 51 | else if (std::string(argv[1]) == std::string("uint8")) 52 | { 53 | aux_main(argv); 54 | } 55 | else 56 | std::cout << "Unsupported type. Use float/int8/uint8." << std::endl; 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /apps/utils/int8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char **argv) 8 | { 9 | if (argc != 3) 10 | { 11 | std::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl; 12 | exit(-1); 13 | } 14 | 15 | int8_t *input; 16 | size_t npts, nd; 17 | diskann::load_bin(argv[1], input, npts, nd); 18 | float *output = new float[npts * nd]; 19 | diskann::convert_types(input, output, npts, nd); 20 | diskann::save_bin(argv[2], output, npts, nd); 21 | delete[] output; 22 | delete[] input; 23 | } 24 | -------------------------------------------------------------------------------- /apps/utils/int8_to_float_scale.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ofstream &writer, float *write_buf, std::ifstream &reader, int8_t *read_buf, size_t npts, 8 | size_t ndims, float bias, float scale) 9 | { 10 | reader.read((char *)read_buf, npts * ndims * sizeof(int8_t)); 11 | 12 | for (size_t i = 0; i < npts; i++) 13 | { 14 | for (size_t d = 0; d < ndims; d++) 15 | { 16 | write_buf[d + i * ndims] = (((float)read_buf[d + i * ndims] - bias) * scale); 17 | } 18 | } 19 | writer.write((char *)write_buf, npts * ndims * sizeof(float)); 20 | } 21 | 22 | int main(int argc, char **argv) 23 | { 24 | if (argc != 5) 25 | { 26 | std::cout << "Usage: " << argv[0] << " input-int8.bin output-float.bin bias scale" << std::endl; 27 | exit(-1); 28 | } 29 | 30 | std::ifstream reader(argv[1], std::ios::binary); 31 | uint32_t npts_u32; 32 | uint32_t ndims_u32; 33 | reader.read((char *)&npts_u32, sizeof(uint32_t)); 34 | reader.read((char *)&ndims_u32, sizeof(uint32_t)); 35 | size_t npts = npts_u32; 36 | size_t ndims = ndims_u32; 37 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 38 | 39 | size_t blk_size = 131072; 40 | size_t nblks = ROUND_UP(npts, blk_size) / blk_size; 41 | 42 | std::ofstream writer(argv[2], std::ios::binary); 43 | auto read_buf = new int8_t[blk_size * ndims]; 44 | auto write_buf = new float[blk_size * ndims]; 45 | float bias = (float)atof(argv[3]); 46 | float scale = (float)atof(argv[4]); 47 | 48 | writer.write((char *)(&npts_u32), sizeof(uint32_t)); 49 | writer.write((char *)(&ndims_u32), sizeof(uint32_t)); 50 | 51 | for (size_t i = 0; i < nblks; i++) 52 | { 53 | size_t cblk_size = std::min(npts - i * blk_size, blk_size); 54 | block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale); 55 | std::cout << "Block #" << i << " written" << std::endl; 56 | } 57 | 58 | delete[] read_buf; 59 | delete[] write_buf; 60 | 61 | writer.close(); 62 | reader.close(); 63 | } 64 | -------------------------------------------------------------------------------- /apps/utils/ivecs_to_bin.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | void block_convert(std::ifstream &reader, std::ofstream &writer, uint32_t *read_buf, uint32_t *write_buf, size_t npts, 8 | size_t ndims) 9 | { 10 | reader.read((char *)read_buf, npts * (ndims * sizeof(uint32_t) + sizeof(uint32_t))); 11 | for (size_t i = 0; i < npts; i++) 12 | { 13 | memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(uint32_t)); 14 | } 15 | writer.write((char *)write_buf, npts * ndims * sizeof(uint32_t)); 16 | } 17 | 18 | int main(int argc, char **argv) 19 | { 20 | if (argc != 3) 21 | { 22 | std::cout << argv[0] << " input_ivecs output_bin" << std::endl; 23 | exit(-1); 24 | } 25 | std::ifstream reader(argv[1], std::ios::binary | std::ios::ate); 26 | size_t fsize = reader.tellg(); 27 | reader.seekg(0, std::ios::beg); 28 | 29 | uint32_t ndims_u32; 30 | reader.read((char *)&ndims_u32, sizeof(uint32_t)); 31 | reader.seekg(0, std::ios::beg); 32 | size_t ndims = (size_t)ndims_u32; 33 | size_t npts = fsize / ((ndims + 1) * sizeof(uint32_t)); 34 | std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl; 35 | 36 | size_t blk_size = 131072; 37 | size_t nblks = ROUND_UP(npts, blk_size) / blk_size; 38 | std::cout << "# blks: " << nblks << std::endl; 39 | std::ofstream writer(argv[2], std::ios::binary); 40 | int npts_s32 = (int)npts; 41 | int ndims_s32 = (int)ndims; 42 | writer.write((char *)&npts_s32, sizeof(int)); 43 | writer.write((char *)&ndims_s32, sizeof(int)); 44 | uint32_t *read_buf = new uint32_t[npts * (ndims + 1)]; 45 | uint32_t *write_buf = new uint32_t[npts * ndims]; 46 | for (size_t i = 0; i < nblks; i++) 47 | { 48 | size_t cblk_size = std::min(npts - i * blk_size, blk_size); 49 | block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims); 50 | std::cout << "Block #" << i << " written" << std::endl; 51 | } 52 | 53 | delete[] read_buf; 54 | delete[] write_buf; 55 | 56 | reader.close(); 57 | writer.close(); 58 | } 59 | -------------------------------------------------------------------------------- /apps/utils/merge_shards.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "disk_utils.h" 14 | #include "cached_io.h" 15 | #include "utils.h" 16 | 17 | int main(int argc, char **argv) 18 | { 19 | if (argc != 9) 20 | { 21 | std::cout << argv[0] 22 | << " vamana_index_prefix[1] vamana_index_suffix[2] " 23 | "idmaps_prefix[3] " 24 | "idmaps_suffix[4] n_shards[5] max_degree[6] " 25 | "output_vamana_path[7] " 26 | "output_medoids_path[8]" 27 | << std::endl; 28 | exit(-1); 29 | } 30 | 31 | std::string vamana_prefix(argv[1]); 32 | std::string vamana_suffix(argv[2]); 33 | std::string idmaps_prefix(argv[3]); 34 | std::string idmaps_suffix(argv[4]); 35 | uint64_t nshards = (uint64_t)std::atoi(argv[5]); 36 | uint32_t max_degree = (uint64_t)std::atoi(argv[6]); 37 | std::string output_index(argv[7]); 38 | std::string output_medoids(argv[8]); 39 | 40 | return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, idmaps_suffix, nshards, max_degree, 41 | output_index, output_medoids); 42 | } 43 | -------------------------------------------------------------------------------- /apps/utils/partition_data.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char **argv) 12 | { 13 | if (argc != 7) 14 | { 15 | std::cout << "Usage:\n" 16 | << argv[0] 17 | << " datatype " 18 | " " 19 | " " 20 | << std::endl; 21 | exit(-1); 22 | } 23 | 24 | const std::string data_path(argv[2]); 25 | const std::string prefix_path(argv[3]); 26 | const float sampling_rate = (float)atof(argv[4]); 27 | const size_t num_partitions = (size_t)std::atoi(argv[5]); 28 | const size_t max_reps = 15; 29 | const size_t k_index = (size_t)std::atoi(argv[6]); 30 | 31 | if (std::string(argv[1]) == std::string("float")) 32 | partition(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index); 33 | else if (std::string(argv[1]) == std::string("int8")) 34 | partition(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index); 37 | else 38 | std::cout << "unsupported data format. use float/int8/uint8" << std::endl; 39 | } 40 | -------------------------------------------------------------------------------- /apps/utils/partition_with_ram_budget.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | #include "cached_io.h" 7 | #include "partition.h" 8 | 9 | // DEPRECATED: NEED TO REPROGRAM 10 | 11 | int main(int argc, char **argv) 12 | { 13 | if (argc != 8) 14 | { 15 | std::cout << "Usage:\n" 16 | << argv[0] 17 | << " datatype " 18 | " " 19 | " " 20 | << std::endl; 21 | exit(-1); 22 | } 23 | 24 | const std::string data_path(argv[2]); 25 | const std::string prefix_path(argv[3]); 26 | const float sampling_rate = (float)atof(argv[4]); 27 | const double ram_budget = (double)std::atof(argv[5]); 28 | const size_t graph_degree = (size_t)std::atoi(argv[6]); 29 | const size_t k_index = (size_t)std::atoi(argv[7]); 30 | 31 | if (std::string(argv[1]) == std::string("float")) 32 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index); 33 | else if (std::string(argv[1]) == std::string("int8")) 34 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index); 35 | else if (std::string(argv[1]) == std::string("uint8")) 36 | partition_with_ram_budget(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index); 37 | else 38 | std::cout << "unsupported data format. use float/int8/uint8" << std::endl; 39 | } 40 | -------------------------------------------------------------------------------- /apps/utils/uint32_to_uint8.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char **argv) 8 | { 9 | if (argc != 3) 10 | { 11 | std::cout << argv[0] << " input_uint32_bin output_int8_bin" << std::endl; 12 | exit(-1); 13 | } 14 | 15 | uint32_t *input; 16 | size_t npts, nd; 17 | diskann::load_bin(argv[1], input, npts, nd); 18 | uint8_t *output = new uint8_t[npts * nd]; 19 | diskann::convert_types(input, output, npts, nd); 20 | diskann::save_bin(argv[2], output, npts, nd); 21 | delete[] output; 22 | delete[] input; 23 | } 24 | -------------------------------------------------------------------------------- /apps/utils/uint8_to_float.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "utils.h" 6 | 7 | int main(int argc, char **argv) 8 | { 9 | if (argc != 3) 10 | { 11 | std::cout << argv[0] << " input_uint8_bin output_float_bin" << std::endl; 12 | exit(-1); 13 | } 14 | 15 | uint8_t *input; 16 | size_t npts, nd; 17 | diskann::load_bin(argv[1], input, npts, nd); 18 | float *output = new float[npts * nd]; 19 | diskann::convert_types(input, output, npts, nd); 20 | diskann::save_bin(argv[2], output, npts, nd); 21 | delete[] output; 22 | delete[] input; 23 | } 24 | -------------------------------------------------------------------------------- /clang-format.cmake: -------------------------------------------------------------------------------- 1 | if (NOT MSVC) 2 | message(STATUS "Setting up `make format` and `make checkformat`") 3 | # additional target to perform clang-format run, requires clang-format 4 | # get all project files 5 | file(GLOB_RECURSE ALL_SOURCE_FILES include/*.h include/*.hpp python/src/*.cpp src/*.cpp src/*.hpp apps/*.cpp apps/*.hpp) 6 | 7 | message(status ${ALL_SOURCE_FILES}) 8 | 9 | add_custom_target( 10 | format 11 | COMMAND /usr/bin/clang-format 12 | -i 13 | ${ALL_SOURCE_FILES} 14 | ) 15 | add_custom_target( 16 | checkformat 17 | COMMAND /usr/bin/clang-format 18 | --Werror 19 | --dry-run 20 | ${ALL_SOURCE_FILES} 21 | ) 22 | endif() 23 | -------------------------------------------------------------------------------- /include/abstract_graph_store.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include "types.h" 9 | 10 | namespace diskann 11 | { 12 | 13 | class AbstractGraphStore 14 | { 15 | public: 16 | AbstractGraphStore(const size_t total_pts, const size_t reserve_graph_degree) 17 | : _capacity(total_pts), _reserve_graph_degree(reserve_graph_degree) 18 | { 19 | } 20 | 21 | virtual ~AbstractGraphStore() = default; 22 | 23 | // returns tuple of 24 | virtual std::tuple load(const std::string &index_path_prefix, 25 | const size_t num_points) = 0; 26 | virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points, 27 | const uint32_t start) = 0; 28 | 29 | // not synchronised, user should use lock when necvessary. 30 | virtual const std::vector &get_neighbours(const location_t i) const = 0; 31 | virtual void add_neighbour(const location_t i, location_t neighbour_id) = 0; 32 | virtual void clear_neighbours(const location_t i) = 0; 33 | virtual void swap_neighbours(const location_t a, location_t b) = 0; 34 | 35 | virtual void set_neighbours(const location_t i, std::vector &neighbours) = 0; 36 | 37 | virtual size_t resize_graph(const size_t new_size) = 0; 38 | virtual void clear_graph() = 0; 39 | 40 | virtual uint32_t get_max_observed_degree() = 0; 41 | 42 | // set during load 43 | virtual size_t get_max_range_of_graph() = 0; 44 | 45 | // Total internal points _max_points + _num_frozen_points 46 | size_t get_total_points() 47 | { 48 | return _capacity; 49 | } 50 | 51 | protected: 52 | // Internal function, changes total points when resize_graph is called. 53 | void set_total_points(size_t new_capacity) 54 | { 55 | _capacity = new_capacity; 56 | } 57 | 58 | size_t get_reserve_graph_degree() 59 | { 60 | return _reserve_graph_degree; 61 | } 62 | 63 | private: 64 | size_t _capacity; 65 | size_t _reserve_graph_degree; 66 | }; 67 | 68 | } // namespace diskann -------------------------------------------------------------------------------- /include/abstract_scratch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace diskann 3 | { 4 | 5 | template class PQScratch; 6 | 7 | // By somewhat more than a coincidence, it seems that both InMemQueryScratch 8 | // and SSDQueryScratch have the aligned query and PQScratch objects. So we 9 | // can put them in a neat hierarchy and keep PQScratch as a standalone class. 10 | template class AbstractScratch 11 | { 12 | public: 13 | AbstractScratch() = default; 14 | // This class does not take any responsibilty for memory management of 15 | // its members. It is the responsibility of the derived classes to do so. 16 | virtual ~AbstractScratch() = default; 17 | 18 | // Scratch objects should not be copied 19 | AbstractScratch(const AbstractScratch &) = delete; 20 | AbstractScratch &operator=(const AbstractScratch &) = delete; 21 | 22 | data_t *aligned_query_T() 23 | { 24 | return _aligned_query_T; 25 | } 26 | PQScratch *pq_scratch() 27 | { 28 | return _pq_scratch; 29 | } 30 | 31 | protected: 32 | data_t *_aligned_query_T = nullptr; 33 | PQScratch *_pq_scratch = nullptr; 34 | }; 35 | } // namespace diskann 36 | -------------------------------------------------------------------------------- /include/ann_exception.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include "windows_customizations.h" 9 | 10 | #ifndef _WINDOWS 11 | #define __FUNCSIG__ __PRETTY_FUNCTION__ 12 | #endif 13 | 14 | namespace diskann 15 | { 16 | 17 | class ANNException : public std::runtime_error 18 | { 19 | public: 20 | DISKANN_DLLEXPORT ANNException(const std::string &message, int errorCode); 21 | DISKANN_DLLEXPORT ANNException(const std::string &message, int errorCode, const std::string &funcSig, 22 | const std::string &fileName, uint32_t lineNum); 23 | 24 | private: 25 | int _errorCode; 26 | }; 27 | 28 | class FileException : public ANNException 29 | { 30 | public: 31 | DISKANN_DLLEXPORT FileException(const std::string &filename, std::system_error &e, const std::string &funcSig, 32 | const std::string &fileName, uint32_t lineNum); 33 | }; 34 | } // namespace diskann 35 | -------------------------------------------------------------------------------- /include/any_wrappers.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "tsl/robin_set.h" 11 | 12 | namespace AnyWrapper 13 | { 14 | 15 | /* 16 | * Base Struct to hold refrence to the data. 17 | * Note: No memory mamagement, caller need to keep object alive. 18 | */ 19 | struct AnyReference 20 | { 21 | template AnyReference(Ty &reference) : _data(&reference) 22 | { 23 | } 24 | 25 | template Ty &get() 26 | { 27 | auto ptr = std::any_cast(_data); 28 | return *ptr; 29 | } 30 | 31 | private: 32 | std::any _data; 33 | }; 34 | struct AnyRobinSet : public AnyReference 35 | { 36 | template AnyRobinSet(const tsl::robin_set &robin_set) : AnyReference(robin_set) 37 | { 38 | } 39 | template AnyRobinSet(tsl::robin_set &robin_set) : AnyReference(robin_set) 40 | { 41 | } 42 | }; 43 | 44 | struct AnyVector : public AnyReference 45 | { 46 | template AnyVector(const std::vector &vector) : AnyReference(vector) 47 | { 48 | } 49 | template AnyVector(std::vector &vector) : AnyReference(vector) 50 | { 51 | } 52 | }; 53 | } // namespace AnyWrapper 54 | -------------------------------------------------------------------------------- /include/boost_dynamic_bitset_fwd.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | namespace boost 7 | { 8 | #ifndef BOOST_DYNAMIC_BITSET_FWD_HPP 9 | template > class dynamic_bitset; 10 | #endif 11 | } // namespace boost 12 | -------------------------------------------------------------------------------- /include/common_includes.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | -------------------------------------------------------------------------------- /include/defaults.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | 7 | namespace diskann 8 | { 9 | namespace defaults 10 | { 11 | const float ALPHA = 1.2f; 12 | const uint32_t NUM_THREADS = 0; 13 | const uint32_t MAX_OCCLUSION_SIZE = 750; 14 | const bool HAS_LABELS = false; 15 | const uint32_t FILTER_LIST_SIZE = 0; 16 | const uint32_t NUM_FROZEN_POINTS_STATIC = 0; 17 | const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1; 18 | 19 | // In-mem index related limits 20 | const float GRAPH_SLACK_FACTOR = 1.3f; 21 | 22 | // SSD Index related limits 23 | const uint64_t MAX_GRAPH_DEGREE = 512; 24 | const uint64_t SECTOR_LEN = 4096; 25 | const uint64_t MAX_N_SECTOR_READS = 128; 26 | 27 | // following constants should always be specified, but are useful as a 28 | // sensible default at cli / python boundaries 29 | const uint32_t MAX_DEGREE = 64; 30 | const uint32_t BUILD_LIST_SIZE = 100; 31 | const uint32_t SATURATE_GRAPH = false; 32 | const uint32_t SEARCH_LIST_SIZE = 100; 33 | } // namespace defaults 34 | } // namespace diskann 35 | -------------------------------------------------------------------------------- /include/exceptions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | 7 | namespace diskann 8 | { 9 | 10 | class NotImplementedException : public std::logic_error 11 | { 12 | public: 13 | NotImplementedException() : std::logic_error("Function not yet implemented.") 14 | { 15 | } 16 | }; 17 | } // namespace diskann 18 | -------------------------------------------------------------------------------- /include/in_mem_graph_store.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include "abstract_graph_store.h" 7 | 8 | namespace diskann 9 | { 10 | 11 | class InMemGraphStore : public AbstractGraphStore 12 | { 13 | public: 14 | InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree); 15 | 16 | // returns tuple of 17 | virtual std::tuple load(const std::string &index_path_prefix, 18 | const size_t num_points) override; 19 | virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points, 20 | const uint32_t start) override; 21 | 22 | virtual const std::vector &get_neighbours(const location_t i) const override; 23 | virtual void add_neighbour(const location_t i, location_t neighbour_id) override; 24 | virtual void clear_neighbours(const location_t i) override; 25 | virtual void swap_neighbours(const location_t a, location_t b) override; 26 | 27 | virtual void set_neighbours(const location_t i, std::vector &neighbors) override; 28 | 29 | virtual size_t resize_graph(const size_t new_size) override; 30 | virtual void clear_graph() override; 31 | 32 | virtual size_t get_max_range_of_graph() override; 33 | virtual uint32_t get_max_observed_degree() override; 34 | 35 | protected: 36 | virtual std::tuple load_impl(const std::string &filename, size_t expected_num_points); 37 | #ifdef EXEC_ENV_OLS 38 | virtual std::tuple load_impl(AlignedFileReader &reader, size_t expected_num_points); 39 | #endif 40 | 41 | int save_graph(const std::string &index_path_prefix, const size_t active_points, const size_t num_frozen_points, 42 | const uint32_t start); 43 | 44 | private: 45 | size_t _max_range_of_graph = 0; 46 | uint32_t _max_observed_degree = 0; 47 | 48 | std::vector> _graph; 49 | }; 50 | 51 | } // namespace diskann 52 | -------------------------------------------------------------------------------- /include/index_build_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common_includes.h" 4 | #include "parameters.h" 5 | 6 | namespace diskann 7 | { 8 | struct IndexFilterParams 9 | { 10 | public: 11 | std::string save_path_prefix; 12 | std::string label_file; 13 | std::string tags_file; 14 | std::string universal_label; 15 | uint32_t filter_threshold = 0; 16 | 17 | private: 18 | IndexFilterParams(const std::string &save_path_prefix, const std::string &label_file, 19 | const std::string &universal_label, uint32_t filter_threshold) 20 | : save_path_prefix(save_path_prefix), label_file(label_file), universal_label(universal_label), 21 | filter_threshold(filter_threshold) 22 | { 23 | } 24 | 25 | friend class IndexFilterParamsBuilder; 26 | }; 27 | class IndexFilterParamsBuilder 28 | { 29 | public: 30 | IndexFilterParamsBuilder() = default; 31 | 32 | IndexFilterParamsBuilder &with_save_path_prefix(const std::string &save_path_prefix) 33 | { 34 | if (save_path_prefix.empty() || save_path_prefix == "") 35 | throw ANNException("Error: save_path_prefix can't be empty", -1); 36 | this->_save_path_prefix = save_path_prefix; 37 | return *this; 38 | } 39 | 40 | IndexFilterParamsBuilder &with_label_file(const std::string &label_file) 41 | { 42 | this->_label_file = label_file; 43 | return *this; 44 | } 45 | 46 | IndexFilterParamsBuilder &with_universal_label(const std::string &univeral_label) 47 | { 48 | this->_universal_label = univeral_label; 49 | return *this; 50 | } 51 | 52 | IndexFilterParamsBuilder &with_filter_threshold(const std::uint32_t &filter_threshold) 53 | { 54 | this->_filter_threshold = filter_threshold; 55 | return *this; 56 | } 57 | 58 | IndexFilterParams build() 59 | { 60 | return IndexFilterParams(_save_path_prefix, _label_file, _universal_label, _filter_threshold); 61 | } 62 | 63 | IndexFilterParamsBuilder(const IndexFilterParamsBuilder &) = delete; 64 | IndexFilterParamsBuilder &operator=(const IndexFilterParamsBuilder &) = delete; 65 | 66 | private: 67 | std::string _save_path_prefix; 68 | std::string _label_file; 69 | std::string _tags_file; 70 | std::string _universal_label; 71 | uint32_t _filter_threshold = 0; 72 | }; 73 | } // namespace diskann 74 | -------------------------------------------------------------------------------- /include/index_factory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "index.h" 4 | #include "abstract_graph_store.h" 5 | #include "in_mem_graph_store.h" 6 | #include "pq_data_store.h" 7 | 8 | namespace diskann 9 | { 10 | class IndexFactory 11 | { 12 | public: 13 | DISKANN_DLLEXPORT explicit IndexFactory(const IndexConfig &config); 14 | DISKANN_DLLEXPORT std::unique_ptr create_instance(); 15 | 16 | DISKANN_DLLEXPORT static std::unique_ptr construct_graphstore( 17 | const GraphStoreStrategy stratagy, const size_t size, const size_t reserve_graph_degree); 18 | 19 | template 20 | DISKANN_DLLEXPORT static std::shared_ptr> construct_datastore(DataStoreStrategy stratagy, 21 | size_t num_points, 22 | size_t dimension, Metric m); 23 | // For now PQDataStore incorporates within itself all variants of quantization that we support. In the 24 | // future it may be necessary to introduce an AbstractPQDataStore class to spearate various quantization 25 | // flavours. 26 | template 27 | DISKANN_DLLEXPORT static std::shared_ptr> construct_pq_datastore(DataStoreStrategy strategy, 28 | size_t num_points, size_t dimension, 29 | Metric m, size_t num_pq_chunks, 30 | bool use_opq); 31 | template static Distance *construct_inmem_distance_fn(Metric m); 32 | 33 | private: 34 | void check_config(); 35 | 36 | template 37 | std::unique_ptr create_instance(); 38 | 39 | std::unique_ptr create_instance(const std::string &data_type, const std::string &tag_type, 40 | const std::string &label_type); 41 | 42 | template 43 | std::unique_ptr create_instance(const std::string &tag_type, const std::string &label_type); 44 | 45 | template 46 | std::unique_ptr create_instance(const std::string &label_type); 47 | 48 | std::unique_ptr _config; 49 | }; 50 | 51 | } // namespace diskann 52 | -------------------------------------------------------------------------------- /include/linux_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifndef _WINDOWS 6 | 7 | #include "aligned_file_reader.h" 8 | 9 | class LinuxAlignedFileReader : public AlignedFileReader 10 | { 11 | private: 12 | uint64_t file_sz; 13 | FileHandle file_desc; 14 | io_context_t bad_ctx = (io_context_t)-1; 15 | 16 | public: 17 | LinuxAlignedFileReader(); 18 | ~LinuxAlignedFileReader(); 19 | 20 | IOContext &get_ctx(); 21 | 22 | // register thread-id for a context 23 | void register_thread(); 24 | 25 | // de-register thread-id for a context 26 | void deregister_thread(); 27 | void deregister_all_threads(); 28 | 29 | // Open & close ops 30 | // Blocking calls 31 | void open(const std::string &fname); 32 | void close(); 33 | 34 | // process batch of aligned requests in parallel 35 | // NOTE :: blocking call 36 | void read(std::vector &read_reqs, IOContext &ctx, bool async = false); 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /include/locking.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | #pragma once 4 | 5 | #include 6 | 7 | #ifdef _WINDOWS 8 | #include "windows_slim_lock.h" 9 | #endif 10 | 11 | namespace diskann 12 | { 13 | #ifdef _WINDOWS 14 | using non_recursive_mutex = windows_exclusive_slim_lock; 15 | using LockGuard = windows_exclusive_slim_lock_guard; 16 | #else 17 | using non_recursive_mutex = std::mutex; 18 | using LockGuard = std::lock_guard; 19 | #endif 20 | } // namespace diskann 21 | -------------------------------------------------------------------------------- /include/logger.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include "windows_customizations.h" 8 | 9 | #ifdef EXEC_ENV_OLS 10 | #ifndef ENABLE_CUSTOM_LOGGER 11 | #define ENABLE_CUSTOM_LOGGER 12 | #endif // !ENABLE_CUSTOM_LOGGER 13 | #endif // EXEC_ENV_OLS 14 | 15 | namespace diskann 16 | { 17 | #ifdef ENABLE_CUSTOM_LOGGER 18 | DISKANN_DLLEXPORT extern std::basic_ostream cout; 19 | DISKANN_DLLEXPORT extern std::basic_ostream cerr; 20 | #else 21 | using std::cerr; 22 | using std::cout; 23 | #endif 24 | 25 | enum class DISKANN_DLLEXPORT LogLevel 26 | { 27 | LL_Info = 0, 28 | LL_Error, 29 | LL_Count 30 | }; 31 | 32 | #ifdef ENABLE_CUSTOM_LOGGER 33 | DISKANN_DLLEXPORT void SetCustomLogger(std::function logger); 34 | #endif 35 | } // namespace diskann 36 | -------------------------------------------------------------------------------- /include/logger_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include "ann_exception.h" 10 | #include "logger.h" 11 | 12 | namespace diskann 13 | { 14 | #ifdef ENABLE_CUSTOM_LOGGER 15 | class ANNStreamBuf : public std::basic_streambuf 16 | { 17 | public: 18 | DISKANN_DLLEXPORT explicit ANNStreamBuf(FILE *fp); 19 | DISKANN_DLLEXPORT ~ANNStreamBuf(); 20 | 21 | DISKANN_DLLEXPORT bool is_open() const 22 | { 23 | return true; // because stdout and stderr are always open. 24 | } 25 | DISKANN_DLLEXPORT void close(); 26 | DISKANN_DLLEXPORT virtual int underflow(); 27 | DISKANN_DLLEXPORT virtual int overflow(int c); 28 | DISKANN_DLLEXPORT virtual int sync(); 29 | 30 | private: 31 | FILE *_fp; 32 | char *_buf; 33 | int _bufIndex; 34 | std::mutex _mutex; 35 | LogLevel _logLevel; 36 | 37 | int flush(); 38 | void logImpl(char *str, int numchars); 39 | 40 | // Why the two buffer-sizes? If we are running normally, we are basically 41 | // interacting with a character output system, so we short-circuit the 42 | // output process by keeping an empty buffer and writing each character 43 | // to stdout/stderr. But if we are running in OLS, we have to take all 44 | // the text that is written to diskann::cout/diskann:cerr, consolidate it 45 | // and push it out in one-shot, because the OLS infra does not give us 46 | // character based output. Therefore, we use a larger buffer that is large 47 | // enough to store the longest message, and continuously add characters 48 | // to it. When the calling code outputs a std::endl or std::flush, sync() 49 | // will be called and will output a log level, component name, and the text 50 | // that has been collected. (sync() is also called if the buffer is full, so 51 | // overflows/missing text are not a concern). 52 | // This implies calling code _must_ either print std::endl or std::flush 53 | // to ensure that the message is written immediately. 54 | 55 | static const int BUFFER_SIZE = 1024; 56 | 57 | ANNStreamBuf(const ANNStreamBuf &); 58 | ANNStreamBuf &operator=(const ANNStreamBuf &); 59 | }; 60 | #endif 61 | } // namespace diskann 62 | -------------------------------------------------------------------------------- /include/memory_mapper.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifndef _WINDOWS 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #else 14 | #include 15 | #endif 16 | #include 17 | 18 | namespace diskann 19 | { 20 | class MemoryMapper 21 | { 22 | private: 23 | #ifndef _WINDOWS 24 | int _fd; 25 | #else 26 | HANDLE _bareFile; 27 | HANDLE _fd; 28 | 29 | #endif 30 | char *_buf; 31 | size_t _fileSize; 32 | const char *_fileName; 33 | 34 | public: 35 | MemoryMapper(const char *filename); 36 | MemoryMapper(const std::string &filename); 37 | 38 | char *getBuf(); 39 | size_t getFileSize(); 40 | 41 | ~MemoryMapper(); 42 | }; 43 | } // namespace diskann 44 | -------------------------------------------------------------------------------- /include/natural_number_set.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include "boost_dynamic_bitset_fwd.h" 10 | 11 | namespace diskann 12 | { 13 | // A set of natural numbers (from 0 onwards). Made for scenario where the 14 | // pool of numbers is consecutive from zero to some max value and very 15 | // efficient methods for "add to set", "get any value from set", "is in set" 16 | // are needed. The memory usage of the set is determined by the largest 17 | // number of inserted entries (uses a vector as a backing store) as well as 18 | // the largest value to be placed in it (uses bitset as well). 19 | // 20 | // Thread-safety: this class is not thread-safe in general. 21 | // Exception: multiple read-only operations (e.g. is_in_set, empty, size) are 22 | // safe on the object only if there are no writers to it in parallel. 23 | template class natural_number_set 24 | { 25 | public: 26 | static_assert(std::is_trivial::value, "Identifier must be a trivial type"); 27 | 28 | natural_number_set(); 29 | 30 | bool is_empty() const; 31 | void reserve(size_t count); 32 | void insert(T id); 33 | T pop_any(); 34 | void clear(); 35 | size_t size() const; 36 | bool is_in_set(T id) const; 37 | 38 | private: 39 | // Values that are currently in set. 40 | std::vector _values_vector; 41 | 42 | // Values that are in the set have the corresponding bit index set 43 | // to 1. 44 | // 45 | // Use a pointer here to allow for forward declaration of dynamic_bitset 46 | // in public headers to avoid making boost a dependency for clients 47 | // of DiskANN. 48 | std::unique_ptr> _values_bitset; 49 | }; 50 | } // namespace diskann 51 | -------------------------------------------------------------------------------- /include/partition.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "neighbor.h" 12 | #include "parameters.h" 13 | #include "tsl/robin_set.h" 14 | #include "utils.h" 15 | 16 | #include "windows_customizations.h" 17 | 18 | template 19 | void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate); 20 | 21 | template 22 | void gen_random_slice(const std::string data_file, double p_val, float *&sampled_data, size_t &slice_size, 23 | size_t &ndims); 24 | 25 | template 26 | void gen_random_slice(const T *inputdata, size_t npts, size_t ndims, double p_val, float *&sampled_data, 27 | size_t &slice_size); 28 | 29 | int estimate_cluster_sizes(float *test_data_float, size_t num_test, float *pivots, const size_t num_centers, 30 | const size_t dim, const size_t k_base, std::vector &cluster_sizes); 31 | 32 | template 33 | int shard_data_into_clusters(const std::string data_file, float *pivots, const size_t num_centers, const size_t dim, 34 | const size_t k_base, std::string prefix_path); 35 | 36 | template 37 | int shard_data_into_clusters_only_ids(const std::string data_file, float *pivots, const size_t num_centers, 38 | const size_t dim, const size_t k_base, std::string prefix_path); 39 | 40 | template 41 | int retrieve_shard_data_from_ids(const std::string data_file, std::string idmap_filename, std::string data_filename); 42 | 43 | template 44 | int partition(const std::string data_file, const float sampling_rate, size_t num_centers, size_t max_k_means_reps, 45 | const std::string prefix_path, size_t k_base); 46 | 47 | template 48 | int partition_with_ram_budget(const std::string data_file, const double sampling_rate, double ram_budget, 49 | size_t graph_degree, const std::string prefix_path, size_t k_base); 50 | -------------------------------------------------------------------------------- /include/percentile_stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #ifdef _WINDOWS 11 | #include 12 | #endif 13 | #include 14 | #include 15 | 16 | #include "distance.h" 17 | #include "parameters.h" 18 | 19 | namespace diskann 20 | { 21 | struct QueryStats 22 | { 23 | float total_us = 0; // total time to process query in micros 24 | float io_us = 0; // total time spent in IO 25 | float cpu_us = 0; // total time spent in CPU 26 | 27 | unsigned n_4k = 0; // # of 4kB reads 28 | unsigned n_8k = 0; // # of 8kB reads 29 | unsigned n_12k = 0; // # of 12kB reads 30 | unsigned n_ios = 0; // total # of IOs issued 31 | unsigned read_size = 0; // total # of bytes read 32 | unsigned n_cmps_saved = 0; // # cmps saved 33 | unsigned n_cmps = 0; // # cmps 34 | unsigned n_cache_hits = 0; // # cache_hits 35 | unsigned n_hops = 0; // # search hops 36 | }; 37 | 38 | template 39 | inline T get_percentile_stats(QueryStats *stats, uint64_t len, float percentile, 40 | const std::function &member_fn) 41 | { 42 | std::vector vals(len); 43 | for (uint64_t i = 0; i < len; i++) 44 | { 45 | vals[i] = member_fn(stats[i]); 46 | } 47 | 48 | std::sort(vals.begin(), vals.end(), [](const T &left, const T &right) { return left < right; }); 49 | 50 | auto retval = vals[(uint64_t)(percentile * len)]; 51 | vals.clear(); 52 | return retval; 53 | } 54 | 55 | template 56 | inline double get_mean_stats(QueryStats *stats, uint64_t len, const std::function &member_fn) 57 | { 58 | double avg = 0; 59 | for (uint64_t i = 0; i < len; i++) 60 | { 61 | avg += (double)member_fn(stats[i]); 62 | } 63 | return avg / len; 64 | } 65 | } // namespace diskann 66 | -------------------------------------------------------------------------------- /include/pq_common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #define NUM_PQ_BITS 8 7 | #define NUM_PQ_CENTROIDS (1 << NUM_PQ_BITS) 8 | #define MAX_OPQ_ITERS 20 9 | #define NUM_KMEANS_REPS_PQ 12 10 | #define MAX_PQ_TRAINING_SET_SIZE 256000 11 | #define MAX_PQ_CHUNKS 512 12 | 13 | namespace diskann 14 | { 15 | inline std::string get_quantized_vectors_filename(const std::string &prefix, bool use_opq, uint32_t num_chunks) 16 | { 17 | return prefix + (use_opq ? "_opq" : "pq") + std::to_string(num_chunks) + "_compressed.bin"; 18 | } 19 | 20 | inline std::string get_pivot_data_filename(const std::string &prefix, bool use_opq, uint32_t num_chunks) 21 | { 22 | return prefix + (use_opq ? "_opq" : "pq") + std::to_string(num_chunks) + "_pivots.bin"; 23 | } 24 | 25 | inline std::string get_rotation_matrix_suffix(const std::string &pivot_data_filename) 26 | { 27 | return pivot_data_filename + "_rotation_matrix.bin"; 28 | } 29 | 30 | } // namespace diskann 31 | -------------------------------------------------------------------------------- /include/pq_scratch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "pq_common.h" 4 | #include "utils.h" 5 | 6 | namespace diskann 7 | { 8 | 9 | template class PQScratch 10 | { 11 | public: 12 | float *aligned_pqtable_dist_scratch = nullptr; // MUST BE AT LEAST [256 * NCHUNKS] 13 | float *aligned_dist_scratch = nullptr; // MUST BE AT LEAST diskann MAX_DEGREE 14 | uint8_t *aligned_pq_coord_scratch = nullptr; // AT LEAST [N_CHUNKS * MAX_DEGREE] 15 | float *rotated_query = nullptr; 16 | float *aligned_query_float = nullptr; 17 | 18 | PQScratch(size_t graph_degree, size_t aligned_dim); 19 | void initialize(size_t dim, const T *query, const float norm = 1.0f); 20 | virtual ~PQScratch(); 21 | }; 22 | 23 | } // namespace diskann -------------------------------------------------------------------------------- /include/restapi/common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace diskann 10 | { 11 | // Constants 12 | static const std::string VECTOR_KEY = "query", K_KEY = "k", INDICES_KEY = "indices", DISTANCES_KEY = "distances", 13 | TAGS_KEY = "tags", QUERY_ID_KEY = "query_id", ERROR_MESSAGE_KEY = "error", L_KEY = "Ls", 14 | TIME_TAKEN_KEY = "time_taken_in_us", PARTITION_KEY = "partition", 15 | UNKNOWN_ERROR = "unknown_error"; 16 | const unsigned int DEFAULT_L = 100; 17 | 18 | } // namespace diskann -------------------------------------------------------------------------------- /include/restapi/server.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | namespace diskann 10 | { 11 | class Server 12 | { 13 | public: 14 | Server(web::uri &url, std::vector> &multi_searcher, 15 | const std::string &typestring); 16 | virtual ~Server(); 17 | 18 | pplx::task open(); 19 | pplx::task close(); 20 | 21 | protected: 22 | template void handle_post(web::http::http_request message); 23 | 24 | template 25 | web::json::value toJsonArray(const std::vector &v, std::function valConverter); 26 | web::json::value prepareResponse(const int64_t &queryId, const int k); 27 | 28 | template 29 | void parseJson(const utility::string_t &body, unsigned int &k, int64_t &queryId, T *&queryVector, 30 | unsigned int &dimensions, unsigned &Ls); 31 | 32 | web::json::value idsToJsonArray(const diskann::SearchResult &result); 33 | web::json::value distancesToJsonArray(const diskann::SearchResult &result); 34 | web::json::value tagsToJsonArray(const diskann::SearchResult &result); 35 | web::json::value partitionsToJsonArray(const diskann::SearchResult &result); 36 | 37 | SearchResult aggregate_results(const unsigned K, const std::vector &results); 38 | 39 | private: 40 | bool _isDebug; 41 | std::unique_ptr _listener; 42 | const bool _multi_search; 43 | std::vector> _multi_searcher; 44 | }; 45 | } // namespace diskann 46 | -------------------------------------------------------------------------------- /include/tag_uint128.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace diskann 6 | { 7 | #pragma pack(push, 1) 8 | 9 | struct tag_uint128 10 | { 11 | std::uint64_t _data1 = 0; 12 | std::uint64_t _data2 = 0; 13 | 14 | bool operator==(const tag_uint128 &other) const 15 | { 16 | return _data1 == other._data1 && _data2 == other._data2; 17 | } 18 | 19 | bool operator==(std::uint64_t other) const 20 | { 21 | return _data1 == other && _data2 == 0; 22 | } 23 | 24 | tag_uint128 &operator=(const tag_uint128 &other) 25 | { 26 | _data1 = other._data1; 27 | _data2 = other._data2; 28 | 29 | return *this; 30 | } 31 | 32 | tag_uint128 &operator=(std::uint64_t other) 33 | { 34 | _data1 = other; 35 | _data2 = 0; 36 | 37 | return *this; 38 | } 39 | }; 40 | 41 | #pragma pack(pop) 42 | } // namespace diskann 43 | 44 | namespace std 45 | { 46 | // Hash 128 input bits down to 64 bits of output. 47 | // This is intended to be a reasonably good hash function. 48 | inline std::uint64_t Hash128to64(const std::uint64_t &low, const std::uint64_t &high) 49 | { 50 | // Murmur-inspired hashing. 51 | const std::uint64_t kMul = 0x9ddfea08eb382d69ULL; 52 | std::uint64_t a = (low ^ high) * kMul; 53 | a ^= (a >> 47); 54 | std::uint64_t b = (high ^ a) * kMul; 55 | b ^= (b >> 47); 56 | b *= kMul; 57 | return b; 58 | } 59 | 60 | template <> struct hash 61 | { 62 | size_t operator()(const diskann::tag_uint128 &key) const noexcept 63 | { 64 | return Hash128to64(key._data1, key._data2); // map -0 to 0 65 | } 66 | }; 67 | 68 | } // namespace std -------------------------------------------------------------------------------- /include/timer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | #pragma once 4 | 5 | #include 6 | 7 | namespace diskann 8 | { 9 | class Timer 10 | { 11 | typedef std::chrono::high_resolution_clock _clock; 12 | std::chrono::time_point<_clock> check_point; 13 | 14 | public: 15 | Timer() : check_point(_clock::now()) 16 | { 17 | } 18 | 19 | void reset() 20 | { 21 | check_point = _clock::now(); 22 | } 23 | 24 | long long elapsed() const 25 | { 26 | return std::chrono::duration_cast(_clock::now() - check_point).count(); 27 | } 28 | 29 | float elapsed_seconds() const 30 | { 31 | return (float)elapsed() / 1000000.0f; 32 | } 33 | 34 | std::string elapsed_seconds_for_step(const std::string &step) const 35 | { 36 | return std::string("Time for ") + step + std::string(": ") + std::to_string(elapsed_seconds()) + 37 | std::string(" seconds"); 38 | } 39 | }; 40 | } // namespace diskann 41 | -------------------------------------------------------------------------------- /include/tsl/.clang-format: -------------------------------------------------------------------------------- 1 | DisableFormat: true 2 | SortIncludes: false 3 | -------------------------------------------------------------------------------- /include/types.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include "any_wrappers.h" 10 | 11 | namespace diskann 12 | { 13 | typedef uint32_t location_t; 14 | 15 | using DataType = std::any; 16 | using TagType = std::any; 17 | using LabelType = std::any; 18 | using TagVector = AnyWrapper::AnyVector; 19 | using DataVector = AnyWrapper::AnyVector; 20 | using Labelvector = AnyWrapper::AnyVector; 21 | using TagRobinSet = AnyWrapper::AnyRobinSet; 22 | } // namespace diskann 23 | -------------------------------------------------------------------------------- /include/windows_aligned_file_reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | #ifdef _WINDOWS 6 | #ifndef USE_BING_INFRA 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include "aligned_file_reader.h" 16 | #include "tsl/robin_map.h" 17 | #include "utils.h" 18 | #include "windows_customizations.h" 19 | 20 | class WindowsAlignedFileReader : public AlignedFileReader 21 | { 22 | private: 23 | #ifdef UNICODE 24 | std::wstring m_filename; 25 | #else 26 | std::string m_filename; 27 | #endif 28 | 29 | protected: 30 | // virtual IOContext createContext(); 31 | 32 | public: 33 | DISKANN_DLLEXPORT WindowsAlignedFileReader(){}; 34 | DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){}; 35 | 36 | // Open & close ops 37 | // Blocking calls 38 | DISKANN_DLLEXPORT virtual void open(const std::string &fname) override; 39 | DISKANN_DLLEXPORT virtual void close() override; 40 | 41 | DISKANN_DLLEXPORT virtual void register_thread() override; 42 | DISKANN_DLLEXPORT virtual void deregister_thread() override 43 | { 44 | // TODO: Needs implementation. 45 | } 46 | DISKANN_DLLEXPORT virtual void deregister_all_threads() override 47 | { 48 | // TODO: Needs implementation. 49 | } 50 | DISKANN_DLLEXPORT virtual IOContext &get_ctx() override; 51 | 52 | // process batch of aligned requests in parallel 53 | // NOTE :: blocking call for the calling thread, but can thread-safe 54 | DISKANN_DLLEXPORT virtual void read(std::vector &read_reqs, IOContext &ctx, bool async) override; 55 | }; 56 | #endif // USE_BING_INFRA 57 | #endif //_WINDOWS 58 | -------------------------------------------------------------------------------- /include/windows_customizations.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #ifdef _WINDOWS 7 | 8 | #ifdef _WINDLL 9 | #define DISKANN_DLLEXPORT __declspec(dllexport) 10 | #else 11 | #define DISKANN_DLLEXPORT __declspec(dllimport) 12 | #endif 13 | 14 | #else 15 | #define DISKANN_DLLEXPORT 16 | #endif 17 | -------------------------------------------------------------------------------- /include/windows_slim_lock.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | #pragma once 4 | 5 | #ifndef WIN32_LEAN_AND_MEAN 6 | #define WIN32_LEAN_AND_MEAN 7 | #endif 8 | #include "Windows.h" 9 | 10 | namespace diskann 11 | { 12 | // A thin C++ wrapper around Windows exclusive functionality of Windows 13 | // SlimReaderWriterLock. 14 | // 15 | // The SlimReaderWriterLock is simpler/more lightweight than std::mutex 16 | // (8 bytes vs 80 bytes), which is useful in the scenario where DiskANN has 17 | // one lock per vector in the index. It does not support recursive locking and 18 | // requires Windows Vista or later. 19 | // 20 | // Full documentation can be found at. 21 | // https://msdn.microsoft.com/en-us/library/windows/desktop/aa904937(v=vs.85).aspx 22 | class windows_exclusive_slim_lock 23 | { 24 | public: 25 | windows_exclusive_slim_lock() : _lock(SRWLOCK_INIT) 26 | { 27 | } 28 | 29 | // The lock is non-copyable. This also disables move constructor/operator=. 30 | windows_exclusive_slim_lock(const windows_exclusive_slim_lock &) = delete; 31 | windows_exclusive_slim_lock &operator=(const windows_exclusive_slim_lock &) = delete; 32 | 33 | void lock() 34 | { 35 | return AcquireSRWLockExclusive(&_lock); 36 | } 37 | 38 | bool try_lock() 39 | { 40 | return TryAcquireSRWLockExclusive(&_lock) != FALSE; 41 | } 42 | 43 | void unlock() 44 | { 45 | return ReleaseSRWLockExclusive(&_lock); 46 | } 47 | 48 | private: 49 | SRWLOCK _lock; 50 | }; 51 | 52 | // An exclusive lock over a SlimReaderWriterLock. 53 | class windows_exclusive_slim_lock_guard 54 | { 55 | public: 56 | windows_exclusive_slim_lock_guard(windows_exclusive_slim_lock &p_lock) : _lock(p_lock) 57 | { 58 | _lock.lock(); 59 | } 60 | 61 | // The lock is non-copyable. This also disables move constructor/operator=. 62 | windows_exclusive_slim_lock_guard(const windows_exclusive_slim_lock_guard &) = delete; 63 | windows_exclusive_slim_lock_guard &operator=(const windows_exclusive_slim_lock_guard &) = delete; 64 | 65 | ~windows_exclusive_slim_lock_guard() 66 | { 67 | _lock.unlock(); 68 | } 69 | 70 | private: 71 | windows_exclusive_slim_lock &_lock; 72 | }; 73 | } // namespace diskann 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=59.6", 4 | "pybind11>=2.10.0", 5 | "cmake>=3.22", 6 | "numpy==1.25", # this is important to keep fixed. It also means anyone using something other than 1.25 won't be able to use this library 7 | "wheel", 8 | "ninja" 9 | ] 10 | build-backend = "setuptools.build_meta" 11 | 12 | [project] 13 | name = "diskannpy" 14 | version = "0.7.1" 15 | 16 | description = "DiskANN Python extension module" 17 | readme = "python/README.md" 18 | requires-python = ">=3.9" 19 | license = {text = "MIT License"} 20 | dependencies = [ 21 | "numpy==1.25" 22 | ] 23 | authors = [ 24 | {name = "Harsha Vardhan Simhadri", email = "harshasi@microsoft.com"}, 25 | {name = "Dax Pryce", email = "daxpryce@microsoft.com"} 26 | ] 27 | 28 | [project.optional-dependencies] 29 | dev = ["black", "isort", "mypy"] 30 | 31 | [tool.setuptools] 32 | package-dir = {"" = "python/src"} 33 | 34 | [tool.isort] 35 | profile = "black" 36 | multi_line_output = 3 37 | 38 | [tool.mypy] 39 | plugins = "numpy.typing.mypy_plugin" 40 | 41 | [tool.cibuildwheel] 42 | manylinux-x86_64-image = "manylinux_2_28" 43 | test-requires = ["scikit-learn~=1.2"] 44 | build-frontend = "build" 45 | skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux*"] 46 | test-command = "python -m unittest discover {project}/python/tests" 47 | 48 | [tool.cibuildwheel.linux] 49 | before-build = [ 50 | "rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux", 51 | "dnf makecache --refresh", 52 | "dnf upgrade -y almalinux-release", 53 | "dnf install -y epel-release", 54 | "dnf config-manager -y --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo", 55 | "rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB", 56 | "dnf makecache --refresh -y", 57 | "dnf install -y wget make cmake gcc-c++ libaio-devel gperftools-libs libunwind-devel clang-tools-extra boost-devel boost-program-options intel-mkl-2020.4-912" 58 | ] 59 | -------------------------------------------------------------------------------- /python/apps/cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import argparse 5 | import utils 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser( 10 | prog="cluster", description="kmeans cluster points in a file" 11 | ) 12 | 13 | parser.add_argument("-d", "--data_type", required=True) 14 | parser.add_argument("-i", "--indexdata_file", required=True) 15 | parser.add_argument("-k", "--num_clusters", type=int, required=True) 16 | args = parser.parse_args() 17 | 18 | npts, ndims = get_bin_metadata(indexdata_file) 19 | 20 | data = utils.bin_to_numpy(args.data_type, args.indexdata_file) 21 | 22 | offsets, permutation = utils.cluster_and_permute( 23 | args.data_type, npts, ndims, data, args.num_clusters 24 | ) 25 | 26 | permuted_data = data[permutation] 27 | 28 | utils.numpy_to_bin(permuted_data, args.indexdata_file + ".cluster") 29 | -------------------------------------------------------------------------------- /python/apps/requirements.txt: -------------------------------------------------------------------------------- 1 | diskannpy 2 | fire 3 | -------------------------------------------------------------------------------- /python/include/builder.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include "common.h" 10 | #include "distance.h" 11 | 12 | namespace diskannpy 13 | { 14 | template 15 | void build_disk_index(diskann::Metric metric, const std::string &data_file_path, const std::string &index_prefix_path, 16 | uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit, 17 | double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes); 18 | 19 | template 20 | void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path, 21 | const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity, 22 | float alpha, uint32_t num_threads, bool use_pq_build, 23 | size_t num_pq_bytes, bool use_opq, bool use_tags = false, 24 | const std::string& filter_labels_file = "", const std::string& universal_label = "", 25 | uint32_t filter_complexity = 0); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /python/include/common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | namespace py = pybind11; 13 | 14 | namespace diskannpy 15 | { 16 | 17 | typedef uint32_t filterT; 18 | 19 | typedef uint32_t StaticIdType; 20 | typedef uint32_t DynamicIdType; 21 | 22 | template using NeighborsAndDistances = std::pair, py::array_t>; 23 | 24 | }; // namespace diskannpy 25 | -------------------------------------------------------------------------------- /python/include/dynamic_memory_index.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include "common.h" 13 | #include "index.h" 14 | #include "parameters.h" 15 | 16 | namespace py = pybind11; 17 | 18 | namespace diskannpy 19 | { 20 | 21 | template 22 | class DynamicMemoryIndex 23 | { 24 | public: 25 | DynamicMemoryIndex(diskann::Metric m, size_t dimensions, size_t max_vectors, uint32_t complexity, 26 | uint32_t graph_degree, bool saturate_graph, uint32_t max_occlusion_size, float alpha, 27 | uint32_t num_threads, uint32_t filter_complexity, uint32_t num_frozen_points, 28 | uint32_t initial_search_complexity, uint32_t initial_search_threads, 29 | bool concurrent_consolidation); 30 | 31 | void load(const std::string &index_path); 32 | int insert(const py::array_t &vector, DynamicIdType id); 33 | py::array_t batch_insert(py::array_t &vectors, 34 | py::array_t &ids, int32_t num_inserts, 35 | int num_threads = 0); 36 | int mark_deleted(DynamicIdType id); 37 | void save(const std::string &save_path, bool compact_before_save = false); 38 | NeighborsAndDistances search(py::array_t &query, uint64_t knn, 39 | uint64_t complexity); 40 | NeighborsAndDistances batch_search(py::array_t &queries, 41 | uint64_t num_queries, uint64_t knn, uint64_t complexity, 42 | uint32_t num_threads); 43 | void consolidate_delete(); 44 | size_t num_points(); 45 | 46 | 47 | private: 48 | const uint32_t _initial_search_complexity; 49 | const diskann::IndexWriteParameters _write_parameters; 50 | diskann::Index _index; 51 | }; 52 | 53 | }; // namespace diskannpy -------------------------------------------------------------------------------- /python/include/static_disk_index.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #ifdef _WINDOWS 13 | #include "windows_aligned_file_reader.h" 14 | #else 15 | #include "linux_aligned_file_reader.h" 16 | #endif 17 | 18 | #include "common.h" 19 | #include "pq_flash_index.h" 20 | 21 | namespace py = pybind11; 22 | 23 | namespace diskannpy 24 | { 25 | 26 | #ifdef _WINDOWS 27 | typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader; 28 | #else 29 | typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader; 30 | #endif 31 | 32 | template class StaticDiskIndex 33 | { 34 | public: 35 | StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads, 36 | size_t num_nodes_to_cache, uint32_t cache_mechanism); 37 | 38 | void cache_bfs_levels(size_t num_nodes_to_cache); 39 | 40 | void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads); 41 | 42 | NeighborsAndDistances search(py::array_t &query, 43 | uint64_t knn, uint64_t complexity, uint64_t beam_width); 44 | 45 | NeighborsAndDistances batch_search( 46 | py::array_t &queries, uint64_t num_queries, uint64_t knn, 47 | uint64_t complexity, uint64_t beam_width, uint32_t num_threads); 48 | 49 | private: 50 | std::shared_ptr _reader; 51 | diskann::PQFlashIndex
_index; 52 | }; 53 | } // namespace diskannpy 54 | -------------------------------------------------------------------------------- /python/include/static_memory_index.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include "common.h" 13 | #include "index.h" 14 | 15 | namespace py = pybind11; 16 | 17 | namespace diskannpy 18 | { 19 | 20 | template class StaticMemoryIndex 21 | { 22 | public: 23 | StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, size_t dimensions, 24 | uint32_t num_threads, uint32_t initial_search_complexity); 25 | 26 | NeighborsAndDistances search(py::array_t &query, 27 | uint64_t knn, uint64_t complexity); 28 | 29 | NeighborsAndDistances search_with_filter( 30 | py::array_t &query, uint64_t knn, uint64_t complexity, 31 | filterT filter); 32 | 33 | NeighborsAndDistances batch_search( 34 | py::array_t &queries, uint64_t num_queries, uint64_t knn, 35 | uint64_t complexity, uint32_t num_threads); 36 | 37 | private: 38 | diskann::Index _index; 39 | }; 40 | } // namespace diskannpy -------------------------------------------------------------------------------- /python/src/_builder.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from typing import BinaryIO, Optional, overload 5 | 6 | import numpy as np 7 | 8 | from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch 9 | 10 | def numpy_to_diskann_file(vectors: np.ndarray, file_handler: BinaryIO): ... 11 | @overload 12 | def build_disk_index( 13 | data: str, 14 | distance_metric: DistanceMetric, 15 | index_directory: str, 16 | complexity: int, 17 | graph_degree: int, 18 | search_memory_maximum: float, 19 | build_memory_maximum: float, 20 | num_threads: int, 21 | pq_disk_bytes: int, 22 | vector_dtype: VectorDType, 23 | index_prefix: str, 24 | ) -> None: ... 25 | @overload 26 | def build_disk_index( 27 | data: VectorLikeBatch, 28 | distance_metric: DistanceMetric, 29 | index_directory: str, 30 | complexity: int, 31 | graph_degree: int, 32 | search_memory_maximum: float, 33 | build_memory_maximum: float, 34 | num_threads: int, 35 | pq_disk_bytes: int, 36 | index_prefix: str, 37 | ) -> None: ... 38 | @overload 39 | def build_memory_index( 40 | data: VectorLikeBatch, 41 | distance_metric: DistanceMetric, 42 | index_directory: str, 43 | complexity: int, 44 | graph_degree: int, 45 | alpha: float, 46 | num_threads: int, 47 | use_pq_build: bool, 48 | num_pq_bytes: int, 49 | use_opq: bool, 50 | tags: Union[str, VectorIdentifierBatch], 51 | filter_labels: Optional[list[list[str]]], 52 | universal_label: str, 53 | filter_complexity: int, 54 | index_prefix: str 55 | ) -> None: ... 56 | @overload 57 | def build_memory_index( 58 | data: str, 59 | distance_metric: DistanceMetric, 60 | index_directory: str, 61 | complexity: int, 62 | graph_degree: int, 63 | alpha: float, 64 | num_threads: int, 65 | use_pq_build: bool, 66 | num_pq_bytes: int, 67 | use_opq: bool, 68 | vector_dtype: VectorDType, 69 | tags: Union[str, VectorIdentifierBatch], 70 | filter_labels_file: Optional[list[list[str]]], 71 | universal_label: str, 72 | filter_complexity: int, 73 | index_prefix: str 74 | ) -> None: ... 75 | -------------------------------------------------------------------------------- /python/src/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/python/src/py.typed -------------------------------------------------------------------------------- /python/tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from .build_memory_index import build_random_vectors_and_memory_index 5 | from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors 6 | from .recall import calculate_recall 7 | -------------------------------------------------------------------------------- /python/tests/fixtures/build_memory_index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | from tempfile import mkdtemp 6 | 7 | import diskannpy as dap 8 | import numpy as np 9 | 10 | from .create_test_data import random_vectors 11 | 12 | 13 | def build_random_vectors_and_memory_index( 14 | dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345 15 | ): 16 | query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed) 17 | index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed) 18 | ann_dir = mkdtemp() 19 | 20 | if with_tags: 21 | rng = np.random.default_rng(seed) 22 | tags = np.arange(start=1, stop=10001, dtype=np.uint32) 23 | rng.shuffle(tags) 24 | else: 25 | tags = "" 26 | 27 | dap.build_memory_index( 28 | data=index_vectors, 29 | distance_metric=metric, 30 | index_directory=ann_dir, 31 | graph_degree=16, 32 | complexity=32, 33 | alpha=1.2, 34 | num_threads=0, 35 | use_pq_build=False, 36 | num_pq_bytes=8, 37 | use_opq=False, 38 | filter_complexity=32, 39 | tags=tags, 40 | index_prefix=index_prefix, 41 | ) 42 | 43 | return ( 44 | metric, 45 | dtype, 46 | query_vectors, 47 | index_vectors, 48 | ann_dir, 49 | os.path.join(ann_dir, "vectors.bin"), 50 | tags, 51 | ) 52 | -------------------------------------------------------------------------------- /python/tests/fixtures/create_test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from contextlib import contextmanager 5 | from pathlib import Path 6 | from tempfile import NamedTemporaryFile 7 | from typing import BinaryIO 8 | 9 | import numpy as np 10 | 11 | 12 | def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray: 13 | rng = np.random.default_rng(seed) 14 | if dtype == np.float32: 15 | vectors = rng.random((rows, dimensions), dtype=dtype) 16 | elif dtype == np.uint8: 17 | vectors = rng.integers( 18 | low=0, high=256, size=(rows, dimensions), dtype=dtype 19 | ) # low is inclusive, high is exclusive 20 | elif dtype == np.int8: 21 | vectors = rng.integers( 22 | low=-128, high=128, size=(rows, dimensions), dtype=dtype 23 | ) # low is inclusive, high is exclusive 24 | else: 25 | raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported") 26 | return vectors 27 | 28 | 29 | def write_vectors(file_handler: BinaryIO, vectors: np.ndarray): 30 | _ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes()) 31 | _ = file_handler.write(vectors.tobytes()) 32 | 33 | 34 | @contextmanager 35 | def vectors_as_temp_file(vectors: np.ndarray) -> str: 36 | temp = NamedTemporaryFile(mode="wb", delete=False) 37 | write_vectors(temp, vectors) 38 | temp.close() 39 | yield temp.name 40 | Path(temp.name).unlink() 41 | -------------------------------------------------------------------------------- /python/tests/fixtures/recall.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import numpy as np 5 | 6 | 7 | def calculate_recall( 8 | result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5 9 | ) -> float: 10 | """ 11 | result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of 12 | the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices 13 | being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class. 14 | :param result_set_indices: 15 | :param truth_set_indices: 16 | :param recall_at: 17 | :return: 18 | """ 19 | found = 0 20 | for i in range(0, result_set_indices.shape[0]): 21 | result_set_set = set(result_set_indices[i][0:recall_at]) 22 | truth_set_set = set(truth_set_indices[i][0:recall_at]) 23 | found += len(result_set_set.intersection(truth_set_set)) 24 | return found / (result_set_indices.shape[0] * recall_at) 25 | -------------------------------------------------------------------------------- /python/tests/test_files.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import atexit 5 | import unittest 6 | import shutil 7 | import tempfile 8 | 9 | from pathlib import Path 10 | 11 | import numpy as np 12 | 13 | from fixtures import random_vectors, vectors_as_temp_file 14 | 15 | import diskannpy as dap 16 | 17 | 18 | class TestVectorsFromFile(unittest.TestCase): 19 | def test_in_mem(self): 20 | expected = random_vectors(10_000, 100, dtype=np.float32) 21 | with vectors_as_temp_file(expected) as vecs_file: 22 | actual = dap.vectors_from_file(vecs_file, dtype=np.float32) 23 | self.assertTrue((expected == actual).all(), f"{expected == actual}\n{expected}\n{actual}") 24 | 25 | def test_memmap(self): 26 | expected = random_vectors(10_000, 100, dtype=np.float32) 27 | with vectors_as_temp_file(expected) as vecs_file: 28 | vecs_file_copy = tempfile.NamedTemporaryFile(delete=False) 29 | atexit.register(Path(vecs_file_copy.name).unlink) 30 | shutil.copyfile(vecs_file, vecs_file_copy.name) 31 | 32 | actual = dap.vectors_from_file( 33 | vecs_file, 34 | dtype=np.float32, 35 | use_memmap=True 36 | ) 37 | self.assertTrue((expected == actual).all(), f"{expected == actual}\n{expected}\n{actual}") 38 | # windows refuses to allow 2 active handles via memmap to touch the same file 39 | # that's why we made a copy of the file itself and are using the copy here to test 40 | # the read+append(inmem) 41 | actual = dap.vectors_from_file( 42 | vecs_file_copy.name, 43 | dtype=np.float32, 44 | use_memmap=True, 45 | mode="r+" 46 | ) 47 | self.assertTrue((expected == actual).all(), f"{expected == actual}\n{expected}\n{actual}") 48 | 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | [workspace] 5 | members = [ 6 | "cmd_drivers/build_memory_index", 7 | "cmd_drivers/build_and_insert_memory_index", 8 | "cmd_drivers/load_and_insert_memory_index", 9 | "cmd_drivers/convert_f32_to_bf16", 10 | "cmd_drivers/search_memory_index", 11 | "cmd_drivers/build_disk_index", 12 | "cmd_drivers/build_and_insert_delete_memory_index", 13 | "vector", 14 | "diskann", 15 | "platform", 16 | "logger", 17 | "vector_base64" 18 | ] 19 | resolver = "2" 20 | 21 | [profile.release] 22 | opt-level = 3 23 | codegen-units=1 24 | -------------------------------------------------------------------------------- /rust/cmd_drivers/build_and_insert_delete_memory_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "build_and_insert_delete_memory_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | diskann = { path = "../../diskann" } 12 | logger = { path = "../../logger" } 13 | vector = { path = "../../vector" } 14 | 15 | -------------------------------------------------------------------------------- /rust/cmd_drivers/build_and_insert_memory_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "build_and_insert_memory_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | diskann = { path = "../../diskann" } 12 | logger = { path = "../../logger" } 13 | vector = { path = "../../vector" } 14 | 15 | -------------------------------------------------------------------------------- /rust/cmd_drivers/build_disk_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "build_disk_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | diskann = { path = "../../diskann" } 12 | logger = { path = "../../logger" } 13 | vector = { path = "../../vector" } 14 | openblas-src = { version = "0.10.8", features = ["system", "static"] } 15 | -------------------------------------------------------------------------------- /rust/cmd_drivers/build_memory_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "build_memory_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | clap = { version = "4.3.8", features = ["derive"] } 12 | diskann = { path = "../../diskann" } 13 | logger = { path = "../../logger" } 14 | vector = { path = "../../vector" } 15 | 16 | -------------------------------------------------------------------------------- /rust/cmd_drivers/build_memory_index/src/args.rs: -------------------------------------------------------------------------------- 1 | use clap::{Args, Parser}; 2 | 3 | #[derive(Debug, Args)] 4 | enum DataType { 5 | /// Float data type. 6 | Float, 7 | 8 | /// Half data type. 9 | FP16, 10 | } 11 | 12 | #[derive(Debug, Args)] 13 | enum DistanceFunction { 14 | /// Euclidean distance. 15 | L2, 16 | 17 | /// Cosine distance. 18 | Cosine, 19 | } 20 | 21 | #[derive(Debug, Parser)] 22 | struct BuildMemoryIndexArgs { 23 | /// Data type of the vectors. 24 | #[clap(long, default_value = "float")] 25 | pub data_type: DataType, 26 | 27 | /// Distance function to use. 28 | #[clap(long, default_value = "l2")] 29 | pub dist_fn: Metric, 30 | 31 | /// Path to the data file. The file should be in the format specified by the `data_type` argument. 32 | #[clap(long, short, required = true)] 33 | pub data_path: String, 34 | 35 | /// Path to the index file. The index will be saved to this prefixed name. 36 | #[clap(long, short, required = true)] 37 | pub index_path_prefix: String, 38 | 39 | /// Number of max out degree from a vertex. 40 | #[clap(long, default_value = "32")] 41 | pub max_degree: usize, 42 | 43 | /// Number of candidates to consider when building out edges 44 | #[clap(long, short default_value = "50")] 45 | pub l_build: usize, 46 | 47 | /// Alpha to use to build diverse edges 48 | #[clap(long, short default_value = "1.0")] 49 | pub alpha: f32, 50 | 51 | /// Number of threads to use. 52 | #[clap(long, short, default_value = "1")] 53 | pub num_threads: u8, 54 | 55 | /// Number of PQ bytes to use. 56 | #[clap(long, short, default_value = "8")] 57 | pub build_pq_bytes: usize, 58 | 59 | /// Use opq? 60 | #[clap(long, short, default_value = "false")] 61 | pub use_opq: bool, 62 | } 63 | -------------------------------------------------------------------------------- /rust/cmd_drivers/convert_f32_to_bf16/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "convert_f32_to_bf16" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | half = "2.2.1" 12 | -------------------------------------------------------------------------------- /rust/cmd_drivers/load_and_insert_memory_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "load_and_insert_memory_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | diskann = { path = "../../diskann" } 12 | logger = { path = "../../logger" } 13 | vector = { path = "../../vector" } 14 | 15 | -------------------------------------------------------------------------------- /rust/cmd_drivers/search_memory_index/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "search_memory_index" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | bytemuck = "1.13.1" 12 | diskann = { path = "../../diskann" } 13 | num_cpus = "1.15.0" 14 | rayon = "1.7.0" 15 | vector = { path = "../../vector" } 16 | 17 | -------------------------------------------------------------------------------- /rust/diskann/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "diskann" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | bincode = "1.3.3" 12 | bit-vec = "0.6.3" 13 | byteorder = "1.4.3" 14 | cblas = "0.4.0" 15 | crossbeam = "0.8.2" 16 | half = "2.2.1" 17 | hashbrown = "0.13.2" 18 | num-traits = "0.2.15" 19 | once_cell = "1.17.1" 20 | openblas-src = { version = "0.10.8", features = ["system"] } 21 | rand = { version = "0.8.5", features = [ "small_rng" ] } 22 | rayon = "1.7.0" 23 | serde = { version = "1.0.130", features = ["derive"] } 24 | thiserror = "1.0.40" 25 | winapi = { version = "0.3.9", features = ["errhandlingapi", "fileapi", "ioapiset", "handleapi", "winnt", "minwindef", "basetsd", "winerror", "winbase"] } 26 | 27 | logger = { path = "../logger" } 28 | platform = { path = "../platform" } 29 | vector = { path = "../vector" } 30 | 31 | [build-dependencies] 32 | cc = "1.0.79" 33 | 34 | [dev-dependencies] 35 | approx = "0.5.1" 36 | criterion = "0.5.1" 37 | 38 | 39 | [[bench]] 40 | name = "distance_bench" 41 | harness = false 42 | 43 | [[bench]] 44 | name = "neighbor_bench" 45 | harness = false 46 | -------------------------------------------------------------------------------- /rust/diskann/benches/distance_bench.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 6 | 7 | use rand::{thread_rng, Rng}; 8 | use vector::{FullPrecisionDistance, Metric}; 9 | 10 | // make sure the vector is 256-bit (32 bytes) aligned required by _mm256_load_ps 11 | #[repr(C, align(32))] 12 | struct Vector32ByteAligned { 13 | v: [f32; 256], 14 | } 15 | 16 | fn benchmark_l2_distance_float_rust(c: &mut Criterion) { 17 | let (a, b) = prepare_random_aligned_vectors(); 18 | let mut group = c.benchmark_group("avx-computation"); 19 | group.sample_size(5000); 20 | 21 | group.bench_function("AVX Rust run", |f| { 22 | f.iter(|| { 23 | black_box(<[f32; 256]>::distance_compare( 24 | black_box(&a.v), 25 | black_box(&b.v), 26 | Metric::L2, 27 | )) 28 | }) 29 | }); 30 | } 31 | 32 | // make sure the vector is 256-bit (32 bytes) aligned required by _mm256_load_ps 33 | fn prepare_random_aligned_vectors() -> (Box, Box) { 34 | let a = Box::new(Vector32ByteAligned { 35 | v: [(); 256].map(|_| thread_rng().gen_range(0.0..100.0)), 36 | }); 37 | 38 | let b = Box::new(Vector32ByteAligned { 39 | v: [(); 256].map(|_| thread_rng().gen_range(0.0..100.0)), 40 | }); 41 | 42 | (a, b) 43 | } 44 | 45 | criterion_group!(benches, benchmark_l2_distance_float_rust,); 46 | criterion_main!(benches); 47 | 48 | -------------------------------------------------------------------------------- /rust/diskann/benches/kmeans_bench.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use criterion::{criterion_group, criterion_main, Criterion}; 6 | use diskann::utils::k_means_clustering; 7 | use rand::Rng; 8 | 9 | const NUM_POINTS: usize = 10000; 10 | const DIM: usize = 100; 11 | const NUM_CENTERS: usize = 256; 12 | const MAX_KMEANS_REPS: usize = 12; 13 | 14 | fn benchmark_kmeans_rust(c: &mut Criterion) { 15 | let mut rng = rand::thread_rng(); 16 | let data: Vec = (0..NUM_POINTS * DIM) 17 | .map(|_| rng.gen_range(-1.0..1.0)) 18 | .collect(); 19 | let centers: Vec = vec![0.0; NUM_CENTERS * DIM]; 20 | 21 | let mut group = c.benchmark_group("kmeans-computation"); 22 | group.sample_size(500); 23 | 24 | group.bench_function("K-Means Rust run", |f| { 25 | f.iter(|| { 26 | // let mut centers_copy = centers.clone(); 27 | let data_copy = data.clone(); 28 | let mut centers_copy = centers.clone(); 29 | k_means_clustering( 30 | &data_copy, 31 | NUM_POINTS, 32 | DIM, 33 | &mut centers_copy, 34 | NUM_CENTERS, 35 | MAX_KMEANS_REPS, 36 | ) 37 | }) 38 | }); 39 | } 40 | 41 | fn benchmark_kmeans_c(c: &mut Criterion) { 42 | let mut rng = rand::thread_rng(); 43 | let data: Vec = (0..NUM_POINTS * DIM) 44 | .map(|_| rng.gen_range(-1.0..1.0)) 45 | .collect(); 46 | let centers: Vec = vec![0.0; NUM_CENTERS * DIM]; 47 | 48 | let mut group = c.benchmark_group("kmeans-computation"); 49 | group.sample_size(500); 50 | 51 | group.bench_function("K-Means C++ Run", |f| { 52 | f.iter(|| { 53 | let data_copy = data.clone(); 54 | let mut centers_copy = centers.clone(); 55 | let _ = k_means_clustering( 56 | data_copy.as_slice(), 57 | NUM_POINTS, 58 | DIM, 59 | centers_copy.as_mut_slice(), 60 | NUM_CENTERS, 61 | MAX_KMEANS_REPS, 62 | ); 63 | }) 64 | }); 65 | } 66 | 67 | criterion_group!(benches, benchmark_kmeans_rust, benchmark_kmeans_c); 68 | 69 | criterion_main!(benches); 70 | 71 | -------------------------------------------------------------------------------- /rust/diskann/benches/neighbor_bench.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::time::Duration; 6 | 7 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 8 | 9 | use diskann::model::{Neighbor, NeighborPriorityQueue}; 10 | use rand::distributions::{Distribution, Uniform}; 11 | use rand::rngs::StdRng; 12 | use rand::SeedableRng; 13 | 14 | fn benchmark_priority_queue_insert(c: &mut Criterion) { 15 | let vec = generate_random_floats(); 16 | let mut group = c.benchmark_group("neighborqueue-insert"); 17 | group.measurement_time(Duration::from_secs(3)).sample_size(500); 18 | 19 | let mut queue = NeighborPriorityQueue::with_capacity(64_usize); 20 | group.bench_function("Neighbor Priority Queue Insert", |f| { 21 | f.iter(|| { 22 | queue.clear(); 23 | for n in vec.iter() { 24 | queue.insert(*n); 25 | } 26 | 27 | black_box(&1) 28 | }); 29 | }); 30 | } 31 | 32 | fn generate_random_floats() -> Vec { 33 | let seed: [u8; 32] = [73; 32]; 34 | let mut rng: StdRng = SeedableRng::from_seed(seed); 35 | let range = Uniform::new(0.0, 1.0); 36 | let mut random_floats = Vec::with_capacity(100); 37 | 38 | for i in 0..100 { 39 | let random_float = range.sample(&mut rng) as f32; 40 | let n = Neighbor::new(i, random_float); 41 | random_floats.push(n); 42 | } 43 | 44 | random_floats 45 | } 46 | 47 | criterion_group!(benches, benchmark_priority_queue_insert); 48 | criterion_main!(benches); 49 | 50 | -------------------------------------------------------------------------------- /rust/diskann/src/algorithm/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod search; 6 | 7 | pub mod prune; 8 | -------------------------------------------------------------------------------- /rust/diskann/src/algorithm/prune/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | pub mod prune; 7 | -------------------------------------------------------------------------------- /rust/diskann/src/algorithm/search/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | pub mod search; 7 | 8 | -------------------------------------------------------------------------------- /rust/diskann/src/common/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | mod aligned_allocator; 6 | pub use aligned_allocator::AlignedBoxWithSlice; 7 | 8 | mod ann_result; 9 | pub use ann_result::*; 10 | -------------------------------------------------------------------------------- /rust/diskann/src/index/disk_index/ann_disk_index.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_docs)] 6 | 7 | //! ANN disk index abstraction 8 | 9 | use vector::FullPrecisionDistance; 10 | 11 | use crate::model::{IndexConfiguration, DiskIndexBuildParameters}; 12 | use crate::storage::DiskIndexStorage; 13 | use crate::model::vertex::{DIM_128, DIM_256, DIM_104}; 14 | 15 | use crate::common::{ANNResult, ANNError}; 16 | 17 | use super::DiskIndex; 18 | 19 | /// ANN disk index abstraction for custom 20 | pub trait ANNDiskIndex : Sync + Send 21 | where T : Default + Copy + Sync + Send + Into 22 | { 23 | /// Build index 24 | fn build(&mut self, codebook_prefix: &str) -> ANNResult<()>; 25 | } 26 | 27 | /// Create Index based on configuration 28 | pub fn create_disk_index<'a, T>( 29 | disk_build_param: Option, 30 | config: IndexConfiguration, 31 | storage: DiskIndexStorage, 32 | ) -> ANNResult + 'a>> 33 | where 34 | T: Default + Copy + Sync + Send + Into + 'a, 35 | [T; DIM_104]: FullPrecisionDistance, 36 | [T; DIM_128]: FullPrecisionDistance, 37 | [T; DIM_256]: FullPrecisionDistance, 38 | { 39 | match config.aligned_dim { 40 | DIM_104 => { 41 | let index = Box::new(DiskIndex::::new(disk_build_param, config, storage)); 42 | Ok(index as Box>) 43 | }, 44 | DIM_128 => { 45 | let index = Box::new(DiskIndex::::new(disk_build_param, config, storage)); 46 | Ok(index as Box>) 47 | }, 48 | DIM_256 => { 49 | let index = Box::new(DiskIndex::::new(disk_build_param, config, storage)); 50 | Ok(index as Box>) 51 | }, 52 | _ => Err(ANNError::log_index_error(format!("Invalid dimension: {}", config.aligned_dim))), 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rust/diskann/src/index/disk_index/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod disk_index; 7 | pub use disk_index::DiskIndex; 8 | 9 | pub mod ann_disk_index; 10 | -------------------------------------------------------------------------------- /rust/diskann/src/index/inmem_index/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod inmem_index; 7 | pub use inmem_index::InmemIndex; 8 | 9 | mod inmem_index_storage; 10 | 11 | pub mod ann_inmem_index; 12 | 13 | -------------------------------------------------------------------------------- /rust/diskann/src/index/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | mod inmem_index; 6 | pub use inmem_index::ann_inmem_index::*; 7 | pub use inmem_index::InmemIndex; 8 | 9 | mod disk_index; 10 | pub use disk_index::*; 11 | 12 | -------------------------------------------------------------------------------- /rust/diskann/src/instrumentation/disk_index_build_logger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use logger::logger::indexlog::DiskIndexConstructionCheckpoint; 6 | use logger::logger::indexlog::DiskIndexConstructionLog; 7 | use logger::logger::indexlog::Log; 8 | use logger::logger::indexlog::LogLevel; 9 | use logger::message_handler::send_log; 10 | 11 | use crate::{utils::Timer, common::ANNResult}; 12 | 13 | pub struct DiskIndexBuildLogger { 14 | timer: Timer, 15 | checkpoint: DiskIndexConstructionCheckpoint, 16 | } 17 | 18 | impl DiskIndexBuildLogger { 19 | pub fn new(checkpoint: DiskIndexConstructionCheckpoint) -> Self { 20 | Self { 21 | timer: Timer::new(), 22 | checkpoint, 23 | } 24 | } 25 | 26 | pub fn log_checkpoint(&mut self, next_checkpoint: DiskIndexConstructionCheckpoint) -> ANNResult<()> { 27 | if self.checkpoint == DiskIndexConstructionCheckpoint::None { 28 | return Ok(()); 29 | } 30 | 31 | let mut log = Log::default(); 32 | let disk_index_construction_log = DiskIndexConstructionLog { 33 | checkpoint: self.checkpoint as i32, 34 | time_spent_in_seconds: self.timer.elapsed().as_secs_f32(), 35 | g_cycles_spent: self.timer.elapsed_gcycles(), 36 | log_level: LogLevel::Info as i32, 37 | }; 38 | log.disk_index_construction_log = Some(disk_index_construction_log); 39 | 40 | send_log(log)?; 41 | self.checkpoint = next_checkpoint; 42 | self.timer.reset(); 43 | Ok(()) 44 | } 45 | } 46 | 47 | #[cfg(test)] 48 | mod dataset_test { 49 | use super::*; 50 | 51 | #[test] 52 | fn test_log() { 53 | let mut logger = DiskIndexBuildLogger::new(DiskIndexConstructionCheckpoint::PqConstruction); 54 | logger.log_checkpoint(DiskIndexConstructionCheckpoint::InmemIndexBuild).unwrap();logger.log_checkpoint(logger::logger::indexlog::DiskIndexConstructionCheckpoint::DiskLayout).unwrap(); 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /rust/diskann/src/instrumentation/index_logger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::sync::atomic::{AtomicUsize, Ordering}; 6 | 7 | use logger::logger::indexlog::IndexConstructionLog; 8 | use logger::logger::indexlog::Log; 9 | use logger::logger::indexlog::LogLevel; 10 | use logger::message_handler::send_log; 11 | 12 | use crate::common::ANNResult; 13 | use crate::utils::Timer; 14 | 15 | pub struct IndexLogger { 16 | items_processed: AtomicUsize, 17 | timer: Timer, 18 | range: usize, 19 | } 20 | 21 | impl IndexLogger { 22 | pub fn new(range: usize) -> Self { 23 | Self { 24 | items_processed: AtomicUsize::new(0), 25 | timer: Timer::new(), 26 | range, 27 | } 28 | } 29 | 30 | pub fn vertex_processed(&self) -> ANNResult<()> { 31 | let count = self.items_processed.fetch_add(1, Ordering::Relaxed); 32 | if count % 100_000 == 0 { 33 | let mut log = Log::default(); 34 | let index_construction_log = IndexConstructionLog { 35 | percentage_complete: (100_f32 * count as f32) / (self.range as f32), 36 | time_spent_in_seconds: self.timer.elapsed().as_secs_f32(), 37 | g_cycles_spent: self.timer.elapsed_gcycles(), 38 | log_level: LogLevel::Info as i32, 39 | }; 40 | log.index_construction_log = Some(index_construction_log); 41 | 42 | send_log(log)?; 43 | } 44 | 45 | Ok(()) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /rust/diskann/src/instrumentation/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | mod index_logger; 6 | pub use index_logger::IndexLogger; 7 | 8 | mod disk_index_build_logger; 9 | pub use disk_index_build_logger::DiskIndexBuildLogger; 10 | -------------------------------------------------------------------------------- /rust/diskann/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![cfg_attr( 6 | not(test), 7 | warn(clippy::panic, clippy::unwrap_used, clippy::expect_used) 8 | )] 9 | #![cfg_attr(test, allow(clippy::unused_io_amount))] 10 | 11 | pub mod utils; 12 | 13 | pub mod algorithm; 14 | 15 | pub mod model; 16 | 17 | pub mod common; 18 | 19 | pub mod index; 20 | 21 | pub mod storage; 22 | 23 | pub mod instrumentation; 24 | 25 | #[cfg(test)] 26 | pub mod test_utils; 27 | -------------------------------------------------------------------------------- /rust/diskann/src/model/configuration/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod index_configuration; 6 | pub use index_configuration::IndexConfiguration; 7 | 8 | pub mod index_write_parameters; 9 | pub use index_write_parameters::*; 10 | 11 | pub mod disk_index_build_parameter; 12 | pub use disk_index_build_parameter::DiskIndexBuildParameters; 13 | -------------------------------------------------------------------------------- /rust/diskann/src/model/data_store/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod inmem_dataset; 7 | pub use inmem_dataset::InmemDataset; 8 | pub use inmem_dataset::DatasetDto; 9 | 10 | mod disk_scratch_dataset; 11 | pub use disk_scratch_dataset::*; 12 | -------------------------------------------------------------------------------- /rust/diskann/src/model/graph/adjacency_list.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_debug_implementations, missing_docs)] 6 | 7 | //! Adjacency List 8 | 9 | use std::ops::{Deref, DerefMut}; 10 | 11 | #[derive(Debug, Eq, PartialEq)] 12 | /// Represents the out neighbors of a vertex 13 | pub struct AdjacencyList { 14 | edges: Vec, 15 | } 16 | 17 | /// In-mem index related limits 18 | const GRAPH_SLACK_FACTOR: f32 = 1.3_f32; 19 | 20 | impl AdjacencyList { 21 | /// Create AdjacencyList with capacity slack for a range. 22 | pub fn for_range(range: usize) -> Self { 23 | let capacity = (range as f32 * GRAPH_SLACK_FACTOR).ceil() as usize; 24 | Self { 25 | edges: Vec::with_capacity(capacity), 26 | } 27 | } 28 | 29 | /// Push a node to the list of neighbors for the given node. 30 | pub fn push(&mut self, node_id: u32) { 31 | debug_assert!(self.edges.len() < self.edges.capacity()); 32 | self.edges.push(node_id); 33 | } 34 | } 35 | 36 | impl From> for AdjacencyList { 37 | fn from(edges: Vec) -> Self { 38 | Self { edges } 39 | } 40 | } 41 | 42 | impl Deref for AdjacencyList { 43 | type Target = Vec; 44 | 45 | fn deref(&self) -> &Self::Target { 46 | &self.edges 47 | } 48 | } 49 | 50 | impl DerefMut for AdjacencyList { 51 | fn deref_mut(&mut self) -> &mut Self::Target { 52 | &mut self.edges 53 | } 54 | } 55 | 56 | impl<'a> IntoIterator for &'a AdjacencyList { 57 | type Item = &'a u32; 58 | type IntoIter = std::slice::Iter<'a, u32>; 59 | 60 | fn into_iter(self) -> Self::IntoIter { 61 | self.edges.iter() 62 | } 63 | } 64 | 65 | -------------------------------------------------------------------------------- /rust/diskann/src/model/graph/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod inmem_graph; 7 | pub use inmem_graph::InMemoryGraph; 8 | 9 | pub mod vertex_and_neighbors; 10 | pub use vertex_and_neighbors::VertexAndNeighbors; 11 | 12 | mod adjacency_list; 13 | pub use adjacency_list::AdjacencyList; 14 | 15 | mod sector_graph; 16 | pub use sector_graph::*; 17 | 18 | mod disk_graph; 19 | pub use disk_graph::*; 20 | 21 | -------------------------------------------------------------------------------- /rust/diskann/src/model/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod neighbor; 6 | pub use neighbor::Neighbor; 7 | pub use neighbor::NeighborPriorityQueue; 8 | 9 | pub mod data_store; 10 | pub use data_store::InmemDataset; 11 | 12 | pub mod graph; 13 | pub use graph::InMemoryGraph; 14 | pub use graph::VertexAndNeighbors; 15 | 16 | pub mod configuration; 17 | pub use configuration::*; 18 | 19 | pub mod scratch; 20 | pub use scratch::*; 21 | 22 | pub mod vertex; 23 | pub use vertex::Vertex; 24 | 25 | pub mod pq; 26 | pub use pq::*; 27 | 28 | pub mod windows_aligned_file_reader; 29 | pub use windows_aligned_file_reader::*; 30 | -------------------------------------------------------------------------------- /rust/diskann/src/model/neighbor/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod neighbor; 7 | pub use neighbor::*; 8 | 9 | mod neighbor_priority_queue; 10 | pub use neighbor_priority_queue::*; 11 | 12 | mod sorted_neighbor_vector; 13 | pub use sorted_neighbor_vector::SortedNeighborVector; 14 | -------------------------------------------------------------------------------- /rust/diskann/src/model/neighbor/sorted_neighbor_vector.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_debug_implementations, missing_docs)] 6 | 7 | //! Sorted Neighbor Vector 8 | 9 | use std::ops::{Deref, DerefMut}; 10 | 11 | use super::Neighbor; 12 | 13 | /// A newtype on top of vector of neighbors, is sorted by distance 14 | #[derive(Debug)] 15 | pub struct SortedNeighborVector<'a>(&'a mut Vec); 16 | 17 | impl<'a> SortedNeighborVector<'a> { 18 | /// Create a new SortedNeighborVector 19 | pub fn new(vec: &'a mut Vec) -> Self { 20 | vec.sort_unstable(); 21 | Self(vec) 22 | } 23 | } 24 | 25 | impl<'a> Deref for SortedNeighborVector<'a> { 26 | type Target = Vec; 27 | 28 | fn deref(&self) -> &Self::Target { 29 | self.0 30 | } 31 | } 32 | 33 | impl<'a> DerefMut for SortedNeighborVector<'a> { 34 | fn deref_mut(&mut self) -> &mut Self::Target { 35 | self.0 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rust/diskann/src/model/pq/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | mod fixed_chunk_pq_table; 6 | pub use fixed_chunk_pq_table::*; 7 | 8 | mod pq_construction; 9 | pub use pq_construction::*; 10 | -------------------------------------------------------------------------------- /rust/diskann/src/model/scratch/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod scratch_traits; 6 | pub use scratch_traits::*; 7 | 8 | pub mod concurrent_queue; 9 | pub use concurrent_queue::*; 10 | 11 | pub mod pq_scratch; 12 | pub use pq_scratch::*; 13 | 14 | 15 | pub mod inmem_query_scratch; 16 | pub use inmem_query_scratch::*; 17 | 18 | pub mod scratch_store_manager; 19 | pub use scratch_store_manager::*; 20 | 21 | pub mod ssd_query_scratch; 22 | pub use ssd_query_scratch::*; 23 | 24 | pub mod ssd_thread_data; 25 | pub use ssd_thread_data::*; 26 | 27 | pub mod ssd_io_context; 28 | pub use ssd_io_context::*; 29 | -------------------------------------------------------------------------------- /rust/diskann/src/model/scratch/scratch_store_manager.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use crate::common::ANNResult; 6 | 7 | use super::ArcConcurrentBoxedQueue; 8 | use super::{scratch_traits::Scratch}; 9 | use std::time::Duration; 10 | 11 | pub struct ScratchStoreManager { 12 | scratch: Option>, 13 | scratch_pool: ArcConcurrentBoxedQueue, 14 | } 15 | 16 | impl ScratchStoreManager { 17 | pub fn new(scratch_pool: ArcConcurrentBoxedQueue, wait_time: Duration) -> ANNResult { 18 | let mut scratch = scratch_pool.pop()?; 19 | while scratch.is_none() { 20 | scratch_pool.wait_for_push_notify(wait_time)?; 21 | scratch = scratch_pool.pop()?; 22 | } 23 | 24 | Ok(ScratchStoreManager { 25 | scratch, 26 | scratch_pool, 27 | }) 28 | } 29 | 30 | pub fn scratch_space(&mut self) -> Option<&mut T> { 31 | self.scratch.as_deref_mut() 32 | } 33 | } 34 | 35 | impl Drop for ScratchStoreManager { 36 | fn drop(&mut self) { 37 | if let Some(mut scratch) = self.scratch.take() { 38 | scratch.clear(); 39 | let _ = self.scratch_pool.push(scratch); 40 | } 41 | } 42 | } 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use super::*; 47 | 48 | #[derive(Debug)] 49 | struct MyScratch { 50 | data: Vec, 51 | } 52 | 53 | impl Scratch for MyScratch { 54 | fn clear(&mut self) { 55 | self.data.clear(); 56 | } 57 | } 58 | 59 | #[test] 60 | fn test_scratch_store_manager() { 61 | let wait_time = Duration::from_millis(100); 62 | 63 | let scratch_pool = ArcConcurrentBoxedQueue::new(); 64 | for i in 1..3 { 65 | scratch_pool.push(Box::new(MyScratch { 66 | data: vec![i, 2 * i, 3 * i], 67 | })).unwrap(); 68 | } 69 | 70 | let mut manager = ScratchStoreManager::new(scratch_pool.clone(), wait_time).unwrap(); 71 | let scratch_space = manager.scratch_space().unwrap(); 72 | 73 | assert_eq!(scratch_space.data, vec![1, 2, 3]); 74 | 75 | // At this point, the ScratchStoreManager will go out of scope, 76 | // causing the Drop implementation to be called, which should 77 | // call the clear method on MyScratch. 78 | drop(manager); 79 | 80 | let current_scratch = scratch_pool.pop().unwrap().unwrap(); 81 | assert_eq!(current_scratch.data, vec![2, 4, 6]); 82 | } 83 | } 84 | 85 | -------------------------------------------------------------------------------- /rust/diskann/src/model/scratch/scratch_traits.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub trait Scratch { 6 | fn clear(&mut self); 7 | } 8 | 9 | -------------------------------------------------------------------------------- /rust/diskann/src/model/scratch/ssd_io_context.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![allow(dead_code)] // Todo: Remove this when the disk index query code is complete. 6 | use crate::common::ANNError; 7 | 8 | use platform::{FileHandle, IOCompletionPort}; 9 | 10 | // The IOContext struct for disk I/O. One for each thread. 11 | pub struct IOContext { 12 | pub status: Status, 13 | pub file_handle: FileHandle, 14 | pub io_completion_port: IOCompletionPort, 15 | } 16 | 17 | impl Default for IOContext { 18 | fn default() -> Self { 19 | IOContext { 20 | status: Status::ReadWait, 21 | file_handle: FileHandle::default(), 22 | io_completion_port: IOCompletionPort::default(), 23 | } 24 | } 25 | } 26 | 27 | impl IOContext { 28 | pub fn new() -> Self { 29 | Self::default() 30 | } 31 | } 32 | 33 | pub enum Status { 34 | ReadWait, 35 | ReadSuccess, 36 | ReadFailed(ANNError), 37 | ProcessComplete, 38 | } 39 | -------------------------------------------------------------------------------- /rust/diskann/src/model/vertex/dimension.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_debug_implementations, missing_docs)] 6 | 7 | //! Vertex dimension 8 | 9 | /// 32 vertex dimension 10 | pub const DIM_32: usize = 32; 11 | 12 | /// 64 vertex dimension 13 | pub const DIM_64: usize = 64; 14 | 15 | /// 104 vertex dimension 16 | pub const DIM_104: usize = 104; 17 | 18 | /// 128 vertex dimension 19 | pub const DIM_128: usize = 128; 20 | 21 | /// 256 vertex dimension 22 | pub const DIM_256: usize = 256; 23 | -------------------------------------------------------------------------------- /rust/diskann/src/model/vertex/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod vertex; 7 | pub use vertex::Vertex; 8 | 9 | mod dimension; 10 | pub use dimension::*; 11 | -------------------------------------------------------------------------------- /rust/diskann/src/model/vertex/vertex.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_debug_implementations, missing_docs)] 6 | 7 | //! Vertex 8 | 9 | use std::array::TryFromSliceError; 10 | 11 | use vector::{FullPrecisionDistance, Metric}; 12 | 13 | /// Vertex with data type T and dimension N 14 | #[derive(Debug)] 15 | pub struct Vertex<'a, T, const N: usize> 16 | where 17 | [T; N]: FullPrecisionDistance, 18 | { 19 | /// Vertex value 20 | val: &'a [T; N], 21 | 22 | /// Vertex Id 23 | id: u32, 24 | } 25 | 26 | impl<'a, T, const N: usize> Vertex<'a, T, N> 27 | where 28 | [T; N]: FullPrecisionDistance, 29 | { 30 | /// Create the vertex with data 31 | pub fn new(val: &'a [T; N], id: u32) -> Self { 32 | Self { 33 | val, 34 | id, 35 | } 36 | } 37 | 38 | /// Compare the vertex with another. 39 | #[inline(always)] 40 | pub fn compare(&self, other: &Vertex<'a, T, N>, metric: Metric) -> f32 { 41 | <[T; N]>::distance_compare(self.val, other.val, metric) 42 | } 43 | 44 | /// Get the vector associated with the vertex. 45 | #[inline] 46 | pub fn vector(&self) -> &[T; N] { 47 | self.val 48 | } 49 | 50 | /// Get the vertex id. 51 | #[inline] 52 | pub fn vertex_id(&self) -> u32 { 53 | self.id 54 | } 55 | } 56 | 57 | impl<'a, T, const N: usize> TryFrom<(&'a [T], u32)> for Vertex<'a, T, N> 58 | where 59 | [T; N]: FullPrecisionDistance, 60 | { 61 | type Error = TryFromSliceError; 62 | 63 | fn try_from((mem_slice, id): (&'a [T], u32)) -> Result { 64 | let array: &[T; N] = mem_slice.try_into()?; 65 | Ok(Vertex::new(array, id)) 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /rust/diskann/src/model/windows_aligned_file_reader/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[allow(clippy::module_inception)] 6 | mod windows_aligned_file_reader; 7 | pub use windows_aligned_file_reader::*; 8 | -------------------------------------------------------------------------------- /rust/diskann/src/storage/disk_graph_storage.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_docs)] 6 | 7 | //! Disk graph storage 8 | 9 | use std::sync::Arc; 10 | 11 | use crate::{model::{WindowsAlignedFileReader, IOContext, AlignedRead}, common::ANNResult}; 12 | 13 | /// Graph storage for disk index 14 | /// One thread has one storage instance 15 | pub struct DiskGraphStorage { 16 | /// Disk graph reader 17 | disk_graph_reader: Arc, 18 | 19 | /// IOContext of current thread 20 | ctx: Arc, 21 | } 22 | 23 | impl DiskGraphStorage { 24 | /// Create a new DiskGraphStorage instance 25 | pub fn new(disk_graph_reader: Arc) -> ANNResult { 26 | let ctx = disk_graph_reader.get_ctx()?; 27 | Ok(Self { 28 | disk_graph_reader, 29 | ctx, 30 | }) 31 | } 32 | 33 | /// Read disk graph data 34 | pub fn read(&self, read_requests: &mut [AlignedRead]) -> ANNResult<()> { 35 | self.disk_graph_reader.read(read_requests, &self.ctx) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rust/diskann/src/storage/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | mod disk_index_storage; 6 | pub use disk_index_storage::*; 7 | 8 | mod disk_graph_storage; 9 | pub use disk_graph_storage::*; 10 | 11 | mod pq_storage; 12 | pub use pq_storage::*; 13 | -------------------------------------------------------------------------------- /rust/diskann/src/test_utils/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod inmem_index_initialization; 6 | 7 | /// test files should be placed under tests folder 8 | pub fn get_test_file_path(relative_path: &str) -> String { 9 | format!("{}/{}", env!("CARGO_MANIFEST_DIR"), relative_path) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /rust/diskann/src/utils/bit_vec_extension.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::cmp::Ordering; 6 | 7 | use bit_vec::BitVec; 8 | 9 | pub trait BitVecExtension { 10 | fn resize(&mut self, new_len: usize, value: bool); 11 | } 12 | 13 | impl BitVecExtension for BitVec { 14 | fn resize(&mut self, new_len: usize, value: bool) { 15 | let old_len = self.len(); 16 | match new_len.cmp(&old_len) { 17 | Ordering::Less => self.truncate(new_len), 18 | Ordering::Greater => self.grow(new_len - old_len, value), 19 | Ordering::Equal => {} 20 | } 21 | } 22 | } 23 | 24 | #[cfg(test)] 25 | mod bit_vec_extension_test { 26 | use super::*; 27 | 28 | #[test] 29 | fn resize_test() { 30 | let mut bitset = BitVec::new(); 31 | 32 | bitset.resize(10, false); 33 | assert_eq!(bitset.len(), 10); 34 | assert!(bitset.none()); 35 | 36 | bitset.resize(11, true); 37 | assert_eq!(bitset.len(), 11); 38 | assert!(bitset[10]); 39 | 40 | bitset.resize(5, false); 41 | assert_eq!(bitset.len(), 5); 42 | assert!(bitset.none()); 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /rust/diskann/src/utils/hashset_u32.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | /* 6 | * Copyright (c) Microsoft Corporation. All rights reserved. 7 | * Licensed under the MIT license. 8 | */ 9 | use hashbrown::HashSet; 10 | use std::{hash::BuildHasherDefault, ops::{Deref, DerefMut}}; 11 | use fxhash::FxHasher; 12 | 13 | lazy_static::lazy_static! { 14 | /// Singleton hasher. 15 | static ref HASHER: BuildHasherDefault = { 16 | BuildHasherDefault::::default() 17 | }; 18 | } 19 | 20 | pub struct HashSetForU32 { 21 | hashset: HashSet::>, 22 | } 23 | 24 | impl HashSetForU32 { 25 | pub fn with_capacity(capacity: usize) -> HashSetForU32 { 26 | let hashset = HashSet::>::with_capacity_and_hasher(capacity, HASHER.clone()); 27 | HashSetForU32 { 28 | hashset 29 | } 30 | } 31 | } 32 | 33 | impl Deref for HashSetForU32 { 34 | type Target = HashSet::>; 35 | 36 | fn deref(&self) -> &Self::Target { 37 | &self.hashset 38 | } 39 | } 40 | 41 | impl DerefMut for HashSetForU32 { 42 | fn deref_mut(&mut self) -> &mut Self::Target { 43 | &mut self.hashset 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /rust/diskann/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | pub mod file_util; 6 | pub use file_util::*; 7 | 8 | #[allow(clippy::module_inception)] 9 | pub mod utils; 10 | pub use utils::*; 11 | 12 | pub mod bit_vec_extension; 13 | pub use bit_vec_extension::*; 14 | 15 | pub mod rayon_util; 16 | pub use rayon_util::*; 17 | 18 | pub mod timer; 19 | pub use timer::*; 20 | 21 | pub mod cached_reader; 22 | pub use cached_reader::*; 23 | 24 | pub mod cached_writer; 25 | pub use cached_writer::*; 26 | 27 | pub mod partition; 28 | pub use partition::*; 29 | 30 | pub mod math_util; 31 | pub use math_util::*; 32 | 33 | pub mod kmeans; 34 | pub use kmeans::*; 35 | -------------------------------------------------------------------------------- /rust/diskann/src/utils/rayon_util.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::ops::Range; 6 | use rayon::prelude::{IntoParallelIterator, ParallelIterator}; 7 | 8 | use crate::common::ANNResult; 9 | 10 | /// based on thread_num, execute the task in parallel using Rayon or serial 11 | #[inline] 12 | pub fn execute_with_rayon(range: Range, num_threads: u32, f: F) -> ANNResult<()> 13 | where F: Fn(usize) -> ANNResult<()> + Sync + Send + Copy 14 | { 15 | if num_threads == 1 { 16 | for i in range { 17 | f(i)?; 18 | } 19 | Ok(()) 20 | } else { 21 | range.into_par_iter().try_for_each(f) 22 | } 23 | } 24 | 25 | /// set the thread count of Rayon, otherwise it will use threads as many as logical cores. 26 | #[inline] 27 | pub fn set_rayon_num_threads(num_threads: u32) { 28 | std::env::set_var( 29 | "RAYON_NUM_THREADS", 30 | num_threads.to_string(), 31 | ); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /rust/diskann/tests/data/delete_set_50pts.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/delete_set_50pts.bin -------------------------------------------------------------------------------- /rust/diskann/tests/data/disk_index_node_data_aligned_reader_truth.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/disk_index_node_data_aligned_reader_truth.bin -------------------------------------------------------------------------------- /rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_alligned_reader_test.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_alligned_reader_test.index -------------------------------------------------------------------------------- /rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index -------------------------------------------------------------------------------- /rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_mem.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_mem.index -------------------------------------------------------------------------------- /rust/diskann/tests/data/siftsmall_learn.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/siftsmall_learn.bin -------------------------------------------------------------------------------- /rust/diskann/tests/data/siftsmall_learn.bin_pq_compressed.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/siftsmall_learn.bin_pq_compressed.bin -------------------------------------------------------------------------------- /rust/diskann/tests/data/siftsmall_learn.bin_pq_pivots.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/siftsmall_learn.bin_pq_pivots.bin -------------------------------------------------------------------------------- /rust/diskann/tests/data/siftsmall_learn_256pts.fbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/siftsmall_learn_256pts.fbin -------------------------------------------------------------------------------- /rust/diskann/tests/data/siftsmall_learn_256pts_2.fbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/siftsmall_learn_256pts_2.fbin -------------------------------------------------------------------------------- /rust/diskann/tests/data/truth_disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/truth_disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index -------------------------------------------------------------------------------- /rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2 -------------------------------------------------------------------------------- /rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2 -------------------------------------------------------------------------------- /rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2 -------------------------------------------------------------------------------- /rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/DiskANN/1f9b79c16e43181be95ad0346706e6d9080b35f9/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data -------------------------------------------------------------------------------- /rust/logger/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "logger" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | lazy_static = "1.4.0" 12 | log="0.4.17" 13 | once_cell = "1.17.1" 14 | prost = "0.11.9" 15 | prost-types = "0.11.9" 16 | thiserror = "1.0.40" 17 | win_etw_macros="0.1.8" 18 | win_etw_provider="0.1.8" 19 | 20 | [build-dependencies] 21 | prost-build = "0.11.9" 22 | 23 | [[example]] 24 | name="trace_example" 25 | path= "src/examples/trace_example.rs" 26 | 27 | [target."cfg(target_os=\"windows\")".build-dependencies.vcpkg] 28 | version = "0.2" 29 | 30 | -------------------------------------------------------------------------------- /rust/logger/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::env; 6 | 7 | extern crate prost_build; 8 | 9 | fn main() { 10 | let protopkg = vcpkg::find_package("protobuf").unwrap(); 11 | let protobuf_path = protopkg.link_paths[0].parent().unwrap(); 12 | 13 | let protobuf_bin_path = protobuf_path 14 | .join("tools") 15 | .join("protobuf") 16 | .join("protoc.exe") 17 | .to_str() 18 | .unwrap() 19 | .to_string(); 20 | env::set_var("PROTOC", protobuf_bin_path); 21 | 22 | let protobuf_inc_path = protobuf_path 23 | .join("include") 24 | .join("google") 25 | .join("protobuf") 26 | .to_str() 27 | .unwrap() 28 | .to_string(); 29 | env::set_var("PROTOC_INCLUDE", protobuf_inc_path); 30 | 31 | prost_build::compile_protos(&["src/indexlog.proto"], &["src/"]).unwrap(); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /rust/logger/src/error_logger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use crate::log_error::LogError; 6 | use crate::logger::indexlog::{ErrorLog, Log, LogLevel}; 7 | use crate::message_handler::send_log; 8 | 9 | pub fn log_error(error_message: String) -> Result<(), LogError> { 10 | let mut log = Log::default(); 11 | let error_log = ErrorLog { 12 | log_level: LogLevel::Error as i32, 13 | error_message, 14 | }; 15 | log.error_log = Some(error_log); 16 | 17 | send_log(log) 18 | } 19 | 20 | #[cfg(test)] 21 | mod error_logger_test { 22 | use super::*; 23 | 24 | #[test] 25 | fn log_error_works() { 26 | log_error(String::from("Error")).unwrap(); 27 | } 28 | } 29 | 30 | -------------------------------------------------------------------------------- /rust/logger/src/examples/trace_example.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use log::{debug, info, log_enabled, warn, Level}; 6 | use logger::trace_logger::TraceLogger; 7 | 8 | // cargo run --example trace_example 9 | 10 | fn main() { 11 | static LOGGER: TraceLogger = TraceLogger {}; 12 | log::set_logger(&LOGGER) 13 | .map(|()| log::set_max_level(log::LevelFilter::Trace)) 14 | .unwrap(); 15 | 16 | info!("Rust logging n = {}", 42); 17 | warn!("This is too much fun!"); 18 | debug!("Maybe we can make this code work"); 19 | 20 | let error_is_enabled = log_enabled!(Level::Error); 21 | let warn_is_enabled = log_enabled!(Level::Warn); 22 | let info_is_enabled = log_enabled!(Level::Info); 23 | let debug_is_enabled = log_enabled!(Level::Debug); 24 | let trace_is_enabled = log_enabled!(Level::Trace); 25 | println!( 26 | "is_enabled? error: {:5?}, warn: {:5?}, info: {:5?}, debug: {:5?}, trace: {:5?}", 27 | error_is_enabled, warn_is_enabled, info_is_enabled, debug_is_enabled, trace_is_enabled, 28 | ); 29 | } 30 | 31 | -------------------------------------------------------------------------------- /rust/logger/src/indexlog.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package diskann_logger; 4 | 5 | message Log { 6 | IndexConstructionLog IndexConstructionLog = 1; 7 | DiskIndexConstructionLog DiskIndexConstructionLog = 2; 8 | ErrorLog ErrorLog = 3; 9 | TraceLog TraceLog = 100; 10 | } 11 | 12 | enum LogLevel { 13 | UNSPECIFIED = 0; 14 | Error = 1; 15 | Warn = 2; 16 | Info = 3; 17 | Debug = 4; 18 | Trace = 5; 19 | } 20 | 21 | message IndexConstructionLog { 22 | float PercentageComplete = 1; 23 | float TimeSpentInSeconds = 2; 24 | float GCyclesSpent = 3; 25 | LogLevel LogLevel = 4; 26 | } 27 | 28 | message DiskIndexConstructionLog { 29 | DiskIndexConstructionCheckpoint checkpoint = 1; 30 | float TimeSpentInSeconds = 2; 31 | float GCyclesSpent = 3; 32 | LogLevel LogLevel = 4; 33 | } 34 | 35 | enum DiskIndexConstructionCheckpoint { 36 | None = 0; 37 | PqConstruction = 1; 38 | InmemIndexBuild = 2; 39 | DiskLayout = 3; 40 | } 41 | 42 | message TraceLog { 43 | string LogLine = 1; 44 | LogLevel LogLevel = 2; 45 | } 46 | 47 | message ErrorLog { 48 | string ErrorMessage = 1; 49 | LogLevel LogLevel = 2; 50 | } -------------------------------------------------------------------------------- /rust/logger/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![cfg_attr( 6 | not(test), 7 | warn(clippy::panic, clippy::unwrap_used, clippy::expect_used) 8 | )] 9 | 10 | pub mod logger { 11 | pub mod indexlog { 12 | include!(concat!(env!("OUT_DIR"), "/diskann_logger.rs")); 13 | } 14 | } 15 | 16 | pub mod error_logger; 17 | pub mod log_error; 18 | pub mod message_handler; 19 | pub mod trace_logger; 20 | -------------------------------------------------------------------------------- /rust/logger/src/log_error.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::sync::mpsc::SendError; 6 | 7 | use crate::logger::indexlog::Log; 8 | 9 | #[derive(thiserror::Error, Debug, Clone)] 10 | pub enum LogError { 11 | /// Sender failed to send message to the channel 12 | #[error("IOError: {err}")] 13 | SendError { 14 | #[from] 15 | err: SendError, 16 | }, 17 | 18 | /// PoisonError which can be returned whenever a lock is acquired 19 | /// Both Mutexes and RwLocks are poisoned whenever a thread fails while the lock is held 20 | #[error("LockPoisonError: {err}")] 21 | LockPoisonError { err: String }, 22 | 23 | /// Failed to create EtwPublisher 24 | #[error("EtwProviderError: {err:?}")] 25 | ETWProviderError { err: win_etw_provider::Error }, 26 | } 27 | 28 | -------------------------------------------------------------------------------- /rust/logger/src/trace_logger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use crate::logger::indexlog::{Log, TraceLog}; 6 | use crate::message_handler::send_log; 7 | 8 | use log; 9 | 10 | pub struct TraceLogger {} 11 | 12 | fn level_to_i32(value: log::Level) -> i32 { 13 | match value { 14 | log::Level::Error => 1, 15 | log::Level::Warn => 2, 16 | log::Level::Info => 3, 17 | log::Level::Debug => 4, 18 | log::Level::Trace => 5, 19 | } 20 | } 21 | 22 | impl log::Log for TraceLogger { 23 | fn enabled(&self, metadata: &log::Metadata) -> bool { 24 | metadata.level() <= log::max_level() 25 | } 26 | 27 | fn log(&self, record: &log::Record) { 28 | let message = record.args().to_string(); 29 | let metadata = record.metadata(); 30 | let mut log = Log::default(); 31 | let trace_log = TraceLog { 32 | log_line: message, 33 | log_level: level_to_i32(metadata.level()), 34 | }; 35 | log.trace_log = Some(trace_log); 36 | let _ = send_log(log); 37 | } 38 | 39 | fn flush(&self) {} 40 | } 41 | 42 | -------------------------------------------------------------------------------- /rust/platform/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "platform" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | log="0.4.18" 12 | winapi = { version = "0.3.9", features = ["errhandlingapi", "fileapi", "ioapiset", "handleapi", "winnt", "minwindef", "basetsd", "winerror", "winbase"] } 13 | 14 | -------------------------------------------------------------------------------- /rust/platform/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![cfg_attr( 6 | not(test), 7 | warn(clippy::panic, clippy::unwrap_used, clippy::expect_used) 8 | )] 9 | 10 | pub mod perf; 11 | pub use perf::{get_process_cycle_time, get_process_handle}; 12 | 13 | pub mod file_io; 14 | pub use file_io::{get_queued_completion_status, read_file_to_slice}; 15 | 16 | pub mod file_handle; 17 | pub use file_handle::FileHandle; 18 | 19 | pub mod io_completion_port; 20 | pub use io_completion_port::IOCompletionPort; 21 | -------------------------------------------------------------------------------- /rust/platform/src/perf.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[cfg(target_os = "windows")] 6 | #[link(name = "kernel32")] 7 | extern "system" { 8 | fn OpenProcess(dwDesiredAccess: u32, bInheritHandle: bool, dwProcessId: u32) -> usize; 9 | fn QueryProcessCycleTime(hProcess: usize, lpCycleTime: *mut u64) -> bool; 10 | fn GetCurrentProcessId() -> u32; 11 | } 12 | 13 | /// Get current process handle. 14 | pub fn get_process_handle() -> Option { 15 | if cfg!(windows) { 16 | const PROCESS_QUERY_INFORMATION: u32 = 0x0400; 17 | const PROCESS_VM_READ: u32 = 0x0010; 18 | 19 | unsafe { 20 | let current_process_id = GetCurrentProcessId(); 21 | let handle = OpenProcess( 22 | PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, 23 | false, 24 | current_process_id, 25 | ); 26 | if handle == 0 { 27 | None 28 | } else { 29 | Some(handle) 30 | } 31 | } 32 | } else { 33 | None 34 | } 35 | } 36 | 37 | pub fn get_process_cycle_time(process_handle: Option) -> Option { 38 | let mut cycle_time: u64 = 0; 39 | if cfg!(windows) { 40 | if let Some(handle) = process_handle { 41 | let result = unsafe { QueryProcessCycleTime(handle, &mut cycle_time as *mut u64) }; 42 | if result { 43 | return Some(cycle_time); 44 | } 45 | } 46 | } 47 | 48 | None 49 | } 50 | 51 | -------------------------------------------------------------------------------- /rust/project.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": { 8 | "search.exclude": { 9 | "target": true, 10 | }, 11 | "files.exclude": { 12 | "target": true, 13 | }, 14 | "rust-analyzer.linkedProjects": [ 15 | ".\\vector\\Cargo.toml", 16 | ".\\vector\\Cargo.toml", 17 | ".\\vector\\Cargo.toml", 18 | ".\\diskann\\Cargo.toml" 19 | ], 20 | "[rust]": { 21 | "editor.defaultFormatter": "rust-lang.rust-analyzer", 22 | "editor.formatOnSave": true, 23 | } 24 | }, 25 | "launch": { 26 | "version": "0.2.0", 27 | "configurations": [ 28 | { 29 | "name": "Build memory index", 30 | "type": "cppvsdbg", 31 | "request": "launch", 32 | "program": "${workspaceRoot}\\target\\debug\\build_memory_index.exe", 33 | "args": [ 34 | "--data_type", 35 | "float", 36 | "--dist_fn", 37 | "l2", 38 | "--data_path", 39 | ".\\base1m.fbin", 40 | "--index_path_prefix", 41 | ".\\rust_index_sift_base_R32_L50_A1.2_T1", 42 | "-R", 43 | "64", 44 | "-L", 45 | "100", 46 | "--alpha", 47 | "1.2", 48 | "-T", 49 | "1" 50 | ], 51 | "stopAtEntry": false, 52 | "cwd": "c:\\data", 53 | "environment": [], 54 | "externalConsole": true 55 | }, 56 | ] 57 | } 58 | } -------------------------------------------------------------------------------- /rust/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # readme 3 | 4 | run commands under disnann_rust directory. 5 | 6 | build: 7 | ``` 8 | cargo build // Debug 9 | 10 | cargo build -r // Release 11 | ``` 12 | 13 | 14 | run: 15 | ``` 16 | cargo run // Debug 17 | 18 | cargo run -r // Release 19 | ``` 20 | 21 | 22 | test: 23 | ``` 24 | cargo test 25 | ``` 26 | -------------------------------------------------------------------------------- /rust/rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [toolchain] 4 | channel = "stable" 5 | -------------------------------------------------------------------------------- /rust/vector/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "vector" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | half = "2.2.1" 12 | thiserror = "1.0.40" 13 | bytemuck = "1.7.0" 14 | 15 | [build-dependencies] 16 | cc = "1.0.79" 17 | 18 | [dev-dependencies] 19 | base64 = "0.21.2" 20 | bincode = "1.3.3" 21 | serde = "1.0.163" 22 | approx = "0.5.1" 23 | rand = "0.8.5" 24 | 25 | -------------------------------------------------------------------------------- /rust/vector/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | fn main() { 6 | println!("cargo:rerun-if-changed=distance.c"); 7 | if cfg!(target_os = "macos") { 8 | std::env::set_var("CFLAGS", "-mavx2 -mfma -Wno-error -MP -O2 -D NDEBUG -D MKL_ILP64 -D USE_AVX2 -D USE_ACCELERATED_PQ -D NOMINMAX -D _TARGET_ARM_APPLE_DARWIN"); 9 | 10 | cc::Build::new() 11 | .file("distance.c") 12 | .warnings_into_errors(true) 13 | .debug(false) 14 | .target("x86_64-apple-darwin") 15 | .compile("nativefunctions.lib"); 16 | } else { 17 | std::env::set_var("CFLAGS", "/permissive- /MP /ifcOutput /GS- /W3 /Gy /Zi /Gm- /O2 /Ob2 /Zc:inline /fp:fast /D NDEBUG /D MKL_ILP64 /D USE_AVX2 /D USE_ACCELERATED_PQ /D NOMINMAX /fp:except- /errorReport:prompt /WX /openmp:experimental /Zc:forScope /GR /arch:AVX2 /Gd /Oy /Oi /MD /std:c++14 /FC /EHsc /nologo /Ot"); 18 | // std::env::set_var("CFLAGS", "/permissive- /MP /ifcOutput /GS- /W3 /Gy /Zi /Gm- /Obd /Zc:inline /fp:fast /D DEBUG /D MKL_ILP64 /D USE_AVX2 /D USE_ACCELERATED_PQ /D NOMINMAX /fp:except- /errorReport:prompt /WX /openmp:experimental /Zc:forScope /GR /arch:AVX512 /Gd /Oy /Oi /MD /std:c++14 /FC /EHsc /nologo /Ot"); 19 | 20 | cc::Build::new() 21 | .file("distance.c") 22 | .warnings_into_errors(true) 23 | .debug(false) 24 | .compile("nativefunctions"); 25 | 26 | println!("cargo:rustc-link-arg=nativefunctions.lib"); 27 | } 28 | } 29 | 30 | -------------------------------------------------------------------------------- /rust/vector/distance.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | inline __m256i load_128bit_to_256bit(const __m128i *ptr) 5 | { 6 | __m128i value128 = _mm_loadu_si128(ptr); 7 | __m256i value256 = _mm256_castsi128_si256(value128); 8 | return _mm256_inserti128_si256(value256, _mm_setzero_si128(), 1); 9 | } 10 | 11 | float distance_compare_avx512f_f16(const unsigned char *vec1, const unsigned char *vec2, size_t size) 12 | { 13 | __m512 sum_squared_diff = _mm512_setzero_ps(); 14 | 15 | for (int i = 0; i < size / 16; i += 1) 16 | { 17 | __m512 v1 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(vec1 + i * 2 * 16))); 18 | __m512 v2 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(vec2 + i * 2 * 16))); 19 | 20 | __m512 diff = _mm512_sub_ps(v1, v2); 21 | sum_squared_diff = _mm512_fmadd_ps(diff, diff, sum_squared_diff); 22 | } 23 | 24 | size_t i = (size / 16) * 16; 25 | 26 | if (i != size) 27 | { 28 | __m512 va = _mm512_cvtph_ps(load_128bit_to_256bit((const __m128i *)(vec1 + i * 2))); 29 | __m512 vb = _mm512_cvtph_ps(load_128bit_to_256bit((const __m128i *)(vec2 + i * 2))); 30 | __m512 diff512 = _mm512_sub_ps(va, vb); 31 | sum_squared_diff = _mm512_fmadd_ps(diff512, diff512, sum_squared_diff); 32 | } 33 | 34 | return _mm512_reduce_add_ps(sum_squared_diff); 35 | } 36 | -------------------------------------------------------------------------------- /rust/vector/src/half.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use bytemuck::{Pod, Zeroable}; 6 | use half::f16; 7 | use std::convert::AsRef; 8 | use std::fmt; 9 | 10 | // Define the Half type as a new type over f16. 11 | // the memory layout of the Half struct will be the same as the memory layout of the f16 type itself. 12 | // The Half struct serves as a simple wrapper around the f16 type and does not introduce any additional memory overhead. 13 | // Test function: 14 | // use half::f16; 15 | // pub struct Half(f16); 16 | // fn main() { 17 | // let size_of_half = std::mem::size_of::(); 18 | // let alignment_of_half = std::mem::align_of::(); 19 | // println!("Size of Half: {} bytes", size_of_half); 20 | // println!("Alignment of Half: {} bytes", alignment_of_half); 21 | // } 22 | // Output: 23 | // Size of Half: 2 bytes 24 | // Alignment of Half: 2 bytes 25 | pub struct Half(f16); 26 | 27 | unsafe impl Pod for Half {} 28 | unsafe impl Zeroable for Half {} 29 | 30 | // Implement From for Half 31 | impl From for f32 { 32 | fn from(val: Half) -> Self { 33 | val.0.to_f32() 34 | } 35 | } 36 | 37 | // Implement AsRef for Half so that it can be used in distance_compare. 38 | impl AsRef for Half { 39 | fn as_ref(&self) -> &f16 { 40 | &self.0 41 | } 42 | } 43 | 44 | // Implement From for Half. 45 | impl Half { 46 | pub fn from_f32(value: f32) -> Self { 47 | Self(f16::from_f32(value)) 48 | } 49 | } 50 | 51 | // Implement Default for Half. 52 | impl Default for Half { 53 | fn default() -> Self { 54 | Self(f16::from_f32(Default::default())) 55 | } 56 | } 57 | 58 | // Implement Clone for Half. 59 | impl Clone for Half { 60 | fn clone(&self) -> Self { 61 | Half(self.0) 62 | } 63 | } 64 | 65 | // Implement PartialEq for Half. 66 | impl fmt::Debug for Half { 67 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 68 | write!(f, "Half({:?})", self.0) 69 | } 70 | } 71 | 72 | impl Copy for Half {} 73 | 74 | impl Half { 75 | pub fn to_f32(&self) -> f32 { 76 | self.0.to_f32() 77 | } 78 | } 79 | 80 | unsafe impl Send for Half {} 81 | unsafe impl Sync for Half {} 82 | 83 | -------------------------------------------------------------------------------- /rust/vector/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![cfg_attr( 6 | not(test), 7 | warn(clippy::panic, clippy::unwrap_used, clippy::expect_used) 8 | )] 9 | 10 | // #![feature(stdsimd)] 11 | // mod f32x16; 12 | // Uncomment above 2 to experiment with f32x16 13 | mod distance; 14 | mod half; 15 | mod l2_float_distance; 16 | mod metric; 17 | mod utils; 18 | 19 | pub use crate::half::Half; 20 | pub use distance::FullPrecisionDistance; 21 | pub use metric::Metric; 22 | pub use utils::prefetch_vector; 23 | 24 | #[cfg(test)] 25 | mod distance_test; 26 | mod test_util; 27 | -------------------------------------------------------------------------------- /rust/vector/src/metric.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #![warn(missing_debug_implementations, missing_docs)] 6 | use std::str::FromStr; 7 | 8 | /// Distance metric 9 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 10 | pub enum Metric { 11 | /// Squared Euclidean (L2-Squared) 12 | L2, 13 | 14 | /// Cosine similarity 15 | /// TODO: T should be float for Cosine distance 16 | Cosine, 17 | } 18 | 19 | #[derive(thiserror::Error, Debug)] 20 | pub enum ParseMetricError { 21 | #[error("Invalid format for Metric: {0}")] 22 | InvalidFormat(String), 23 | } 24 | 25 | impl FromStr for Metric { 26 | type Err = ParseMetricError; 27 | 28 | fn from_str(s: &str) -> Result { 29 | match s.to_lowercase().as_str() { 30 | "l2" => Ok(Metric::L2), 31 | "cosine" => Ok(Metric::Cosine), 32 | _ => Err(ParseMetricError::InvalidFormat(String::from(s))), 33 | } 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /rust/vector/src/test_util.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | #[cfg(test)] 6 | use crate::Half; 7 | 8 | #[cfg(test)] 9 | pub fn no_vector_compare_f16(a: &[Half], b: &[Half]) -> f32 { 10 | let mut sum = 0.0; 11 | debug_assert_eq!(a.len(), b.len()); 12 | 13 | for i in 0..a.len() { 14 | sum += (a[i].to_f32() - b[i].to_f32()).powi(2); 15 | } 16 | sum 17 | } 18 | 19 | #[cfg(test)] 20 | pub fn no_vector_compare_f32(a: &[f32], b: &[f32]) -> f32 { 21 | let mut sum = 0.0; 22 | debug_assert_eq!(a.len(), b.len()); 23 | 24 | for i in 0..a.len() { 25 | sum += (a[i] - b[i]).powi(2); 26 | } 27 | sum 28 | } 29 | 30 | -------------------------------------------------------------------------------- /rust/vector/src/utils.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0}; 6 | 7 | /// Prefetch the given vector in chunks of 64 bytes, which is a cache line size 8 | /// NOTE: good efficiency when total_vec_size is integral multiple of 64 9 | #[inline] 10 | pub fn prefetch_vector(vec: &[T]) { 11 | let vec_ptr = vec.as_ptr() as *const i8; 12 | let vecsize = std::mem::size_of_val(vec); 13 | let max_prefetch_size = (vecsize / 64) * 64; 14 | 15 | for d in (0..max_prefetch_size).step_by(64) { 16 | unsafe { 17 | _mm_prefetch(vec_ptr.add(d), _MM_HINT_T0); 18 | } 19 | } 20 | } 21 | 22 | -------------------------------------------------------------------------------- /rust/vector_base64/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | [package] 4 | name = "vector_base64" 5 | version = "0.1.0" 6 | edition = "2021" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | base64 = "0.21.2" 12 | bincode = "1.3.3" 13 | half = "2.2.1" 14 | serde = "1.0.163" 15 | 16 | -------------------------------------------------------------------------------- /scripts/IndexParser/BinFileParser.py: -------------------------------------------------------------------------------- 1 | import parse_common 2 | import argparse 3 | 4 | def get_data_type_code(data_type_name): 5 | if data_type_name == "float": 6 | return ('f', 4) 7 | elif data_type_name == "int8": 8 | return ('b', 1) 9 | elif data_type_name == "uint8": 10 | return ('B', 1) 11 | else: 12 | raise Exception("Only float, int8 and uint8 are supported.") 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser(description="Parse a file in .bin format") 16 | parser.add_argument("filename", help="The vector/matrix file to parse") 17 | parser.add_argument("data_type", help="Type of data in the vector file. Only float, int8 and uint8 are supported.") 18 | parser.add_argument("output_file", help="The file to write the parsed data to") 19 | args = parser.parse_args() 20 | 21 | data_type_code, data_type_size = get_data_type_code(args.data_type) 22 | 23 | datamat = parse_common.DataMat(data_type_code, data_type_size) 24 | datamat.load_bin(args.filename) 25 | 26 | with open(args.output_file, "w") as out_file: 27 | for i in range(len(datamat)): 28 | out_file.write(str(datamat[i].tolist()) + "\n") 29 | 30 | print("Parsed " + str(len(datamat)) + " vectors from " + args.filename + " and wrote output to " + args.output_file) 31 | -------------------------------------------------------------------------------- /scripts/IndexParser/parse_pq.py: -------------------------------------------------------------------------------- 1 | import parse_common as pc 2 | 3 | def parse_compressed_vectors(file_prefix) : 4 | file_name = file_prefix + "_pq_compressed.bin" 5 | compressed_vectors = pc.DataMat('B', 1) 6 | compressed_vectors.load_bin(file_name) 7 | return compressed_vectors 8 | 9 | def parse_pivots_file(file_prefix): 10 | file_name = file_prefix + "_pq_pivots.bin" 11 | with open(file_name, "rb") as file: 12 | metadata_mat = pc.DataMat('Q', 8) 13 | metadata_mat.load_bin_from_opened_file(file) 14 | num_metadata = metadata_mat.num_rows 15 | num_dims = metadata_mat.num_cols 16 | assert num_dims == 1 and (num_metadata == 4 or num_metadata == 5) 17 | 18 | 19 | for i in range(num_metadata): 20 | for j in range(num_dims): 21 | print (metadata_mat[i][j]) 22 | print("\n") 23 | 24 | pivots = pc.DataMat('f', 4) 25 | pivots.load_bin_from_opened_file(file, metadata_mat[0][0]) 26 | assert pivots.num_rows == pc.NUM_PQ_CENTROIDS 27 | 28 | centroids = pc.DataMat('f', 4) 29 | centroids.load_bin_from_opened_file(file, metadata_mat[1][0]) 30 | assert centroids.num_rows == pivots.num_cols 31 | assert centroids.num_cols == 1 32 | 33 | #Assuming new file format =>(chunk offset is at offset 3) because we will not encounter old index formats now. 34 | chunk_offsets = pc.DataMat('I', 4) 35 | chunk_offsets.load_bin_from_opened_file(file, metadata_mat[2][0]) 36 | #assert chunk_offsets.num_rows == pivots.num_cols + 1 or chunk_offsets.num_rows == 0 37 | assert chunk_offsets.num_cols == 1 38 | #Ignoring rotmat for now. Also ignoring diskPQ 39 | 40 | return pivots, centroids, chunk_offsets 41 | -------------------------------------------------------------------------------- /scripts/dev/install-dev-deps-ubuntu.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEBIAN_FRONTEND=noninteractive apt install -y cmake \ 4 | g++ \ 5 | libaio-dev \ 6 | libgoogle-perftools-dev \ 7 | libunwind-dev \ 8 | clang-format \ 9 | libboost-dev \ 10 | libboost-program-options-dev \ 11 | libboost-test-dev \ 12 | libmkl-full-dev -------------------------------------------------------------------------------- /scripts/perf/Dockerfile: -------------------------------------------------------------------------------- 1 | #Copyright(c) Microsoft Corporation.All rights reserved. 2 | #Licensed under the MIT license. 3 | 4 | FROM ubuntu:jammy 5 | 6 | # Can be provided at build to point to a specific commit-ish, by default builds from HEAD 7 | ARG GIT_COMMIT_ISH=HEAD 8 | 9 | RUN apt update 10 | RUN apt install -y software-properties-common 11 | RUN add-apt-repository -y ppa:git-core/ppa 12 | RUN apt update 13 | RUN DEBIAN_FRONTEND=noninteractive apt install -y git time 14 | 15 | COPY dev/install-dev-deps-ubuntu.bash /app/fallback/install-dev-deps-ubuntu.bash 16 | WORKDIR /app 17 | RUN git clone https://github.com/microsoft/DiskANN.git 18 | WORKDIR /app/DiskANN 19 | RUN git checkout $GIT_COMMIT_ISH 20 | 21 | # we would prefer to use the deps requested at the same commit. if the script doesn't exist we'll use the current one. 22 | RUN bash scripts/dev/install-dev-deps-ubuntu.bash || bash /app/fallback/install-dev-deps-ubuntu.bash 23 | 24 | RUN mkdir build 25 | RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True 26 | RUN cmake --build build -- -j 27 | 28 | RUN mkdir /app/logs 29 | COPY perf/perf_test.sh /app/DiskANN/perf_test.sh 30 | 31 | ENTRYPOINT bash perf_test.sh 32 | -------------------------------------------------------------------------------- /scripts/perf/README.md: -------------------------------------------------------------------------------- 1 | #Performance Tests 2 | 3 | The bash scripts in this folder are responsible for running a suite of performance 4 | tests. 5 | 6 | The timing and recall metrics reported by these tests when run periodically can then 7 | be used to identify performance improvements or regressions as 8 | development continues. 9 | 10 | ## Usage 11 | 12 | `docker build` must be run with the context directory set to `scripts`, but the Dockerfile set to `scripts/perf/Dockerfile` as in: 13 | ```bash 14 | docker build [--build-arg GIT_COMMIT_ISH=] -f scripts/perf/Dockerfile scripts 15 | ``` 16 | 17 | We prefer to install the dependencies from the commit-ish that we're building against, but as the deps were not stored 18 | in a known file in all commits, we will fall back to the one currently in HEAD if one is not found already. 19 | 20 | The `--build-arg GIT_COMMIT_ISH=` is optional, with a default value of HEAD if not otherwise specified. 21 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #Copyright(c) Microsoft Corporation.All rights reserved. 2 | #Licensed under the MIT license. 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_COMPILE_WARNING_AS_ERROR ON) 6 | 7 | if(MSVC) 8 | add_subdirectory(dll) 9 | else() 10 | #file(GLOB CPP_SOURCES *.cpp) 11 | set(CPP_SOURCES abstract_data_store.cpp ann_exception.cpp disk_utils.cpp 12 | distance.cpp index.cpp in_mem_graph_store.cpp in_mem_data_store.cpp 13 | linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp 14 | in_mem_data_store.cpp in_mem_graph_store.cpp 15 | natural_number_set.cpp memory_mapper.cpp partition.cpp pq.cpp 16 | pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp) 17 | if (RESTAPI) 18 | list(APPEND CPP_SOURCES restapi/search_wrapper.cpp restapi/server.cpp) 19 | endif() 20 | add_library(${PROJECT_NAME} ${CPP_SOURCES}) 21 | add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES}) 22 | endif() 23 | 24 | if (NOT MSVC) 25 | install(TARGETS ${PROJECT_NAME} LIBRARY) 26 | endif() 27 | -------------------------------------------------------------------------------- /src/abstract_data_store.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include "abstract_data_store.h" 6 | 7 | namespace diskann 8 | { 9 | 10 | template 11 | AbstractDataStore::AbstractDataStore(const location_t capacity, const size_t dim) 12 | : _capacity(capacity), _dim(dim) 13 | { 14 | } 15 | 16 | template location_t AbstractDataStore::capacity() const 17 | { 18 | return _capacity; 19 | } 20 | 21 | template size_t AbstractDataStore::get_dims() const 22 | { 23 | return _dim; 24 | } 25 | 26 | template location_t AbstractDataStore::resize(const location_t new_num_points) 27 | { 28 | if (new_num_points > _capacity) 29 | { 30 | return expand(new_num_points); 31 | } 32 | else if (new_num_points < _capacity) 33 | { 34 | return shrink(new_num_points); 35 | } 36 | else 37 | { 38 | return _capacity; 39 | } 40 | } 41 | 42 | template DISKANN_DLLEXPORT class AbstractDataStore; 43 | template DISKANN_DLLEXPORT class AbstractDataStore; 44 | template DISKANN_DLLEXPORT class AbstractDataStore; 45 | } // namespace diskann 46 | -------------------------------------------------------------------------------- /src/ann_exception.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include "ann_exception.h" 5 | #include 6 | #include 7 | 8 | namespace diskann 9 | { 10 | ANNException::ANNException(const std::string &message, int errorCode) 11 | : std::runtime_error(message), _errorCode(errorCode) 12 | { 13 | } 14 | 15 | std::string package_string(const std::string &item_name, const std::string &item_val) 16 | { 17 | return std::string("[") + item_name + ": " + std::string(item_val) + std::string("]"); 18 | } 19 | 20 | ANNException::ANNException(const std::string &message, int errorCode, const std::string &funcSig, 21 | const std::string &fileName, uint32_t lineNum) 22 | : ANNException(package_string(std::string("FUNC"), funcSig) + package_string(std::string("FILE"), fileName) + 23 | package_string(std::string("LINE"), std::to_string(lineNum)) + " " + message, 24 | errorCode) 25 | { 26 | } 27 | 28 | FileException::FileException(const std::string &filename, std::system_error &e, const std::string &funcSig, 29 | const std::string &fileName, uint32_t lineNum) 30 | : ANNException(std::string(" While opening file \'") + filename + std::string("\', error code: ") + 31 | std::to_string(e.code().value()) + " " + e.code().message(), 32 | e.code().value(), funcSig, fileName, lineNum) 33 | { 34 | } 35 | 36 | } // namespace diskann -------------------------------------------------------------------------------- /src/dll/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #Copyright(c) Microsoft Corporation.All rights reserved. 2 | #Licensed under the MIT license. 3 | 4 | add_library(${PROJECT_NAME} SHARED dllmain.cpp ../abstract_data_store.cpp ../partition.cpp ../pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp 5 | ../windows_aligned_file_reader.cpp ../distance.cpp ../pq_l2_distance.cpp ../memory_mapper.cpp ../index.cpp 6 | ../in_mem_data_store.cpp ../pq_data_store.cpp ../in_mem_graph_store.cpp ../math_utils.cpp ../disk_utils.cpp ../filter_utils.cpp 7 | ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp ../scratch.cpp ../index_factory.cpp ../abstract_index.cpp) 8 | 9 | set(TARGET_DIR "$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}>") 10 | 11 | set(DISKANN_DLL_IMPLIB "${TARGET_DIR}/${PROJECT_NAME}.lib") 12 | 13 | if (NOT PYBIND) 14 | target_compile_definitions(${PROJECT_NAME} PRIVATE DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS DISKANN_BUILD) 15 | endif() 16 | target_compile_definitions(${PROJECT_NAME} PRIVATE _USRDLL _WINDLL) 17 | target_compile_options(${PROJECT_NAME} PRIVATE /GL) 18 | target_include_directories(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES}) 19 | 20 | target_link_options(${PROJECT_NAME} PRIVATE /DLL /IMPLIB:${DISKANN_DLL_IMPLIB} /LTCG) 21 | target_link_libraries(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_LINK_LIBRARIES}) 22 | target_link_libraries(${PROJECT_NAME} PRIVATE synchronization.lib) 23 | 24 | if (DISKANN_DLL_TCMALLOC_LINK_OPTIONS) 25 | target_link_libraries(${PROJECT_NAME} PUBLIC ${DISKANN_DLL_TCMALLOC_LINK_OPTIONS}) 26 | endif() 27 | 28 | # Copy OpenMP DLL and PDB. 29 | set(RUNTIME_FILES_TO_COPY ${OPENMP_WINDOWS_RUNTIME_FILES} ${TCMALLOC_WINDOWS_RUNTIME_FILES}) 30 | 31 | foreach(RUNTIME_FILE ${RUNTIME_FILES_TO_COPY}) 32 | add_custom_command(TARGET ${PROJECT_NAME} 33 | POST_BUILD 34 | COMMAND ${CMAKE_COMMAND} -E copy "${RUNTIME_FILE}" "${TARGET_DIR}") 35 | endforeach() -------------------------------------------------------------------------------- /src/dll/dllmain.cpp: -------------------------------------------------------------------------------- 1 | // dllmain.cpp : Defines the entry point for the DLL application. 2 | #include 3 | 4 | BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) 5 | { 6 | switch (ul_reason_for_call) 7 | { 8 | case DLL_PROCESS_ATTACH: 9 | case DLL_THREAD_ATTACH: 10 | case DLL_THREAD_DETACH: 11 | case DLL_PROCESS_DETACH: 12 | break; 13 | } 14 | return TRUE; 15 | } 16 | -------------------------------------------------------------------------------- /src/logger.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include "logger_impl.h" 8 | #include "windows_customizations.h" 9 | 10 | namespace diskann 11 | { 12 | 13 | #ifdef ENABLE_CUSTOM_LOGGER 14 | DISKANN_DLLEXPORT ANNStreamBuf coutBuff(stdout); 15 | DISKANN_DLLEXPORT ANNStreamBuf cerrBuff(stderr); 16 | 17 | DISKANN_DLLEXPORT std::basic_ostream cout(&coutBuff); 18 | DISKANN_DLLEXPORT std::basic_ostream cerr(&cerrBuff); 19 | std::function g_logger; 20 | 21 | void SetCustomLogger(std::function logger) 22 | { 23 | g_logger = logger; 24 | diskann::cout << "Set Custom Logger" << std::endl; 25 | } 26 | 27 | ANNStreamBuf::ANNStreamBuf(FILE *fp) 28 | { 29 | if (fp == nullptr) 30 | { 31 | throw diskann::ANNException("File pointer passed to ANNStreamBuf() cannot be null", -1); 32 | } 33 | if (fp != stdout && fp != stderr) 34 | { 35 | throw diskann::ANNException("The custom logger only supports stdout and stderr.", -1); 36 | } 37 | _fp = fp; 38 | _logLevel = (_fp == stdout) ? LogLevel::LL_Info : LogLevel::LL_Error; 39 | _buf = new char[BUFFER_SIZE + 1]; // See comment in the header 40 | 41 | std::memset(_buf, 0, (BUFFER_SIZE) * sizeof(char)); 42 | setp(_buf, _buf + BUFFER_SIZE - 1); 43 | } 44 | 45 | ANNStreamBuf::~ANNStreamBuf() 46 | { 47 | sync(); 48 | _fp = nullptr; // we'll not close because we can't. 49 | delete[] _buf; 50 | } 51 | 52 | int ANNStreamBuf::overflow(int c) 53 | { 54 | std::lock_guard lock(_mutex); 55 | if (c != EOF) 56 | { 57 | *pptr() = (char)c; 58 | pbump(1); 59 | } 60 | flush(); 61 | return c; 62 | } 63 | 64 | int ANNStreamBuf::sync() 65 | { 66 | std::lock_guard lock(_mutex); 67 | flush(); 68 | return 0; 69 | } 70 | 71 | int ANNStreamBuf::underflow() 72 | { 73 | throw diskann::ANNException("Attempt to read on streambuf meant only for writing.", -1); 74 | } 75 | 76 | int ANNStreamBuf::flush() 77 | { 78 | const int num = (int)(pptr() - pbase()); 79 | logImpl(pbase(), num); 80 | pbump(-num); 81 | return num; 82 | } 83 | void ANNStreamBuf::logImpl(char *str, int num) 84 | { 85 | str[num] = '\0'; // Safe. See the c'tor. 86 | // Invoke the OLS custom logging function. 87 | if (g_logger) 88 | { 89 | g_logger(_logLevel, str); 90 | } 91 | } 92 | #else 93 | using std::cerr; 94 | using std::cout; 95 | #endif 96 | 97 | } // namespace diskann 98 | -------------------------------------------------------------------------------- /src/natural_number_set.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include "ann_exception.h" 7 | #include "natural_number_set.h" 8 | 9 | namespace diskann 10 | { 11 | template 12 | natural_number_set::natural_number_set() : _values_bitset(std::make_unique>()) 13 | { 14 | } 15 | 16 | template bool natural_number_set::is_empty() const 17 | { 18 | return _values_vector.empty(); 19 | } 20 | 21 | template void natural_number_set::reserve(size_t count) 22 | { 23 | _values_vector.reserve(count); 24 | _values_bitset->reserve(count); 25 | } 26 | 27 | template void natural_number_set::insert(T id) 28 | { 29 | _values_vector.emplace_back(id); 30 | 31 | if (id >= _values_bitset->size()) 32 | _values_bitset->resize(static_cast(id) + 1); 33 | 34 | _values_bitset->set(id, true); 35 | } 36 | 37 | template T natural_number_set::pop_any() 38 | { 39 | if (_values_vector.empty()) 40 | { 41 | throw diskann::ANNException("No values available", -1, __FUNCSIG__, __FILE__, __LINE__); 42 | } 43 | 44 | const T id = _values_vector.back(); 45 | _values_vector.pop_back(); 46 | 47 | _values_bitset->set(id, false); 48 | 49 | return id; 50 | } 51 | 52 | template void natural_number_set::clear() 53 | { 54 | _values_vector.clear(); 55 | _values_bitset->clear(); 56 | } 57 | 58 | template size_t natural_number_set::size() const 59 | { 60 | return _values_vector.size(); 61 | } 62 | 63 | template bool natural_number_set::is_in_set(T id) const 64 | { 65 | return _values_bitset->test(id); 66 | } 67 | 68 | // Instantiate used templates. 69 | template class natural_number_set; 70 | } // namespace diskann 71 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | set(CMAKE_COMPILE_WARNING_AS_ERROR ON) 5 | 6 | find_package(Boost COMPONENTS unit_test_framework) 7 | 8 | # For Windows, fall back to nuget version if find_package didn't find it. 9 | if (MSVC AND NOT Boost_FOUND) 10 | set(DISKANN_BOOST_INCLUDE "${DISKANN_MSVC_PACKAGES}/boost/lib/native/include") 11 | # Multi-threaded static library. 12 | set(UNIT_TEST_FRAMEWORK_LIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}-mt-x64-*.lib") 13 | file(GLOB DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB ${UNIT_TEST_FRAMEWORK_LIB_PATTERN}) 14 | 15 | set(UNIT_TEST_FRAMEWORK_DLIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}-mt-gd-x64-*.lib") 16 | file(GLOB DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB ${UNIT_TEST_FRAMEWORK_DLIB_PATTERN}) 17 | 18 | if (EXISTS ${DISKANN_BOOST_INCLUDE} AND EXISTS ${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB} AND EXISTS ${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB}) 19 | set(Boost_FOUND ON) 20 | set(Boost_INCLUDE_DIR ${DISKANN_BOOST_INCLUDE}) 21 | add_library(Boost::unit_test_framework STATIC IMPORTED) 22 | set_target_properties(Boost::unit_test_framework PROPERTIES IMPORTED_LOCATION_RELEASE "${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB}") 23 | set_target_properties(Boost::unit_test_framework PROPERTIES IMPORTED_LOCATION_DEBUG "${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB}") 24 | message(STATUS "Falling back to using Boost from the nuget package") 25 | else() 26 | message(WARNING "Couldn't find Boost. Was looking for ${DISKANN_BOOST_INCLUDE} and ${UNIT_TEST_FRAMEWORK_LIB_PATTERN}") 27 | endif() 28 | endif() 29 | 30 | if (NOT Boost_FOUND) 31 | message(FATAL_ERROR "Couldn't find Boost dependency") 32 | endif() 33 | 34 | 35 | set(DISKANN_UNIT_TEST_SOURCES main.cpp index_write_parameters_builder_tests.cpp) 36 | 37 | add_executable(${PROJECT_NAME}_unit_tests ${DISKANN_SOURCES} ${DISKANN_UNIT_TEST_SOURCES}) 38 | target_link_libraries(${PROJECT_NAME}_unit_tests ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::unit_test_framework) 39 | 40 | add_test(NAME ${PROJECT_NAME}_unit_tests COMMAND ${PROJECT_NAME}_unit_tests) 41 | 42 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Unit Test project 2 | 3 | This unit test project is based on the [boost unit test framework](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/index.html). Below are the simple steps to add new unit test, you could find more usage from the [boost unit test document](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/index.html). 4 | 5 | ## How to add unit test 6 | 7 | - Create new [BOOST_AUTO_TEST_SUITE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_suite.html) for each class in an individual cpp file 8 | 9 | - Add [BOOST_AUTO_TEST_CASE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_case.html) for each test case in the [BOOST_AUTO_TEST_SUITE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_suite.html) 10 | 11 | - Update the [CMakeLists.txt](CMakeLists.txt) file to add the new cpp file to the test project -------------------------------------------------------------------------------- /tests/index_write_parameters_builder_tests.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | 6 | #include "parameters.h" 7 | 8 | BOOST_AUTO_TEST_SUITE(IndexWriteParametersBuilder_tests) 9 | 10 | BOOST_AUTO_TEST_CASE(test_build) 11 | { 12 | uint32_t search_list_size = rand(); 13 | uint32_t max_degree = rand(); 14 | float alpha = (float)rand(); 15 | uint32_t filter_list_size = rand(); 16 | uint32_t max_occlusion_size = rand(); 17 | bool saturate_graph = true; 18 | 19 | diskann::IndexWriteParametersBuilder builder(search_list_size, max_degree); 20 | 21 | builder.with_alpha(alpha) 22 | .with_filter_list_size(filter_list_size) 23 | .with_max_occlusion_size(max_occlusion_size) 24 | .with_num_threads(0) 25 | .with_saturate_graph(saturate_graph); 26 | 27 | { 28 | auto parameters = builder.build(); 29 | 30 | BOOST_TEST(search_list_size == parameters.search_list_size); 31 | BOOST_TEST(max_degree == parameters.max_degree); 32 | BOOST_TEST(alpha == parameters.alpha); 33 | BOOST_TEST(filter_list_size == parameters.filter_list_size); 34 | BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size); 35 | BOOST_TEST(saturate_graph == parameters.saturate_graph); 36 | 37 | BOOST_TEST(parameters.num_threads > (uint32_t)0); 38 | } 39 | 40 | { 41 | uint32_t num_threads = rand() + 1; 42 | saturate_graph = false; 43 | builder.with_num_threads(num_threads).with_saturate_graph(saturate_graph); 44 | 45 | auto parameters = builder.build(); 46 | 47 | BOOST_TEST(search_list_size == parameters.search_list_size); 48 | BOOST_TEST(max_degree == parameters.max_degree); 49 | BOOST_TEST(alpha == parameters.alpha); 50 | BOOST_TEST(filter_list_size == parameters.filter_list_size); 51 | BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size); 52 | BOOST_TEST(saturate_graph == parameters.saturate_graph); 53 | 54 | BOOST_TEST(num_threads == parameters.num_threads); 55 | } 56 | } 57 | 58 | BOOST_AUTO_TEST_SUITE_END() 59 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT license. 3 | 4 | #define BOOST_TEST_MODULE diskann_unit_tests 5 | 6 | #include 7 | -------------------------------------------------------------------------------- /windows/packages.config.in: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /windows/packages_restapi.config.in: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | --------------------------------------------------------------------------------