├── .gitattributes
├── .github
    └── workflows
    │   ├── page_search.yml
    │   ├── pr-test.yml
    │   └── push-test.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── NOTICE.txt
├── README.md
├── SECURITY.md
├── _clang-format
├── appendix
    └── Starling-appendix.pdf
├── include
    ├── aligned_file_reader.h
    ├── ann_exception.h
    ├── aux_utils.h
    ├── boost_dynamic_bitset_fwd.h
    ├── cached_io.h
    ├── common_includes.h
    ├── concurrent_queue.h
    ├── cosine_similarity.h
    ├── distance.h
    ├── exceptions.h
    ├── index.h
    ├── linux_aligned_file_reader.h
    ├── locking.h
    ├── logger.h
    ├── logger_impl.h
    ├── math_utils.h
    ├── memory_mapper.h
    ├── natural_number_map.h
    ├── natural_number_set.h
    ├── neighbor.h
    ├── parameters.h
    ├── partition_and_pq.h
    ├── percentile_stats.h
    ├── pq_flash_index.h
    ├── pq_flash_index_utils.h
    ├── pq_table.h
    ├── simd_utils.h
    ├── timer.h
    ├── tsl
    │   ├── robin_growth_policy.h
    │   ├── robin_hash.h
    │   ├── robin_map.h
    │   ├── robin_set.h
    │   ├── sparse_growth_policy.h
    │   ├── sparse_hash.h
    │   ├── sparse_map.h
    │   └── sparse_set.h
    ├── utils.h
    ├── windows_aligned_file_reader.h
    ├── windows_customizations.h
    └── windows_slim_lock.h
├── scripts
    ├── config_ci.sh
    ├── config_dataset.sh
    ├── config_sample.sh
    ├── multiple_runs.sh
    ├── run_benchmark.sh
    └── unset.sh
├── src
    ├── CMakeLists.txt
    ├── ann_exception.cpp
    ├── aux_utils.cpp
    ├── distance.cpp
    ├── dll
    │   ├── CMakeLists.txt
    │   └── dllmain.cpp
    ├── index.cpp
    ├── linux_aligned_file_reader.cpp
    ├── logger.cpp
    ├── math_utils.cpp
    ├── memory_mapper.cpp
    ├── natural_number_map.cpp
    ├── natural_number_set.cpp
    ├── page_search.cpp
    ├── partition_and_pq.cpp
    ├── pq_flash_index.cpp
    ├── range_search.cpp
    ├── utils.cpp
    ├── visit_freq.cpp
    └── windows_aligned_file_reader.cpp
├── tests
    ├── CMakeLists.txt
    ├── build_disk_index.cpp
    ├── build_memory_index.cpp
    ├── range_search_disk_different_radius.cpp
    ├── range_search_disk_index.cpp
    ├── search_disk_index.cpp
    ├── search_disk_index_save_freq.cpp
    ├── search_memory_index.cpp
    ├── search_memory_index_dynamic.cpp
    ├── test_incremental_index.cpp
    ├── test_insert_deletes_consolidate.cpp
    ├── test_streaming_scenario.cpp
    └── utils
    │   ├── CMakeLists.txt
    │   ├── bin_to_fvecs.cpp
    │   ├── bin_to_tsv.cpp
    │   ├── calculate_recall.cpp
    │   ├── compute_groundtruth.cpp
    │   ├── create_disk_layout.cpp
    │   ├── dist_gen.py
    │   ├── float_bin_to_int8.cpp
    │   ├── fvecs_to_bin.cpp
    │   ├── fvecs_to_bvecs.cpp
    │   ├── gen_random_slice.cpp
    │   ├── gen_range.cpp
    │   ├── generate_pq.cpp
    │   ├── index_relayout.cpp
    │   ├── int8_to_float.cpp
    │   ├── int8_to_float_scale.cpp
    │   ├── ivecs_to_bin.cpp
    │   ├── merge_shards.cpp
    │   ├── parse_freq_file.cpp
    │   ├── partition_data.cpp
    │   ├── partition_with_ram_budget.cpp
    │   ├── rand_data_gen.cpp
    │   ├── simulate_aggregate_recall.cpp
    │   ├── sq.cpp
    │   ├── tsv_to_bin.cpp
    │   ├── uint32_to_uint8.cpp
    │   ├── uint8_to_float.cpp
    │   └── vector_analysis.cpp
├── tests_data
    ├── l2_rand_float_10D_10K_norm1.0_self_gt10
    ├── l2_rand_uint8_10D_10K_norm50.0_self_gt10
    ├── rand_float_10D_10K_norm1.0.bin
    └── rand_uint8_10D_10K_norm50.0.bin
├── unit_tester.sh
├── windows
    └── packages.config.in
└── workflows
    ├── SSD_index.md
    ├── dynamic_index.md
    └── in_memory_index.md


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, in case people don't have core.autocrlf set.
 2 | * text=auto
 3 | 
 4 | # Explicitly declare text files you want to always be normalized and converted
 5 | # to native line endings on checkout.
 6 | *.c text
 7 | *.h text
 8 | 
 9 | # Declare files that will always have CRLF line endings on checkout.
10 | *.sln text eol=crlf
11 | 
12 | # Denote all files that are truly binary and should not be modified.
13 | *.png binary
14 | *.jpg binary
15 | 


--------------------------------------------------------------------------------
/.github/workflows/page_search.yml:
--------------------------------------------------------------------------------
  1 | name: DiskANN and Page Search Functionality Tests
  2 | on: [pull_request, workflow_dispatch]
  3 | jobs:
  4 |   build-and-run:
  5 |     name: Run on ${{ matrix.os }} and ${{ matrix.data_type }} dataset
  6 |     runs-on: ${{ matrix.os }}
  7 | 
  8 |     strategy:
  9 |       matrix:
 10 |         os: [ubuntu-20.04] # ubuntu-latest
 11 |         data_type: [float, uint8]
 12 | 
 13 |     defaults:
 14 |       run:
 15 |         shell: bash
 16 |         working-directory: scripts
 17 | 
 18 |     steps:
 19 |     - name: Checkout repository
 20 |       uses: actions/checkout@v2
 21 |       with:
 22 |           submodules: recursive
 23 | 
 24 |     - name: Install deps
 25 |       run: |
 26 |         if [ "${{ matrix.os }}" != "ubuntu-18.04" ]; then
 27 |             sudo apt install cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev
 28 |         else
 29 |             sudo apt install cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev libboost-program-options-dev
 30 |             wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18487/l_BaseKit_p_2022.1.2.146.sh
 31 |             sudo sh l_BaseKit_p_2022.1.2.146.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
 32 |         fi
 33 | 
 34 |     - name: Config Benchmark
 35 |       run: |
 36 |           cp config_ci.sh config_local.sh
 37 |           sed -i 's/\#\ DATASET_PLACEHOLDER/${{ matrix.data_type }}_dataset/' config_local.sh
 38 |     
 39 |     # Float
 40 |     - name: Build Disk Index
 41 |       run: ./run_benchmark.sh release build
 42 | 
 43 |     - name: Build Memory Index
 44 |       run: ./run_benchmark.sh release build_mem
 45 | 
 46 |     - name: Graph Partition
 47 |       run: ./run_benchmark.sh release gp
 48 | 
 49 |     - name: Generate Frequency
 50 |       run: ./run_benchmark.sh release freq
 51 | 
 52 |     - name: Beam Search
 53 |       run: ./run_benchmark.sh release search knn
 54 | 
 55 |     - name: Beam Search with In-memory Nav Graph
 56 |       run: |
 57 |           echo "MEM_L=5" >> config_local.sh
 58 |           ./run_benchmark.sh release search knn
 59 |           sed -i '$ d' config_local.sh
 60 |     
 61 |     - name: Page Search
 62 |       run: |
 63 |           echo "USE_PAGE_SEARCH=1" >> config_local.sh
 64 |           ./run_benchmark.sh release search knn
 65 | 
 66 |     - name: Page Search With Cache 1000
 67 |       run: |
 68 |           echo "CACHE=1000" >> config_local.sh
 69 |           ./run_benchmark.sh release search knn
 70 |           echo "CACHE=0" >> config_local.sh
 71 | 
 72 |     - name: Page Search with In-memory Nav Graph
 73 |       run: |
 74 |           echo "MEM_L=5" >> config_local.sh
 75 |           ./run_benchmark.sh release search knn
 76 |     
 77 |     - name: Page Search with Frequency In-memory Nav Graph
 78 |       run: |
 79 |           echo "MEM_USE_FREQ=1" >> config_local.sh
 80 |           ./run_benchmark.sh release build_mem
 81 |           ./run_benchmark.sh release search knn
 82 | 
 83 |     - name: Page Search with Frequency Graph partition
 84 |       run: |
 85 |           echo "MEM_USE_FREQ=0" >> config_local.sh
 86 |           echo "GP_USE_FREQ=1" >> config_local.sh
 87 |           echo "GP_LOCK_NUMS=100" >> config_local.sh
 88 |           echo "GP_CUT=3" >> config_local.sh
 89 |           ./run_benchmark.sh release gp
 90 |           ./run_benchmark.sh release search knn
 91 | 
 92 |     - name: Page Search with Frequency Graph Partition and In-memory Nav Graph
 93 |       run: |
 94 |           echo "MEM_USE_FREQ=1" >> config_local.sh
 95 |           ./run_benchmark.sh release search knn
 96 | 
 97 | 
 98 |     # TODO: Add range search
 99 | 
100 |     # Create comment wih results
101 |     - id: get-comment-body
102 |       run: |
103 |           body="$(cat ../indices/summary.log)"
104 |           body="${body//'%'/'%25'}"
105 |           body="${body//$'\n'/'%0A'}"
106 |           body="${body//$'\r'/'%0D'}" 
107 |           echo "::set-output name=body::$body"
108 | 
109 |     - name: Find Comment
110 |       uses: peter-evans/find-comment@v2
111 |       id: fc
112 |       with:
113 |         issue-number: ${{ github.event.pull_request.number }}
114 |         comment-author: 'github-actions[bot]'
115 |         body-includes: Results on ${{ matrix.os }} and ${{ matrix.data_type }} dataset
116 | 
117 |     - name: Create comment
118 |       uses: peter-evans/create-or-update-comment@v2
119 |       with:
120 |         comment-id: ${{ steps.fc.outputs.comment-id }}
121 |         issue-number: ${{ github.event.pull_request.number }}
122 |         body: |
123 |           Results on ${{ matrix.os }} and ${{ matrix.data_type }} dataset
124 |           ${{ steps.get-comment-body.outputs.body }}
125 |         edit-mode: replace


--------------------------------------------------------------------------------
/.github/workflows/push-test.yml:
--------------------------------------------------------------------------------
 1 | name: DiskANN Build
 2 | on: [push]
 3 | jobs:
 4 |   ubuntu-latest-build:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |     - name: Checkout repository
 8 |       uses: actions/checkout@v2
 9 |     - name: Install deps
10 |       run: |
11 |         sudo apt install cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev
12 |     - name: build
13 |       run: |
14 |         mkdir build && cd build && cmake .. && make -j
15 | 
16 |   windows-build:
17 |     name: Build for ${{ matrix.os }}
18 |     runs-on: ${{ matrix.os }}
19 | 
20 |     strategy:
21 |       matrix:
22 |         os: [windows-2019, windows-latest]
23 | 
24 |     steps:
25 |     - name: Checkout repository
26 |       uses: actions/checkout@v2
27 |       with:
28 |           submodules: true
29 | 
30 |     - name: Add VisualStudio command line tools into path
31 |       uses: ilammy/msvc-dev-cmd@v1
32 | 
33 |     - name: Run configure and build
34 |       run: |
35 |         mkdir build && cd build && cmake .. && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64"
36 |       shell: cmd
37 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "gperftools"]
2 | 	path = gperftools
3 | 	url = https://github.com/gperftools/gperftools.git
4 | [submodule "graph_partition"]
5 | 	path = graph_partition
6 | 	url = https://github.com/SonglinLife/SSD_BASED_PLAN.git
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | MAINTAINER Changxu Wang <wang_changxu@zju.edu.cn>
 3 | 
 4 | RUN apt-get update -y
 5 | RUN apt-get install -y g++ cmake libboost-dev libgoogle-perftools-dev
 6 | 
 7 | COPY . /opt/nsg
 8 | 
 9 | WORKDIR /opt/nsg
10 | 
11 | RUN mkdir -p build && cd build && \
12 |     cmake -DCMAKE_BUILD_TYPE=Release .. && \
13 |     make -j $(nproc)
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     DiskANN
 2 |     
 3 |     MIT License
 4 | 
 5 |     Copyright (c) Microsoft Corporation.
 6 | 
 7 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 8 |     of this software and associated documentation files (the "Software"), to deal
 9 |     in the Software without restriction, including without limitation the rights
10 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 |     copies of the Software, and to permit persons to whom the Software is
12 |     furnished to do so, subject to the following conditions:
13 | 
14 |     The above copyright notice and this permission notice shall be included in all
15 |     copies or substantial portions of the Software.
16 | 
17 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |     SOFTWARE
24 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | This algorithms builds upon [code for NSG](https://github.com/ZJULearning/nsg), commit: 335e8e, licensed under the following terms.
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2018 Cong Fu, Changxu Wang, Deng Cai
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Starling
 2 | 
 3 | In this repository, we share the implementations and experiments of our work *Starling: An I/O-Efficient Disk-Resident Graph Index Framework for High-Dimensional Vector Similarity Search on Data Segment* ([arXiv](https://arxiv.org/abs/2401.02116)).
 4 | 
 5 | It contains the following features:
 6 | 
 7 | For build,
 8 | 1. Build disk graph.
 9 | 2. Build in-memory navigation graph, based on
10 |     1. Nodes that are uniformly-sampled.
11 |     2. Nodes that are generated by search frequency.
12 | 3. Perform Graph Partition on given base data
13 | 
14 | For search,
15 | 
16 | |             |  With Cache Nodes  |   With Nav Graph   | With Graph Partition |  With `use_ratio`  |       Use SQ       |
17 | | :---------- | :----------------: | :----------------: | :------------------: | :----------------: | :----------------: |
18 | | Beam Search | :white_check_mark: | :white_check_mark: |                      |                    |                    |
19 | | Page Search | :white_check_mark: | :white_check_mark: |  :white_check_mark:  | :white_check_mark: | :white_check_mark: |
20 | 
21 | ## Datasets
22 | The datasets we used in the experiments can be downloaded and the data formats are explained in [NeurIPS'21 Big-ANN Benchmark](https://big-ann-benchmarks.com/neurips21.html).
23 | 
24 | | Dataset | Data type | Dimensions | Distance | # Query | Query type |
25 | | - | - | - | - | - | - |
26 | | BIGANN | uint8 | 128 | L2 | 10000 | ANNS/RS |
27 | | DEEP | float | 96 | L2 | 10000 | ANNS/RS |
28 | | SSNPP | uint8 | 256 | L2 | 100000 | RS |
29 | | Text2image | float | 200 | IP | 100000 | ANNS |
30 | 
31 | ## Quick Start
32 | 
33 | To install dependencies, run 
34 | 
35 | ```bash
36 | apt install build-essential libboost-all-dev make cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-all-dev libmkl-full-dev
37 | ```
38 | 
39 | To run benchmarks, go to `scripts` directory, copy `config_sample.sh` to `config_local.sh`, modifies the datasets paths in `config_dataset.sh` and run
40 | 
41 | ```bash
42 | ./run_benchmark.sh [debug/release] [build/build_mem/freq/gp/search] [knn/range]
43 | ```
44 | 
45 | | Arguement | Description |
46 | | - | - |
47 | | `debug/release` | Debug/Release mode to run, passed to CMake |
48 | | `build` |  Build index  |
49 | | `build_mem` | Build memory index |
50 | | `freq` | Generate visit-frequency file |
51 | | `gp` | Graph partition given index file |
52 | | `search` | Search index |
53 | | `knn` | Find k-nearest neighbors|
54 | | `range` | Range search |
55 | 
56 | Configure datasets and parameters in `config_local.sh`
57 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/_clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | # BasedOnStyle:  Google
 4 | AccessModifierOffset: -1
 5 | AlignAfterOpenBracket: Align
 6 | AlignConsecutiveAssignments: false
 7 | AlignConsecutiveDeclarations: true
 8 | AlignEscapedNewlinesLeft: true
 9 | AlignOperands:   true
10 | AlignTrailingComments: true
11 | AllowAllParametersOfDeclarationOnNextLine: true
12 | AllowShortBlocksOnASingleLine: false
13 | AllowShortCaseLabelsOnASingleLine: false
14 | AllowShortFunctionsOnASingleLine: None
15 | AllowShortIfStatementsOnASingleLine: false
16 | AllowShortLoopsOnASingleLine: false
17 | AlwaysBreakAfterDefinitionReturnType: None
18 | AlwaysBreakAfterReturnType: None
19 | AlwaysBreakBeforeMultilineStrings: true
20 | AlwaysBreakTemplateDeclarations: true
21 | BinPackArguments: true
22 | BinPackParameters: true
23 | BraceWrapping:
24 |   AfterClass:      true
25 |   AfterControlStatement: false
26 |   AfterEnum:       false
27 |   AfterFunction:   false
28 |   AfterNamespace:  false
29 |   AfterObjCDeclaration: false
30 |   AfterStruct:     false
31 |   AfterUnion:      false
32 |   BeforeCatch:     false
33 |   BeforeElse:      false
34 |   IndentBraces:    false
35 | BreakBeforeBinaryOperators: None
36 | BreakBeforeBraces: Attach
37 | BreakBeforeTernaryOperators: true
38 | BreakConstructorInitializersBeforeComma: false
39 | BreakAfterJavaFieldAnnotations: false
40 | BreakStringLiterals: true
41 | ColumnLimit:     80
42 | CommentPragmas:  '^ IWYU pragma:'
43 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
44 | ConstructorInitializerIndentWidth: 4
45 | ContinuationIndentWidth: 4
46 | Cpp11BracedListStyle: true
47 | DerivePointerAlignment: true
48 | DisableFormat:   false
49 | ExperimentalAutoDetectBinPacking: false
50 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
51 | IncludeCategories:
52 |   - Regex:           '^<.*\.h>'
53 |     Priority:        1
54 |   - Regex:           '^<.*'
55 |     Priority:        2
56 |   - Regex:           '.*'
57 |     Priority:        3
58 | IncludeIsMainRegex: '([-_](test|unittest))?$'
59 | IndentCaseLabels: true
60 | IndentWidth:     2
61 | IndentWrappedFunctionNames: false
62 | JavaScriptQuotes: Leave
63 | JavaScriptWrapImports: true
64 | KeepEmptyLinesAtTheStartOfBlocks: false
65 | MacroBlockBegin: ''
66 | MacroBlockEnd:   ''
67 | MaxEmptyLinesToKeep: 1
68 | NamespaceIndentation: All
69 | ObjCBlockIndentWidth: 2
70 | ObjCSpaceAfterProperty: false
71 | ObjCSpaceBeforeProtocolList: false
72 | PenaltyBreakBeforeFirstCallParameter: 1
73 | PenaltyBreakComment: 300
74 | PenaltyBreakFirstLessLess: 120
75 | PenaltyBreakString: 1000
76 | PenaltyExcessCharacter: 1000000
77 | PenaltyReturnTypeOnItsOwnLine: 200
78 | PointerAlignment: Right
79 | ReflowComments:  true
80 | SortIncludes:    false
81 | SpaceAfterCStyleCast: true
82 | SpaceAfterTemplateKeyword: false
83 | SpaceBeforeAssignmentOperators: true
84 | SpaceBeforeParens: ControlStatements
85 | SpaceInEmptyParentheses: false
86 | SpacesBeforeTrailingComments: 2
87 | SpacesInAngles:  false
88 | SpacesInContainerLiterals: true
89 | SpacesInCStyleCastParentheses: false
90 | SpacesInParentheses: false
91 | SpacesInSquareBrackets: false
92 | Standard:        Cpp11
93 | TabWidth:        4
94 | UseTab:          Never
95 | ...
96 | 


--------------------------------------------------------------------------------
/appendix/Starling-appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zilliztech/starling/17dc3e8a011533a62374445f53963e951b72883a/appendix/Starling-appendix.pdf


--------------------------------------------------------------------------------
/include/aligned_file_reader.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #pragma once
  5 | 
  6 | #define MAX_IO_DEPTH 128
  7 | 
  8 | #include <vector>
  9 | #include <atomic>
 10 | 
 11 | #ifndef _WINDOWS
 12 | #include <fcntl.h>
 13 | #include <libaio.h>
 14 | #include <unistd.h>
 15 | typedef io_context_t IOContext;
 16 | #else
 17 | #include <Windows.h>
 18 | #include <minwinbase.h>
 19 | 
 20 | #ifndef USE_BING_INFRA
 21 | struct IOContext {
 22 |   HANDLE                  fhandle = NULL;
 23 |   HANDLE                  iocp = NULL;
 24 |   std::vector<OVERLAPPED> reqs;
 25 | };
 26 | #else
 27 | #include "IDiskPriorityIO.h"
 28 | #include <atomic>
 29 | // TODO: Caller code is very callous about copying IOContext objects
 30 | // all over the place. MUST verify that it won't cause leaks/logical
 31 | // errors.
 32 | // Because of such callous copying, we have to use ptr->atomic instead
 33 | // of atomic, as atomic is not copyable.
 34 | struct IOContext {
 35 |   enum Status { READ_WAIT = 0, READ_SUCCESS, READ_FAILED, PROCESS_COMPLETE };
 36 | 
 37 |   std::shared_ptr<ANNIndex::IDiskPriorityIO>               m_pDiskIO = nullptr;
 38 |   std::shared_ptr<std::vector<ANNIndex::AsyncReadRequest>> m_pRequests;
 39 |   std::shared_ptr<std::vector<Status>>                     m_pRequestsStatus;
 40 | 
 41 |   IOContext()
 42 |       : m_pRequestsStatus(new std::vector<Status>()),
 43 |         m_pRequests(new std::vector<ANNIndex::AsyncReadRequest>()) {
 44 |     (*m_pRequestsStatus).reserve(MAX_IO_DEPTH);
 45 |     (*m_pRequests).reserve(MAX_IO_DEPTH);
 46 |   }
 47 | };
 48 | #endif
 49 | 
 50 | #endif
 51 | 
 52 | #include <malloc.h>
 53 | #include <cstdio>
 54 | #include <mutex>
 55 | #include <thread>
 56 | #include "tsl/robin_map.h"
 57 | #include "utils.h"
 58 | 
 59 | // NOTE :: all 3 fields must be 512-aligned
 60 | struct AlignedRead {
 61 |   uint64_t offset;  // where to read from
 62 |   uint64_t len;     // how much to read
 63 |   void*    buf;     // where to read into
 64 | 
 65 |   AlignedRead() : offset(0), len(0), buf(nullptr) {
 66 |   }
 67 | 
 68 |   AlignedRead(uint64_t offset, uint64_t len, void* buf)
 69 |       : offset(offset), len(len), buf(buf) {
 70 |     assert(IS_512_ALIGNED(offset));
 71 |     assert(IS_512_ALIGNED(len));
 72 |     assert(IS_512_ALIGNED(buf));
 73 |     // assert(malloc_usable_size(buf) >= len);
 74 |   }
 75 | };
 76 | 
 77 | class AlignedFileReader {
 78 |  protected:
 79 |   tsl::robin_map<std::thread::id, IOContext> ctx_map;
 80 |   std::mutex                                 ctx_mut;
 81 | 
 82 |  public:
 83 |   // returns the thread-specific context
 84 |   // returns (io_context_t)(-1) if thread is not registered
 85 |   virtual IOContext& get_ctx() = 0;
 86 | 
 87 |   virtual ~AlignedFileReader(){};
 88 | 
 89 |   // register thread-id for a context
 90 |   virtual void register_thread() = 0;
 91 |   // de-register thread-id for a context
 92 |   virtual void deregister_thread() = 0;
 93 |   virtual void deregister_all_threads() = 0;
 94 | 
 95 |   // Open & close ops
 96 |   // Blocking calls
 97 |   virtual void open(const std::string& fname) = 0;
 98 |   virtual void close() = 0;
 99 | 
100 |   // process batch of aligned requests in parallel
101 |   // NOTE :: blocking call
102 |   virtual void read(std::vector<AlignedRead>& read_reqs, IOContext& ctx,
103 |                     bool async = false) = 0;
104 |   virtual int submit_reqs(std::vector<AlignedRead>& read_reqs, IOContext& ctx) = 0;
105 |   virtual void get_events(IOContext &ctx, int n_ops) = 0;
106 | };
107 | 


--------------------------------------------------------------------------------
/include/ann_exception.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #include <string>
 6 | #include <stdexcept>
 7 | #include <system_error>
 8 | #include "windows_customizations.h"
 9 | 
10 | #ifndef _WINDOWS
11 | #define __FUNCSIG__ __PRETTY_FUNCTION__
12 | #endif
13 | 
14 | namespace diskann {
15 | 
16 |   class ANNException : public std::runtime_error {
17 |    public:
18 |     DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode);
19 |     DISKANN_DLLEXPORT ANNException(const std::string& message, int errorCode,
20 |                                    const std::string& funcSig,
21 |                                    const std::string& fileName,
22 |                                    unsigned int       lineNum);
23 | 
24 |    private:
25 |     int _errorCode;
26 |   };
27 | 
28 |   class FileException : public ANNException {
29 |    public:
30 |     DISKANN_DLLEXPORT FileException(const std::string& filename,
31 |                                     std::system_error& e,
32 |                                     const std::string& funcSig,
33 |                                     const std::string& fileName,
34 |                                     unsigned int       lineNum);
35 |   };
36 | }  // namespace diskann
37 | 


--------------------------------------------------------------------------------
/include/aux_utils.h:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #pragma once
  5 | #include <algorithm>
  6 | #include <fcntl.h>
  7 | #include <cassert>
  8 | #include <cstdlib>
  9 | #include <cstring>
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <memory>
 13 | #include <random>
 14 | #include <set>
 15 | #ifdef __APPLE__
 16 | #else
 17 | #include <malloc.h>
 18 | #endif
 19 | 
 20 | #ifdef _WINDOWS
 21 | #include <Windows.h>
 22 | typedef HANDLE FileHandle;
 23 | #else
 24 | #include <unistd.h>
 25 | typedef int FileHandle;
 26 | #endif
 27 | 
 28 | #include "cached_io.h"
 29 | #include "common_includes.h"
 30 | #include "tsl/robin_set.h"
 31 | 
 32 | #include "utils.h"
 33 | #include "windows_customizations.h"
 34 | 
 35 | namespace diskann {
 36 |   const size_t   MAX_PQ_TRAINING_SET_SIZE = 256000;
 37 |   const size_t   MAX_SAMPLE_POINTS_FOR_WARMUP = 100000;
 38 |   const double   PQ_TRAINING_SET_FRACTION = 0.1;
 39 |   const double   SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
 40 |   const double   THRESHOLD_FOR_CACHING_IN_GB = 1.0;
 41 |   const uint32_t NUM_NODES_TO_CACHE = 250000;
 42 |   const uint32_t WARMUP_L = 20;
 43 |   const uint32_t NUM_KMEANS_REPS = 12;
 44 | 
 45 |   template<typename T>
 46 |   class PQFlashIndex;
 47 | 
 48 |   DISKANN_DLLEXPORT std::pair<bool, std::vector<_u64>> 
 49 |         get_disk_index_meta(const std::string& path);
 50 | 
 51 |   DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
 52 |   DISKANN_DLLEXPORT double get_memory_budget(double search_ram_budget_in_gb);
 53 |   DISKANN_DLLEXPORT void   add_new_file_to_single_index(std::string index_file,
 54 |                                                         std::string new_file);
 55 | 
 56 |   DISKANN_DLLEXPORT size_t calculate_num_pq_chunks(double final_index_ram_limit,
 57 |                                                    size_t points_num,
 58 |                                                    uint32_t dim);
 59 | 
 60 |   DISKANN_DLLEXPORT double calculate_recall(
 61 |       unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
 62 |       unsigned *our_results, unsigned dim_or, unsigned recall_at);
 63 | 
 64 |   DISKANN_DLLEXPORT double calculate_recall(
 65 |       unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
 66 |       unsigned *our_results, unsigned dim_or, unsigned recall_at,
 67 |       const tsl::robin_set<unsigned> &active_tags);
 68 | 
 69 |   DISKANN_DLLEXPORT double calculate_range_search_recall(
 70 |       unsigned num_queries, std::vector<std::vector<_u32>> &groundtruth,
 71 |       std::vector<std::vector<_u64>> &our_results);
 72 | 
 73 |   DISKANN_DLLEXPORT void read_idmap(const std::string &    fname,
 74 |                                     std::vector<unsigned> &ivecs);
 75 | 
 76 | #ifdef EXEC_ENV_OLS
 77 |   template<typename T>
 78 |   DISKANN_DLLEXPORT T *load_warmup(MemoryMappedFiles &files,
 79 |                                    const std::string &cache_warmup_file,
 80 |                                    uint64_t &warmup_num, uint64_t warmup_dim,
 81 |                                    uint64_t warmup_aligned_dim);
 82 | #else
 83 |   template<typename T>
 84 |   DISKANN_DLLEXPORT T *load_warmup(const std::string &cache_warmup_file,
 85 |                                    uint64_t &warmup_num, uint64_t warmup_dim,
 86 |                                    uint64_t warmup_aligned_dim);
 87 | #endif
 88 | 
 89 |   DISKANN_DLLEXPORT int merge_shards(const std::string &vamana_prefix,
 90 |                                      const std::string &vamana_suffix,
 91 |                                      const std::string &idmaps_prefix,
 92 |                                      const std::string &idmaps_suffix,
 93 |                                      const _u64 nshards, unsigned max_degree,
 94 |                                      const std::string &output_vamana,
 95 |                                      const std::string &medoids_file);
 96 | 
 97 |   template<typename T>
 98 |   DISKANN_DLLEXPORT std::string preprocess_base_file(
 99 |       const std::string &infile, const std::string &indexPrefix,
100 |       diskann::Metric &distMetric);
101 | 
102 |   template<typename T>
103 |   DISKANN_DLLEXPORT int build_merged_vamana_index(
104 |       std::string base_file, diskann::Metric _compareMetric, unsigned L,
105 |       unsigned R, double sampling_rate, double ram_budget,
106 |       std::string mem_index_path, std::string medoids_file,
107 |       std::string centroids_file);
108 | 
109 |   template<typename T>
110 |   DISKANN_DLLEXPORT uint32_t optimize_beamwidth(
111 |       std::unique_ptr<diskann::PQFlashIndex<T>> &_pFlashIndex, T *tuning_sample,
112 |       _u64 tuning_sample_num, _u64 tuning_sample_aligned_dim, uint32_t L,
113 |       uint32_t nthreads, uint32_t start_bw = 2);
114 | 
115 |   template<typename T>
116 |   DISKANN_DLLEXPORT int build_disk_index(const char *    dataFilePath,
117 |                                          const char *    indexFilePath,
118 |                                          const char *    indexBuildParameters,
119 |                                          diskann::Metric _compareMetric);
120 | 
121 |   template<typename T>
122 |   DISKANN_DLLEXPORT void create_disk_layout(
123 |       const std::string base_file, const std::string mem_index_file,
124 |       const std::string output_file,
125 |       const std::string reorder_data_file = std::string(""));
126 | 
127 | }  // namespace diskann
128 | 


--------------------------------------------------------------------------------
/include/boost_dynamic_bitset_fwd.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | namespace boost {
 7 | #ifndef BOOST_DYNAMIC_BITSET_FWD_HPP
 8 |   template<typename Block = unsigned long,
 9 |            typename Allocator = std::allocator<Block>>
10 |   class dynamic_bitset;
11 | #endif
12 | }  // namespace boost
13 | 


--------------------------------------------------------------------------------
/include/cached_io.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #pragma once
  5 | #include <cstring>
  6 | #include <fstream>
  7 | #include <iostream>
  8 | #include <sstream>
  9 | 
 10 | #include "logger.h"
 11 | #include "ann_exception.h"
 12 | 
 13 | // sequential cached reads
 14 | class cached_ifstream {
 15 |  public:
 16 |   cached_ifstream() {
 17 |   }
 18 |   cached_ifstream(const std::string& filename, uint64_t cacheSize)
 19 |       : cache_size(cacheSize), cur_off(0) {
 20 |     reader.exceptions(std::ifstream::failbit | std::ifstream::badbit);
 21 |     this->open(filename, cache_size);
 22 |   }
 23 |   ~cached_ifstream() {
 24 |     delete[] cache_buf;
 25 |     reader.close();
 26 |   }
 27 | 
 28 |   void open(const std::string& filename, uint64_t cacheSize) {
 29 |     this->cur_off = 0;
 30 | 
 31 |     try {
 32 |       reader.open(filename, std::ios::binary | std::ios::ate);
 33 |       fsize = reader.tellg();
 34 |       reader.seekg(0, std::ios::beg);
 35 |       assert(reader.is_open());
 36 |       assert(cacheSize > 0);
 37 |       cacheSize = (std::min)(cacheSize, fsize);
 38 |       this->cache_size = cacheSize;
 39 |       cache_buf = new char[cacheSize];
 40 |       reader.read(cache_buf, cacheSize);
 41 |       diskann::cout << "Opened: " << filename.c_str() << ", size: " << fsize
 42 |                     << ", cache_size: " << cacheSize << std::endl;
 43 |     } catch (std::system_error& e) {
 44 |       throw diskann::FileException(filename, e, __FUNCSIG__, __FILE__,
 45 |                                    __LINE__);
 46 |     }
 47 |   }
 48 | 
 49 |   size_t get_file_size() {
 50 |     return fsize;
 51 |   }
 52 | 
 53 |   void read(char* read_buf, uint64_t n_bytes) {
 54 |     assert(cache_buf != nullptr);
 55 |     assert(read_buf != nullptr);
 56 | 
 57 |     if (n_bytes <= (cache_size - cur_off)) {
 58 |       // case 1: cache contains all data
 59 |       memcpy(read_buf, cache_buf + cur_off, n_bytes);
 60 |       cur_off += n_bytes;
 61 |     } else {
 62 |       // case 2: cache contains some data
 63 |       uint64_t cached_bytes = cache_size - cur_off;
 64 |       if (n_bytes - cached_bytes > fsize - reader.tellg()) {
 65 |         std::stringstream stream;
 66 |         stream << "Reading beyond end of file" << std::endl;
 67 |         stream << "n_bytes: " << n_bytes << " cached_bytes: " << cached_bytes
 68 |                << " fsize: " << fsize << " current pos:" << reader.tellg()
 69 |                << std::endl;
 70 |         diskann::cout << stream.str() << std::endl;
 71 |         throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
 72 |                                     __LINE__);
 73 |       }
 74 |       memcpy(read_buf, cache_buf + cur_off, cached_bytes);
 75 | 
 76 |       // go to disk and fetch more data
 77 |       reader.read(read_buf + cached_bytes, n_bytes - cached_bytes);
 78 |       // reset cur off
 79 |       cur_off = cache_size;
 80 | 
 81 |       uint64_t size_left = fsize - reader.tellg();
 82 | 
 83 |       if (size_left >= cache_size) {
 84 |         reader.read(cache_buf, cache_size);
 85 |         cur_off = 0;
 86 |       }
 87 |       // note that if size_left < cache_size, then cur_off = cache_size, so
 88 |       // subsequent reads will all be directly from file
 89 |     }
 90 |   }
 91 | 
 92 |  private:
 93 |   // underlying ifstream
 94 |   std::ifstream reader;
 95 |   // # bytes to cache in one shot read
 96 |   uint64_t cache_size = 0;
 97 |   // underlying buf for cache
 98 |   char* cache_buf = nullptr;
 99 |   // offset into cache_buf for cur_pos
100 |   uint64_t cur_off = 0;
101 |   // file size
102 |   uint64_t fsize = 0;
103 | };
104 | 
105 | // sequential cached writes
106 | class cached_ofstream {
107 |  public:
108 |   cached_ofstream(const std::string& filename, uint64_t cache_size)
109 |       : cache_size(cache_size), cur_off(0) {
110 |     writer.exceptions(std::ifstream::failbit | std::ifstream::badbit);
111 |     try {
112 |       writer.open(filename, std::ios::binary);
113 |       assert(writer.is_open());
114 |       assert(cache_size > 0);
115 |       cache_buf = new char[cache_size];
116 |       diskann::cout << "Opened: " << filename.c_str()
117 |                     << ", cache_size: " << cache_size << std::endl;
118 |     } catch (std::system_error& e) {
119 |       throw diskann::FileException(filename, e, __FUNCSIG__, __FILE__,
120 |                                    __LINE__);
121 |     }
122 |   }
123 | 
124 |   ~cached_ofstream() {
125 |     this->close();
126 |   }
127 | 
128 |   void close() {
129 |     // dump any remaining data in memory
130 |     if (cur_off > 0) {
131 |       this->flush_cache();
132 |     }
133 | 
134 |     if (cache_buf != nullptr) {
135 |       delete[] cache_buf;
136 |       cache_buf = nullptr;
137 |     }
138 | 
139 |     if (writer.is_open())
140 |       writer.close();
141 |     diskann::cout << "Finished writing " << fsize << "B" << std::endl;
142 |   }
143 | 
144 |   size_t get_file_size() {
145 |     return fsize;
146 |   }
147 |   // writes n_bytes from write_buf to the underlying ofstream/cache
148 |   void write(char* write_buf, uint64_t n_bytes) {
149 |     assert(cache_buf != nullptr);
150 |     if (n_bytes <= (cache_size - cur_off)) {
151 |       // case 1: cache can take all data
152 |       memcpy(cache_buf + cur_off, write_buf, n_bytes);
153 |       cur_off += n_bytes;
154 |     } else {
155 |       // case 2: cache cant take all data
156 |       // go to disk and write existing cache data
157 |       writer.write(cache_buf, cur_off);
158 |       fsize += cur_off;
159 |       // write the new data to disk
160 |       writer.write(write_buf, n_bytes);
161 |       fsize += n_bytes;
162 |       // memset all cache data and reset cur_off
163 |       memset(cache_buf, 0, cache_size);
164 |       cur_off = 0;
165 |     }
166 |   }
167 | 
168 |   void flush_cache() {
169 |     assert(cache_buf != nullptr);
170 |     writer.write(cache_buf, cur_off);
171 |     fsize += cur_off;
172 |     memset(cache_buf, 0, cache_size);
173 |     cur_off = 0;
174 |   }
175 | 
176 |   void reset() {
177 |     flush_cache();
178 |     writer.seekp(0);
179 |   }
180 | 
181 |  private:
182 |   // underlying ofstream
183 |   std::ofstream writer;
184 |   // # bytes to cache for one shot write
185 |   uint64_t cache_size = 0;
186 |   // underlying buf for cache
187 |   char* cache_buf = nullptr;
188 |   // offset into cache_buf for cur_pos
189 |   uint64_t cur_off = 0;
190 | 
191 |   // file size
192 |   uint64_t fsize = 0;
193 | };
194 | 


--------------------------------------------------------------------------------
/include/common_includes.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <algorithm>
 5 | #include <cassert>
 6 | #include <chrono>
 7 | #include <cmath>
 8 | #include <cstdio>
 9 | #include <cstring>
10 | #include <fcntl.h>
11 | #include <fstream>
12 | #include <iostream>
13 | #include <queue>
14 | #include <random>
15 | #include <string.h>
16 | #include <sys/stat.h>
17 | #include <time.h>
18 | #include <vector>
19 | 


--------------------------------------------------------------------------------
/include/concurrent_queue.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #pragma once
  5 | #include <atomic>
  6 | #include <chrono>
  7 | #include <condition_variable>
  8 | #include <mutex>
  9 | #include <queue>
 10 | #include <thread>
 11 | #include <type_traits>
 12 | #include <unordered_set>
 13 | 
 14 | namespace diskann {
 15 | 
 16 |   template<typename T>
 17 |   class ConcurrentQueue {
 18 |     typedef std::chrono::microseconds    chrono_us_t;
 19 |     typedef std::unique_lock<std::mutex> mutex_locker;
 20 | 
 21 |     std::queue<T>           q;
 22 |     std::mutex              mut;
 23 |     std::mutex              push_mut;
 24 |     std::mutex              pop_mut;
 25 |     std::condition_variable push_cv;
 26 |     std::condition_variable pop_cv;
 27 |     T                       null_T;
 28 | 
 29 |    public:
 30 |     ConcurrentQueue() {
 31 |     }
 32 | 
 33 |     ConcurrentQueue(T nullT) {
 34 |       this->null_T = nullT;
 35 |     }
 36 | 
 37 |     ~ConcurrentQueue() {
 38 |       this->push_cv.notify_all();
 39 |       this->pop_cv.notify_all();
 40 |     }
 41 | 
 42 |     // queue stats
 43 |     uint64_t size() {
 44 |       mutex_locker lk(this->mut);
 45 |       uint64_t     ret = q.size();
 46 |       lk.unlock();
 47 |       return ret;
 48 |     }
 49 | 
 50 |     bool empty() {
 51 |       return (this->size() == 0);
 52 |     }
 53 | 
 54 |     // PUSH BACK
 55 |     void push(T& new_val) {
 56 |       mutex_locker lk(this->mut);
 57 |       this->q.push(new_val);
 58 |       lk.unlock();
 59 |     }
 60 | 
 61 |     template<class Iterator>
 62 |     void insert(Iterator iter_begin, Iterator iter_end) {
 63 |       mutex_locker lk(this->mut);
 64 |       for (Iterator it = iter_begin; it != iter_end; it++) {
 65 |         this->q.push(*it);
 66 |       }
 67 |       lk.unlock();
 68 |     }
 69 | 
 70 |     // POP FRONT
 71 |     T pop() {
 72 |       mutex_locker lk(this->mut);
 73 |       if (this->q.empty()) {
 74 |         lk.unlock();
 75 |         return this->null_T;
 76 |       } else {
 77 |         T ret = this->q.front();
 78 |         this->q.pop();
 79 |         // diskann::cout << "thread_id: " << std::this_thread::get_id() << ",
 80 |         // ctx: "
 81 |         // << ret.ctx << "\n";
 82 |         lk.unlock();
 83 |         return ret;
 84 |       }
 85 |     }
 86 | 
 87 |     // register for notifications
 88 |     void wait_for_push_notify(chrono_us_t wait_time = chrono_us_t{10}) {
 89 |       mutex_locker lk(this->push_mut);
 90 |       this->push_cv.wait_for(lk, wait_time);
 91 |       lk.unlock();
 92 |     }
 93 | 
 94 |     void wait_for_pop_notify(chrono_us_t wait_time = chrono_us_t{10}) {
 95 |       mutex_locker lk(this->pop_mut);
 96 |       this->pop_cv.wait_for(lk, wait_time);
 97 |       lk.unlock();
 98 |     }
 99 | 
100 |     // just notify functions
101 |     void push_notify_one() {
102 |       this->push_cv.notify_one();
103 |     }
104 |     void push_notify_all() {
105 |       this->push_cv.notify_all();
106 |     }
107 |     void pop_notify_one() {
108 |       this->pop_cv.notify_one();
109 |     }
110 |     void pop_notify_all() {
111 |       this->pop_cv.notify_all();
112 |     }
113 |   };
114 | }  // namespace diskann
115 | 


--------------------------------------------------------------------------------
/include/distance.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "windows_customizations.h"
  3 | 
  4 | namespace diskann {
  5 | 
  6 |   template<typename T>
  7 |   class Distance {
  8 |    public:
  9 |     virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
 10 |     virtual ~Distance() {
 11 |     }
 12 |   };
 13 | 
 14 |   class DistanceCosineInt8 : public Distance<int8_t> {
 15 |    public:
 16 |     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
 17 |                                             uint32_t length) const;
 18 |   };
 19 | 
 20 |   class DistanceL2Int8 : public Distance<int8_t> {
 21 |    public:
 22 |     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
 23 |                                             uint32_t size) const;
 24 |   };
 25 | 
 26 |   // AVX implementations. Borrowed from HNSW code.
 27 |   class AVXDistanceL2Int8 : public Distance<int8_t> {
 28 |    public:
 29 |     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
 30 |                                             uint32_t length) const;
 31 |   };
 32 | 
 33 |   // Slow implementations of the distance functions to get diskann to
 34 |   // work in pre-AVX machines. Performance here is not a concern, so we are
 35 |   // using the simplest possible implementation.
 36 |   template<typename T>
 37 |   class SlowDistanceL2Int : public Distance<T> {
 38 |    public:
 39 |     // Implementing here because this is a template function
 40 |     DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b,
 41 |                                             uint32_t length) const {
 42 |       uint32_t result = 0;
 43 |       for (uint32_t i = 0; i < length; i++) {
 44 |         result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
 45 |                   ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
 46 |       }
 47 |       return (float) result;
 48 |     }
 49 |   };
 50 | 
 51 |   class DistanceCosineFloat : public Distance<float> {
 52 |    public:
 53 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
 54 |                                             uint32_t length) const;
 55 |   };
 56 | 
 57 |   class DistanceL2Float : public Distance<float> {
 58 |    public:
 59 | #ifdef _WINDOWS
 60 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
 61 |                                             uint32_t size) const;
 62 | #else
 63 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
 64 |                                             uint32_t size) const
 65 |         __attribute__((hot));
 66 | #endif
 67 |   };
 68 | 
 69 |   class AVXDistanceL2Float : public Distance<float> {
 70 |    public:
 71 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
 72 |                                             uint32_t length) const;
 73 |   };
 74 | 
 75 |   class SlowDistanceL2Float : public Distance<float> {
 76 |    public:
 77 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
 78 |                                             uint32_t length) const;
 79 |   };
 80 | 
 81 |   class SlowDistanceCosineUInt8 : public Distance<uint8_t> {
 82 |    public:
 83 |     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
 84 |                                             uint32_t length) const;
 85 |   };
 86 | 
 87 |   class DistanceL2UInt8 : public Distance<uint8_t> {
 88 |    public:
 89 |     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
 90 |                                             uint32_t size) const;
 91 |   };
 92 | 
 93 |   template<typename T>
 94 |   class DistanceInnerProduct : public Distance<T> {
 95 |    public:
 96 |     float inner_product(const T *a, const T *b, unsigned size) const;
 97 |     float compare(const T *a, const T *b, unsigned size) const {
 98 |       // since we use normally minimization objective for distance
 99 |       // comparisons, we are returning 1/x.
100 |       float result = inner_product(a, b, size);
101 |       //      if (result < 0)
102 |       //      return std::numeric_limits<float>::max();
103 |       //      else
104 |       return -result;
105 |     }
106 |   };
107 | 
108 |   template<typename T>
109 |   class DistanceFastL2
110 |       : public DistanceInnerProduct<T> {  // currently defined only for float.
111 |                                           // templated for future use.
112 |    public:
113 |     float norm(const T *a, unsigned size) const;
114 |     float compare(const T *a, const T *b, float norm, unsigned size) const;
115 |   };
116 | 
117 |   class AVXDistanceInnerProductFloat : public Distance<float> {
118 |    public:
119 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
120 |                                             uint32_t length) const;
121 |   };
122 | 
123 |   class AVXNormalizedCosineDistanceFloat : public Distance<float> {
124 |    private:
125 |     AVXDistanceInnerProductFloat _innerProduct;
126 | 
127 |    public:
128 |     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
129 |                                             uint32_t length) const {
130 |       // Inner product returns negative values to indicate distance.
131 |       // This will ensure that cosine is between -1 and 1.
132 |       return 1.0f + _innerProduct.compare(a, b, length);
133 |     }
134 |   };
135 | 
136 | }  // namespace diskann
137 | 


--------------------------------------------------------------------------------
/include/exceptions.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #include <stdexcept>
 6 | 
 7 | namespace diskann {
 8 | 
 9 |   class NotImplementedException : public std::logic_error {
10 |    public:
11 |     NotImplementedException()
12 |         : std::logic_error("Function not yet implemented.") {
13 |     }
14 |   };
15 | }  // namespace diskann
16 | 


--------------------------------------------------------------------------------
/include/linux_aligned_file_reader.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #ifndef _WINDOWS
 6 | 
 7 | #include "aligned_file_reader.h"
 8 | 
 9 | class LinuxAlignedFileReader : public AlignedFileReader {
10 |  private:
11 |   uint64_t     file_sz;
12 |   FileHandle   file_desc;
13 |   io_context_t bad_ctx = (io_context_t) -1;
14 | 
15 |  public:
16 |   LinuxAlignedFileReader();
17 |   ~LinuxAlignedFileReader();
18 | 
19 |   IOContext &get_ctx();
20 | 
21 |   // register thread-id for a context
22 |   void register_thread();
23 | 
24 |   // de-register thread-id for a context
25 |   void deregister_thread();
26 |   void deregister_all_threads();
27 | 
28 |   // Open & close ops
29 |   // Blocking calls
30 |   void open(const std::string &fname);
31 |   void close();
32 | 
33 |   // process batch of aligned requests in parallel
34 |   // NOTE :: blocking call
35 |   void read(std::vector<AlignedRead> &read_reqs, IOContext &ctx,
36 |             bool async = false);
37 | 
38 |   int submit_reqs(std::vector<AlignedRead> &read_reqs, IOContext &ctx);
39 |   void get_events(IOContext &ctx, int n_ops);
40 | };
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/include/locking.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <mutex>
 5 | 
 6 | #ifdef _WINDOWS
 7 | #include "windows_slim_lock.h"
 8 | #endif
 9 | 
10 | namespace diskann {
11 | #ifdef _WINDOWS
12 |   using non_recursive_mutex = windows_exclusive_slim_lock;
13 |   using LockGuard = windows_exclusive_slim_lock_guard;
14 | #else
15 |   using non_recursive_mutex = std::mutex;
16 |   using LockGuard = std::lock_guard<non_recursive_mutex>;
17 | #endif
18 | }  // namespace diskann
19 | 


--------------------------------------------------------------------------------
/include/logger.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "windows_customizations.h"
 6 | 
 7 | namespace diskann {
 8 |   DISKANN_DLLEXPORT extern std::basic_ostream<char> cout;
 9 |   DISKANN_DLLEXPORT extern std::basic_ostream<char> cerr;
10 | }  // namespace diskann
11 | 


--------------------------------------------------------------------------------
/include/logger_impl.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <sstream>
 7 | #include <mutex>
 8 | 
 9 | #ifdef EXEC_ENV_OLS
10 | #include "IANNIndex.h"
11 | #include "ANNLogging.h"
12 | #endif
13 | 
14 | #include "ann_exception.h"
15 | 
16 | #ifndef EXEC_ENV_OLS
17 | namespace ANNIndex {
18 |   enum LogLevel {
19 |     LL_Debug = 0,
20 |     LL_Info,
21 |     LL_Status,
22 |     LL_Warning,
23 |     LL_Error,
24 |     LL_Assert,
25 |     LL_Count
26 |   };
27 | };
28 | #endif
29 | 
30 | namespace diskann {
31 |   class ANNStreamBuf : public std::basic_streambuf<char> {
32 |    public:
33 |     DISKANN_DLLEXPORT explicit ANNStreamBuf(FILE* fp);
34 |     DISKANN_DLLEXPORT ~ANNStreamBuf();
35 | 
36 |     DISKANN_DLLEXPORT bool is_open() const {
37 |       return true;  // because stdout and stderr are always open.
38 |     }
39 |     DISKANN_DLLEXPORT void        close();
40 |     DISKANN_DLLEXPORT virtual int underflow();
41 |     DISKANN_DLLEXPORT virtual int overflow(int c);
42 |     DISKANN_DLLEXPORT virtual int sync();
43 | 
44 |    private:
45 |     FILE*              _fp;
46 |     char*              _buf;
47 |     int                _bufIndex;
48 |     std::mutex         _mutex;
49 |     ANNIndex::LogLevel _logLevel;
50 | 
51 |     int  flush();
52 |     void logImpl(char* str, int numchars);
53 | 
54 | // Why the two buffer-sizes? If we are running normally, we are basically
55 | // interacting with a character output system, so we short-circuit the
56 | // output process by keeping an empty buffer and writing each character
57 | // to stdout/stderr. But if we are running in OLS, we have to take all
58 | // the text that is written to diskann::cout/diskann:cerr, consolidate it
59 | // and push it out in one-shot, because the OLS infra does not give us
60 | // character based output. Therefore, we use a larger buffer that is large
61 | // enough to store the longest message, and continuously add characters
62 | // to it. When the calling code outputs a std::endl or std::flush, sync()
63 | // will be called and will output a log level, component name, and the text
64 | // that has been collected. (sync() is also called if the buffer is full, so
65 | // overflows/missing text are not a concern).
66 | // This implies calling code _must_ either print std::endl or std::flush
67 | // to ensure that the message is written immediately.
68 | #ifdef EXEC_ENV_OLS
69 |     static const int BUFFER_SIZE = 1024;
70 | #else
71 |     static const int BUFFER_SIZE = 0;
72 | #endif
73 | 
74 |     ANNStreamBuf(const ANNStreamBuf&);
75 |     ANNStreamBuf& operator=(const ANNStreamBuf&);
76 |   };
77 | }  // namespace diskann
78 | 


--------------------------------------------------------------------------------
/include/math_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "common_includes.h"
 7 | #include "utils.h"
 8 | 
 9 | namespace math_utils {
10 | 
11 |   float calc_distance(float* vec_1, float* vec_2, size_t dim);
12 | 
13 |   // compute l2-squared norms of data stored in row major num_points * dim,
14 |   // needs
15 |   // to be pre-allocated
16 |   void compute_vecs_l2sq(float* vecs_l2sq, float* data, const size_t num_points,
17 |                          const size_t dim);
18 | 
19 |   void rotate_data_randomly(float* data, size_t num_points, size_t dim,
20 |                             float* rot_mat, float*& new_mat,
21 |                             bool transpose_rot = false);
22 | 
23 |   // calculate closest center to data of num_points * dim (row major)
24 |   // centers is num_centers * dim (row major)
25 |   // data_l2sq has pre-computed squared norms of data
26 |   // centers_l2sq has pre-computed squared norms of centers
27 |   // pre-allocated center_index will contain id of k nearest centers
28 |   // pre-allocated dist_matrix shound be num_points * num_centers and contain
29 |   // squared distances
30 | 
31 |   // Ideally used only by compute_closest_centers
32 |   void compute_closest_centers_in_block(
33 |       const float* const data, const size_t num_points, const size_t dim,
34 |       const float* const centers, const size_t num_centers,
35 |       const float* const docs_l2sq, const float* const centers_l2sq,
36 |       uint32_t* center_index, float* const dist_matrix, size_t k = 1);
37 | 
38 |   // Given data in num_points * new_dim row major
39 |   // Pivots stored in full_pivot_data as k * new_dim row major
40 |   // Calculate the closest pivot for each point and store it in vector
41 |   // closest_centers_ivf (which needs to be allocated outside)
42 |   // Additionally, if inverted index is not null (and pre-allocated), it will
43 |   // return inverted index for each center Additionally, if pts_norms_squared is
44 |   // not null, then it will assume that point norms are pre-computed and use
45 |   // those
46 |   // values
47 | 
48 |   void compute_closest_centers(float* data, size_t num_points, size_t dim,
49 |                                float* pivot_data, size_t num_centers, size_t k,
50 |                                uint32_t*            closest_centers_ivf,
51 |                                std::vector<size_t>* inverted_index = NULL,
52 |                                float*               pts_norms_squared = NULL);
53 | 
54 |   // if to_subtract is 1, will subtract nearest center from each row. Else will
55 |   // add. Output will be in data_load iself.
56 |   // Nearest centers need to be provided in closst_centers.
57 | 
58 |   void process_residuals(float* data_load, size_t num_points, size_t dim,
59 |                          float* cur_pivot_data, size_t num_centers,
60 |                          uint32_t* closest_centers, bool to_subtract);
61 | 
62 | }  // namespace math_utils
63 | 
64 | namespace kmeans {
65 | 
66 |   // run Lloyds one iteration
67 |   // Given data in row major num_points * dim, and centers in row major
68 |   // num_centers * dim
69 |   // And squared lengths of data points, output the closest center to each data
70 |   // point, update centers, and also return inverted index.
71 |   // If closest_centers == NULL, will allocate memory and return.
72 |   // Similarly, if closest_docs == NULL, will allocate memory and return.
73 | 
74 |   float lloyds_iter(float* data, size_t num_points, size_t dim, float* centers,
75 |                     size_t num_centers, float* docs_l2sq,
76 |                     std::vector<size_t>* closest_docs,
77 |                     uint32_t*&           closest_center);
78 | 
79 |   // Run Lloyds until max_reps or stopping criterion
80 |   // If you pass NULL for closest_docs and closest_center, it will NOT return
81 |   // the results, else it will assume appriate allocation as closest_docs = new
82 |   // vector<size_t> [num_centers], and closest_center = new size_t[num_points]
83 |   // Final centers are output in centers as row major num_centers * dim
84 |   //
85 |   float run_lloyds(float* data, size_t num_points, size_t dim, float* centers,
86 |                    const size_t num_centers, const size_t max_reps,
87 |                    std::vector<size_t>* closest_docs, uint32_t* closest_center);
88 | 
89 |   // assumes already memory allocated for pivot_data as new
90 |   // float[num_centers*dim] and select randomly num_centers points as pivots
91 |   void selecting_pivots(float* data, size_t num_points, size_t dim,
92 |                         float* pivot_data, size_t num_centers);
93 | 
94 |   void kmeanspp_selecting_pivots(float* data, size_t num_points, size_t dim,
95 |                                  float* pivot_data, size_t num_centers);
96 | }  // namespace kmeans
97 | 


--------------------------------------------------------------------------------
/include/memory_mapper.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifndef _WINDOWS
 7 | #include <fcntl.h>
 8 | #include <sys/mman.h>
 9 | #include <sys/stat.h>
10 | #include <sys/types.h>
11 | #include <unistd.h>
12 | 
13 | #else
14 | #include <Windows.h>
15 | #endif
16 | #include <string>
17 | 
18 | namespace diskann {
19 |   class MemoryMapper {
20 |    private:
21 | #ifndef _WINDOWS
22 |     int _fd;
23 | #else
24 |     HANDLE _bareFile;
25 |     HANDLE _fd;
26 | 
27 | #endif
28 |     char*       _buf;
29 |     size_t      _fileSize;
30 |     const char* _fileName;
31 | 
32 |    public:
33 |     MemoryMapper(const char* filename);
34 |     MemoryMapper(const std::string& filename);
35 | 
36 |     char*  getBuf();
37 |     size_t getFileSize();
38 | 
39 |     ~MemoryMapper();
40 |   };
41 | }  // namespace diskann
42 | 


--------------------------------------------------------------------------------
/include/natural_number_map.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <memory>
 7 | #include <type_traits>
 8 | 
 9 | #include "boost_dynamic_bitset_fwd.h"
10 | 
11 | namespace diskann {
12 |   // A map whose key is a natural number (from 0 onwards) and maps to a value.
13 |   // Made as both memory and performance efficient map for scenario such as
14 |   // DiskANN location-to-tag map. There, the pool of numbers is consecutive from
15 |   // zero to some max value, and it's expected that most if not all keys from 0
16 |   // up to some current maximum will be present in the map. The memory usage of
17 |   // the map is determined by the largest inserted key since it uses vector as a
18 |   // backing store and bitset for presence indication.
19 |   //
20 |   // Thread-safety: this class is not thread-safe in general.
21 |   // Exception: multiple read-only operations are safe on the object only if
22 |   // there are no writers to it in parallel.
23 |   template<typename Key, typename Value>
24 |   class natural_number_map {
25 |    public:
26 |     static_assert(std::is_trivial<Key>::value, "Key must be a trivial type");
27 |     // Some of the class member prototypes are done with this assumption to
28 |     // minimize verbosity since it's the only use case.
29 |     static_assert(std::is_trivial<Value>::value,
30 |                   "Value must be a trivial type");
31 | 
32 |     // Represents a reference to a element in the map. Used while iterating
33 |     // over map entries.
34 |     struct position {
35 |       size_t _key;
36 |       // The number of keys that were enumerated when iterating through the map
37 |       // so far. Used to early-terminate enumeration when ithere
38 |       // are no more entries in the map.
39 |       size_t _keys_already_enumerated;
40 | 
41 |       // Returns whether it's valid to access the element at this position in
42 |       // the map.
43 |       bool is_valid() const;
44 |     };
45 | 
46 |     natural_number_map();
47 | 
48 |     void   reserve(size_t count);
49 |     size_t size() const;
50 | 
51 |     void set(Key key, Value value);
52 |     void erase(Key key);
53 | 
54 |     bool contains(Key key) const;
55 |     bool try_get(Key key, Value& value) const;
56 | 
57 |     // Returns the value at the specified position. Prerequisite: position is
58 |     // valid.
59 |     Value get(const position& pos) const;
60 | 
61 |     // Finds the first element in the map, if any. Invalidated by changes in the
62 |     // map.
63 |     position find_first() const;
64 | 
65 |     // Finds the next element in the map after the specified position.
66 |     // Invalidated by changes in the map.
67 |     position find_next(const position& after_position) const;
68 | 
69 |     void clear();
70 | 
71 |    private:
72 |     // Number of entries in the map. Not the same as size() of the
73 |     // _values_vector below.
74 |     size_t _size;
75 | 
76 |     // Array of values. The key is the index of the value.
77 |     std::vector<Value> _values_vector;
78 | 
79 |     // Values that are in the set have the corresponding bit index set
80 |     // to 1.
81 |     //
82 |     // Use a pointer here to allow for forward declaration of dynamic_bitset
83 |     // in public headers to avoid making boost a dependency for clients
84 |     // of DiskANN.
85 |     std::unique_ptr<boost::dynamic_bitset<>> _values_bitset;
86 |   };
87 | }  // namespace diskann
88 | 


--------------------------------------------------------------------------------
/include/natural_number_set.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <memory>
 7 | #include <type_traits>
 8 | 
 9 | #include "boost_dynamic_bitset_fwd.h"
10 | 
11 | namespace diskann {
12 |   // A set of natural numbers (from 0 onwards). Made for scenario where the
13 |   // pool of numbers is consecutive from zero to some max value and very
14 |   // efficient methods for "add to set", "get any value from set", "is in set"
15 |   // are needed. The memory usage of the set is determined by the largest
16 |   // number of inserted entries (uses a vector as a backing store) as well as
17 |   // the largest value to be placed in it (uses bitset as well).
18 |   //
19 |   // Thread-safety: this class is not thread-safe in general.
20 |   // Exception: multiple read-only operations (e.g. is_in_set, empty, size) are
21 |   // safe on the object only if there are no writers to it in parallel.
22 |   template<typename T>
23 |   class natural_number_set {
24 |    public:
25 |     static_assert(std::is_trivial<T>::value,
26 |                   "Identifier must be a trivial type");
27 | 
28 |     natural_number_set();
29 | 
30 |     bool   is_empty() const;
31 |     void   reserve(size_t count);
32 |     void   insert(T id);
33 |     T      pop_any();
34 |     void   clear();
35 |     size_t size() const;
36 |     bool   is_in_set(T id) const;
37 | 
38 |    private:
39 |     // Values that are currently in set.
40 |     std::vector<T> _values_vector;
41 | 
42 |     // Values that are in the set have the corresponding bit index set
43 |     // to 1.
44 |     //
45 |     // Use a pointer here to allow for forward declaration of dynamic_bitset
46 |     // in public headers to avoid making boost a dependency for clients
47 |     // of DiskANN.
48 |     std::unique_ptr<boost::dynamic_bitset<>> _values_bitset;
49 |   };
50 | }  // namespace diskann
51 | 


--------------------------------------------------------------------------------
/include/neighbor.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #pragma once
  5 | 
  6 | #include <cstddef>
  7 | #include <mutex>
  8 | #include <vector>
  9 | #include "utils.h"
 10 | 
 11 | namespace diskann {
 12 | 
 13 |   struct Neighbor {
 14 |     unsigned id;
 15 |     float    distance;
 16 |     bool     flag;
 17 |     unsigned rev_id = 0; // where is this neighbor comes from
 18 | 
 19 |     Neighbor() = default;
 20 |     Neighbor(unsigned id, float distance, bool f)
 21 |         : id{id}, distance{distance}, flag(f) {
 22 |     }
 23 | 
 24 |     Neighbor(unsigned id, float distance, bool f, unsigned r_id)
 25 |         : id{id}, distance{distance}, flag(f), rev_id{r_id} {
 26 |     }
 27 | 
 28 |     inline bool operator<(const Neighbor &other) const {
 29 |       return distance < other.distance;
 30 |     }
 31 |     inline bool operator==(const Neighbor &other) const {
 32 |       return (id == other.id);
 33 |     }
 34 |   };
 35 | 
 36 |   struct MemNavNeighbor {
 37 |     unsigned id;
 38 |     float distance;
 39 |     unsigned tag;
 40 | 
 41 |     MemNavNeighbor(unsigned i, float d, unsigned t)
 42 |         : id{i}, distance{d}, tag{t} {
 43 |     }
 44 |   };
 45 | 
 46 | 
 47 |   struct SimpleNeighbor {
 48 |     unsigned id;
 49 |     float    distance;
 50 | 
 51 |     SimpleNeighbor() = default;
 52 |     SimpleNeighbor(unsigned id, float distance) : id(id), distance(distance) {
 53 |     }
 54 | 
 55 |     inline bool operator<(const SimpleNeighbor &other) const {
 56 |       return distance < other.distance;
 57 |     }
 58 | 
 59 |     inline bool operator==(const SimpleNeighbor &other) const {
 60 |       return id == other.id;
 61 |     }
 62 |   };
 63 |   struct SimpleNeighbors {
 64 |     std::vector<SimpleNeighbor> pool;
 65 |   };
 66 | 
 67 |   static inline unsigned InsertIntoPool(Neighbor *addr, unsigned K,
 68 |                                         Neighbor nn) {
 69 |     // find the location to insert
 70 |     unsigned left = 0, right = K - 1;
 71 |     if (addr[left].distance > nn.distance) {
 72 |       memmove((char *) &addr[left + 1], &addr[left], K * sizeof(Neighbor));
 73 |       addr[left] = nn;
 74 |       return left;
 75 |     }
 76 |     if (addr[right].distance < nn.distance) {
 77 |       addr[K] = nn;
 78 |       return K;
 79 |     }
 80 |     while (right > 1 && left < right - 1) {
 81 |       unsigned mid = (left + right) / 2;
 82 |       if (addr[mid].distance > nn.distance)
 83 |         right = mid;
 84 |       else
 85 |         left = mid;
 86 |     }
 87 |     // check equal ID
 88 | 
 89 |     while (left > 0) {
 90 |       if (addr[left].distance < nn.distance)
 91 |         break;
 92 |       if (addr[left].id == nn.id)
 93 |         return K + 1;
 94 |       left--;
 95 |     }
 96 |     if (addr[left].id == nn.id || addr[right].id == nn.id)
 97 |       return K + 1;
 98 |     memmove((char *) &addr[right + 1], &addr[right],
 99 |             (K - right) * sizeof(Neighbor));
100 |     addr[right] = nn;
101 |     return right;
102 |   }
103 | 
104 |   // This class maintains a fixed-size sorted vector which supports
105 |   //    1. if the vector is full and the distance of the inserting element is
106 |   //       larger than the last element, early return. Otherwise, O(logN) insert
107 |   //    2. move the first `num` elements to another vector
108 |   class NeighborVec {
109 |   public:
110 |     void set_cap(size_t cap) {
111 |       v.resize(cap+1);
112 |       cap_ = cap;
113 |       size_ = std::min(size_, cap_);
114 |     }
115 | 
116 |     void insert(const Neighbor &nn) {
117 |       if (size_ == 0 && cap_) {
118 |         v[size_] = nn;
119 |       } else {
120 |         if (size_ == cap_ && nn.distance >= v[size_-1].distance) return;
121 |         InsertIntoPool(v.data(), size_, nn);
122 |       }
123 | 
124 |       if (size_ < cap_) ++size_;
125 |     }
126 | 
127 |     size_t move_to(std::vector<Neighbor>& des, size_t des_idx, size_t num) {
128 |       if (num > size_) {
129 |         std::cout << "warning: require more neighbors than having. num: " << num << " size_: " << size_ << std::endl;
130 |       }
131 |       num = std::min(num, size_);
132 |       if (des_idx + num >= des.size()) {
133 |         std::cerr << "des size error" << std::endl;
134 |         exit(1);
135 |       }
136 |       memmove(&(des.data()[des_idx]), v.data(), num * sizeof(Neighbor));
137 |       if (size_ - num) {
138 |         memmove(v.data(), &(v.data()[num]), (size_ - num)*sizeof(Neighbor));
139 |       }
140 |       size_ -= num;
141 |       return num;
142 |     }
143 |   private:
144 |     std::vector<Neighbor> v;
145 |     size_t cap_ = 0, size_ = 0;
146 |   };
147 | 
148 |   static inline unsigned InsertIntoPool(Neighbor *addr, unsigned K,
149 |                                         Neighbor nn, NeighborVec& kicked, unsigned L) {
150 |     // find the location to insert
151 |     unsigned left = 0, right = K - 1;
152 |     if (addr[left].distance > nn.distance) {
153 |       memmove((char *) &addr[left + 1], &addr[left], K * sizeof(Neighbor));
154 |       if (K == L) kicked.insert(addr[K]);
155 |       addr[left] = nn;
156 |       return left;
157 |     }
158 |     if (addr[right].distance < nn.distance) {
159 |       addr[K] = nn;
160 |       return K;
161 |     }
162 |     while (right > 1 && left < right - 1) {
163 |       unsigned mid = (left + right) / 2;
164 |       if (addr[mid].distance > nn.distance)
165 |         right = mid;
166 |       else
167 |         left = mid;
168 |     }
169 | 
170 |     while (left > 0) {
171 |       if (addr[left].distance < nn.distance)
172 |         break;
173 |       if (addr[left].id == nn.id)
174 |         return K + 1;
175 |       left--;
176 |     }
177 |     if (addr[left].id == nn.id || addr[right].id == nn.id)
178 |       return K + 1;
179 |     memmove((char *) &addr[right + 1], &addr[right],
180 |             (K - right) * sizeof(Neighbor));
181 |     if (K == L) kicked.insert(addr[K]);
182 |     addr[right] = nn;
183 |     return right;
184 |   }
185 | }  // namespace diskann
186 | 


--------------------------------------------------------------------------------
/include/parameters.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #include <sstream>
 6 | #include <typeinfo>
 7 | #include <unordered_map>
 8 | 
 9 | namespace diskann {
10 | 
11 |   class Parameters {
12 |    public:
13 |     Parameters() {
14 |       int *p = new int;
15 |       *p = 0;
16 |       params["num_threads"] = p;
17 |     }
18 | 
19 |     template<typename ParamType>
20 |     inline void Set(const std::string &name, const ParamType &value) {
21 |       //      ParamType *ptr = (ParamType *) malloc(sizeof(ParamType));
22 |       if (params.find(name) != params.end()) {
23 |         free(params[name]);
24 |       }
25 |       ParamType *ptr = new ParamType;
26 |       *ptr = value;
27 |       params[name] = (void *) ptr;
28 |     }
29 | 
30 |     template<typename ParamType>
31 |     inline ParamType Get(const std::string &name) const {
32 |       auto item = params.find(name);
33 |       if (item == params.end()) {
34 |         throw std::invalid_argument("Invalid parameter name.");
35 |       } else {
36 |         // return ConvertStrToValue<ParamType>(item->second);
37 |         if (item->second == nullptr) {
38 |           throw std::invalid_argument(std::string("Parameter ") + name +
39 |                                       " has value null.");
40 |         } else {
41 |           return *(static_cast<ParamType *>(item->second));
42 |         }
43 |       }
44 |     }
45 | 
46 |     template<typename ParamType>
47 |     inline ParamType Get(const std::string &name,
48 |                          const ParamType &  default_value) {
49 |       try {
50 |         return Get<ParamType>(name);
51 |       } catch (std::invalid_argument e) {
52 |         return default_value;
53 |       }
54 |     }
55 | 
56 |     ~Parameters() {
57 |       for (auto iter = params.begin(); iter != params.end(); iter++) {
58 |         if (iter->second != nullptr)
59 |           free(iter->second);
60 |         // delete iter->second;
61 |       }
62 |     }
63 | 
64 |    private:
65 |     std::unordered_map<std::string, void *> params;
66 | 
67 |     Parameters(const Parameters &);
68 |     Parameters &operator=(const Parameters &);
69 | 
70 |     template<typename ParamType>
71 |     inline ParamType ConvertStrToValue(const std::string &str) const {
72 |       std::stringstream sstream(str);
73 |       ParamType         value;
74 |       if (!(sstream >> value) || !sstream.eof()) {
75 |         std::stringstream err;
76 |         err << "Failed to convert value '" << str
77 |             << "' to type: " << typeid(value).name();
78 |         throw std::runtime_error(err.str());
79 |       }
80 |       return value;
81 |     }
82 |   };
83 | }  // namespace diskann
84 | 


--------------------------------------------------------------------------------
/include/partition_and_pq.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #include <cassert>
 6 | #include <sstream>
 7 | #include <stack>
 8 | #include <string>
 9 | #include <unordered_map>
10 | 
11 | #include "neighbor.h"
12 | #include "parameters.h"
13 | #include "tsl/robin_set.h"
14 | #include "utils.h"
15 | 
16 | #include "windows_customizations.h"
17 | 
18 | template<typename T>
19 | void gen_random_slice(const std::string base_file,
20 |                       const std::string output_prefix, double sampling_rate);
21 | 
22 | template<typename T>
23 | void gen_random_slice(const std::string data_file, double p_val,
24 |                       float *&sampled_data, size_t &slice_size, size_t &ndims);
25 | 
26 | template<typename T>
27 | void gen_random_slice(const T *inputdata, size_t npts, size_t ndims,
28 |                       double p_val, float *&sampled_data, size_t &slice_size);
29 | 
30 | int estimate_cluster_sizes(float *test_data_float, size_t num_test,
31 |                            float *pivots, const size_t num_centers,
32 |                            const size_t dim, const size_t k_base,
33 |                            std::vector<size_t> &cluster_sizes);
34 | 
35 | template<typename T>
36 | int shard_data_into_clusters(const std::string data_file, float *pivots,
37 |                              const size_t num_centers, const size_t dim,
38 |                              const size_t k_base, std::string prefix_path);
39 | 
40 | template<typename T>
41 | int shard_data_into_clusters_only_ids(const std::string data_file,
42 |                                       float *pivots, const size_t num_centers,
43 |                                       const size_t dim, const size_t k_base,
44 |                                       std::string prefix_path);
45 | 
46 | template<typename T>
47 | int retrieve_shard_data_from_ids(const std::string data_file,
48 |                                  std::string       idmap_filename,
49 |                                  std::string       data_filename);
50 | 
51 | template<typename T>
52 | int partition(const std::string data_file, const float sampling_rate,
53 |               size_t num_centers, size_t max_k_means_reps,
54 |               const std::string prefix_path, size_t k_base);
55 | 
56 | template<typename T>
57 | int partition_with_ram_budget(const std::string data_file,
58 |                               const double sampling_rate, double ram_budget,
59 |                               size_t            graph_degree,
60 |                               const std::string prefix_path, size_t k_base);
61 | 
62 | DISKANN_DLLEXPORT int generate_pq_pivots(
63 |     const float *train_data, size_t num_train, unsigned dim,
64 |     unsigned num_centers, unsigned num_pq_chunks, unsigned max_k_means_reps,
65 |     std::string pq_pivots_path, bool make_zero_mean = false);
66 | 
67 | template<typename T>
68 | int generate_pq_data_from_pivots(const std::string data_file,
69 |                                  unsigned num_centers, unsigned num_pq_chunks,
70 |                                  std::string pq_pivots_path,
71 |                                  std::string pq_compressed_vectors_path);
72 | 


--------------------------------------------------------------------------------
/include/percentile_stats.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <cstddef>
 7 | #include <cstdint>
 8 | #include <fstream>
 9 | #include <functional>
10 | #include <algorithm>
11 | #ifdef _WINDOWS
12 | #include <numeric>
13 | #endif
14 | #include <string>
15 | #include <vector>
16 | 
17 | #include "distance.h"
18 | #include "parameters.h"
19 | 
20 | namespace diskann {
21 |   struct QueryStats {
22 |     float total_us = 0;  // total time to process query in micros
23 |     float io_us = 0;     // total time spent in IO
24 |     float cpu_us = 0;    // total time spent in CPU
25 | 
26 |     unsigned n_4k = 0;          // # of 4kB reads
27 |     unsigned n_8k = 0;          // # of 8kB reads
28 |     unsigned n_12k = 0;         // # of 12kB reads
29 |     unsigned n_ios = 0;         // total # of IOs issued
30 |     unsigned read_size = 0;     // total # of bytes read
31 |     unsigned n_cmps_saved = 0;  // # cmps saved
32 |     unsigned n_cmps = 0;        // # cmps
33 |     unsigned n_cache_hits = 0;  // # cache_hits
34 |     unsigned n_hops = 0;        // # search hops
35 |   };
36 | 
37 |   template<typename T>
38 |   inline T get_percentile_stats(
39 |       QueryStats *stats, uint64_t len, float percentile,
40 |       const std::function<T(const QueryStats &)> &member_fn) {
41 |     std::vector<T> vals(len);
42 |     for (uint64_t i = 0; i < len; i++) {
43 |       vals[i] = member_fn(stats[i]);
44 |     }
45 | 
46 |     std::sort(vals.begin(), vals.end(),
47 |               [](const T &left, const T &right) { return left < right; });
48 | 
49 |     auto retval = vals[(uint64_t)(percentile * len)];
50 |     vals.clear();
51 |     return retval;
52 |   }
53 | 
54 |   template<typename T>
55 |   inline double get_mean_stats(
56 |       QueryStats *stats, uint64_t len,
57 |       const std::function<T(const QueryStats &)> &member_fn) {
58 |     double avg = 0;
59 |     for (uint64_t i = 0; i < len; i++) {
60 |       avg += (double) member_fn(stats[i]);
61 |     }
62 |     return avg / len;
63 |   }
64 | 
65 |   // The following two functions are used when getting statistics while range searching on only queries with
66 |   // non-zero gt lengths
67 |   template<typename T>
68 |   inline T get_percentile_stats_gt(
69 |       QueryStats *stats, uint64_t len, float percentile,
70 |       const std::function<T(const QueryStats &)> &member_fn, std::vector<std::vector<uint32_t>> &gt) {
71 |     std::vector<T> vals;
72 |     for (uint64_t i = 0; i < len; i++) {
73 |       if (gt[i].size()) vals.push_back(member_fn(stats[i]));
74 |     }
75 | 
76 |     std::sort(vals.begin(), vals.end(),
77 |               [](const T &left, const T &right) { return left < right; });
78 | 
79 |     auto retval = vals[(uint64_t)(percentile * vals.size())];
80 |     vals.clear();
81 |     return retval;
82 |   }
83 | 
84 |   template<typename T>
85 |   inline double get_mean_stats_gt(
86 |       QueryStats *stats, uint64_t len,
87 |       const std::function<T(const QueryStats &)> &member_fn, std::vector<std::vector<uint32_t>> &gt) {
88 |     uint32_t cnt = 0;
89 |     double avg = 0;
90 |     for (uint64_t i = 0; i < len; i++) {
91 |       if (gt[i].size()) {
92 |         ++cnt;
93 |         avg += (double) member_fn(stats[i]);
94 |       }
95 |     }
96 |     return avg / cnt;
97 |   }
98 | }  // namespace diskann
99 | 


--------------------------------------------------------------------------------
/include/pq_flash_index_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "distance.h"
 4 | #include "cosine_similarity.h"
 5 | 
 6 | #ifdef _WINDOWS
 7 | #include "windows_aligned_file_reader.h"
 8 | #else
 9 | #include "linux_aligned_file_reader.h"
10 | #endif
11 | 
12 | #define READ_U64(stream, val) stream.read((char *) &val, sizeof(_u64))
13 | #define READ_U32(stream, val) stream.read((char *) &val, sizeof(_u32))
14 | #define READ_UNSIGNED(stream, val) stream.read((char *) &val, sizeof(unsigned))
15 | 
16 | // sector # on disk where node_id is present with in the graph part
17 | #define NODE_SECTOR_NO(node_id) (((_u64)(node_id)) / nnodes_per_sector + 1)
18 | 
19 | // obtains region of sector containing node
20 | #define OFFSET_TO_NODE(sector_buf, node_id) \
21 |   ((char *) sector_buf + (((_u64) node_id) % nnodes_per_sector) * max_node_len)
22 | 
23 | // returns region of `node_buf` containing [NNBRS][NBR_ID(_u32)]
24 | #define OFFSET_TO_NODE_NHOOD(node_buf) \
25 |   (unsigned *) ((char *) node_buf + disk_bytes_per_point)
26 | 
27 | // returns region of `node_buf` containing [COORD(T)]
28 | #define OFFSET_TO_NODE_COORDS(node_buf) (T *) (node_buf)
29 | 
30 | // sector # beyond the end of graph where data for id is present for reordering
31 | #define VECTOR_SECTOR_NO(id) \
32 |   (((_u64)(id)) / nvecs_per_sector + reorder_data_start_sector)
33 | 
34 | // sector # beyond the end of graph where data for id is present for reordering
35 | #define VECTOR_SECTOR_OFFSET(id) \
36 |   ((((_u64)(id)) % nvecs_per_sector) * data_dim * sizeof(float))
37 | 
38 | namespace diskann {
39 |   namespace pq_flash_index_utils {
40 |     inline void aggregate_coords(const unsigned *ids, const _u64 n_ids,
41 |                           const _u8 *all_coords, const _u64 ndims, _u8 *out) {
42 |       for (_u64 i = 0; i < n_ids; i++) {
43 |         memcpy(out + i * ndims, all_coords + ids[i] * ndims, ndims * sizeof(_u8));
44 |       }
45 |     }
46 | 
47 |     inline void pq_dist_lookup(const _u8 *pq_ids, const _u64 n_pts,
48 |                         const _u64 pq_nchunks, const float *pq_dists,
49 |                         float *dists_out) {
50 |       _mm_prefetch((char *) dists_out, _MM_HINT_T0);
51 |       _mm_prefetch((char *) pq_ids, _MM_HINT_T0);
52 |       _mm_prefetch((char *) (pq_ids + 64), _MM_HINT_T0);
53 |       _mm_prefetch((char *) (pq_ids + 128), _MM_HINT_T0);
54 |       memset(dists_out, 0, n_pts * sizeof(float));
55 |       for (_u64 chunk = 0; chunk < pq_nchunks; chunk++) {
56 |         const float *chunk_dists = pq_dists + 256 * chunk;
57 |         if (chunk < pq_nchunks - 1) {
58 |           _mm_prefetch((char *) (chunk_dists + 256), _MM_HINT_T0);
59 |         }
60 |         for (_u64 idx = 0; idx < n_pts; idx++) {
61 |           _u8 pq_centerid = pq_ids[pq_nchunks * idx + chunk];
62 |           dists_out[idx] += chunk_dists[pq_centerid];
63 |         }
64 |       }
65 |     }
66 |   }
67 | }  // namespace


--------------------------------------------------------------------------------
/include/simd_utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #ifdef _WINDOWS
  4 | #include <immintrin.h>
  5 | #include <smmintrin.h>
  6 | #include <tmmintrin.h>
  7 | #include <intrin.h>
  8 | #else
  9 | #include <immintrin.h>
 10 | #endif
 11 | 
 12 | namespace diskann {
 13 |   static inline __m256 _mm256_mul_epi8(__m256i X) {
 14 |     __m256i zero = _mm256_setzero_si256();
 15 | 
 16 |     __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
 17 | 
 18 |     __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
 19 |     __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
 20 | 
 21 |     return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, xlo),
 22 |                                                _mm256_madd_epi16(xhi, xhi)));
 23 |   }
 24 | 
 25 |   static inline __m128 _mm_mulhi_epi8(__m128i X) {
 26 |     __m128i zero = _mm_setzero_si128();
 27 |     __m128i sign_x = _mm_cmplt_epi8(X, zero);
 28 |     __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
 29 | 
 30 |     return _mm_cvtepi32_ps(
 31 |         _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
 32 |   }
 33 | 
 34 |   static inline __m128 _mm_mulhi_epi8_shift32(__m128i X) {
 35 |     __m128i zero = _mm_setzero_si128();
 36 |     X = _mm_srli_epi64(X, 32);
 37 |     __m128i sign_x = _mm_cmplt_epi8(X, zero);
 38 |     __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
 39 | 
 40 |     return _mm_cvtepi32_ps(
 41 |         _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
 42 |   }
 43 |   static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y) {
 44 |     __m128i zero = _mm_setzero_si128();
 45 | 
 46 |     __m128i sign_x = _mm_cmplt_epi8(X, zero);
 47 |     __m128i sign_y = _mm_cmplt_epi8(Y, zero);
 48 | 
 49 |     __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
 50 |     __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
 51 |     __m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
 52 |     __m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
 53 | 
 54 |     return _mm_cvtepi32_ps(
 55 |         _mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
 56 |   }
 57 |   static inline __m128 _mm_mul_epi8(__m128i X) {
 58 |     __m128i zero = _mm_setzero_si128();
 59 |     __m128i sign_x = _mm_cmplt_epi8(X, zero);
 60 |     __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
 61 |     __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
 62 | 
 63 |     return _mm_cvtepi32_ps(
 64 |         _mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
 65 |   }
 66 | 
 67 |   static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y) {
 68 |     __m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
 69 |     return _mm_cvtepi32_ps(
 70 |         _mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
 71 |   }
 72 | 
 73 |   static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y) {
 74 |     __m256i zero = _mm256_setzero_si256();
 75 | 
 76 |     __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
 77 |     __m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
 78 | 
 79 |     __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
 80 |     __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
 81 |     __m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
 82 |     __m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
 83 | 
 84 |     return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo),
 85 |                                                _mm256_madd_epi16(xhi, yhi)));
 86 |   }
 87 | 
 88 |   static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y) {
 89 |     __m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
 90 |     return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)),
 91 |                            _mm256_setzero_ps(), 252);
 92 |   }
 93 | 
 94 |   static inline float _mm256_reduce_add_ps(__m256 x) {
 95 |     /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
 96 |     const __m128 x128 =
 97 |         _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
 98 |     /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
 99 |     const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
100 |     /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
101 |     const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
102 |     /* Conversion to float is a no-op on x86-64 */
103 |     return _mm_cvtss_f32(x32);
104 |   }
105 | }  // namespace diskann
106 | 


--------------------------------------------------------------------------------
/include/timer.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <chrono>
 5 | 
 6 | namespace diskann {
 7 |   class Timer {
 8 |     typedef std::chrono::high_resolution_clock _clock;
 9 |     std::chrono::time_point<_clock>            check_point;
10 | 
11 |    public:
12 |     Timer() : check_point(_clock::now()) {
13 |     }
14 | 
15 |     void reset() {
16 |       check_point = _clock::now();
17 |     }
18 | 
19 |     long long elapsed() const {
20 |       return std::chrono::duration_cast<std::chrono::microseconds>(
21 |                  _clock::now() - check_point)
22 |           .count();
23 |     }
24 |   };
25 | }  // namespace diskann
26 | 


--------------------------------------------------------------------------------
/include/windows_aligned_file_reader.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | #ifdef _WINDOWS
 6 | #ifndef USE_BING_INFRA
 7 | #include <Windows.h>
 8 | #include <fcntl.h>
 9 | #include <malloc.h>
10 | #include <minwinbase.h>
11 | 
12 | #include <cstdio>
13 | #include <mutex>
14 | #include <thread>
15 | #include "aligned_file_reader.h"
16 | #include "tsl/robin_map.h"
17 | #include "utils.h"
18 | #include "windows_customizations.h"
19 | 
20 | class WindowsAlignedFileReader : public AlignedFileReader {
21 |  private:
22 |   std::wstring m_filename;
23 | 
24 |  protected:
25 |   // virtual IOContext createContext();
26 | 
27 |  public:
28 |   DISKANN_DLLEXPORT WindowsAlignedFileReader(){};
29 |   DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){};
30 | 
31 |   // Open & close ops
32 |   // Blocking calls
33 |   DISKANN_DLLEXPORT virtual void open(const std::string &fname) override;
34 |   DISKANN_DLLEXPORT virtual void close() override;
35 | 
36 |   DISKANN_DLLEXPORT virtual void register_thread() override;
37 |   DISKANN_DLLEXPORT virtual void deregister_thread() override {
38 |     // TODO: Needs implementation.
39 |   }
40 |   DISKANN_DLLEXPORT virtual void deregister_all_threads() override {
41 |     // TODO: Needs implementation.
42 |   }
43 |   DISKANN_DLLEXPORT virtual IOContext &get_ctx() override;
44 | 
45 |   // process batch of aligned requests in parallel
46 |   // NOTE :: blocking call for the calling thread, but can thread-safe
47 |   DISKANN_DLLEXPORT virtual void read(std::vector<AlignedRead> &read_reqs,
48 |                                       IOContext &ctx, bool async) override;
49 | };
50 | #endif  // USE_BING_INFRA
51 | #endif  //_WINDOWS
52 | 


--------------------------------------------------------------------------------
/include/windows_customizations.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #pragma once
 5 | 
 6 | #ifdef _WINDOWS
 7 | 
 8 | #ifdef _WINDLL
 9 | #define DISKANN_DLLEXPORT __declspec(dllexport)
10 | #else
11 | #define DISKANN_DLLEXPORT __declspec(dllimport)
12 | #endif
13 | 
14 | #else
15 | #define DISKANN_DLLEXPORT
16 | #endif
17 | 


--------------------------------------------------------------------------------
/include/windows_slim_lock.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #ifndef WIN32_LEAN_AND_MEAN
 5 | #define WIN32_LEAN_AND_MEAN
 6 | #endif
 7 | #include "Windows.h"
 8 | 
 9 | namespace diskann {
10 |   // A thin C++ wrapper around Windows exclusive functionality of Windows
11 |   // SlimReaderWriterLock.
12 |   //
13 |   // The SlimReaderWriterLock is simpler/more lightweight than std::mutex
14 |   // (8 bytes vs 80 bytes), which is useful in the scenario where DiskANN has
15 |   // one lock per vector in the index. It does not support recursive locking and
16 |   // requires Windows Vista or later.
17 |   //
18 |   // Full documentation can be found at.
19 |   // https://msdn.microsoft.com/en-us/library/windows/desktop/aa904937(v=vs.85).aspx
20 |   class windows_exclusive_slim_lock {
21 |    public:
22 |     windows_exclusive_slim_lock() : _lock(SRWLOCK_INIT) {
23 |     }
24 | 
25 |     // The lock is non-copyable. This also disables move constructor/operator=.
26 |     windows_exclusive_slim_lock(const windows_exclusive_slim_lock&) = delete;
27 |     windows_exclusive_slim_lock& operator=(const windows_exclusive_slim_lock&) =
28 |         delete;
29 | 
30 |     void lock() {
31 |       return AcquireSRWLockExclusive(&_lock);
32 |     }
33 | 
34 |     bool try_lock() {
35 |       return TryAcquireSRWLockExclusive(&_lock) != FALSE;
36 |     }
37 | 
38 |     void unlock() {
39 |       return ReleaseSRWLockExclusive(&_lock);
40 |     }
41 | 
42 |    private:
43 |     SRWLOCK _lock;
44 |   };
45 | 
46 |   // An exclusive lock over a SlimReaderWriterLock.
47 |   class windows_exclusive_slim_lock_guard {
48 |    public:
49 |     windows_exclusive_slim_lock_guard(windows_exclusive_slim_lock& p_lock)
50 |         : _lock(p_lock) {
51 |       _lock.lock();
52 |     }
53 | 
54 |     // The lock is non-copyable. This also disables move constructor/operator=.
55 |     windows_exclusive_slim_lock_guard(
56 |         const windows_exclusive_slim_lock_guard&) = delete;
57 |     windows_exclusive_slim_lock_guard& operator=(
58 |         const windows_exclusive_slim_lock_guard&) = delete;
59 | 
60 |     ~windows_exclusive_slim_lock_guard() {
61 |       _lock.unlock();
62 |     }
63 | 
64 |    private:
65 |     windows_exclusive_slim_lock& _lock;
66 |   };
67 | }  // namespace diskann
68 | 


--------------------------------------------------------------------------------
/scripts/config_ci.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | float_dataset() {
 4 |   BASE_PATH=../tests_data/rand_float_10D_10K_norm1.0.bin
 5 |   QUERY_FILE=../tests_data/rand_float_10D_10K_norm1.0.bin
 6 |   GT_FILE=../tests_data/l2_rand_float_10D_10K_norm1.0_self_gt10
 7 |   PREFIX=float_10k
 8 |   DATA_TYPE=float
 9 |   DIST_FN=l2
10 |   B=0.00003
11 |   K=5
12 |   DATA_DIM=10
13 |   DATA_N=10000
14 | }
15 | 
16 | uint8_dataset() {
17 |   BASE_PATH=../tests_data/rand_uint8_10D_10K_norm50.0.bin
18 |   QUERY_FILE=../tests_data/rand_uint8_10D_10K_norm50.0.bin
19 |   GT_FILE=../tests_data/l2_rand_uint8_10D_10K_norm50.0_self_gt10
20 |   PREFIX=int_10k
21 |   DATA_TYPE=uint8
22 |   DIST_FN=l2
23 |   B=0.00003
24 |   K=5
25 |   DATA_DIM=10
26 |   DATA_N=10000
27 | }
28 | 
29 | # DATASET_PLACEHOLDER
30 | 
31 | ##################
32 | #   Disk Build   #
33 | ##################
34 | R=16
35 | BUILD_L=32
36 | M=1
37 | BUILD_T=16
38 | 
39 | ##################################
40 | #   In-Memory Navigation Graph   #
41 | ##################################
42 | MEM_R=16
43 | MEM_BUILD_L=32
44 | MEM_ALPHA=1.2
45 | MEM_RAND_SAMPLING_RATE=0.001
46 | MEM_USE_FREQ=0
47 | MEM_FREQ_USE_RATE=0.001
48 | 
49 | ##########################
50 | #   Generate Frequency   #
51 | ##########################
52 | FREQ_QUERY_FILE=$QUERY_FILE
53 | FREQ_QUERY_CNT=0 # Set 0 to use all (default)
54 | FREQ_BM=4
55 | FREQ_L=100 # only support one value at a time for now
56 | FREQ_T=16
57 | FREQ_CACHE=0
58 | FREQ_MEM_L=0 # non-zero to enable
59 | FREQ_MEM_TOPK=10
60 | 
61 | #######################
62 | #   Graph Partition   #
63 | #######################
64 | GP_TIMES=5
65 | GP_T=16
66 | GP_USE_FREQ=0
67 | GP_LOCK_NUMS=0
68 | GP_CUT=4096 
69 | 
70 | ##############
71 | #   Search   #
72 | ##############
73 | BM_LIST=(2)
74 | T_LIST=(16)
75 | CACHE=0
76 | MEM_L=0 # non-zero to enable
77 | MEM_TOPK=3
78 | 
79 | #############
80 | #    SQ     #  
81 | #############
82 | USE_SQ=0
83 | 
84 | 
85 | # Page Search
86 | USE_PAGE_SEARCH=0 # Set 0 for beam search, 1 for page search (default)
87 | PS_USE_RATIO=1.0
88 | 
89 | # KNN
90 | LS="10 12 14 16"
91 | 
92 | # Range search
93 | RS_LS="80 100"
94 | 


--------------------------------------------------------------------------------
/scripts/config_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Switch dataset in the config_local.sh file by calling the desired function
 4 | 
 5 | #################
 6 | #   BIGANN10M   #
 7 | #################
 8 | dataset_bigann10M() {
 9 |   BASE_PATH=/data/datasets/BIGANN/base.10M.u8bin
10 |   QUERY_FILE=/data/datasets/BIGANN/query.public.10K.128.u8bin
11 |   GT_FILE=/data/datasets/BIGANN/bigann-10M-gt.bin 
12 |   PREFIX=bigann_10m
13 |   DATA_TYPE=uint8
14 |   DIST_FN=l2
15 |   B=0.3
16 |   K=10
17 |   DATA_DIM=128
18 |   DATA_N=10000000
19 | }
20 | 


--------------------------------------------------------------------------------
/scripts/config_sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source config_dataset.sh
 3 | 
 4 | # Choose the dataset by uncomment the line below
 5 | # If multiple lines are uncommented, only the last dataset is effective
 6 | # dataset_bigann10M
 7 | 
 8 | ##################
 9 | #   Disk Build   #
10 | ##################
11 | R=48
12 | BUILD_L=128
13 | M=32
14 | BUILD_T=8
15 | 
16 | ##################
17 | #       SQ       #
18 | ##################
19 | USE_SQ=0
20 | 
21 | ##################################
22 | #   In-Memory Navigation Graph   #
23 | ##################################
24 | MEM_R=48
25 | MEM_BUILD_L=128
26 | MEM_ALPHA=1.2
27 | MEM_RAND_SAMPLING_RATE=0.01
28 | MEM_USE_FREQ=0
29 | MEM_FREQ_USE_RATE=0.01
30 | 
31 | ##########################
32 | #   Generate Frequency   #
33 | ##########################
34 | FREQ_QUERY_FILE=$QUERY_FILE
35 | FREQ_QUERY_CNT=0 # Set 0 to use all (default)
36 | FREQ_BM=4
37 | FREQ_L=100 # only support one value at a time for now
38 | FREQ_T=16
39 | FREQ_CACHE=0
40 | FREQ_MEM_L=0 # non-zero to enable
41 | FREQ_MEM_TOPK=10
42 | 
43 | #######################
44 | #   Graph Partition   #
45 | #######################
46 | GP_TIMES=16
47 | GP_T=16
48 | GP_LOCK_NUMS=0 # will lock nodes at init, the lock_node_nums = partition_size * GP_LOCK_NUMS
49 | GP_USE_FREQ=0 # use freq file to partition graph
50 | GP_CUT=4096 # the graph's degree will been limited at 4096
51 | 
52 | 
53 | ##############
54 | #   Search   #
55 | ##############
56 | BM_LIST=(4)
57 | T_LIST=(8)
58 | CACHE=0
59 | MEM_L=0 # non-zero to enable
60 | 
61 | # Page Search
62 | USE_PAGE_SEARCH=1 # Set 0 for beam search, 1 for page search (default)
63 | PS_USE_RATIO=1.0
64 | 
65 | # KNN
66 | LS="100"
67 | 
68 | # Range search
69 | RS_LS="80"
70 | RS_ITER_KNN_TO_RANGE_SEARCH=1 # 0 for custom search, 1 for iterating via KNN, combine with USE_PAGE_SEARCH
71 | KICKED_SIZE=0 # non-zero to reuse intermediate states during page search
72 | RS_CUSTOM_ROUND=0 # set when use custom search, 0 for all pages within radius
73 | 


--------------------------------------------------------------------------------
/scripts/multiple_runs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SUMMARY_PATH=../indices/summary.log
 4 | 
 5 | GREEN='\033[0;32m'
 6 | NC='\033[0m'
 7 | 
 8 | rm $SUMMARY_PATH
 9 | 
10 | for l in 5 25 50 ; do
11 |   echo "RS_LS=\"${l}\"" >> config_local.sh
12 | 
13 | for bm in 4 8 16; do
14 |   echo "BM_LIST=(${bm})" >> config_local.sh
15 | # for ml in 50 100 200; do
16 | # for ml in 50 200; do
17 | #   echo "MEM_L=${ml}" >> config_local.sh
18 | 
19 | # for knn_ml in 20 40 80; do
20 | #   echo "RS_KNN_MEM_L=${ml}" >> config_local.sh
21 | 
22 | # for ks in 500; do
23 |   echo "KICKED_SIZE=${ks}" >> config_local.sh
24 |   for i in $(seq 1 $1); do
25 |     printf "${GREEN}Run $i ${NC}\n"
26 |     ./run_benchmark.sh release search range
27 |   done
28 | done
29 | done
30 | # done
31 | # done
32 | 
33 | printf "${GREEN}Summary${NC}\n"
34 | cat $SUMMARY_PATH | grep -E "([0-9]+(\.[0-9]+\s+)){5,}"
35 | 


--------------------------------------------------------------------------------
/scripts/unset.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | source config_local.sh
  3 | 
  4 | INDEX_PREFIX_PATH="${PREFIX}_M${M}_R${R}_L${BUILD_L}_B${B}/"
  5 | MEM_SAMPLE_PATH="${INDEX_PREFIX_PATH}SAMPLE_RATE_${MEM_RAND_SAMPLING_RATE}/"
  6 | MEM_INDEX_PATH="${INDEX_PREFIX_PATH}MEM_R_${MEM_R}_L_${MEM_BUILD_L}_ALPHA_${MEM_ALPHA}_MEM_USE_FREQ${MEM_USE_FREQ}/"
  7 | GP_PATH="${INDEX_PREFIX_PATH}GP_TIMES_${GP_TIMES}_LOCK_${GP_LOCK_NUMS}_GP_USE_FREQ${GP_USE_FREQ}_CUT${GP_CUT}/"
  8 | FREQ_PATH="${INDEX_PREFIX_PATH}FREQ/NQ_${FREQ_QUERY_CNT}_BM_${FREQ_BM}_L_${FREQ_L}_T_${FREQ_T}/"
  9 | 
 10 | print_usage_and_exit(){
 11 |     echo "Usage: ./unset.sh [compile/index_file/gp/mem_index/freq/sample_file/index_dir/relayout] [release/debug]"
 12 |     exit -1;
 13 | }
 14 | cd ../indices
 15 | case $1 in 
 16 |     compile)
 17 |         echo "remove all compiled file."
 18 |         rm -rf ../debug ../release
 19 |     ;;
 20 |     index_file)
 21 |         echo "copy the un-gp disk index to disk index"
 22 |         OLD_INDEX_FILE=${INDEX_PREFIX_PATH}_disk_beam_search.index
 23 |         INDEX_FILE=${INDEX_PREFIX_PATH}_disk.index
 24 |         if [ -f $OLD_INDEX_FILE ]; then
 25 |             cp $OLD_INDEX_FILE $INDEX_FILE
 26 |         else 
 27 |             echo "Wrong! make sure you have the old index file copy."
 28 |         fi
 29 |     ;;
 30 |     gp)
 31 |         echo "remove gp dir and reset the gp index to no-gp index."
 32 |         echo "unset index_file."
 33 |         OLD_INDEX_FILE=${INDEX_PREFIX_PATH}_disk_beam_search.index
 34 |         INDEX_FILE=${INDEX_PREFIX_PATH}_disk.index
 35 |         if [ -f $OLD_INDEX_FILE ]; then
 36 |             echo "copy the un-gp disk index to disk index..."
 37 |             cp $OLD_INDEX_FILE $INDEX_FILE
 38 |         fi
 39 |         echo ""
 40 |         rm -rf $GP_PATH
 41 |         rm -f ${INDEX_PREFIX_PATH}_partition.bin
 42 |     ;;
 43 |     mem_index)
 44 |         echo "remove mem index dir."
 45 |         rm -rf ${MEM_INDEX_PATH}
 46 |     ;;
 47 |     freq)
 48 |         echo "remove freq dir."
 49 |         rm -rf ${FREQ_PATH}
 50 |     ;;
 51 |     sample_file)
 52 |         echo "remove sample data dir."
 53 |         rm -rf ${MEM_SAMPLE_PATH}
 54 |     ;;
 55 |     index_dir)
 56 |         echo "remove index file dir."
 57 |         rm -rf ${INDEX_PREFIX_PATH}
 58 |     ;;
 59 |     relayout)
 60 |         case $2 in
 61 |             debug)
 62 |                 cmake -DCMAKE_BUILD_TYPE=Debug .. -B ../debug
 63 |                 EXE_PATH=../debug
 64 |             ;;
 65 |             release)
 66 |                 cmake -DCMAKE_BUILD_TYPE=Release .. -B ../release
 67 |                 EXE_PATH=../release
 68 |             ;;
 69 |             *)
 70 |                 print_usage_and_exit
 71 |             ;;
 72 |         esac
 73 |         pushd $EXE_PATH
 74 |         make -j
 75 |         popd
 76 |     
 77 |         echo "will relayout the index file using the gpfile in gp dir."
 78 |         echo "unset index_file."
 79 |         OLD_INDEX_FILE=${INDEX_PREFIX_PATH}_disk_beam_search.index
 80 |         INDEX_FILE=${INDEX_PREFIX_PATH}_disk.index
 81 |         if [ ! -f $INDEX_FILE ]; then
 82 |             echo "ERRO! no disk index file!"
 83 |             exit 1; 
 84 |         fi
 85 | 
 86 |         if [ ! -f $OLD_INDEX_FILE ]; then
 87 |             echo "no old file, will copy the index to old index file."
 88 |             cp $INDEX_FILE $OLD_INDEX_FILE
 89 |         fi
 90 | 
 91 |         if [ ! -d ${GP_PATH} ]; then
 92 |             echo "ERRO! no gp dir, maybe you should run './run_benchmark.sh release gp knn' first."
 93 |             exit 1;
 94 |         fi
 95 | 
 96 |         if [ ! -f ${GP_PATH}_part.bin ]; then
 97 |             echo "ERRO! no gp file in gp dir, maybe you should run './run_benchmark.sh release gp knn' first."
 98 |             exit 1;
 99 |         fi
100 |         echo ${EXE_PATH}
101 |         time ${EXE_PATH}/tests/utils/index_relayout ${OLD_INDEX_FILE} ${GP_PATH}_part.bin > ${GP_PATH}relayout.log
102 |         cp ${GP_PATH}_part_tmp.index ${INDEX_PREFIX_PATH}_disk.index
103 |         cp ${GP_PATH}_part.bin ${INDEX_PREFIX_PATH}_partition.bin
104 |     ;;
105 |     search)
106 |         echo rm ${INDEX_PREFIX_PATH}search
107 |         rm ${INDEX_PREFIX_PATH}search/*
108 |     ;;
109 | esac
110 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | 
 6 | if(MSVC)
 7 |     add_subdirectory(dll)
 8 | else()
 9 |     #file(GLOB CPP_SOURCES *.cpp)
10 |     set(CPP_SOURCES ann_exception.cpp aux_utils.cpp distance.cpp index.cpp
11 |         linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp
12 |         natural_number_set.cpp memory_mapper.cpp partition_and_pq.cpp
13 |         pq_flash_index.cpp logger.cpp utils.cpp page_search.cpp visit_freq.cpp
14 |         range_search.cpp)
15 |     add_library(${PROJECT_NAME} ${CPP_SOURCES})
16 |     add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES})
17 | endif()
18 | install()
19 | 


--------------------------------------------------------------------------------
/src/ann_exception.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include "ann_exception.h"
 5 | #include <sstream>
 6 | #include <string>
 7 | 
 8 | namespace diskann {
 9 |   ANNException::ANNException(const std::string& message, int errorCode)
10 |       : std::runtime_error(message), _errorCode(errorCode) {
11 |   }
12 | 
13 |   std::string package_string(const std::string& item_name,
14 |                              const std::string& item_val) {
15 |     return std::string("[") + item_name + ": " + std::string(item_val) +
16 |            std::string("]");
17 |   }
18 | 
19 |   ANNException::ANNException(const std::string& message, int errorCode,
20 |                              const std::string& funcSig,
21 |                              const std::string& fileName, unsigned lineNum)
22 |       : ANNException(
23 |             package_string(std::string("FUNC"), funcSig) +
24 |                 package_string(std::string("FILE"), fileName) +
25 |                 package_string(std::string("LINE"), std::to_string(lineNum)) +
26 |                 "  " + message,
27 |             errorCode) {
28 |   }
29 | 
30 |   FileException::FileException(const std::string& filename,
31 |                                std::system_error& e, const std::string& funcSig,
32 |                                const std::string& fileName,
33 |                                unsigned int       lineNum)
34 |       : ANNException(std::string(" While opening file \'") + filename +
35 |                          std::string("\', error code: ") +
36 |                          std::to_string(e.code().value()) + "  " +
37 |                          e.code().message(),
38 |                      e.code().value(), funcSig, fileName, lineNum) {
39 |   }
40 | 
41 | }  // namespace diskann


--------------------------------------------------------------------------------
/src/dll/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | add_library(${PROJECT_NAME} SHARED dllmain.cpp ../partition_and_pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp 
 5 |     ../windows_aligned_file_reader.cpp ../distance.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../aux_utils.cpp
 6 |     ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp)
 7 | 
 8 | set(TARGET_DIR "$<$<CONFIG:Debug>:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$<CONFIG:Release>:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}>")
 9 | set(DISKANN_DLL_IMPLIB "${TARGET_DIR}/${PROJECT_NAME}.lib")
10 | 
11 | target_compile_definitions(${PROJECT_NAME} PRIVATE _USRDLL _WINDLL)
12 | target_compile_options(${PROJECT_NAME} PRIVATE /MD /GL)
13 | target_include_directories(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
14 | 
15 | target_link_options(${PROJECT_NAME} PRIVATE /DLL /IMPLIB:${DISKANN_DLL_IMPLIB} /LTCG)
16 | target_link_libraries(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_LINK_LIBRARIES})
17 | 
18 | if (DISKANN_DLL_TCMALLOC_LINK_OPTIONS)
19 |     target_link_libraries(${PROJECT_NAME} PUBLIC ${DISKANN_DLL_TCMALLOC_LINK_OPTIONS})
20 | endif()
21 | 
22 | # Copy OpenMP DLL and PDB.
23 | set(RUNTIME_FILES_TO_COPY ${OPENMP_WINDOWS_RUNTIME_FILES} ${TCMALLOC_WINDOWS_RUNTIME_FILES})
24 | 
25 | foreach(RUNTIME_FILE ${RUNTIME_FILES_TO_COPY})
26 |     add_custom_command(TARGET ${PROJECT_NAME}
27 |                        POST_BUILD
28 |                        COMMAND ${CMAKE_COMMAND} -E copy "${RUNTIME_FILE}" "${TARGET_DIR}")
29 | endforeach()
30 | 


--------------------------------------------------------------------------------
/src/dll/dllmain.cpp:
--------------------------------------------------------------------------------
 1 | // dllmain.cpp : Defines the entry point for the DLL application.
 2 | #include <windows.h>
 3 | 
 4 | BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call,
 5 |                       LPVOID lpReserved) {
 6 |   switch (ul_reason_for_call) {
 7 |     case DLL_PROCESS_ATTACH:
 8 |     case DLL_THREAD_ATTACH:
 9 |     case DLL_THREAD_DETACH:
10 |     case DLL_PROCESS_DETACH:
11 |       break;
12 |   }
13 |   return TRUE;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/logger.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <cstring>
 5 | #include <iostream>
 6 | 
 7 | #ifdef EXEC_ENV_OLS
 8 | #include "ANNLoggingImpl.hpp"
 9 | #endif
10 | 
11 | #include "logger_impl.h"
12 | #include "windows_customizations.h"
13 | 
14 | namespace diskann {
15 | 
16 |   DISKANN_DLLEXPORT ANNStreamBuf coutBuff(stdout);
17 |   DISKANN_DLLEXPORT ANNStreamBuf cerrBuff(stderr);
18 | 
19 |   DISKANN_DLLEXPORT std::basic_ostream<char> cout(&coutBuff);
20 |   DISKANN_DLLEXPORT std::basic_ostream<char> cerr(&cerrBuff);
21 | 
22 |   ANNStreamBuf::ANNStreamBuf(FILE* fp) {
23 |     if (fp == nullptr) {
24 |       throw diskann::ANNException(
25 |           "File pointer passed to ANNStreamBuf() cannot be null", -1);
26 |     }
27 |     if (fp != stdout && fp != stderr) {
28 |       throw diskann::ANNException(
29 |           "The custom logger only supports stdout and stderr.", -1);
30 |     }
31 |     _fp = fp;
32 |     _logLevel = (_fp == stdout) ? ANNIndex::LogLevel::LL_Info
33 |                                 : ANNIndex::LogLevel::LL_Error;
34 | #ifdef EXEC_ENV_OLS
35 |     _buf = new char[BUFFER_SIZE + 1];  // See comment in the header
36 | #else
37 |     _buf = new char[BUFFER_SIZE];  // See comment in the header
38 | #endif
39 | 
40 |     std::memset(_buf, 0, (BUFFER_SIZE) * sizeof(char));
41 |     setp(_buf, _buf + BUFFER_SIZE);
42 |   }
43 | 
44 |   ANNStreamBuf::~ANNStreamBuf() {
45 |     sync();
46 |     _fp = nullptr;  // we'll not close because we can't.
47 |     delete[] _buf;
48 |   }
49 | 
50 |   int ANNStreamBuf::overflow(int c) {
51 |     std::lock_guard<std::mutex> lock(_mutex);
52 |     if (c != EOF) {
53 |       *pptr() = (char) c;
54 |       pbump(1);
55 |     }
56 |     flush();
57 |     return c;
58 |   }
59 | 
60 |   int ANNStreamBuf::sync() {
61 |     std::lock_guard<std::mutex> lock(_mutex);
62 |     flush();
63 |     return 0;
64 |   }
65 | 
66 |   int ANNStreamBuf::underflow() {
67 |     throw diskann::ANNException(
68 |         "Attempt to read on streambuf meant only for writing.", -1);
69 |   }
70 | 
71 |   int ANNStreamBuf::flush() {
72 |     const int num = (int) (pptr() - pbase());
73 |     logImpl(pbase(), num);
74 |     pbump(-num);
75 |     return num;
76 |   }
77 |   void ANNStreamBuf::logImpl(char* str, int num) {
78 | #ifdef EXEC_ENV_OLS
79 |     str[num] = '\0';  // Safe. See the c'tor.
80 |     DiskANNLogging(_logLevel, str);
81 | #else
82 |     fwrite(str, sizeof(char), num, _fp);
83 |     fflush(_fp);
84 | #endif
85 |   }
86 | 
87 | }  // namespace diskann
88 | 


--------------------------------------------------------------------------------
/src/memory_mapper.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include "logger.h"
 5 | #include "memory_mapper.h"
 6 | #include <iostream>
 7 | #include <sstream>
 8 | 
 9 | using namespace diskann;
10 | 
11 | MemoryMapper::MemoryMapper(const std::string& filename)
12 |     : MemoryMapper(filename.c_str()) {
13 | }
14 | 
15 | MemoryMapper::MemoryMapper(const char* filename) {
16 | #ifndef _WINDOWS
17 |   _fd = open(filename, O_RDONLY);
18 |   if (_fd <= 0) {
19 |     std::cerr << "Inner vertices file not found" << std::endl;
20 |     return;
21 |   }
22 |   struct stat sb;
23 |   if (fstat(_fd, &sb) != 0) {
24 |     std::cerr << "Inner vertices file not dound. " << std::endl;
25 |     return;
26 |   }
27 |   _fileSize = sb.st_size;
28 |   diskann::cout << "File Size: " << _fileSize << std::endl;
29 |   _buf = (char*) mmap(NULL, _fileSize, PROT_READ, MAP_PRIVATE, _fd, 0);
30 | #else
31 |   _bareFile = CreateFileA(filename, GENERIC_READ | GENERIC_EXECUTE, 0, NULL,
32 |                           OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
33 |   if (_bareFile == nullptr) {
34 |     std::ostringstream message;
35 |     message << "CreateFileA(" << filename << ") failed with error "
36 |             << GetLastError() << std::endl;
37 |     std::cerr << message.str();
38 |     throw std::exception(message.str().c_str());
39 |   }
40 | 
41 |   _fd = CreateFileMapping(_bareFile, NULL, PAGE_EXECUTE_READ, 0, 0, NULL);
42 |   if (_fd == nullptr) {
43 |     std::ostringstream message;
44 |     message << "CreateFileMapping(" << filename << ") failed with error "
45 |             << GetLastError() << std::endl;
46 |     std::cerr << message.str() << std::endl;
47 |     throw std::exception(message.str().c_str());
48 |   }
49 | 
50 |   _buf = (char*) MapViewOfFile(_fd, FILE_MAP_READ, 0, 0, 0);
51 |   if (_buf == nullptr) {
52 |     std::ostringstream message;
53 |     message << "MapViewOfFile(" << filename
54 |             << ") failed with error: " << GetLastError() << std::endl;
55 |     std::cerr << message.str() << std::endl;
56 |     throw std::exception(message.str().c_str());
57 |   }
58 | 
59 |   LARGE_INTEGER fSize;
60 |   if (TRUE == GetFileSizeEx(_bareFile, &fSize)) {
61 |     _fileSize = fSize.QuadPart;  // take the 64-bit value
62 |     diskann::cout << "File Size: " << _fileSize << std::endl;
63 |   } else {
64 |     std::cerr << "Failed to get size of file " << filename << std::endl;
65 |   }
66 | #endif
67 | }
68 | char* MemoryMapper::getBuf() {
69 |   return _buf;
70 | }
71 | 
72 | size_t MemoryMapper::getFileSize() {
73 |   return _fileSize;
74 | }
75 | 
76 | MemoryMapper::~MemoryMapper() {
77 | #ifndef _WINDOWS
78 |   if (munmap(_buf, _fileSize) != 0)
79 |     std::cerr << "ERROR unmapping. CHECK!" << std::endl;
80 |   close(_fd);
81 | #else
82 |   if (FALSE == UnmapViewOfFile(_buf)) {
83 |     std::cerr << "Unmap view of file failed. Error: " << GetLastError()
84 |               << std::endl;
85 |   }
86 | 
87 |   if (FALSE == CloseHandle(_fd)) {
88 |     std::cerr << "Failed to close memory mapped file. Error: " << GetLastError()
89 |               << std::endl;
90 |   }
91 | 
92 |   if (FALSE == CloseHandle(_bareFile)) {
93 |     std::cerr << "Failed to close file: " << _fileName
94 |               << " Error: " << GetLastError() << std::endl;
95 |   }
96 | 
97 | #endif
98 | }
99 | 


--------------------------------------------------------------------------------
/src/natural_number_map.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <assert.h>
  5 | #include <boost/dynamic_bitset.hpp>
  6 | 
  7 | #include "natural_number_map.h"
  8 | 
  9 | namespace diskann {
 10 |   static constexpr auto invalid_position = boost::dynamic_bitset<>::npos;
 11 | 
 12 |   template<typename Key, typename Value>
 13 |   natural_number_map<Key, Value>::natural_number_map()
 14 |       : _size(0), _values_bitset(std::make_unique<boost::dynamic_bitset<>>()) {
 15 |   }
 16 | 
 17 |   template<typename Key, typename Value>
 18 |   void natural_number_map<Key, Value>::reserve(size_t count) {
 19 |     _values_vector.reserve(count);
 20 |     _values_bitset->reserve(count);
 21 |   }
 22 | 
 23 |   template<typename Key, typename Value>
 24 |   size_t natural_number_map<Key, Value>::size() const {
 25 |     return _size;
 26 |   }
 27 | 
 28 |   template<typename Key, typename Value>
 29 |   void natural_number_map<Key, Value>::set(Key key, Value value) {
 30 |     if (key >= _values_bitset->size()) {
 31 |       _values_bitset->resize(static_cast<size_t>(key) + 1);
 32 |       _values_vector.resize(_values_bitset->size());
 33 |     }
 34 | 
 35 |     _values_vector[key] = value;
 36 |     const bool was_present = _values_bitset->test_set(key, true);
 37 | 
 38 |     if (!was_present) {
 39 |       ++_size;
 40 |     }
 41 |   }
 42 | 
 43 |   template<typename Key, typename Value>
 44 |   void natural_number_map<Key, Value>::erase(Key key) {
 45 |     if (key < _values_bitset->size()) {
 46 |       const bool was_present = _values_bitset->test_set(key, false);
 47 | 
 48 |       if (was_present) {
 49 |         --_size;
 50 |       }
 51 |     }
 52 |   }
 53 | 
 54 |   template<typename Key, typename Value>
 55 |   bool natural_number_map<Key, Value>::contains(Key key) const {
 56 |     return key < _values_bitset->size() && _values_bitset->test(key);
 57 |   }
 58 | 
 59 |   template<typename Key, typename Value>
 60 |   bool natural_number_map<Key, Value>::try_get(Key key, Value& value) const {
 61 |     if (!contains(key)) {
 62 |       return false;
 63 |     }
 64 | 
 65 |     value = _values_vector[key];
 66 |     return true;
 67 |   }
 68 | 
 69 |   template<typename Key, typename Value>
 70 |   typename natural_number_map<Key, Value>::position
 71 |   natural_number_map<Key, Value>::find_first() const {
 72 |     return position{_size > 0 ? _values_bitset->find_first() : invalid_position,
 73 |                     0};
 74 |   }
 75 | 
 76 |   template<typename Key, typename Value>
 77 |   typename natural_number_map<Key, Value>::position
 78 |   natural_number_map<Key, Value>::find_next(
 79 |       const position& after_position) const {
 80 |     return position{after_position._keys_already_enumerated < _size
 81 |                         ? _values_bitset->find_next(after_position._key)
 82 |                         : invalid_position,
 83 |                     after_position._keys_already_enumerated + 1};
 84 |   }
 85 | 
 86 |   template<typename Key, typename Value>
 87 |   bool natural_number_map<Key, Value>::position::is_valid() const {
 88 |     return _key != invalid_position;
 89 |   }
 90 | 
 91 |   template<typename Key, typename Value>
 92 |   Value natural_number_map<Key, Value>::get(const position& pos) const {
 93 |     assert(pos.is_valid());
 94 |     return _values_vector[pos._key];
 95 |   }
 96 | 
 97 |   template<typename Key, typename Value>
 98 |   void natural_number_map<Key, Value>::clear() {
 99 |     _size = 0;
100 |     _values_vector.clear();
101 |     _values_bitset->clear();
102 |   }
103 | 
104 |   // Instantiate used templates.
105 |   template class natural_number_map<unsigned, int32_t>;
106 |   template class natural_number_map<unsigned, uint32_t>;
107 |   template class natural_number_map<unsigned, int64_t>;
108 |   template class natural_number_map<unsigned, uint64_t>;
109 | }  // namespace diskann
110 | 


--------------------------------------------------------------------------------
/src/natural_number_set.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <boost/dynamic_bitset.hpp>
 5 | 
 6 | #include "ann_exception.h"
 7 | #include "natural_number_set.h"
 8 | 
 9 | namespace diskann {
10 |   template<typename T>
11 |   natural_number_set<T>::natural_number_set()
12 |       : _values_bitset(std::make_unique<boost::dynamic_bitset<>>()) {
13 |   }
14 | 
15 |   template<typename T>
16 |   bool natural_number_set<T>::is_empty() const {
17 |     return _values_vector.empty();
18 |   }
19 | 
20 |   template<typename T>
21 |   void natural_number_set<T>::reserve(size_t count) {
22 |     _values_vector.reserve(count);
23 |     _values_bitset->reserve(count);
24 |   }
25 | 
26 |   template<typename T>
27 |   void natural_number_set<T>::insert(T id) {
28 |     _values_vector.emplace_back(id);
29 | 
30 |     if (id >= _values_bitset->size())
31 |       _values_bitset->resize(static_cast<size_t>(id) + 1);
32 | 
33 |     _values_bitset->set(id, true);
34 |   }
35 | 
36 |   template<typename T>
37 |   T natural_number_set<T>::pop_any() {
38 |     if (_values_vector.empty()) {
39 |       throw diskann::ANNException("No values available", -1, __FUNCSIG__,
40 |                                   __FILE__, __LINE__);
41 |     }
42 | 
43 |     const T id = _values_vector.back();
44 |     _values_vector.pop_back();
45 | 
46 |     _values_bitset->set(id, false);
47 | 
48 |     return id;
49 |   }
50 | 
51 |   template<typename T>
52 |   void natural_number_set<T>::clear() {
53 |     _values_vector.clear();
54 |     _values_bitset->clear();
55 |   }
56 | 
57 |   template<typename T>
58 |   size_t natural_number_set<T>::size() const {
59 |     return _values_vector.size();
60 |   }
61 | 
62 |   template<typename T>
63 |   bool natural_number_set<T>::is_in_set(T id) const {
64 |     return _values_bitset->test(id);
65 |   }
66 | 
67 |   // Instantiate used templates.
68 |   template class natural_number_set<unsigned>;
69 | }  // namespace diskann
70 | 


--------------------------------------------------------------------------------
/src/visit_freq.cpp:
--------------------------------------------------------------------------------
 1 | #include "pq_flash_index.h"
 2 | 
 3 | namespace diskann {
 4 |   template<typename T>
 5 |   void PQFlashIndex<T>::generate_node_nbrs_freq(
 6 |         const std::string& freq_save_path,
 7 |         const size_t query_num,
 8 |         const T *query, const size_t query_aligned_dim, const _u64 k_search,
 9 |         const _u64 l_search, _u64 *res_ids,
10 |         float *res_dists, const _u64 beam_width, const _u32 io_limit,
11 |         const bool use_reorder_data, QueryStats *stats, const _u32 mem_L) {
12 | 
13 |     this->count_visited_nodes = true;
14 |     this->count_visited_nbrs = true;
15 | 
16 |     init_node_visit_counter();
17 | 
18 |     nbrs_freq_counter_.resize(this->num_points);
19 |     for (auto& m : nbrs_freq_counter_) m.clear();
20 | 
21 | #pragma omp parallel for schedule(dynamic, 1)
22 |     for (_s64 i = 0; i < (int64_t) query_num; i++) {
23 |       cached_beam_search(
24 |           query + (i * query_aligned_dim), k_search, l_search,
25 |           res_ids + (i * k_search),
26 |           res_dists + (i * k_search),
27 |           beam_width, io_limit, use_reorder_data, stats + i, mem_L);
28 |     }
29 |     this->count_visited_nbrs = false;
30 |     this->count_visited_nodes = false;
31 | 
32 |     // save freq file
33 |     const std::string freq_file = freq_save_path + "_freq.bin";
34 |     std::ofstream writer(freq_file, std::ios::binary | std::ios::out);
35 |     diskann::cout << "Writing visited nodes and neighbors frequency: " << freq_file << std::endl;
36 |     unsigned num = node_visit_counter.size(); // number of data points
37 |     if (num != this->num_points) {
38 |       diskann::cerr << "Total number of elements mismatch" << std::endl;
39 |       exit(1);
40 |     }
41 |     writer.write((char *)&num, sizeof(unsigned));
42 | 
43 |     for (size_t i = 0; i < num; ++i) {
44 |       writer.write((char *)(&(node_visit_counter[i].second)), sizeof(unsigned));
45 |     }
46 |     for (size_t i = 0; i < num; ++i) {
47 |       unsigned p_size = nbrs_freq_counter_[i].size();
48 |       writer.write((char *)(&p_size), sizeof(unsigned));
49 |       for (const auto [nbr_id, nbr_freq] : nbrs_freq_counter_[i]) {
50 |         writer.write((char *)(&nbr_id), sizeof(unsigned));
51 |         writer.write((char *)(&nbr_freq), sizeof(unsigned));
52 |       }
53 |     }
54 |     diskann::cout << "Writing frequency file finished" << std::endl;
55 |   }
56 | 
57 |   // instantiations
58 |   template class PQFlashIndex<_u8>;
59 |   template class PQFlashIndex<_s8>;
60 |   template class PQFlashIndex<float>;
61 | } // namespace diskann


--------------------------------------------------------------------------------
/src/windows_aligned_file_reader.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #ifdef _WINDOWS
  5 | #ifndef USE_BING_INFRA
  6 | #include "windows_aligned_file_reader.h"
  7 | #include <iostream>
  8 | #include "utils.h"
  9 | 
 10 | #define SECTOR_LEN 4096
 11 | 
 12 | void WindowsAlignedFileReader::open(const std::string& fname) {
 13 |   m_filename = std::wstring(fname.begin(), fname.end());
 14 |   this->register_thread();
 15 | }
 16 | 
 17 | void WindowsAlignedFileReader::close() {
 18 |   for (auto& k_v : ctx_map) {
 19 |     IOContext ctx = ctx_map[k_v.first];
 20 |     CloseHandle(ctx.fhandle);
 21 |   }
 22 | }
 23 | 
 24 | void WindowsAlignedFileReader::register_thread() {
 25 |   std::unique_lock<std::mutex> lk(this->ctx_mut);
 26 |   if (this->ctx_map.find(std::this_thread::get_id()) != ctx_map.end()) {
 27 |     diskann::cout << "Warning:: Duplicate registration for thread_id : "
 28 |                   << std::this_thread::get_id() << std::endl;
 29 |   }
 30 | 
 31 |   IOContext ctx;
 32 |   ctx.fhandle = CreateFile(m_filename.c_str(), GENERIC_READ, FILE_SHARE_READ,
 33 |                            NULL, OPEN_EXISTING,
 34 |                            FILE_ATTRIBUTE_READONLY | FILE_FLAG_NO_BUFFERING |
 35 |                                FILE_FLAG_OVERLAPPED | FILE_FLAG_RANDOM_ACCESS,
 36 |                            NULL);
 37 |   if (ctx.fhandle == INVALID_HANDLE_VALUE) {
 38 |     diskann::cout << "Error opening " << m_filename.c_str()
 39 |                   << " -- error=" << GetLastError() << std::endl;
 40 |   }
 41 | 
 42 |   // create IOCompletionPort
 43 |   ctx.iocp = CreateIoCompletionPort(ctx.fhandle, ctx.iocp, 0, 0);
 44 | 
 45 |   // create MAX_DEPTH # of reqs
 46 |   for (_u64 i = 0; i < MAX_IO_DEPTH; i++) {
 47 |     OVERLAPPED os;
 48 |     memset(&os, 0, sizeof(OVERLAPPED));
 49 |     // os.hEvent = CreateEventA(NULL, TRUE, FALSE, NULL);
 50 |     ctx.reqs.push_back(os);
 51 |   }
 52 |   this->ctx_map.insert(std::make_pair(std::this_thread::get_id(), ctx));
 53 | }
 54 | 
 55 | IOContext& WindowsAlignedFileReader::get_ctx() {
 56 |   std::unique_lock<std::mutex> lk(this->ctx_mut);
 57 |   if (ctx_map.find(std::this_thread::get_id()) == ctx_map.end()) {
 58 |     std::stringstream stream;
 59 |     stream << "unable to find IOContext for thread_id : "
 60 |            << std::this_thread::get_id() << "\n";
 61 |     throw diskann::ANNException(stream.str(), -2, __FUNCSIG__, __FILE__,
 62 |                                 __LINE__);
 63 |   }
 64 |   IOContext& ctx = ctx_map[std::this_thread::get_id()];
 65 |   lk.unlock();
 66 |   return ctx;
 67 | }
 68 | 
 69 | void WindowsAlignedFileReader::read(std::vector<AlignedRead>& read_reqs,
 70 |                                     IOContext& ctx, bool async) {
 71 |   using namespace std::chrono_literals;
 72 |   // execute each request sequentially
 73 |   _u64 n_reqs = read_reqs.size();
 74 |   _u64 n_batches = ROUND_UP(n_reqs, MAX_IO_DEPTH) / MAX_IO_DEPTH;
 75 |   for (_u64 i = 0; i < n_batches; i++) {
 76 |     // reset all OVERLAPPED objects
 77 |     for (auto& os : ctx.reqs) {
 78 |       // HANDLE evt = os.hEvent;
 79 |       memset(&os, 0, sizeof(os));
 80 |       // os.hEvent = evt;
 81 | 
 82 |       /*
 83 |         if (ResetEvent(os.hEvent) == 0) {
 84 |           diskann::cerr << "ResetEvent failed" << std::endl;
 85 |           exit(-3);
 86 |         }
 87 |       */
 88 |     }
 89 | 
 90 |     // batch start/end
 91 |     _u64 batch_start = MAX_IO_DEPTH * i;
 92 |     _u64 batch_size =
 93 |         std::min((_u64)(n_reqs - batch_start), (_u64) MAX_IO_DEPTH);
 94 | 
 95 |     // fill OVERLAPPED and issue them
 96 |     for (_u64 j = 0; j < batch_size; j++) {
 97 |       AlignedRead& req = read_reqs[batch_start + j];
 98 |       OVERLAPPED&  os = ctx.reqs[j];
 99 | 
100 |       _u64  offset = req.offset;
101 |       _u64  nbytes = req.len;
102 |       char* read_buf = (char*) req.buf;
103 |       assert(IS_ALIGNED(read_buf, SECTOR_LEN));
104 |       assert(IS_ALIGNED(offset, SECTOR_LEN));
105 |       assert(IS_ALIGNED(nbytes, SECTOR_LEN));
106 | 
107 |       // fill in OVERLAPPED struct
108 |       os.Offset = offset & 0xffffffff;
109 |       os.OffsetHigh = (offset >> 32);
110 | 
111 |       BOOL ret = ReadFile(ctx.fhandle, read_buf, nbytes, NULL, &os);
112 |       if (ret == FALSE) {
113 |         auto error = GetLastError();
114 |         if (error != ERROR_IO_PENDING) {
115 |           diskann::cerr << "Error queuing IO -- " << error << "\n";
116 |         }
117 |       } else {
118 |         diskann::cerr << "Error queueing IO -- ReadFile returned TRUE"
119 |                       << std::endl;
120 |       }
121 |     }
122 |     DWORD       n_read = 0;
123 |     _u64        n_complete = 0;
124 |     ULONG_PTR   completion_key = 0;
125 |     OVERLAPPED* lp_os;
126 |     while (n_complete < batch_size) {
127 |       if (GetQueuedCompletionStatus(ctx.iocp, &n_read, &completion_key, &lp_os,
128 |                                     INFINITE) != 0) {
129 |         // successfully dequeued a completed I/O
130 |         n_complete++;
131 |       } else {
132 |         // failed to dequeue OR dequeued failed I/O
133 |         if (lp_os == NULL) {
134 |           DWORD error = GetLastError();
135 |           if (error != WAIT_TIMEOUT) {
136 |             diskann::cerr << "GetQueuedCompletionStatus() failed with error = "
137 |                           << error << std::endl;
138 |             throw diskann::ANNException(
139 |                 "GetQueuedCompletionStatus failed with error: ", error,
140 |                 __FUNCSIG__, __FILE__, __LINE__);
141 |           }
142 |           // no completion packet dequeued ==> sleep for 5us and try again
143 |           std::this_thread::sleep_for(5us);
144 |         } else {
145 |           // completion packet for failed IO dequeued
146 |           auto              op_idx = lp_os - ctx.reqs.data();
147 |           std::stringstream stream;
148 |           stream << "I/O failed , offset: " << read_reqs[op_idx].offset
149 |                  << "with error code: " << GetLastError() << std::endl;
150 |           throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
151 |                                       __LINE__);
152 |         }
153 |       }
154 |     }
155 |   }
156 | }
157 | #endif
158 | #endif
159 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | 
 6 | add_executable(build_memory_index build_memory_index.cpp)
 7 | target_link_libraries(build_memory_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
 8 | 
 9 | add_executable(search_memory_index search_memory_index.cpp)
10 | target_link_libraries(search_memory_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
11 | 
12 | add_executable(build_disk_index build_disk_index.cpp)
13 | target_link_libraries(build_disk_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)
14 | 
15 | add_executable(search_disk_index search_disk_index.cpp)
16 | target_link_libraries(search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
17 | 
18 | add_executable(range_search_disk_index range_search_disk_index.cpp)
19 | target_link_libraries(range_search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
20 | 
21 | # add_executable(range_search_disk_different_radius range_search_disk_different_radius.cpp)
22 | # target_link_libraries(range_search_disk_different_radius ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
23 | #add_executable(test_incremental_index test_incremental_index.cpp)
24 | #target_link_libraries(test_incremental_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
25 | 
26 | add_executable(test_streaming_scenario test_streaming_scenario.cpp)
27 | target_link_libraries(test_streaming_scenario ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
28 | 
29 | add_executable(test_insert_deletes_consolidate test_insert_deletes_consolidate.cpp)
30 | target_link_libraries(test_insert_deletes_consolidate ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
31 | 
32 | add_executable(search_memory_index_dynamic search_memory_index_dynamic.cpp)
33 | target_link_libraries(search_memory_index_dynamic ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
34 | 
35 | add_executable(search_disk_index_save_freq search_disk_index_save_freq.cpp)
36 | target_link_libraries(search_disk_index_save_freq ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)


--------------------------------------------------------------------------------
/tests/build_disk_index.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <omp.h>
  5 | #include <boost/program_options.hpp>
  6 | 
  7 | #include "aux_utils.h"
  8 | #include "index.h"
  9 | #include "math_utils.h"
 10 | #include "partition_and_pq.h"
 11 | #include "utils.h"
 12 | 
 13 | namespace po = boost::program_options;
 14 | 
 15 | int main(int argc, char** argv) {
 16 |   std::string data_type, dist_fn, data_path, index_path_prefix;
 17 |   unsigned    num_threads, R, L, disk_PQ;
 18 |   float       B, M;
 19 |   bool        append_reorder_data = false;
 20 | 
 21 |   po::options_description desc{"Arguments"};
 22 |   try {
 23 |     desc.add_options()("help,h", "Print information on arguments");
 24 |     desc.add_options()("data_type",
 25 |                        po::value<std::string>(&data_type)->required(),
 26 |                        "data type <int8/uint8/float>");
 27 |     desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
 28 |                        "distance function <l2/mips>");
 29 |     desc.add_options()("data_path",
 30 |                        po::value<std::string>(&data_path)->required(),
 31 |                        "Input data file in bin format");
 32 |     desc.add_options()("index_path_prefix",
 33 |                        po::value<std::string>(&index_path_prefix)->required(),
 34 |                        "Path prefix for saving index file components");
 35 |     desc.add_options()("max_degree,R",
 36 |                        po::value<uint32_t>(&R)->default_value(64),
 37 |                        "Maximum graph degree");
 38 |     desc.add_options()(
 39 |         "Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
 40 |         "Build complexity, higher value results in better graphs");
 41 |     desc.add_options()("search_DRAM_budget,B", po::value<float>(&B)->required(),
 42 |                        "DRAM budget in GB for searching the index to set the "
 43 |                        "compressed level for data while search happens");
 44 |     desc.add_options()("build_DRAM_budget,M", po::value<float>(&M)->required(),
 45 |                        "DRAM budget in GB for building the index");
 46 |     desc.add_options()(
 47 |         "num_threads,T",
 48 |         po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
 49 |         "Number of threads used for building index (defaults to "
 50 |         "omp_get_num_procs())");
 51 |     desc.add_options()("PQ_disk_bytes",
 52 |                        po::value<uint32_t>(&disk_PQ)->default_value(0),
 53 |                        "Number of bytes to which vectors should be compressed "
 54 |                        "on SSD; 0 for no compression");
 55 |     desc.add_options()("append_reorder_data",
 56 |                        po::bool_switch()->default_value(false),
 57 |                        "Include full precision data in the index. Use only in "
 58 |                        "conjuction with compressed data on SSD.");
 59 | 
 60 |     po::variables_map vm;
 61 |     po::store(po::parse_command_line(argc, argv, desc), vm);
 62 |     if (vm.count("help")) {
 63 |       std::cout << desc;
 64 |       return 0;
 65 |     }
 66 |     po::notify(vm);
 67 |     if (vm["append_reorder_data"].as<bool>())
 68 |       append_reorder_data = true;
 69 |   } catch (const std::exception& ex) {
 70 |     std::cerr << ex.what() << '\n';
 71 |     return -1;
 72 |   }
 73 | 
 74 |   diskann::Metric metric;
 75 |   if (dist_fn == std::string("l2"))
 76 |     metric = diskann::Metric::L2;
 77 |   else if (dist_fn == std::string("mips"))
 78 |     metric = diskann::Metric::INNER_PRODUCT;
 79 |   else {
 80 |     std::cout << "Error. Only l2 and mips distance functions are supported"
 81 |               << std::endl;
 82 |     return -1;
 83 |   }
 84 | 
 85 |   if (append_reorder_data) {
 86 |     if (disk_PQ == 0) {
 87 |       std::cout << "Error: It is not necessary to append data for reordering "
 88 |                    "when vectors are not compressed on disk."
 89 |                 << std::endl;
 90 |       return -1;
 91 |     }
 92 |     if (data_type != std::string("float")) {
 93 |       std::cout << "Error: Appending data for reordering currently only "
 94 |                    "supported for float data type."
 95 |                 << std::endl;
 96 |       return -1;
 97 |     }
 98 |   }
 99 | 
100 |   std::string params = std::string(std::to_string(R)) + " " +
101 |                        std::string(std::to_string(L)) + " " +
102 |                        std::string(std::to_string(B)) + " " +
103 |                        std::string(std::to_string(M)) + " " +
104 |                        std::string(std::to_string(num_threads)) + " " +
105 |                        std::string(std::to_string(disk_PQ)) + " " +
106 |                        std::string(std::to_string(append_reorder_data));
107 | 
108 |   try {
109 |     if (data_type == std::string("int8"))
110 |       return diskann::build_disk_index<int8_t>(
111 |           data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric);
112 |     else if (data_type == std::string("uint8"))
113 |       return diskann::build_disk_index<uint8_t>(
114 |           data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric);
115 |     else if (data_type == std::string("float"))
116 |       return diskann::build_disk_index<float>(
117 |           data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric);
118 |     else {
119 |       diskann::cerr << "Error. Unsupported data type" << std::endl;
120 |       return -1;
121 |     }
122 |   } catch (const std::exception& e) {
123 |     std::cout << std::string(e.what()) << std::endl;
124 |     diskann::cerr << "Index build failed." << std::endl;
125 |     return -1;
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/tests/build_memory_index.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <index.h>
  5 | #include <omp.h>
  6 | #include <string.h>
  7 | #include <boost/program_options.hpp>
  8 | 
  9 | #include "utils.h"
 10 | 
 11 | #ifndef _WINDOWS
 12 | #include <sys/mman.h>
 13 | #include <unistd.h>
 14 | #else
 15 | #include <Windows.h>
 16 | #endif
 17 | 
 18 | #include "memory_mapper.h"
 19 | #include "ann_exception.h"
 20 | 
 21 | namespace po = boost::program_options;
 22 | 
 23 | template<typename T, typename TagT = uint32_t>
 24 | int build_in_memory_index(const diskann::Metric& metric,
 25 |                           const std::string& data_prefix, const unsigned R,
 26 |                           const unsigned L, const float alpha,
 27 |                           const std::string& save_path,
 28 |                           const unsigned     num_threads) {
 29 |   diskann::Parameters paras;
 30 |   paras.Set<unsigned>("R", R);
 31 |   paras.Set<unsigned>("L", L);
 32 |   paras.Set<unsigned>(
 33 |       "C", 750);  // maximum candidate set size during pruning procedure
 34 |   paras.Set<float>("alpha", alpha);
 35 |   paras.Set<bool>("saturate_graph", 0);
 36 |   paras.Set<unsigned>("num_threads", num_threads);
 37 | 
 38 |   const std::string data_path = data_prefix + "_data.bin";
 39 |   const std::string tags_path = data_prefix + "_ids.bin";
 40 | 
 41 |   _u64 data_num, data_dim, tags_num, tags_dim;
 42 |   diskann::get_bin_metadata(data_path, data_num, data_dim);
 43 |   diskann::get_bin_metadata(tags_path, tags_num, tags_dim);
 44 |   if (data_num != tags_num) {
 45 |     diskann::cerr << "The number of data and tags mismatch" << std::endl;
 46 |     exit(1);
 47 |   }
 48 |   std::ifstream reader;
 49 |   reader.exceptions(std::ifstream::failbit | std::ifstream::badbit);
 50 |   diskann::cout << "Opening bin file " << tags_path << "... " << std::endl;
 51 |   reader.open(tags_path, std::ios::binary | std::ios::ate);
 52 |   reader.seekg(2 * sizeof(_u32), std::ios::beg);
 53 |   _u32 tags_size = tags_num * tags_dim;
 54 |   std::vector<_u32> tags(tags_size);
 55 |   reader.read((char*)tags.data(), tags_size * sizeof(_u32));
 56 |   reader.close();
 57 | 
 58 |   diskann::Index<T, TagT> index(metric, data_dim, data_num, false, true);
 59 |   auto                    s = std::chrono::high_resolution_clock::now();
 60 |   index.build(data_path.c_str(), data_num, paras, tags);
 61 | 
 62 |   std::chrono::duration<double> diff =
 63 |       std::chrono::high_resolution_clock::now() - s;
 64 | 
 65 |   std::cout << "Indexing time: " << diff.count() << "\n";
 66 |   index.save(save_path.c_str());
 67 | 
 68 |   return 0;
 69 | }
 70 | 
 71 | int main(int argc, char** argv) {
 72 |   std::string data_type, dist_fn, data_path, index_path_prefix;
 73 |   unsigned    num_threads, R, L;
 74 |   float       alpha;
 75 | 
 76 |   po::options_description desc{"Arguments"};
 77 |   try {
 78 |     desc.add_options()("help,h", "Print information on arguments");
 79 |     desc.add_options()("data_type",
 80 |                        po::value<std::string>(&data_type)->required(),
 81 |                        "data type <int8/uint8/float>");
 82 |     desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
 83 |                        "distance function <l2/mips>");
 84 |     desc.add_options()("data_path",
 85 |                        po::value<std::string>(&data_path)->required(),
 86 |                        "Input data file in bin format");
 87 |     desc.add_options()("index_path_prefix",
 88 |                        po::value<std::string>(&index_path_prefix)->required(),
 89 |                        "Path prefix for saving index file components");
 90 |     desc.add_options()("max_degree,R",
 91 |                        po::value<uint32_t>(&R)->default_value(64),
 92 |                        "Maximum graph degree");
 93 |     desc.add_options()(
 94 |         "Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
 95 |         "Build complexity, higher value results in better graphs");
 96 |     desc.add_options()(
 97 |         "alpha", po::value<float>(&alpha)->default_value(1.2f),
 98 |         "alpha controls density and diameter of graph, set 1 for sparse graph, "
 99 |         "1.2 or 1.4 for denser graphs with lower diameter");
100 |     desc.add_options()(
101 |         "num_threads,T",
102 |         po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
103 |         "Number of threads used for building index (defaults to "
104 |         "omp_get_num_procs())");
105 | 
106 |     po::variables_map vm;
107 |     po::store(po::parse_command_line(argc, argv, desc), vm);
108 |     if (vm.count("help")) {
109 |       std::cout << desc;
110 |       return 0;
111 |     }
112 |     po::notify(vm);
113 |   } catch (const std::exception& ex) {
114 |     std::cerr << ex.what() << '\n';
115 |     return -1;
116 |   }
117 | 
118 |   diskann::Metric metric;
119 |   if (dist_fn == std::string("mips")) {
120 |     metric = diskann::Metric::INNER_PRODUCT;
121 |   } else if (dist_fn == std::string("l2")) {
122 |     metric = diskann::Metric::L2;
123 |   } else if (dist_fn == std::string("cosine")) {
124 |     metric = diskann::Metric::COSINE;
125 |   } else {
126 |     std::cout << "Unsupported distance function. Currently only L2/ Inner "
127 |                  "Product/Cosine are supported."
128 |               << std::endl;
129 |     return -1;
130 |   }
131 | 
132 |   try {
133 |     diskann::cout << "Starting index build with R: " << R << "  Lbuild: " << L
134 |                   << "  alpha: " << alpha << "  #threads: " << num_threads
135 |                   << std::endl;
136 |     if (data_type == std::string("int8"))
137 |       return build_in_memory_index<int8_t>(metric, data_path, R, L, alpha,
138 |                                            index_path_prefix, num_threads);
139 |     else if (data_type == std::string("uint8"))
140 |       return build_in_memory_index<uint8_t>(metric, data_path, R, L, alpha,
141 |                                             index_path_prefix, num_threads);
142 |     else if (data_type == std::string("float"))
143 |       return build_in_memory_index<float>(metric, data_path, R, L, alpha,
144 |                                           index_path_prefix, num_threads);
145 |     else {
146 |       std::cout << "Unsupported type. Use one of int8, uint8 or float."
147 |                 << std::endl;
148 |       return -1;
149 |     }
150 |   } catch (const std::exception& e) {
151 |     std::cout << std::string(e.what()) << std::endl;
152 |     diskann::cerr << "Index build failed." << std::endl;
153 |     return -1;
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/tests/test_incremental_index.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <index.h>
  5 | #include <numeric>
  6 | #include <omp.h>
  7 | #include <string.h>
  8 | #include <time.h>
  9 | #include <timer.h>
 10 | 
 11 | #include "utils.h"
 12 | 
 13 | #ifndef _WINDOWS
 14 | #include <sys/mman.h>
 15 | #include <sys/stat.h>
 16 | #include <unistd.h>
 17 | #endif
 18 | 
 19 | #include "memory_mapper.h"
 20 | 
 21 | int main(int argc, char** argv) {
 22 |   if (argc != 10) {
 23 |     std::cout << "Correct usage: " << argv[0]
 24 |               << " data_file L R C alpha num_rounds "
 25 |               << "save_graph_file #incr_points #frozen_points" << std::endl;
 26 |     exit(-1);
 27 |   }
 28 | 
 29 |   float* data_load = NULL;
 30 |   size_t num_points, dim, aligned_dim;
 31 | 
 32 |   diskann::load_aligned_bin<float>(argv[1], data_load, num_points, dim,
 33 |                                    aligned_dim);
 34 | 
 35 |   unsigned    L = (unsigned) atoi(argv[2]);
 36 |   unsigned    R = (unsigned) atoi(argv[3]);
 37 |   unsigned    C = (unsigned) atoi(argv[4]);
 38 |   float       alpha = (float) std::atof(argv[5]);
 39 |   unsigned    num_rnds = (unsigned) std::atoi(argv[6]);
 40 |   std::string save_path(argv[7]);
 41 |   unsigned    num_incr = (unsigned) atoi(argv[8]);
 42 |   unsigned    num_frozen = (unsigned) atoi(argv[9]);
 43 | 
 44 |   diskann::Parameters paras;
 45 |   paras.Set<unsigned>("L", L);
 46 |   paras.Set<unsigned>("R", R);
 47 |   paras.Set<unsigned>("C", C);
 48 |   paras.Set<float>("alpha", alpha);
 49 |   paras.Set<bool>("saturate_graph", false);
 50 |   paras.Set<unsigned>("num_rnds", num_rnds);
 51 | 
 52 |   typedef int                 TagT;
 53 |   diskann::Index<float, TagT> index(diskann::L2, argv[1], num_points,
 54 |                                     num_points - num_incr, num_frozen, true,
 55 |                                     true, true);
 56 |   {
 57 |     std::vector<TagT> tags(num_points - num_incr);
 58 |     std::iota(tags.begin(), tags.end(), 0);
 59 | 
 60 |     if (argc > 10) {
 61 |       std::string frozen_points_file(argv[10]);
 62 |       index.generate_random_frozen_points(frozen_points_file.c_str());
 63 |     } else
 64 |       index.generate_random_frozen_points();
 65 | 
 66 |     diskann::Timer timer;
 67 |     index.build(paras, tags);
 68 |     std::cout << "Index build time: " << timer.elapsed() / 1000 << "ms\n";
 69 |   }
 70 | 
 71 |   std::vector<diskann::Neighbor>       pool, tmp;
 72 |   tsl::robin_set<unsigned>             visited;
 73 |   std::vector<diskann::SimpleNeighbor> cut_graph;
 74 |   index.readjust_data(num_frozen);
 75 | 
 76 |   {
 77 |     diskann::Timer timer;
 78 |     for (size_t i = num_points - num_incr; i < num_points; ++i) {
 79 |       index.insert_point(data_load + i * aligned_dim, paras, pool, tmp, visited,
 80 |                          cut_graph, i);
 81 |     }
 82 |     std::cout << "Incremental time: " << timer.elapsed() / 1000 << "ms\n";
 83 |     auto save_path_inc = save_path + ".inc";
 84 |     index.save(save_path_inc.c_str());
 85 |   }
 86 | 
 87 |   tsl::robin_set<unsigned> delete_list;
 88 |   while (delete_list.size() < num_incr)
 89 |     delete_list.insert(rand() % num_points);
 90 |   std::cout << "Deleting " << delete_list.size() << " elements" << std::endl;
 91 | 
 92 |   {
 93 |     diskann::Timer timer;
 94 |     index.enable_delete();
 95 |     for (auto p : delete_list)
 96 | 
 97 |       if (index.eager_delete(p, paras) != 0)
 98 |         //    if (index.delete_point(p) != 0)
 99 |         std::cerr << "Delete tag " << p << " not found" << std::endl;
100 | 
101 |     if (index.disable_delete(paras, true) != 0) {
102 |       std::cerr << "Disable delete failed" << std::endl;
103 |       return -1;
104 |     }
105 |     std::cout << "Delete time: " << timer.elapsed() / 1000 << "ms\n";
106 |   }
107 | 
108 |   auto save_path_del = save_path + ".del";
109 |   index.save(save_path_del.c_str());
110 | 
111 |   index.readjust_data(num_frozen);
112 |   {
113 |     diskann::Timer timer;
114 |     for (auto p : delete_list) {
115 |       index.insert_point(data_load + (size_t) p * (size_t) aligned_dim, paras,
116 |                          pool, tmp, visited, cut_graph, p);
117 |     }
118 |     std::cout << "Re-incremental time: " << timer.elapsed() / 1000 << "ms\n";
119 |   }
120 | 
121 |   auto save_path_reinc = save_path + ".reinc";
122 |   index.save(save_path_reinc.c_str());
123 | 
124 |   delete[] data_load;
125 | 
126 |   return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/tests/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT license.
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | 
 6 | add_executable(fvecs_to_bin fvecs_to_bin.cpp)
 7 | 
 8 | add_executable(fvecs_to_bvecs fvecs_to_bvecs.cpp)
 9 | 
10 | add_executable(rand_data_gen rand_data_gen.cpp)
11 | target_link_libraries(rand_data_gen ${PROJECT_NAME} Boost::program_options)
12 | 
13 | add_executable(float_bin_to_int8 float_bin_to_int8.cpp)
14 | 
15 | add_executable(ivecs_to_bin ivecs_to_bin.cpp)
16 | 
17 | add_executable(tsv_to_bin tsv_to_bin.cpp)
18 | 
19 | add_executable(bin_to_tsv bin_to_tsv.cpp)
20 | 
21 | add_executable(int8_to_float int8_to_float.cpp)
22 | target_link_libraries(int8_to_float ${PROJECT_NAME})
23 | 
24 | add_executable(int8_to_float_scale int8_to_float_scale.cpp)
25 | target_link_libraries(int8_to_float_scale ${PROJECT_NAME})
26 | 
27 | add_executable(uint8_to_float uint8_to_float.cpp)
28 | target_link_libraries(uint8_to_float ${PROJECT_NAME})
29 | 
30 | add_executable(uint32_to_uint8 uint32_to_uint8.cpp)
31 | target_link_libraries(uint32_to_uint8 ${PROJECT_NAME})
32 | 
33 | add_executable(vector_analysis vector_analysis.cpp)
34 | target_link_libraries(vector_analysis ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
35 | 
36 | add_executable(gen_random_slice gen_random_slice.cpp)
37 | target_link_libraries(gen_random_slice ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
38 | 
39 | add_executable(simulate_aggregate_recall simulate_aggregate_recall.cpp)
40 | 
41 | add_executable(calculate_recall calculate_recall.cpp)
42 | target_link_libraries(calculate_recall ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
43 | 
44 | # This is the only thing outside of DiskANN main source that depends on MKL.
45 | add_executable(compute_groundtruth compute_groundtruth.cpp)
46 | target_include_directories(compute_groundtruth PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
47 | target_link_libraries(compute_groundtruth ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
48 | 
49 | add_executable(gen_range gen_range.cpp)
50 | target_include_directories(gen_range PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
51 | target_link_libraries(gen_range ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
52 | 
53 | add_executable(generate_pq generate_pq.cpp)
54 | target_link_libraries(generate_pq ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
55 | 
56 | add_executable(partition_data partition_data.cpp)
57 | target_link_libraries(partition_data ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
58 | 
59 | add_executable(partition_with_ram_budget partition_with_ram_budget.cpp)
60 | target_link_libraries(partition_with_ram_budget ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
61 | 
62 | add_executable(merge_shards merge_shards.cpp)
63 | target_link_libraries(merge_shards ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB})
64 | 
65 | add_executable(create_disk_layout create_disk_layout.cpp)
66 | target_link_libraries(create_disk_layout ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
67 | 
68 | add_executable(index_relayout index_relayout.cpp)
69 | target_link_libraries(index_relayout ${PROJECT_NAME})
70 | 
71 | add_executable(parse_freq_file parse_freq_file.cpp)
72 | target_link_libraries(parse_freq_file ${PROJECT_NAME})
73 | 
74 | add_executable(sq sq.cpp)
75 | target_link_libraries(sq ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
76 | # formatter
77 | # if (LINUX)
78 | # 	add_custom_command(TARGET gen_random_slice PRE_BUILD COMMAND clang-format -i ../../../include/*.h ../../../include/dll/*.h ../../../src/*.cpp  ../../../tests/*.cpp ../../../src/dll/*.cpp ../../../tests/utils/*.cpp)
79 | # endif()
80 | 
81 | 


--------------------------------------------------------------------------------
/tests/utils/bin_to_fvecs.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "util.h"
 6 | 
 7 | void block_convert(std::ifstream& writr, std::ofstream& readr, float* read_buf,
 8 |                    float* write_buf, _u64 npts, _u64 ndims) {
 9 |   writr.write((char*) read_buf,
10 |               npts * (ndims * sizeof(float) + sizeof(unsigned)));
11 | #pragma omp parallel for
12 |   for (_u64 i = 0; i < npts; i++) {
13 |     memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1,
14 |            ndims * sizeof(float));
15 |   }
16 |   readr.read((char*) write_buf, npts * ndims * sizeof(float));
17 | }
18 | 
19 | int main(int argc, char** argv) {
20 |   if (argc != 3) {
21 |     std::cout << argv[0] << " input_bin output_fvecs" << std::endl;
22 |     exit(-1);
23 |   }
24 |   std::ifstream readr(argv[1], std::ios::binary);
25 |   int           npts_s32;
26 |   int           ndims_s32;
27 |   readr.read((char*) &npts_s32, sizeof(_s32));
28 |   readr.read((char*) &ndims_s32, sizeof(_s32));
29 |   size_t npts = npts_s32;
30 |   size_t ndims = ndims_s32;
31 |   _u32   ndims_u32 = (_u32) ndims_s32;
32 |   //  _u64          fsize = writr.tellg();
33 |   readr.seekg(0, std::ios::beg);
34 | 
35 |   unsigned ndims_u32;
36 |   writr.write((char*) &ndims_u32, sizeof(unsigned));
37 |   writr.seekg(0, std::ios::beg);
38 |   _u64 ndims = (_u64) ndims_u32;
39 |   _u64 npts = fsize / ((ndims + 1) * sizeof(float));
40 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
41 |             << std::endl;
42 | 
43 |   _u64 blk_size = 131072;
44 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
45 |   std::cout << "# blks: " << nblks << std::endl;
46 | 
47 |   std::ofstream writr(argv[2], std::ios::binary);
48 |   float*        read_buf = new float[npts * (ndims + 1)];
49 |   float*        write_buf = new float[npts * ndims];
50 |   for (_u64 i = 0; i < nblks; i++) {
51 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
52 |     block_convert(writr, readr, read_buf, write_buf, cblk_size, ndims);
53 |     std::cout << "Block #" << i << " written" << std::endl;
54 |   }
55 | 
56 |   delete[] read_buf;
57 |   delete[] write_buf;
58 | 
59 |   writr.close();
60 |   readr.close();
61 | }
62 | 


--------------------------------------------------------------------------------
/tests/utils/bin_to_tsv.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | template<class T>
 8 | void block_convert(std::ofstream& writer, std::ifstream& reader, T* read_buf,
 9 |                    _u64 npts, _u64 ndims) {
10 |   reader.read((char*) read_buf, npts * ndims * sizeof(float));
11 | 
12 |   for (_u64 i = 0; i < npts; i++) {
13 |     for (_u64 d = 0; d < ndims; d++) {
14 |       writer << read_buf[d + i * ndims];
15 |       if (d < ndims - 1)
16 |         writer << "\t";
17 |       else
18 |         writer << "\n";
19 |     }
20 |   }
21 | }
22 | 
23 | int main(int argc, char** argv) {
24 |   if (argc != 4) {
25 |     std::cout << argv[0] << " <float/int8/uint8> input_bin output_tsv"
26 |               << std::endl;
27 |     exit(-1);
28 |   }
29 |   std::string type_string(argv[1]);
30 |   if ((type_string != std::string("float")) &&
31 |       (type_string != std::string("int8")) &&
32 |       (type_string != std::string("uin8"))) {
33 |     std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl;
34 |   }
35 | 
36 |   std::ifstream reader(argv[2], std::ios::binary);
37 |   _u32          npts_u32;
38 |   _u32          ndims_u32;
39 |   reader.read((char*) &npts_u32, sizeof(_s32));
40 |   reader.read((char*) &ndims_u32, sizeof(_s32));
41 |   size_t npts = npts_u32;
42 |   size_t ndims = ndims_u32;
43 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
44 |             << std::endl;
45 | 
46 |   _u64 blk_size = 131072;
47 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
48 | 
49 |   std::ofstream writer(argv[3]);
50 |   char*         read_buf = new char[blk_size * ndims * 4];
51 |   for (_u64 i = 0; i < nblks; i++) {
52 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
53 |     if (type_string == std::string("float"))
54 |       block_convert<float>(writer, reader, (float*) read_buf, cblk_size, ndims);
55 |     else if (type_string == std::string("int8"))
56 |       block_convert<int8_t>(writer, reader, (int8_t*) read_buf, cblk_size,
57 |                             ndims);
58 |     else if (type_string == std::string("uint8"))
59 |       block_convert<uint8_t>(writer, reader, (uint8_t*) read_buf, cblk_size,
60 |                              ndims);
61 |     std::cout << "Block #" << i << " written" << std::endl;
62 |   }
63 | 
64 |   delete[] read_buf;
65 | 
66 |   writer.close();
67 |   reader.close();
68 | }
69 | 


--------------------------------------------------------------------------------
/tests/utils/calculate_recall.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <cstddef>
 5 | #include <cstdlib>
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <set>
 9 | #include <string>
10 | #include <vector>
11 | 
12 | #include "aux_utils.h"
13 | #include "utils.h"
14 | 
15 | int main(int argc, char** argv) {
16 |   if (argc != 4) {
17 |     std::cout << argv[0] << " <ground_truth_bin> <our_results_bin>  <r> "
18 |               << std::endl;
19 |     return -1;
20 |   }
21 |   unsigned* gold_std = NULL;
22 |   float*    gs_dist = nullptr;
23 |   unsigned* our_results = NULL;
24 |   float*    or_dist = nullptr;
25 |   size_t    points_num, points_num_gs, points_num_or;
26 |   size_t    dim_gs;
27 |   size_t    dim_or;
28 |   diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs);
29 |   diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or);
30 | 
31 |   if (points_num_gs != points_num_or) {
32 |     std::cout
33 |         << "Error. Number of queries mismatch in ground truth and our results"
34 |         << std::endl;
35 |     return -1;
36 |   }
37 |   points_num = points_num_gs;
38 | 
39 |   uint32_t recall_at = std::atoi(argv[3]);
40 | 
41 |   if ((dim_or < recall_at) || (recall_at > dim_gs)) {
42 |     std::cout << "ground truth has size " << dim_gs << "; our set has "
43 |               << dim_or << " points. Asking for recall " << recall_at
44 |               << std::endl;
45 |     return -1;
46 |   }
47 |   std::cout << "Calculating recall@" << recall_at << std::endl;
48 |   float recall_val = diskann::calculate_recall(
49 |       points_num, gold_std, gs_dist, dim_gs, our_results, dim_or, recall_at);
50 | 
51 |   //  double avg_recall = (recall*1.0)/(points_num*1.0);
52 |   std::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n";
53 | }
54 | 


--------------------------------------------------------------------------------
/tests/utils/create_disk_layout.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <cmath>
 5 | #include <cstring>
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <limits>
 9 | #include <vector>
10 | 
11 | #include "aux_utils.h"
12 | #include "cached_io.h"
13 | #include "utils.h"
14 | 
15 | template<typename T>
16 | int create_disk_layout(char **argv) {
17 |   std::string base_file(argv[2]);
18 |   std::string vamana_file(argv[3]);
19 |   std::string output_file(argv[4]);
20 |   diskann::create_disk_layout<T>(base_file, vamana_file, output_file);
21 |   return 0;
22 | }
23 | 
24 | int main(int argc, char **argv) {
25 |   if (argc != 5) {
26 |     std::cout << argv[0]
27 |               << " data_type <float/int8/uint8> data_bin "
28 |                  "vamana_index_file output_diskann_index_file"
29 |               << std::endl;
30 |     exit(-1);
31 |   }
32 | 
33 |   int ret_val = -1;
34 |   if (std::string(argv[1]) == std::string("float"))
35 |     ret_val = create_disk_layout<float>(argv);
36 |   else if (std::string(argv[1]) == std::string("int8"))
37 |     ret_val = create_disk_layout<int8_t>(argv);
38 |   else if (std::string(argv[1]) == std::string("uint8"))
39 |     ret_val = create_disk_layout<uint8_t>(argv);
40 |   else {
41 |     std::cout << "unsupported type. use int8/uint8/float " << std::endl;
42 |     ret_val = -2;
43 |   }
44 |   return ret_val;
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/utils/dist_gen.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from re import L
  3 | import struct
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | import statsmodels.api as sm 
  7 | import seaborn as sns
  8 | import scipy.stats
  9 | 
 10 | from numpy import random
 11 | 
 12 | def normal_distribution(size, exp, var, lim):
 13 |   assert(2 * exp < lim)
 14 |   standard_var = math.sqrt(var)
 15 |   data = np.array(random.normal(loc= exp, scale = standard_var, size=size)).astype(np.int32)
 16 |   for i in range(data.shape[0]):
 17 |     if data[i] < 0:
 18 |       data[i] = 0
 19 |     if data[i] > lim:
 20 |       data[i] = lim 
 21 |   return data
 22 | 
 23 | def uniform_distribution(size, low, high):
 24 |     
 25 |   assert(low >=0 & low <= high)
 26 |   data = random.uniform(low = low, high = high, size = size).astype(np.int32)
 27 |   return data
 28 | 
 29 | 
 30 | def poisson_distribution(size, exp, lim):
 31 |   data = np.array(random.poisson(lam = exp, size = size)).astype(np.int32)
 32 |   for i in range(data.shape[0]):
 33 |     if data[i] < 0:
 34 |       data[i] = 0
 35 |     if data[i] > lim:
 36 |       data[i] = lim 
 37 |   return data
 38 | 
 39 | def exponential_distribution(size, exp, lim):
 40 |   data = np.array(random.exponential(scale = exp, size = size)).astype(np.int32)
 41 |   for i in range(data.shape[0]):
 42 |     if data[i] < 0:
 43 |       data[i] = 0
 44 |     if data[i] > lim:
 45 |       data[i] = lim 
 46 |   return data
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | class DataAnalysis:
 53 |     """
 54 |     Generate a histogram, cdf of the data distribution,
 55 |     Get the max, min, mean, and variance of the data.
 56 | 
 57 |     Attributes:
 58 |         prefix_path: result folder prefix.
 59 |     """
 60 |     def __init__(self, result_prefix_path):
 61 |         self.prefix_path = result_prefix_path
 62 |         pass
 63 | 
 64 |     def plot_hist_and_kde(self, fname, x):
 65 |         plt.xlim((x.min(), x.max()))
 66 |         sns.distplot(x, hist=True)
 67 |         plt.title(fname + "hist and kde")
 68 |         plt.xlabel('result length of range search')
 69 |         plt.savefig(self.prefix_path + fname)
 70 |         plt.close()
 71 |         return 
 72 | 
 73 |     def plot_kde(self, fname, x_list, label_list):
 74 |         assert(len(x_list) == len(label_list))
 75 |         for x, label in zip(x_list, label_list):
 76 |             plt.xlim((x.min(), x.max()))
 77 |             sns.distplot(x, hist=False, label = label)
 78 |         plt.xlabel('result length of range search')
 79 |         plt.savefig(self.prefix_path + fname+"_kde")
 80 |         plt.close()
 81 |         return 
 82 | 
 83 |     def plot_ecdf(self, fname, x_list, label_list):
 84 |         assert(len(x_list) == len(label_list))
 85 |         for x, label in zip(x_list, label_list):
 86 |             plt.xlim(x.min(), x.max())
 87 |             sns.ecdfplot(data = x, label = label)
 88 |         plt.xlabel('result length of range search')
 89 |         plt.grid()
 90 |         plt.savefig(self.prefix_path + fname+"_cdf")
 91 |         plt.close()
 92 |         return 
 93 |     def save_dis_to_bin_file(self, data, filename):
 94 |       with open(self.prefix_path + filename, "wb") as f:
 95 |         f.write(struct.pack('I', len(data)))
 96 |         f.write(struct.pack('I'*len(data), *data))
 97 |         f.close()
 98 | 
 99 | def print_mean_var(data):
100 |   print("data len: %d mean: %d var: %d max %d min %d\n" %(len(data), data.mean(), data.var(), data.max(), data.min()))
101 | 
102 | def save_res(data, aly, file_name):
103 |   print(file_name + ": ")
104 |   print_mean_var(data=data)
105 |   aly.plot_kde(file_name, [data], [file_name])
106 |   aly.plot_ecdf(file_name, [data], [file_name])
107 |   aly.save_dis_to_bin_file(data, filename=file_name)
108 | 
109 | npts = 100000
110 | mean_k = 1000
111 | max_k = 50000
112 | aly = DataAnalysis("./dis/")
113 | 
114 | data = normal_distribution(npts, mean_k, 100000, max_k)
115 | save_res(data, aly, "norm")
116 | 
117 | data = uniform_distribution(npts, 0, 2000)
118 | save_res(data, aly, "uniform")
119 | 
120 | data = poisson_distribution(npts, mean_k, max_k)
121 | save_res(data, aly, "poisson")
122 | 
123 | data = exponential_distribution(npts, mean_k, max_k)
124 | save_res(data, aly, "exponential")
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/tests/utils/float_bin_to_int8.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | void block_convert(std::ofstream& writer, int8_t* write_buf,
 8 |                    std::ifstream& reader, float* read_buf, _u64 npts,
 9 |                    _u64 ndims, float bias, float scale) {
10 |   reader.read((char*) read_buf, npts * ndims * sizeof(float));
11 | 
12 |   for (_u64 i = 0; i < npts; i++) {
13 |     for (_u64 d = 0; d < ndims; d++) {
14 |       write_buf[d + i * ndims] =
15 |           (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale));
16 |     }
17 |   }
18 |   writer.write((char*) write_buf, npts * ndims);
19 | }
20 | 
21 | int main(int argc, char** argv) {
22 |   if (argc != 5) {
23 |     std::cout << "Usage: " << argv[0] << "  input_bin  output_tsv  bias  scale"
24 |               << std::endl;
25 |     exit(-1);
26 |   }
27 | 
28 |   std::ifstream reader(argv[1], std::ios::binary);
29 |   _u32          npts_u32;
30 |   _u32          ndims_u32;
31 |   reader.read((char*) &npts_u32, sizeof(_s32));
32 |   reader.read((char*) &ndims_u32, sizeof(_s32));
33 |   size_t npts = npts_u32;
34 |   size_t ndims = ndims_u32;
35 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
36 |             << std::endl;
37 | 
38 |   _u64 blk_size = 131072;
39 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
40 | 
41 |   std::ofstream writer(argv[2], std::ios::binary);
42 |   auto          read_buf = new float[blk_size * ndims];
43 |   auto          write_buf = new int8_t[blk_size * ndims];
44 |   float         bias = atof(argv[3]);
45 |   float         scale = atof(argv[4]);
46 | 
47 |   writer.write((char*) (&npts_u32), sizeof(_u32));
48 |   writer.write((char*) (&ndims_u32), sizeof(_u32));
49 | 
50 |   for (_u64 i = 0; i < nblks; i++) {
51 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
52 |     block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias,
53 |                   scale);
54 |     std::cout << "Block #" << i << " written" << std::endl;
55 |   }
56 | 
57 |   delete[] read_buf;
58 |   delete[] write_buf;
59 | 
60 |   writer.close();
61 |   reader.close();
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/utils/fvecs_to_bin.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | void block_convert(std::ifstream& reader, std::ofstream& writer,
 8 |                    float* read_buf, float* write_buf, _u64 npts, _u64 ndims) {
 9 |   reader.read((char*) read_buf,
10 |               npts * (ndims * sizeof(float) + sizeof(unsigned)));
11 |   for (_u64 i = 0; i < npts; i++) {
12 |     memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1,
13 |            ndims * sizeof(float));
14 |   }
15 |   writer.write((char*) write_buf, npts * ndims * sizeof(float));
16 | }
17 | 
18 | int main(int argc, char** argv) {
19 |   if (argc != 3) {
20 |     std::cout << argv[0] << " input_fvecs output_bin" << std::endl;
21 |     exit(-1);
22 |   }
23 |   std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
24 |   _u64          fsize = reader.tellg();
25 |   reader.seekg(0, std::ios::beg);
26 | 
27 |   unsigned ndims_u32;
28 |   reader.read((char*) &ndims_u32, sizeof(unsigned));
29 |   reader.seekg(0, std::ios::beg);
30 |   _u64 ndims = (_u64) ndims_u32;
31 |   _u64 npts = fsize / ((ndims + 1) * sizeof(float));
32 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
33 |             << std::endl;
34 | 
35 |   _u64 blk_size = 131072;
36 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
37 |   std::cout << "# blks: " << nblks << std::endl;
38 |   std::ofstream writer(argv[2], std::ios::binary);
39 |   int           npts_s32 = (_s32) npts;
40 |   int           ndims_s32 = (_s32) ndims;
41 |   writer.write((char*) &npts_s32, sizeof(_s32));
42 |   writer.write((char*) &ndims_s32, sizeof(_s32));
43 |   float* read_buf = new float[npts * (ndims + 1)];
44 |   float* write_buf = new float[npts * ndims];
45 |   for (_u64 i = 0; i < nblks; i++) {
46 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
47 |     block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
48 |     std::cout << "Block #" << i << " written" << std::endl;
49 |   }
50 | 
51 |   delete[] read_buf;
52 |   delete[] write_buf;
53 | 
54 |   reader.close();
55 |   writer.close();
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/utils/fvecs_to_bvecs.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | void block_convert(std::ifstream& reader, std::ofstream& writer,
 8 |                    float* read_buf, uint8_t* write_buf, _u64 npts, _u64 ndims) {
 9 |   reader.read((char*) read_buf,
10 |               npts * (ndims * sizeof(float) + sizeof(unsigned)));
11 |   for (_u64 i = 0; i < npts; i++) {
12 |     memcpy(write_buf + i * (ndims + 4), read_buf + i * (ndims + 1),
13 |            sizeof(unsigned));
14 |     for (_u64 d = 0; d < ndims; d++)
15 |       write_buf[i * (ndims + 4) + 4 + d] =
16 |           (uint8_t) read_buf[i * (ndims + 1) + 1 + d];
17 |   }
18 |   writer.write((char*) write_buf, npts * (ndims * 1 + 4));
19 | }
20 | 
21 | int main(int argc, char** argv) {
22 |   if (argc != 3) {
23 |     std::cout << argv[0] << " input_fvecs output_bvecs(uint8)" << std::endl;
24 |     exit(-1);
25 |   }
26 |   std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
27 |   _u64          fsize = reader.tellg();
28 |   reader.seekg(0, std::ios::beg);
29 | 
30 |   unsigned ndims_u32;
31 |   reader.read((char*) &ndims_u32, sizeof(unsigned));
32 |   reader.seekg(0, std::ios::beg);
33 |   _u64 ndims = (_u64) ndims_u32;
34 |   _u64 npts = fsize / ((ndims + 1) * sizeof(float));
35 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
36 |             << std::endl;
37 | 
38 |   _u64 blk_size = 131072;
39 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
40 |   std::cout << "# blks: " << nblks << std::endl;
41 |   std::ofstream writer(argv[2], std::ios::binary);
42 |   auto          read_buf = new float[npts * (ndims + 1)];
43 |   auto          write_buf = new uint8_t[npts * (ndims + 4)];
44 |   for (_u64 i = 0; i < nblks; i++) {
45 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
46 |     block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
47 |     std::cout << "Block #" << i << " written" << std::endl;
48 |   }
49 | 
50 |   delete[] read_buf;
51 |   delete[] write_buf;
52 | 
53 |   reader.close();
54 |   writer.close();
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/utils/gen_random_slice.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <omp.h>
 5 | #include <algorithm>
 6 | #include <chrono>
 7 | #include <cmath>
 8 | #include <cstdio>
 9 | #include <ctime>
10 | #include <iostream>
11 | #include <iterator>
12 | #include <map>
13 | #include <sstream>
14 | #include <string>
15 | #include "partition_and_pq.h"
16 | #include "utils.h"
17 | 
18 | #include <fcntl.h>
19 | #include <sys/stat.h>
20 | #include <time.h>
21 | #include <typeinfo>
22 | 
23 | template<typename T>
24 | int aux_main(char** argv) {
25 |   std::string base_file(argv[2]);
26 |   std::string output_prefix(argv[3]);
27 |   float       sampling_rate = (float) (std::atof(argv[4]));
28 |   gen_random_slice<T>(base_file, output_prefix, sampling_rate);
29 |   return 0;
30 | }
31 | 
32 | int main(int argc, char** argv) {
33 |   if (argc != 5) {
34 |     std::cout << argv[0]
35 |               << " data_type [float/int8/uint8] base_bin_file "
36 |                  "sample_output_prefix sampling_probability"
37 |               << std::endl;
38 |     exit(-1);
39 |   }
40 | 
41 |   if (std::string(argv[1]) == std::string("float")) {
42 |     aux_main<float>(argv);
43 |   } else if (std::string(argv[1]) == std::string("int8")) {
44 |     aux_main<int8_t>(argv);
45 |   } else if (std::string(argv[1]) == std::string("uint8")) {
46 |     aux_main<uint8_t>(argv);
47 |   } else
48 |     std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
49 |   return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/utils/generate_pq.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include "math_utils.h"
 5 | #include "partition_and_pq.h"
 6 | 
 7 | #define KMEANS_ITERS_FOR_PQ 15
 8 | 
 9 | template<typename T>
10 | bool generate_pq(const std::string& data_path,
11 |                  const std::string& index_prefix_path,
12 |                  const size_t num_pq_centers, const size_t num_pq_chunks,
13 |                  const float sampling_rate) {
14 |   std::string pq_pivots_path = index_prefix_path + "_pq_pivots.bin";
15 |   std::string pq_compressed_vectors_path =
16 |       index_prefix_path + "_compressed.bin";
17 | 
18 |   // generates random sample and sets it to train_data and updates train_size
19 |   size_t train_size, train_dim;
20 |   float* train_data;
21 |   gen_random_slice<T>(data_path, sampling_rate, train_data, train_size,
22 |                       train_dim);
23 |   std::cout << "For computing pivots, loaded sample data of size " << train_size
24 |             << std::endl;
25 | 
26 |   generate_pq_pivots(train_data, train_size, train_dim, num_pq_centers,
27 |                      num_pq_chunks, KMEANS_ITERS_FOR_PQ, pq_pivots_path);
28 |   generate_pq_data_from_pivots<T>(data_path, num_pq_centers, num_pq_chunks,
29 |                                   pq_pivots_path, pq_compressed_vectors_path);
30 | 
31 |   delete[] train_data;
32 | 
33 |   return 0;
34 | }
35 | 
36 | int main(int argc, char** argv) {
37 |   if (argc != 6) {
38 |     std::cout
39 |         << "Usage: \n"
40 |         << argv[0]
41 |         << "  <data_type[float/uint8/int8]>   <data_file[.bin]>"
42 |            "  <PQ_prefix_path>  <target-bytes/data-point>  <sampling_rate>"
43 |         << std::endl;
44 |   } else {
45 |     const std::string data_path(argv[2]);
46 |     const std::string index_prefix_path(argv[3]);
47 |     const size_t      num_pq_centers = 256;
48 |     const size_t      num_pq_chunks = (size_t) atoi(argv[4]);
49 |     const float       sampling_rate = atof(argv[5]);
50 | 
51 |     if (std::string(argv[1]) == std::string("float"))
52 |       generate_pq<float>(data_path, index_prefix_path, num_pq_centers,
53 |                          num_pq_chunks, sampling_rate);
54 |     else if (std::string(argv[1]) == std::string("int8"))
55 |       generate_pq<int8_t>(data_path, index_prefix_path, num_pq_centers,
56 |                           num_pq_chunks, sampling_rate);
57 |     else if (std::string(argv[1]) == std::string("uint8"))
58 |       generate_pq<uint8_t>(data_path, index_prefix_path, num_pq_centers,
59 |                            num_pq_chunks, sampling_rate);
60 |     else
61 |       std::cout << "Error. wrong file type" << std::endl;
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/utils/index_relayout.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Songlin Wu on 2022/6/30.
  3 | //
  4 | #include <chrono>
  5 | #include <string>
  6 | #include <utils.h>
  7 | #include <memory>
  8 | #include <set>
  9 | #include <vector>
 10 | #include <iostream>
 11 | #include <fstream>
 12 | #include <limits>
 13 | #include <cstring>
 14 | #include <map>
 15 | #include <unordered_map>
 16 | #include <unordered_set>
 17 | #include <algorithm>
 18 | #include <utility>
 19 | #include <omp.h>
 20 | #include <cmath>
 21 | #include <mutex>
 22 | #include <queue>
 23 | #include <random>
 24 | 
 25 | #include "cached_io.h"
 26 | #include "pq_flash_index.h"
 27 | #include "aux_utils.h"
 28 | 
 29 | #define READ_SECTOR_LEN (size_t) 4096
 30 | #define READ_SECTOR_OFFSET(node_id) \
 31 |   ((_u64) node_id / nnodes_per_sector  + 1) * READ_SECTOR_LEN + ((_u64) node_id % nnodes_per_sector) * max_node_len;
 32 | #define INF 0xffffffff
 33 | 
 34 | const std::string partition_index_filename = "_tmp.index";
 35 | 
 36 | // Write DiskANN sector data according to graph-partition layout 
 37 | // The new index data
 38 | void relayout(const char* indexname, const char* partition_name) {
 39 |   _u64                               C;
 40 |   _u64                               _partition_nums;
 41 |   _u64                               _nd;
 42 |   _u64                               max_node_len;
 43 |   std::vector<std::vector<unsigned>> layout;
 44 |   std::vector<std::vector<unsigned>> _partition;
 45 | 
 46 |   std::ifstream part(partition_name);
 47 |   part.read((char*) &C, sizeof(_u64));
 48 |   part.read((char*) &_partition_nums, sizeof(_u64));
 49 |   part.read((char*) &_nd, sizeof(_u64));
 50 |   std::cout << "C: " << C << " partition_nums:" << _partition_nums
 51 |             << " _nd:" << _nd << std::endl;
 52 |   
 53 |   auto meta_pair = diskann::get_disk_index_meta(indexname);
 54 |   _u64 actual_index_size = get_file_size(indexname);
 55 |   _u64 expected_file_size, expected_npts;
 56 | 
 57 |   if (meta_pair.first) {
 58 |       // new version
 59 |       expected_file_size = meta_pair.second.back();
 60 |       expected_npts = meta_pair.second.front();
 61 |   } else {
 62 |       expected_file_size = meta_pair.second.front();
 63 |       expected_npts = meta_pair.second[1];
 64 |   }
 65 | 
 66 |   if (expected_file_size != actual_index_size) {
 67 |     diskann::cout << "File size mismatch for " << indexname
 68 |                   << " (size: " << actual_index_size << ")"
 69 |                   << " with meta-data size: " << expected_file_size << std::endl;
 70 |     exit(-1);
 71 |   }
 72 |   if (expected_npts != _nd) {
 73 |     diskann::cout << "expect _nd: " << _nd
 74 |                   << " actual _nd: " << expected_npts << std::endl;
 75 |     exit(-1);
 76 |   }
 77 |   max_node_len = meta_pair.second[3];
 78 |   unsigned nnodes_per_sector = meta_pair.second[4];
 79 |   if (SECTOR_LEN / max_node_len != C) {
 80 |     diskann::cout << "nnodes per sector: " << SECTOR_LEN / max_node_len << " C: " << C
 81 |                   << std::endl;
 82 |     exit(-1);
 83 |   }
 84 | 
 85 |   layout.resize(_partition_nums);
 86 |   for (unsigned i = 0; i < _partition_nums; i++) {
 87 |     unsigned s;
 88 |     part.read((char*) &s, sizeof(unsigned));
 89 |     layout[i].resize(s);
 90 |     part.read((char*) layout[i].data(), sizeof(unsigned) * s);
 91 |   }
 92 |   part.close();
 93 | 
 94 |   _u64            read_blk_size = 64 * 1024 * 1024;
 95 |   _u64            write_blk_size = read_blk_size;
 96 | 
 97 |   std::string partition_path(partition_name);
 98 |   partition_path = partition_path.substr(0, partition_path.find_last_of('.')) + partition_index_filename;
 99 |   cached_ofstream diskann_writer(partition_path, write_blk_size);
100 |   // cached_ifstream diskann_reader(indexname, read_blk_size);
101 | 
102 |   std::unique_ptr<char[]> sector_buf = std::make_unique<char[]>(SECTOR_LEN);
103 |   std::unique_ptr<char[]> node_buf = std::make_unique<char[]>(max_node_len);
104 | 
105 |   // this time, we load all index into mem;
106 |   std::cout << "nnodes per sector "<<nnodes_per_sector << std::endl;
107 |   _u64 file_size = READ_SECTOR_LEN + READ_SECTOR_LEN * ((_nd + nnodes_per_sector - 1) / nnodes_per_sector);
108 |   std::unique_ptr<char[]> mem_index =
109 |       std::make_unique<char[]>(file_size);
110 |   std::ifstream diskann_reader(indexname);
111 |   diskann_reader.read(mem_index.get(),file_size);
112 |   std::cout << "C: " << C << " partition_nums:" << _partition_nums
113 |             << " _nd:" << _nd << std::endl;
114 | 
115 |   const _u64 disk_file_size = _partition_nums * SECTOR_LEN + SECTOR_LEN;
116 |   if (meta_pair.first) {
117 |     char* meta_buf = mem_index.get() + 2 * sizeof(int);
118 |     *(reinterpret_cast<_u64*>(meta_buf + 4 * sizeof(_u64))) = C;
119 |     *(reinterpret_cast<_u64*>(meta_buf + (meta_pair.second.size()-1) * sizeof(_u64)))
120 |         = disk_file_size;
121 |   } else {
122 |     _u64* meta_buf = reinterpret_cast<_u64*>(mem_index.get());
123 |     *meta_buf = disk_file_size;
124 |     *(meta_buf + 4) = C;
125 |   }
126 |   std::cout << "size "<< disk_file_size << std::endl;
127 |   diskann_writer.write((char*) mem_index.get(),
128 |                        SECTOR_LEN);  // copy meta data;
129 |   for (unsigned i = 0; i < _partition_nums; i++) {
130 |     if (i % 100000 == 0) {
131 |       diskann::cout << "relayout has done " << (float) i / _partition_nums
132 |                     << std::endl;
133 |       diskann::cout.flush();
134 |     }
135 |     memset(sector_buf.get(), 0, SECTOR_LEN);
136 |     for (unsigned j = 0; j < layout[i].size(); j++) {
137 |       unsigned id = layout[i][j];
138 |       memset(node_buf.get(), 0, max_node_len);
139 |       uint64_t index_offset = READ_SECTOR_OFFSET(id);
140 |       uint64_t buf_offset = (uint64_t)j * max_node_len;
141 |       memcpy((char*) sector_buf.get() + buf_offset,
142 |              (char*) mem_index.get() + index_offset, max_node_len);
143 |     }
144 |     // memcpy((char*)sector_buf.get() + C*max_node_len, (char*)layout[i].data(), sizeof(unsigned));
145 |     diskann_writer.write(sector_buf.get(), SECTOR_LEN);
146 |   }
147 |   diskann::cout << "Relayout index." << std::endl;
148 | }
149 | 
150 | int main(int argc, char** argv){
151 |   char* indexName = argv[1];
152 |   char* partitonName = argv[2];
153 |   relayout(indexName, partitonName);
154 |   return 0;
155 | }


--------------------------------------------------------------------------------
/tests/utils/int8_to_float.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | int main(int argc, char** argv) {
 8 |   if (argc != 3) {
 9 |     std::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl;
10 |     exit(-1);
11 |   }
12 | 
13 |   int8_t* input;
14 |   size_t  npts, nd;
15 |   diskann::load_bin<int8_t>(argv[1], input, npts, nd);
16 |   float* output = new float[npts * nd];
17 |   diskann::convert_types<int8_t, float>(input, output, npts, nd);
18 |   diskann::save_bin<float>(argv[2], output, npts, nd);
19 |   delete[] output;
20 |   delete[] input;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/utils/int8_to_float_scale.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | void block_convert(std::ofstream& writer, float* write_buf,
 8 |                    std::ifstream& reader, int8_t* read_buf, _u64 npts,
 9 |                    _u64 ndims, float bias, float scale) {
10 |   reader.read((char*) read_buf, npts * ndims * sizeof(int8_t));
11 | 
12 |   for (_u64 i = 0; i < npts; i++) {
13 |     for (_u64 d = 0; d < ndims; d++) {
14 |       write_buf[d + i * ndims] =
15 |           (((float) read_buf[d + i * ndims] - bias) * scale);
16 |     }
17 |   }
18 |   writer.write((char*) write_buf, npts * ndims * sizeof(float));
19 | }
20 | 
21 | int main(int argc, char** argv) {
22 |   if (argc != 5) {
23 |     std::cout << "Usage: " << argv[0]
24 |               << "  input-int8.bin  output-float.bin  bias  scale" << std::endl;
25 |     exit(-1);
26 |   }
27 | 
28 |   std::ifstream reader(argv[1], std::ios::binary);
29 |   _u32          npts_u32;
30 |   _u32          ndims_u32;
31 |   reader.read((char*) &npts_u32, sizeof(_s32));
32 |   reader.read((char*) &ndims_u32, sizeof(_s32));
33 |   size_t npts = npts_u32;
34 |   size_t ndims = ndims_u32;
35 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
36 |             << std::endl;
37 | 
38 |   _u64 blk_size = 131072;
39 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
40 | 
41 |   std::ofstream writer(argv[2], std::ios::binary);
42 |   auto          read_buf = new int8_t[blk_size * ndims];
43 |   auto          write_buf = new float[blk_size * ndims];
44 |   float         bias = atof(argv[3]);
45 |   float         scale = atof(argv[4]);
46 | 
47 |   writer.write((char*) (&npts_u32), sizeof(_u32));
48 |   writer.write((char*) (&ndims_u32), sizeof(_u32));
49 | 
50 |   for (_u64 i = 0; i < nblks; i++) {
51 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
52 |     block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias,
53 |                   scale);
54 |     std::cout << "Block #" << i << " written" << std::endl;
55 |   }
56 | 
57 |   delete[] read_buf;
58 |   delete[] write_buf;
59 | 
60 |   writer.close();
61 |   reader.close();
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/utils/ivecs_to_bin.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | void block_convert(std::ifstream& reader, std::ofstream& writer, _u32* read_buf,
 8 |                    _u32* write_buf, _u64 npts, _u64 ndims) {
 9 |   reader.read((char*) read_buf,
10 |               npts * (ndims * sizeof(_u32) + sizeof(unsigned)));
11 |   for (_u64 i = 0; i < npts; i++) {
12 |     memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1,
13 |            ndims * sizeof(_u32));
14 |   }
15 |   writer.write((char*) write_buf, npts * ndims * sizeof(_u32));
16 | }
17 | 
18 | int main(int argc, char** argv) {
19 |   if (argc != 3) {
20 |     std::cout << argv[0] << " input_ivecs output_bin" << std::endl;
21 |     exit(-1);
22 |   }
23 |   std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
24 |   _u64          fsize = reader.tellg();
25 |   reader.seekg(0, std::ios::beg);
26 | 
27 |   unsigned ndims_u32;
28 |   reader.read((char*) &ndims_u32, sizeof(unsigned));
29 |   reader.seekg(0, std::ios::beg);
30 |   _u64 ndims = (_u64) ndims_u32;
31 |   _u64 npts = fsize / ((ndims + 1) * sizeof(_u32));
32 |   std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
33 |             << std::endl;
34 | 
35 |   _u64 blk_size = 131072;
36 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
37 |   std::cout << "# blks: " << nblks << std::endl;
38 |   std::ofstream writer(argv[2], std::ios::binary);
39 |   int           npts_s32 = (_s32) npts;
40 |   int           ndims_s32 = (_s32) ndims;
41 |   writer.write((char*) &npts_s32, sizeof(_s32));
42 |   writer.write((char*) &ndims_s32, sizeof(_s32));
43 |   _u32* read_buf = new _u32[npts * (ndims + 1)];
44 |   _u32* write_buf = new _u32[npts * ndims];
45 |   for (_u64 i = 0; i < nblks; i++) {
46 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
47 |     block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
48 |     std::cout << "Block #" << i << " written" << std::endl;
49 |   }
50 | 
51 |   delete[] read_buf;
52 |   delete[] write_buf;
53 | 
54 |   reader.close();
55 |   writer.close();
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/utils/merge_shards.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <algorithm>
 5 | #include <atomic>
 6 | #include <cassert>
 7 | #include <fstream>
 8 | #include <iostream>
 9 | #include <set>
10 | #include <string>
11 | #include <vector>
12 | 
13 | #include "aux_utils.h"
14 | #include "cached_io.h"
15 | #include "utils.h"
16 | 
17 | int main(int argc, char **argv) {
18 |   if (argc != 9) {
19 |     std::cout
20 |         << argv[0]
21 |         << " vamana_index_prefix[1] vamana_index_suffix[2] idmaps_prefix[3] "
22 |            "idmaps_suffix[4] n_shards[5] max_degree[6] output_vamana_path[7] "
23 |            "output_medoids_path[8]"
24 |         << std::endl;
25 |     exit(-1);
26 |   }
27 | 
28 |   std::string vamana_prefix(argv[1]);
29 |   std::string vamana_suffix(argv[2]);
30 |   std::string idmaps_prefix(argv[3]);
31 |   std::string idmaps_suffix(argv[4]);
32 |   _u64        nshards = (_u64) std::atoi(argv[5]);
33 |   _u32        max_degree = (_u64) std::atoi(argv[6]);
34 |   std::string output_index(argv[7]);
35 |   std::string output_medoids(argv[8]);
36 | 
37 |   return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix,
38 |                                idmaps_suffix, nshards, max_degree, output_index,
39 |                                output_medoids);
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/utils/parse_freq_file.cpp:
--------------------------------------------------------------------------------
  1 | #include <omp.h>
  2 | #include <algorithm>
  3 | #include <cassert>
  4 | #include <chrono>
  5 | #include <cmath>
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <ctime>
  9 | #include <fstream>
 10 | #include <ios>
 11 | #include <iostream>
 12 | #include <iterator>
 13 | #include <map>
 14 | #include <memory>
 15 | #include <sstream>
 16 | #include <string>
 17 | #include "partition_and_pq.h"
 18 | #include "utils.h"
 19 | 
 20 | #include <fcntl.h>
 21 | #include <sys/stat.h>
 22 | #include <time.h>
 23 | #include <typeinfo>
 24 | #include <utility>
 25 | #include <vector>
 26 | 
 27 | template<typename T>
 28 | int aux_main(char** argv) {
 29 |   std::string base_file(argv[2]);
 30 |   std::string freq_file(argv[3]);
 31 |   std::string output_prefix(argv[4]);
 32 |   float       use_ratio = atof(argv[5]);
 33 | 
 34 |   std::string output_data_file = output_prefix + "_data.bin";
 35 |   std::string output_ids_file = output_prefix + "_ids.bin";
 36 | 
 37 |   std::ifstream freq_reader(freq_file, std::ios_base::binary);
 38 |   std::ifstream base_reader(base_file, std::ios_base::binary);
 39 |   std::ofstream data_writer(output_data_file, std::ios_base::binary);
 40 |   std::ofstream ids_writer(output_ids_file, std::ios_base::binary);
 41 | 
 42 |   unsigned npts = 0;
 43 |   freq_reader.read((char*) &npts, sizeof(unsigned));
 44 |   std::vector<unsigned> freq_vec(npts);
 45 |   freq_reader.read((char*) freq_vec.data(), sizeof(unsigned) * npts);
 46 | 
 47 |   std::vector<std::pair<unsigned, unsigned>> freq_pair_vec(npts);
 48 |   for (unsigned i = 0; i < npts; i++) {
 49 |     freq_pair_vec[i] = std::make_pair(i, freq_vec[i]);
 50 |   }
 51 |   std::sort(
 52 |       freq_pair_vec.begin(), freq_pair_vec.end(),
 53 |       [](std::pair<unsigned, unsigned>& a, std::pair<unsigned, unsigned>& b) {
 54 |         return a.second > b.second;
 55 |       });
 56 |   size_t   dim = 0, nums = 0;
 57 |   unsigned one = 1;
 58 | 
 59 |   diskann::get_bin_metadata(base_file, nums, dim);
 60 |   unsigned _dim = dim;
 61 |   unsigned use_npt = nums * use_ratio;
 62 |   assert(nums == npts);
 63 |   assert(use_npt > 0);
 64 |   auto buf = std::make_unique<char[]>(sizeof(T) * dim);
 65 | 
 66 |   data_writer.write((char*) &use_npt, sizeof(unsigned));
 67 |   data_writer.write((char*) &_dim, sizeof(unsigned));
 68 | 
 69 |   ids_writer.write((char*) &use_npt, sizeof(unsigned));
 70 |   ids_writer.write((char*) &one, sizeof(unsigned));
 71 | 
 72 |   for (unsigned i = 0; i < use_npt; i++) {
 73 |     unsigned node_id = freq_pair_vec[i].first;
 74 |     ids_writer.write((char*) &node_id, sizeof(unsigned));
 75 |     unsigned offset = 8 + sizeof(T) * dim * node_id;
 76 |     base_reader.seekg(offset, std::ios_base::beg);
 77 |     base_reader.read((char*) buf.get(), sizeof(T) * dim);
 78 |     data_writer.write((char*) buf.get(), sizeof(T) * dim);
 79 |   }
 80 | 
 81 |   return 0;
 82 | }
 83 | 
 84 | int main(int argc, char** argv) {
 85 |   if (argc != 6) {
 86 |     std::cout << argv[0]
 87 |               << " data_type [float/int8/uint8] base_bin_file "
 88 |                  "freq_file_path output_prefix sample_ratio"
 89 |               << std::endl;
 90 |     exit(-1);
 91 |   }
 92 | 
 93 |   if (std::string(argv[1]) == std::string("float")) {
 94 |     aux_main<float>(argv);
 95 |   } else if (std::string(argv[1]) == std::string("int8")) {
 96 |     aux_main<int8_t>(argv);
 97 |   } else if (std::string(argv[1]) == std::string("uint8")) {
 98 |     aux_main<uint8_t>(argv);
 99 |   } else
100 |     std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
101 |   return 0;
102 | }
103 | 


--------------------------------------------------------------------------------
/tests/utils/partition_data.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <index.h>
 5 | #include <math_utils.h>
 6 | #include "cached_io.h"
 7 | #include "partition_and_pq.h"
 8 | 
 9 | // DEPRECATED: NEED TO REPROGRAM
10 | 
11 | int main(int argc, char** argv) {
12 |   if (argc != 7) {
13 |     std::cout << "Usage:\n"
14 |               << argv[0]
15 |               << "  datatype<int8/uint8/float>  <data_path>"
16 |                  "  <prefix_path>  <sampling_rate>  "
17 |                  "  <num_partitions>  <k_index>"
18 |               << std::endl;
19 |     exit(-1);
20 |   }
21 | 
22 |   const std::string data_path(argv[2]);
23 |   const std::string prefix_path(argv[3]);
24 |   const float       sampling_rate = atof(argv[4]);
25 |   const size_t      num_partitions = (size_t) std::atoi(argv[5]);
26 |   const size_t      max_reps = 15;
27 |   const size_t      k_index = (size_t) std::atoi(argv[6]);
28 | 
29 |   if (std::string(argv[1]) == std::string("float"))
30 |     partition<float>(data_path, sampling_rate, num_partitions, max_reps,
31 |                      prefix_path, k_index);
32 |   else if (std::string(argv[1]) == std::string("int8"))
33 |     partition<int8_t>(data_path, sampling_rate, num_partitions, max_reps,
34 |                       prefix_path, k_index);
35 |   else if (std::string(argv[1]) == std::string("uint8"))
36 |     partition<uint8_t>(data_path, sampling_rate, num_partitions, max_reps,
37 |                        prefix_path, k_index);
38 |   else
39 |     std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/utils/partition_with_ram_budget.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <index.h>
 5 | #include <math_utils.h>
 6 | #include "cached_io.h"
 7 | #include "partition_and_pq.h"
 8 | 
 9 | // DEPRECATED: NEED TO REPROGRAM
10 | 
11 | int main(int argc, char** argv) {
12 |   if (argc != 8) {
13 |     std::cout << "Usage:\n"
14 |               << argv[0]
15 |               << "  datatype<int8/uint8/float>  <data_path>"
16 |                  "  <prefix_path>  <sampling_rate>  "
17 |                  "  <ram_budget(GB)> <graph_degree>  <k_index>"
18 |               << std::endl;
19 |     exit(-1);
20 |   }
21 | 
22 |   const std::string data_path(argv[2]);
23 |   const std::string prefix_path(argv[3]);
24 |   const float       sampling_rate = atof(argv[4]);
25 |   const double      ram_budget = (double) std::atof(argv[5]);
26 |   const size_t      graph_degree = (size_t) std::atoi(argv[6]);
27 |   const size_t      k_index = (size_t) std::atoi(argv[7]);
28 | 
29 |   if (std::string(argv[1]) == std::string("float"))
30 |     partition_with_ram_budget<float>(data_path, sampling_rate, ram_budget,
31 |                                      graph_degree, prefix_path, k_index);
32 |   else if (std::string(argv[1]) == std::string("int8"))
33 |     partition_with_ram_budget<int8_t>(data_path, sampling_rate, ram_budget,
34 |                                       graph_degree, prefix_path, k_index);
35 |   else if (std::string(argv[1]) == std::string("uint8"))
36 |     partition_with_ram_budget<uint8_t>(data_path, sampling_rate, ram_budget,
37 |                                        graph_degree, prefix_path, k_index);
38 |   else
39 |     std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/utils/rand_data_gen.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <iostream>
  5 | #include <cstdlib>
  6 | #include <random>
  7 | #include <cmath>
  8 | #include <boost/program_options.hpp>
  9 | 
 10 | #include "utils.h"
 11 | 
 12 | namespace po = boost::program_options;
 13 | 
 14 | int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
 15 |                       float norm) {
 16 |   auto vec = new float[ndims];
 17 | 
 18 |   std::random_device         rd{};
 19 |   std::mt19937               gen{rd()};
 20 |   std::normal_distribution<> normal_rand{0, 1};
 21 | 
 22 |   for (_u64 i = 0; i < npts; i++) {
 23 |     float sum = 0;
 24 |     for (_u64 d = 0; d < ndims; ++d)
 25 |       vec[d] = normal_rand(gen);
 26 |     for (_u64 d = 0; d < ndims; ++d)
 27 |       sum += vec[d] * vec[d];
 28 |     for (_u64 d = 0; d < ndims; ++d)
 29 |       vec[d] = vec[d] * norm / std::sqrt(sum);
 30 | 
 31 |     writer.write((char*) vec, ndims * sizeof(float));
 32 |   }
 33 | 
 34 |   delete[] vec;
 35 |   return 0;
 36 | }
 37 | 
 38 | int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts, float norm) {
 39 |   auto vec = new float[ndims];
 40 |   auto vec_T = new int8_t[ndims];
 41 | 
 42 |   std::random_device         rd{};
 43 |   std::mt19937               gen{rd()};
 44 |   std::normal_distribution<> normal_rand{0, 1};
 45 | 
 46 |   for (_u64 i = 0; i < npts; i++) {
 47 |     float sum = 0;
 48 |     for (_u64 d = 0; d < ndims; ++d)
 49 |       vec[d] = normal_rand(gen);
 50 |     for (_u64 d = 0; d < ndims; ++d)
 51 |       sum += vec[d] * vec[d];
 52 |     for (_u64 d = 0; d < ndims; ++d)
 53 |       vec[d] = vec[d] * norm / std::sqrt(sum);
 54 | 
 55 |     for (_u64 d = 0; d < ndims; ++d) {
 56 |       vec_T[d] = std::round<int>(vec[d]);
 57 |     }
 58 | 
 59 |     writer.write((char*) vec_T, ndims * sizeof(int8_t));
 60 |   }
 61 | 
 62 |   delete[] vec;
 63 |   delete[] vec_T;
 64 |   return 0;
 65 | }
 66 | 
 67 | int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
 68 |                       float norm) {
 69 |   auto vec = new float[ndims];
 70 |   auto vec_T = new int8_t[ndims];
 71 | 
 72 |   std::random_device         rd{};
 73 |   std::mt19937               gen{rd()};
 74 |   std::normal_distribution<> normal_rand{0, 1};
 75 | 
 76 |   for (_u64 i = 0; i < npts; i++) {
 77 |     float sum = 0;
 78 |     for (_u64 d = 0; d < ndims; ++d)
 79 |       vec[d] = normal_rand(gen);
 80 |     for (_u64 d = 0; d < ndims; ++d)
 81 |       sum += vec[d] * vec[d];
 82 |     for (_u64 d = 0; d < ndims; ++d)
 83 |       vec[d] = vec[d] * norm / std::sqrt(sum);
 84 | 
 85 |     for (_u64 d = 0; d < ndims; ++d) {
 86 |       vec_T[d] = 128 + std::round<int>(vec[d]);
 87 |     }
 88 | 
 89 |     writer.write((char*) vec_T, ndims * sizeof(uint8_t));
 90 |   }
 91 | 
 92 |   delete[] vec;
 93 |   delete[] vec_T;
 94 |   return 0;
 95 | }
 96 | 
 97 | int main(int argc, char** argv) {
 98 |   std::string data_type, output_file;
 99 |   _u64        ndims, npts;
100 |   float       norm;
101 | 
102 |   try {
103 |     po::options_description desc{"Arguments"};
104 | 
105 |     desc.add_options()("help,h", "Print information on arguments");
106 | 
107 |     desc.add_options()("data_type",
108 |                        po::value<std::string>(&data_type)->required(),
109 |                        "data type <int8/uint8/float>");
110 |     desc.add_options()("output_file",
111 |                        po::value<std::string>(&output_file)->required(),
112 |                        "File name for saving the random vectors");
113 |     desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(),
114 |                        "Dimensoinality of the vector");
115 |     desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(),
116 |                        "Number of vectors");
117 |     desc.add_options()("norm", po::value<float>(&norm)->required(),
118 |                        "Norm of the vectors");
119 |     po::variables_map vm;
120 |     po::store(po::parse_command_line(argc, argv, desc), vm);
121 |     if (vm.count("help")) {
122 |       std::cout << desc;
123 |       return 0;
124 |     }
125 |     po::notify(vm);
126 |   } catch (const std::exception& ex) {
127 |     std::cerr << ex.what() << '\n';
128 |     return -1;
129 |   }
130 | 
131 |   if (data_type != std::string("float") && data_type != std::string("int8") &&
132 |       data_type != std::string("uint8")) {
133 |     std::cout << "Unsupported type. float, int8 and uint8 types are supported."
134 |               << std::endl;
135 |     return -1;
136 |   }
137 | 
138 |   if (norm <= 0.0) {
139 |     std::cerr << "Error: Norm must be a positive number" << std::endl;
140 |     return -1;
141 |   }
142 | 
143 |   if (data_type == std::string("int8") || data_type == std::string("uint8")) {
144 |     if (norm > 127) {
145 |       std::cerr
146 |           << "Error: for int8/uint8 datatypes, L2 norm can not be greater "
147 |              "than 127"
148 |           << std::endl;
149 |       return -1;
150 |     }
151 |   }
152 | 
153 |   try {
154 |     std::ofstream writer;
155 |     writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
156 |     writer.open(output_file, std::ios::binary);
157 |     auto npts_s32 = (_u32) npts;
158 |     auto ndims_s32 = (_u32) ndims;
159 |     writer.write((char*) &npts_s32, sizeof(_u32));
160 |     writer.write((char*) &ndims_s32, sizeof(_u32));
161 | 
162 |     _u64 blk_size = 131072;
163 |     _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
164 |     std::cout << "# blks: " << nblks << std::endl;
165 | 
166 |     int ret = 0;
167 |     for (_u64 i = 0; i < nblks; i++) {
168 |       _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
169 |       if (data_type == std::string("float")) {
170 |         ret = block_write_float(writer, ndims, cblk_size, norm);
171 |       } else if (data_type == std::string("int8")) {
172 |         ret = block_write_int8(writer, ndims, cblk_size, norm);
173 |       } else if (data_type == std::string("uint8")) {
174 |         ret = block_write_uint8(writer, ndims, cblk_size, norm);
175 |       }
176 |       if (ret == 0)
177 |         std::cout << "Block #" << i << " written" << std::endl;
178 |       else {
179 |         writer.close();
180 |         std::cout << "failed to write" << std::endl;
181 |         return -1;
182 |       }
183 |     }
184 |     writer.close();
185 | 
186 |   } catch (const std::exception& e) {
187 |     std::cout << std::string(e.what()) << std::endl;
188 |     diskann::cerr << "Index build failed." << std::endl;
189 |     return -1;
190 |   }
191 | 
192 |   return 0;
193 | }
194 | 


--------------------------------------------------------------------------------
/tests/utils/simulate_aggregate_recall.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include <cstdlib>
 6 | #include <random>
 7 | #include <cmath>
 8 | 
 9 | inline float aggregate_recall(const unsigned k_aggr, const unsigned k,
10 |                               const unsigned npart, unsigned* count,
11 |                               const std::vector<float>& recalls) {
12 |   float found = 0;
13 |   for (unsigned i = 0; i < npart; ++i) {
14 |     size_t max_found = std::min(count[i], k);
15 |     found += recalls[max_found - 1] * max_found;
16 |   }
17 |   return found / (float) k_aggr;
18 | }
19 | 
20 | void simulate(const unsigned k_aggr, const unsigned k, const unsigned npart,
21 |               const unsigned nsim, const std::vector<float>& recalls) {
22 |   std::random_device                 r;
23 |   std::default_random_engine         randeng(r());
24 |   std::uniform_int_distribution<int> uniform_dist(0, npart - 1);
25 | 
26 |   unsigned* count = new unsigned[npart];
27 |   double    aggr_recall = 0;
28 | 
29 |   for (unsigned i = 0; i < nsim; ++i) {
30 |     for (unsigned p = 0; p < npart; ++p) {
31 |       count[p] = 0;
32 |     }
33 |     for (unsigned t = 0; t < k_aggr; ++t) {
34 |       count[uniform_dist(randeng)]++;
35 |     }
36 |     aggr_recall += aggregate_recall(k_aggr, k, npart, count, recalls);
37 |   }
38 | 
39 |   std::cout << "Aggregate recall is " << aggr_recall / (double) nsim
40 |             << std::endl;
41 |   delete[] count;
42 | }
43 | 
44 | int main(int argc, char** argv) {
45 |   if (argc < 6) {
46 |     std::cout << argv[0]
47 |               << " k_aggregate k_out npart nsim recall@1 recall@2 ... recall@k"
48 |               << std::endl;
49 |     exit(-1);
50 |   }
51 | 
52 |   const unsigned k_aggr = atoi(argv[1]);
53 |   const unsigned k = atoi(argv[2]);
54 |   const unsigned npart = atoi(argv[3]);
55 |   const unsigned nsim = atoi(argv[4]);
56 | 
57 |   std::vector<float> recalls;
58 |   for (int ctr = 5; ctr < argc; ctr++) {
59 |     recalls.push_back(atof(argv[ctr]));
60 |   }
61 | 
62 |   if (recalls.size() != k) {
63 |     std::cerr << "Please input k numbers for recall@1, recall@2 .. recall@k"
64 |               << std::endl;
65 |   }
66 |   if (k_aggr > npart * k) {
67 |     std::cerr << "k_aggr must be <= k * npart" << std::endl;
68 |     exit(-1);
69 |   }
70 |   if (nsim <= npart * k_aggr) {
71 |     std::cerr << "Choose nsim > npart*k_aggr" << std::endl;
72 |     exit(-1);
73 |   }
74 | 
75 |   simulate(k_aggr, k, npart, nsim, recalls);
76 | 
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/tests/utils/tsv_to_bin.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <iostream>
  5 | #include "utils.h"
  6 | 
  7 | void block_convert_float(std::ifstream& reader, std::ofstream& writer,
  8 |                          _u64 npts, _u64 ndims) {
  9 |   auto read_buf = new float[npts * (ndims + 1)];
 10 | 
 11 |   auto  cursor = read_buf;
 12 |   float val;
 13 | 
 14 |   for (_u64 i = 0; i < npts; i++) {
 15 |     for (_u64 d = 0; d < ndims; ++d) {
 16 |       reader >> val;
 17 |       *cursor = val;
 18 |       cursor++;
 19 |     }
 20 |   }
 21 |   writer.write((char*) read_buf, npts * ndims * sizeof(float));
 22 |   delete[] read_buf;
 23 | }
 24 | 
 25 | void block_convert_int8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
 26 |                         _u64 ndims) {
 27 |   auto read_buf = new int8_t[npts * (ndims + 1)];
 28 | 
 29 |   auto cursor = read_buf;
 30 |   int  val;
 31 | 
 32 |   for (_u64 i = 0; i < npts; i++) {
 33 |     for (_u64 d = 0; d < ndims; ++d) {
 34 |       reader >> val;
 35 |       *cursor = (int8_t) val;
 36 |       cursor++;
 37 |     }
 38 |   }
 39 |   writer.write((char*) read_buf, npts * ndims * sizeof(uint8_t));
 40 |   delete[] read_buf;
 41 | }
 42 | 
 43 | void block_convert_uint8(std::ifstream& reader, std::ofstream& writer,
 44 |                          _u64 npts, _u64 ndims) {
 45 |   auto read_buf = new uint8_t[npts * (ndims + 1)];
 46 | 
 47 |   auto cursor = read_buf;
 48 |   int  val;
 49 | 
 50 |   for (_u64 i = 0; i < npts; i++) {
 51 |     for (_u64 d = 0; d < ndims; ++d) {
 52 |       reader >> val;
 53 |       *cursor = (uint8_t) val;
 54 |       cursor++;
 55 |     }
 56 |   }
 57 |   writer.write((char*) read_buf, npts * ndims * sizeof(uint8_t));
 58 |   delete[] read_buf;
 59 | }
 60 | 
 61 | int main(int argc, char** argv) {
 62 |   if (argc != 6) {
 63 |     std::cout << argv[0]
 64 |               << "<float/int8/uint8> input_filename.tsv output_filename.bin "
 65 |                  "dim num_pts>"
 66 |               << std::endl;
 67 |     exit(-1);
 68 |   }
 69 | 
 70 |   if (std::string(argv[1]) != std::string("float") &&
 71 |       std::string(argv[1]) != std::string("int8") &&
 72 |       std::string(argv[1]) != std::string("uint8")) {
 73 |     std::cout << "Unsupported type. float, int8 and uint8 types are supported."
 74 |               << std::endl;
 75 |   }
 76 | 
 77 |   _u64 ndims = atoi(argv[4]);
 78 |   _u64 npts = atoi(argv[5]);
 79 | 
 80 |   std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
 81 |   //  _u64          fsize = reader.tellg();
 82 |   reader.seekg(0, std::ios::beg);
 83 |   reader.seekg(0, std::ios::beg);
 84 | 
 85 |   _u64 blk_size = 131072;
 86 |   _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
 87 |   std::cout << "# blks: " << nblks << std::endl;
 88 |   std::ofstream writer(argv[3], std::ios::binary);
 89 |   auto          npts_s32 = (_u32) npts;
 90 |   auto          ndims_s32 = (_u32) ndims;
 91 |   writer.write((char*) &npts_s32, sizeof(_u32));
 92 |   writer.write((char*) &ndims_s32, sizeof(_u32));
 93 | 
 94 |   for (_u64 i = 0; i < nblks; i++) {
 95 |     _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
 96 |     if (std::string(argv[1]) == std::string("float")) {
 97 |       block_convert_float(reader, writer, cblk_size, ndims);
 98 |     } else if (std::string(argv[1]) == std::string("int8")) {
 99 |       block_convert_int8(reader, writer, cblk_size, ndims);
100 |     } else if (std::string(argv[1]) == std::string("uint8")) {
101 |       block_convert_uint8(reader, writer, cblk_size, ndims);
102 |     }
103 |     std::cout << "Block #" << i << " written" << std::endl;
104 |   }
105 | 
106 |   reader.close();
107 |   writer.close();
108 | }
109 | 


--------------------------------------------------------------------------------
/tests/utils/uint32_to_uint8.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | int main(int argc, char** argv) {
 8 |   if (argc != 3) {
 9 |     std::cout << argv[0] << " input_uint32_bin output_int8_bin" << std::endl;
10 |     exit(-1);
11 |   }
12 | 
13 |   uint32_t* input;
14 |   size_t    npts, nd;
15 |   diskann::load_bin<uint32_t>(argv[1], input, npts, nd);
16 |   uint8_t* output = new uint8_t[npts * nd];
17 |   diskann::convert_types<uint32_t, uint8_t>(input, output, npts, nd);
18 |   diskann::save_bin<uint8_t>(argv[2], output, npts, nd);
19 |   delete[] output;
20 |   delete[] input;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/utils/uint8_to_float.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT license.
 3 | 
 4 | #include <iostream>
 5 | #include "utils.h"
 6 | 
 7 | int main(int argc, char** argv) {
 8 |   if (argc != 3) {
 9 |     std::cout << argv[0] << " input_uint8_bin output_float_bin" << std::endl;
10 |     exit(-1);
11 |   }
12 | 
13 |   uint8_t* input;
14 |   size_t   npts, nd;
15 |   diskann::load_bin<uint8_t>(argv[1], input, npts, nd);
16 |   float* output = new float[npts * nd];
17 |   diskann::convert_types<uint8_t, float>(input, output, npts, nd);
18 |   diskann::save_bin<float>(argv[2], output, npts, nd);
19 |   delete[] output;
20 |   delete[] input;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/utils/vector_analysis.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation. All rights reserved.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <omp.h>
  5 | #include <algorithm>
  6 | #include <chrono>
  7 | #include <cmath>
  8 | #include <cstdio>
  9 | #include <ctime>
 10 | #include <iostream>
 11 | #include <iterator>
 12 | #include <map>
 13 | #include <sstream>
 14 | #include <string>
 15 | #include "partition_and_pq.h"
 16 | #include "utils.h"
 17 | 
 18 | #include <fcntl.h>
 19 | #include <sys/stat.h>
 20 | #include <time.h>
 21 | #include <typeinfo>
 22 | 
 23 | template<typename T>
 24 | int analyze_norm(std::string base_file) {
 25 |   std::cout << "Analyzing data norms" << std::endl;
 26 |   T*   data;
 27 |   _u64 npts, ndims;
 28 |   diskann::load_bin<T>(base_file, data, npts, ndims);
 29 |   std::vector<float> norms(npts, 0);
 30 | #pragma omp parallel for schedule(dynamic)
 31 |   for (_s64 i = 0; i < (_s64) npts; i++) {
 32 |     for (_u32 d = 0; d < ndims; d++)
 33 |       norms[i] += data[i * ndims + d] * data[i * ndims + d];
 34 |     norms[i] = std::sqrt(norms[i]);
 35 |   }
 36 |   std::sort(norms.begin(), norms.end());
 37 |   for (_u32 p = 0; p < 100; p += 5)
 38 |     std::cout << "percentile " << p << ": "
 39 |               << norms[std::floor((p / 100.0) * npts)] << std::endl;
 40 |   std::cout << "percentile 100"
 41 |             << ": " << norms[npts - 1] << std::endl;
 42 |   delete[] data;
 43 |   return 0;
 44 | }
 45 | 
 46 | template<typename T>
 47 | int normalize_base(std::string base_file, std::string out_file) {
 48 |   std::cout << "Normalizing base" << std::endl;
 49 |   T*   data;
 50 |   _u64 npts, ndims;
 51 |   diskann::load_bin<T>(base_file, data, npts, ndims);
 52 |   //  std::vector<float> norms(npts, 0);
 53 | #pragma omp parallel for schedule(dynamic)
 54 |   for (_s64 i = 0; i < (_s64) npts; i++) {
 55 |     float pt_norm = 0;
 56 |     for (_u32 d = 0; d < ndims; d++)
 57 |       pt_norm += data[i * ndims + d] * data[i * ndims + d];
 58 |     pt_norm = std::sqrt(pt_norm);
 59 |     for (_u32 d = 0; d < ndims; d++)
 60 |       data[i * ndims + d] = data[i * ndims + d] / pt_norm;
 61 |   }
 62 |   diskann::save_bin<T>(out_file, data, npts, ndims);
 63 |   delete[] data;
 64 |   return 0;
 65 | }
 66 | 
 67 | template<typename T>
 68 | int augment_base(std::string base_file, std::string out_file,
 69 |                  bool prep_base = true) {
 70 |   std::cout << "Analyzing data norms" << std::endl;
 71 |   T*   data;
 72 |   _u64 npts, ndims;
 73 |   diskann::load_bin<T>(base_file, data, npts, ndims);
 74 |   std::vector<float> norms(npts, 0);
 75 |   float              max_norm = 0;
 76 | #pragma omp parallel for schedule(dynamic)
 77 |   for (_s64 i = 0; i < (_s64) npts; i++) {
 78 |     for (_u32 d = 0; d < ndims; d++)
 79 |       norms[i] += data[i * ndims + d] * data[i * ndims + d];
 80 |     max_norm = norms[i] > max_norm ? norms[i] : max_norm;
 81 |   }
 82 |   //  std::sort(norms.begin(), norms.end());
 83 |   max_norm = std::sqrt(max_norm);
 84 |   std::cout << "Max norm: " << max_norm << std::endl;
 85 |   T*   new_data;
 86 |   _u64 newdims = ndims + 1;
 87 |   new_data = new T[npts * newdims];
 88 |   for (_u64 i = 0; i < npts; i++) {
 89 |     if (prep_base) {
 90 |       for (_u64 j = 0; j < ndims; j++) {
 91 |         new_data[i * newdims + j] = data[i * ndims + j] / max_norm;
 92 |       }
 93 |       float diff = 1 - (norms[i] / (max_norm * max_norm));
 94 |       diff = diff <= 0 ? 0 : std::sqrt(diff);
 95 |       new_data[i * newdims + ndims] = diff;
 96 |       if (diff <= 0) {
 97 |         std::cout << i << " has large max norm, investigate if needed. diff = "
 98 |                   << diff << std::endl;
 99 |       }
100 |     } else {
101 |       for (_u64 j = 0; j < ndims; j++) {
102 |         new_data[i * newdims + j] = data[i * ndims + j] / std::sqrt(norms[i]);
103 |       }
104 |       new_data[i * newdims + ndims] = 0;
105 |     }
106 |   }
107 |   diskann::save_bin<T>(out_file, new_data, npts, newdims);
108 |   delete[] new_data;
109 |   delete[] data;
110 |   return 0;
111 | }
112 | 
113 | template<typename T>
114 | int aux_main(char** argv) {
115 |   std::string base_file(argv[2]);
116 |   _u32        option = atoi(argv[3]);
117 |   if (option == 1)
118 |     analyze_norm<T>(base_file);
119 |   else if (option == 2)
120 |     augment_base<T>(base_file, std::string(argv[4]), true);
121 |   else if (option == 3)
122 |     augment_base<T>(base_file, std::string(argv[4]), false);
123 |   else if (option == 4)
124 |     normalize_base<T>(base_file, std::string(argv[4]));
125 |   return 0;
126 | }
127 | 
128 | int main(int argc, char** argv) {
129 |   if (argc < 4) {
130 |     std::cout << argv[0]
131 |               << " data_type [float/int8/uint8] base_bin_file "
132 |                  "[option: 1-norm analysis, 2-prep_base_for_mip, "
133 |                  "3-prep_query_for_mip, 4-normalize-vecs] [out_file for "
134 |                  "options 2/3/4]"
135 |               << std::endl;
136 |     exit(-1);
137 |   }
138 | 
139 |   if (std::string(argv[1]) == std::string("float")) {
140 |     aux_main<float>(argv);
141 |   } else if (std::string(argv[1]) == std::string("int8")) {
142 |     aux_main<int8_t>(argv);
143 |   } else if (std::string(argv[1]) == std::string("uint8")) {
144 |     aux_main<uint8_t>(argv);
145 |   } else
146 |     std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
147 |   return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/tests_data/l2_rand_float_10D_10K_norm1.0_self_gt10:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zilliztech/starling/17dc3e8a011533a62374445f53963e951b72883a/tests_data/l2_rand_float_10D_10K_norm1.0_self_gt10


--------------------------------------------------------------------------------
/tests_data/l2_rand_uint8_10D_10K_norm50.0_self_gt10:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zilliztech/starling/17dc3e8a011533a62374445f53963e951b72883a/tests_data/l2_rand_uint8_10D_10K_norm50.0_self_gt10


--------------------------------------------------------------------------------
/tests_data/rand_float_10D_10K_norm1.0.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zilliztech/starling/17dc3e8a011533a62374445f53963e951b72883a/tests_data/rand_float_10D_10K_norm1.0.bin


--------------------------------------------------------------------------------
/tests_data/rand_uint8_10D_10K_norm50.0.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zilliztech/starling/17dc3e8a011533a62374445f53963e951b72883a/tests_data/rand_uint8_10D_10K_norm50.0.bin


--------------------------------------------------------------------------------
/unit_tester.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Performs build and search test on disk and memory indices (parameters are tuned for 100K-1M sized datasets)
 3 | # All indices and logs will be stored in working_folder after run is complete
 4 | # To run, create a catalog text file consisting of the following entries
 5 | # For each dataset, specify the following 5 lines, in a line by line format, and then move on to next dataset
 6 | # dataset_name[used for save file names]
 7 | # /path/to/base.bin
 8 | # /path/to/query.bin
 9 | # data_type[float/uint8/int8]
10 | # metric[l2/mips]
11 | if [ "$#" -ne "3" ]; then
12 |   echo "usage: ./unit_test.sh [build_folder_path] [catalog] [working_folder]"
13 | else
14 | 
15 | BUILD_FOLDER=${1}
16 | CATALOG1=${2}
17 | WORK_FOLDER=${3}
18 | mkdir ${WORK_FOLDER}
19 | CATALOG="${WORK_FOLDER}/catalog_formatted.txt"
20 | sed -e '/^$/d' ${CATALOG1} > ${CATALOG}
21 | 
22 | echo Running unit testing on various files, with build folder as ${BUILD_FOLDER} and working folder as ${WORK_FOLDER}
23 | # download all unit test files
24 | 
25 | #iterate over them and run the corresponding test
26 | 
27 | 
28 | while IFS= read -r line; do
29 |   DATASET=${line}
30 |   read -r BASE
31 |   read -r QUERY
32 |   read -r TYPE
33 |   read -r METRIC
34 |   GT="${WORK_FOLDER}/${DATASET}_gt30_${METRIC}"
35 |   MEM="${WORK_FOLDER}/${DATASET}_mem"
36 |   DISK="${WORK_FOLDER}/${DATASET}_disk"
37 |   MBLOG="${WORK_FOLDER}/${DATASET}_mb.log"
38 |   DBLOG="${WORK_FOLDER}/${DATASET}_db.log"
39 |   MSLOG="${WORK_FOLDER}/${DATASET}_ms.log"
40 |   DSLOG="${WORK_FOLDER}/${DATASET}_ds.log"
41 |   
42 |   FILESIZE=`wc -c "${BASE}" | awk '{print $1}'`
43 |   BUDGETBUILD=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(5*1024*1024*1024)"`
44 |   BUDGETSERVE=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(10*1024*1024*1024)"`
45 |   echo "============================================================================================================================================="
46 |   echo "Running tests on ${DATASET} dataset, ${TYPE} datatype, $METRIC metric, ${BUDGETBUILD} GiB and ${BUDGETSERVE} GiB build and serve budget"
47 |   echo "============================================================================================================================================="
48 |   rm ${DISK}_*
49 |   
50 |   #echo "Going to run test on ${BASE} base, ${QUERY} query, ${TYPE} datatype, ${METRIC} metric, saving gt at ${GT}"
51 |   echo "Computing Groundtruth"
52 |   ${BUILD_FOLDER}/tests/utils/compute_groundtruth ${TYPE} ${BASE} ${QUERY} 30 ${GT} ${METRIC} > /dev/null
53 |   echo "Building Mem Index"
54 |   /usr/bin/time ${BUILD_FOLDER}/tests/build_memory_index ${TYPE} ${METRIC} ${BASE} ${MEM}  32  50  1.2 0 > ${MBLOG}
55 |   awk '/^Degree/' ${MBLOG}
56 |   awk '/^Indexing/' ${MBLOG}
57 |   echo "Searching Mem Index"
58 |   ${BUILD_FOLDER}/tests/search_memory_index ${TYPE} ${METRIC} ${BASE} ${MEM} 16 ${QUERY} ${GT} 10 /tmp/res 10 20 30 40 50 60 70 80 90 100 > ${MSLOG}
59 |   awk '/===/{x=NR+10}(NR<=x){print}' ${MSLOG}
60 |   echo "Building Disk Index"
61 |   ${BUILD_FOLDER}/tests/build_disk_index  ${TYPE} ${METRIC} ${BASE} ${DISK} 32 50 ${BUDGETSERVE} ${BUDGETBUILD} 32 0 > ${DBLOG}
62 |   awk '/^Compressing/' ${DBLOG}
63 |   echo "#shards in disk index"
64 |   awk '/^Indexing/' ${DBLOG}
65 |   echo "Searching Disk Index"
66 |   ${BUILD_FOLDER}/tests/search_disk_index ${TYPE} ${METRIC} ${DISK} 10000 10 4 ${QUERY} ${GT} 10 /tmp/res 20 40 60 80 100 > ${DSLOG}
67 |   echo "# shards used during index construction:"
68 |   awk '/medoids/{x=NR+1}(NR<=x){print}' ${DSLOG}
69 |   awk '/===/{x=NR+10}(NR<=x){print}' ${DSLOG}
70 | done < "${CATALOG}"
71 | fi
72 | 


--------------------------------------------------------------------------------
/windows/packages.config.in:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <packages>
 3 |   <package id="boost" version="1.78.0" />
 4 |   <!-- This will be replaced by CMake with the corresponding VisualStudio
 5 |   version (e.g. vc142 for Visual Studio 2019). -->
 6 |   <package id="boost_program_options-vc${MSVC_TOOLSET_VERSION}" version="1.78.0" />
 7 |   <package id="intelopenmp.redist.win" version="2022.0.3.3747" />
 8 |   <package id="intelopenmp.devel.win" version="2022.0.3.3747" />
 9 |   <package id="intelmkl.static.win-x64" version="2022.0.3.171" />
10 | </packages>
11 | 


--------------------------------------------------------------------------------
/workflows/in_memory_index.md:
--------------------------------------------------------------------------------
 1 | **Usage for in-memory indices**
 2 | ================================
 3 | 
 4 | To generate index, use the `tests/build_memory_index` program. 
 5 | --------------------------------------------------------------
 6 | 
 7 | The arguments are as follows:
 8 | 
 9 | 1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
10 | 2. **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
11 | 3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
12 | 4. **--index_path_prefix**: The constructed index components will be saved to this path prefix.
13 | 5. **-R (--max_degree)** (default is 64): the degree of the graph index, typically between 32 and 150. Larger R will result in larger indices and longer indexing times, but might yield better search quality. 
14 | 6. **-L (--Lbuild)** (default is 100): the size of search list we maintain during index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. 
15 | 7. **--alpha** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
16 | 8. **T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
17 | 
18 | 
19 | To search the generated index, use the `tests/search_memory_index` program:
20 | ---------------------------------------------------------------------------
21 | 
22 | 
23 | The arguments are as follows:
24 | 
25 | 1. **data_type**: The type of dataset you built the index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
26 | 2. **dist_fn**: There are two distance functions supported: l2 and mips. There is an additional *fast_l2* implementation that could provide faster results for small (about a million-sized) indices. Use the same distance as in arg (2) above used in building the index.
27 | 3. **memory_index_path**: index built above in argument (4).
28 | 4. **T**: The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but may lead to higher per-query latency, especially if the DRAM bandwidth is a bottleneck. So find the balance depending on throughput and latency required for your application.
29 | 5. **query_bin**: The queries to be searched on in same binary file format as the data file (ii) above. The query file must be the same type as in argument (1).
30 | 6. **truthset.bin**: The ground truth file for the queries in arg (7) and data file used in index construction.  The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `tests/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall.
31 | 7. **K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
32 | 8. **result_output_prefix**: search results will be stored in files, one per L value (see next arg), with specified prefix, in binary format.
33 | 9. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be atleast the value of *K* in (7).
34 | 
35 | 
36 | Example with BIGANN:
37 | --------------------
38 | 
39 | This example demonstrates the use of the commands above on a 100K slice of the [BIGANN dataset](http://corpus-texmex.irisa.fr/) with 128 dimensional SIFT descriptors applied to images. 
40 | 
41 | Download the base and query set and convert the data to binary format
42 | ```bash
43 | mkdir -p DiskANN/build/data && cd DiskANN/build/data
44 | wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
45 | tar -xf sift.tar.gz
46 | cd ..
47 | ./tests/utils/fvecs_to_bin data/sift/sift_learn.fvecs data/sift/sift_learn.fbin
48 | ./tests/utils/fvecs_to_bin data/sift/sift_query.fvecs data/sift/sift_query.fbin
49 | ```
50 | 
51 | Now build and search the index and measure the recall using ground truth computed using brutefoce. 
52 | ```bash
53 | ./tests/utils/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/sift/sift_learn.fbin --query_file  data/sift/sift_query.fbin --gt_file data/sift/sift_query_learn_gt100 --K 100
54 | ./tests/build_memory_index  --data_type float --dist_fn l2 --data_path data/sift/sift_learn.fbin --index_path_prefix data/sift/index_sift_learn_R32_L50_A1.2 -R 32 -L 50 --alpha 1.2
55 |  ./tests/search_memory_index  --data_type float --dist_fn l2 --index_path_prefix data/sift/index_sift_learn_R32_L50_A1.2 --query_file data/sift/sift_query.fbin  --gt_file data/sift/sift_query_learn_gt100 -K 10 -L 10 20 30 40 50 100 --result_path data/sift/res
56 |  ```
57 |  
58 | 
59 |  The output of search lists the throughput (Queries/sec) as well as mean and 99.9 latency in microseconds for each `L` parameter provided. (We measured on a 32-core 64-vCPU D-series Azure VM)
60 |  ```
61 |   Ls        QPS      Avg dist cmps  Mean Latency (mus)   99.9 Latency   Recall@10
62 | =================================================================================
63 |   10   319901.78            348.93              174.51        4943.35       97.80
64 |   20   346572.72            525.85              183.36         376.60       98.93
65 |   30   292060.12            688.86              217.73         421.60       99.30
66 |   40   248945.22            841.74              255.41         476.80       99.45
67 |   50   215888.81            986.67              294.62         542.21       99.56
68 |  100   129711.39           1631.94              490.58         848.61       99.88
69 |  ```
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------