├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── binReader.py
├── generateRandomTokens.py
├── getGT-filterSearch.py
├── getGT-filterSearchVariable.py
├── include
    ├── .DS_Store
    ├── FilterIndex.h
    ├── bliss
    │   ├── binReader.py
    │   ├── config.py
    │   ├── construct.py
    │   ├── dataPrepare.py
    │   ├── dataPrepare_constrained.py
    │   ├── index.py
    │   ├── net.py
    │   ├── query.py
    │   ├── train.py
    │   └── utils.py
    ├── cluster.h
    ├── readfile.h
    └── utils.h
├── plots.py
└── src
    ├── .DS_Store
    ├── FilterIndexHamming.cpp
    ├── FilterIndexHammingThNprobe.cpp
    ├── index.cpp
    ├── libfaiss.a
    ├── query.cpp
    ├── readfile.cpp
    └── utils.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.bin
 2 | index
 3 | query
 4 | sift*/
 5 | indices/
 6 | indices2/
 7 | results2/
 8 | plots.py   
 9 | script.sh 
10 | unused/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CXX=g++
 3 | CFLAGS = -std=gnu++17 -lgfortran -Wall -O3 -w -mavx
 4 | INC=-I faiss -I include/
 5 | LFLAGS=faiss/build/faiss/libfaiss.a OpenBLAS/libopenblas.a -lpthread -lm -ldl -lgfortran -fopenmp
 6 | 
 7 | index: clean_index
 8 | 	$(CXX) $(INC) $(CFLAGS) src/readfile.cpp \
 9 | 							src/utils.cpp \
10 | 							src/FilterIndexHamming.cpp \
11 | 							src/index.cpp \
12 | 	-o index $(LFLAGS)
13 | 
14 | query: clean_query
15 | 	$(CXX) $(INC) $(CFLAGS) src/readfile.cpp \
16 | 							src/utils.cpp \
17 | 							src/FilterIndexHamming.cpp \
18 | 							src/query.cpp \
19 | 	-o query $(LFLAGS)
20 | 	
21 | clean_index:
22 | 	rm -f index
23 | clean_query:
24 | 	rm -f query
25 | 
26 | .PHONY: clean all
27 | 
28 | debug: CXXFLAGS += -DDEBUG -g
29 | debug: all
30 | 
31 | release: CXXFLAGS += -O2
32 | release: all
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | With the surging popularity of approximate near-neighbor search (ANNS), driven by advances in neural representation learning, the ability to serve queries accompanied by a set of constraints has become an area of intense interest. While the community has recently proposed several algorithms for constrained ANNS, almost all of these methods focus on integration with graph-based indexes, the predominant class of algorithms achieving state-of-the-art performance in latency-recall tradeoffs. In this work, we take a different approach and focus on developing a constrained ANNS algorithm via space partitioning as opposed to graphs. To that end, we introduce Constrained Approximate Partitioned Search (CAPS), an index for ANNS with filters via space partitions that not only retains the benefits of a partition-based algorithm but also outperforms state-of-the-art graph-based constrained search techniques in recall-latency tradeoffs, with only 10% of the index size.
 3 | 
 4 | 
 5 | Install (only for Faiss Kmeans clustering)
 6 | - Faiss
 7 |    ```
 8 |    cd ..
 9 |    git clone https://github.com/facebookresearch/faiss.git
10 |    cd faiss
11 |    cmake -B build .
12 |    make -C build -j faiss
13 |    make -C build install
14 |    ```
15 |    This will generate the libfaiss.a
16 | 
17 | - OpenBLAS
18 |   ```
19 |   git clone https://github.com/xianyi/OpenBLAS.git
20 |   make
21 |   ```
22 |   This will generate the libopenblas.a
23 |   
24 | 
25 | Provide path at INC and LFLAGS in Makefile
26 | 
27 | - INC=-I faiss -I include/
28 | - LFLAGS=faiss/build/faiss/libfaiss.a OpenBLAS/libopenblas.a -lpthread -lm -ldl -lgfortran -fopenmp
29 | 
30 | 
31 | Get Data:
32 | - Download from https://github.com/AshenOn3/NHQ
33 | - To genearate synthetic data: Run generateRandomTokens.py then getGT-filterSearch.py, each time changing the attribute length (default =3) of attributes to generate the synthetic attributes and groundtruth.
34 |  
35 | For your own data
36 | - base vectors and query vectors are stored in .fvecs format
37 | - base and query attributes are stored in .txt files. 
38 | - Example -
39 | ```
40 | <num points> <num attributes>
41 | 2 outdoor night
42 | 1 indoor daytime
43 | 3 outdoor night
44 | 2 indoor daytime
45 | 3 outdoor daytime
46 | ```
47 |  
48 | Where "2 outdoor night" is an example of space seperated 3 attributes.
49 | 
50 | Make sure to have these files in the data folder
51 | ```
52 | data/sift/base.fvecs 
53 | data/sift/label_base_3.txt
54 | data/sift/query.fvecs 
55 | data/sift/label_query_3.txt 
56 | data/sift/label_3_hard_groundtruth.ivecs
57 | ```
58 | 
59 | If using bliss run
60 | ```
61 | python3 include/bliss/dataPrepare_constrained.py --data="data/sift"
62 | python3 include/bliss/construct.py --index='sift_epc40_K10_B1024_R1' --hdim=256 --mode=1 --kn=10
63 | make index
64 | ./index data/sift/base.fvecs data/sift/label_base_3.txt indices/sift1024blissMode1 1024 bliss 1
65 | make query
66 | ./query data/sift/base.fvecs data/sift/label_base_3.txt data/sift/query.fvecs data/sift/label_query_3.txt indices/sift1024blissMode1 data/sift/label_3_hard_groundtruth.ivecs 1024 bliss 1 500
67 | ```
68 | 
69 | If using faiss kmeans run
70 | ```
71 | make index
72 | ./index data/sift/base.fvecs data/sift/label_base_3.txt indices/sift1024blissMode1 1024 kmeans 1
73 | make kmeans
74 | ./query data/sift/base.fvecs data/sift/label_base_3.txt data/sift/query.fvecs data/sift/label_query_3.txt indices/sift1024blissMode1 data/sift/label_3_hard_groundtruth.ivecs 1024 kmeans 1 500
75 | ```
76 | 
77 | Functionalities: 
78 | - Variable number of attributes
79 | - AND among attributes
80 | - Large number of attributes
81 | 


--------------------------------------------------------------------------------
/binReader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pdb
 3 | def fvecs_read(filename, c_contiguous=True):
 4 |     fv = np.fromfile(filename, dtype=np.float32)
 5 |     if fv.size == 0:
 6 |         return np.zeros((0, 0))
 7 |     dim = fv.view(np.int32)[0]
 8 |     assert dim > 0
 9 |     fv = fv.reshape(-1, 1 + dim)
10 |     if not all(fv.view(np.int32)[:, 0] == dim):
11 |         raise IOError("Non-uniform vector sizes in " + filename)
12 |     fv = fv[:, 1:]
13 |     if c_contiguous:
14 |         fv = fv.copy()
15 |     return fv
16 | 
17 | def ivecs_read(fname):
18 |     a = np.fromfile(fname, dtype='int32')
19 |     d = a[0] 
20 |     return a.reshape(-1, d + 1)[:, 1:].copy()
21 | 
22 | def ibin_read(fname):
23 |     a = np.fromfile(fname, dtype='int32')
24 |     N = a[0]
25 |     d = a[1]
26 |     a = a[2:]
27 |     return a.reshape(-1, d).copy()
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/generateRandomTokens.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import pdb
 4 | 
 5 | def createLargeAttributes(Na, Nb, Nq, datasetname):
 6 |     base_file = open(datasetname+"/label_base_{}.txt".format(Na), "w")
 7 |     query_file = open(datasetname+"/label_query_{}.txt".format(Na), "w")
 8 |     rn = 0.2 +0.8*np.random.random((Nb,Na))
 9 |     qidx = random.sample(range(Nb), Nq)
10 | 
11 |     factor = np.floor((np.random.random(Na)*2)**3) +2
12 |     print (factor)
13 |     rn = np.log(1/rn)
14 |     for a in range(Na):
15 |         rn[:,a] = np.floor(factor[a]*rn[:,a])
16 |         print (np.unique(rn[:,a]))
17 |     rn = rn.astype(np.int32)
18 |     base_file.write("{} {}\n".format(Nb, Na))
19 |     query_file.write("{} {}\n".format(Nq, Na))
20 |     for i in range(Nb):
21 |         base_file.write("{}_{}".format(0,rn[i,0]))
22 |         for a in range(1,Na):
23 |             constrint = "{}_{}".format(a,rn[i,a])
24 |             base_file.write(" " + constrint)
25 |         base_file.write("\n")
26 |     for i in qidx:
27 |         query_file.write("{}_{}".format(0,rn[i,0]))
28 |         for a in range(1, Na):
29 |             constrint = "{}_{}".format(a,rn[i,a])
30 |             query_file.write(" " + constrint)
31 |         query_file.write("\n")
32 | 
33 | # createLargeAttributes(3, 53387, 200, "audio")
34 | # createLargeAttributes(3, 992272, 200, "msong")
35 | createLargeAttributes(10, 1183514, 10000, "glove-100")
36 | createLargeAttributes(100, 1183514, 10000, "glove-100")
37 | createLargeAttributes(10, 1000000, 10000, "gist")
38 | createLargeAttributes(100, 1000000, 10000, "gist")
39 | 


--------------------------------------------------------------------------------
/getGT-filterSearch.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | sys.path.insert(0, '../')
 4 | from binReader import *
 5 | import time
 6 | import pdb
 7 | from tqdm import tqdm
 8 | import argparse
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--data", default='nthing', type=str)
12 | args = parser.parse_args()
13 | 
14 | def getInvertedIndex(sentencesTokenised):
15 |     Invbins = {}
16 |     for i,sentence in enumerate(sentencesTokenised):
17 |         for word in sentence:
18 |             try:
19 |                 Invbins[word].append(i)
20 |             except(KeyError):
21 |                 Invbins[word] = []
22 |                 Invbins[word].append(i)
23 |     return Invbins
24 | 
25 | def intersection(lst1, lst2):
26 |     # Use of hybrid method
27 |     temp = set(lst2)
28 |     lst3 = [value for value in lst1 if value in temp]
29 |     return lst3
30 | 
31 | # attr = 3
32 | dataname = args.data
33 | trainPath = dataname+"/base.fvecs"
34 | testPath = dataname+"/query.fvecs"
35 | trainConstPath = dataname+"/label_base_3.txt"
36 | testConstPath = dataname+"/label_query_3.txt"
37 | print (trainPath, testPath, trainConstPath, testConstPath)
38 | 
39 | train = fvecs_read(trainPath, c_contiguous=True)
40 | test = fvecs_read(testPath, c_contiguous=True)
41 | trainConst = np.genfromtxt(trainConstPath, skip_header=1, delimiter=" ",dtype='str')
42 | testConst = np.genfromtxt(testConstPath, skip_header=1, delimiter=" ",dtype='str')
43 | 
44 | # get inv index on constraint
45 | Invbins = getInvertedIndex(trainConst)
46 | norms = 0.5*(np.linalg.norm(train,axis=1))**2
47 | 
48 | # do filter then search
49 | # get neighbors
50 | nt = test.shape[0]
51 | largest_indices = []
52 | print ("starting")
53 | print (time.time())
54 | for i in tqdm(range(0, nt)):
55 |     # try:
56 |     a = [Invbins[key] for key in testConst[i]]
57 |     candidates = a[0]
58 |     for k in range(1,len(a)):
59 |         candidates = intersection(candidates,a[k])
60 |     candidates = np.array(candidates)
61 |     dist = norms[candidates]  -train[candidates, :]@test[i] 
62 |     # dist2 = np.linalg.norm((train[candidates, :]-test[i]),axis=1)**2
63 |     # pdb.set_trace()
64 |     if len(dist)>100:
65 |         temp = np.argpartition(dist, 100)[:100]
66 |         temp = temp[np.argsort(dist[temp])]
67 |     else:
68 |         temp = np.argsort(dist)
69 |         temp = np.append(temp,-np.ones(100-len(temp),dtype=np.dtype(temp[0])))
70 |     assert len(temp)==100
71 |     largest_indices.append(candidates[temp]) # to verify this
72 |     
73 | print (len(largest_indices), nt)
74 | a = np.array(largest_indices).astype('int32')
75 | N,d = a.shape
76 | a = np.column_stack((d*np.ones(N, dtype='int32'),a)).flatten()
77 | a.tofile(dataname+"/label_3_hard_groundtruth.ivecs")
78 | 


--------------------------------------------------------------------------------
/getGT-filterSearchVariable.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | sys.path.insert(0, '../')
 4 | from binReader import *
 5 | import time
 6 | import pdb
 7 | from tqdm import tqdm
 8 | import argparse
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--data", default='nthing', type=str)
12 | args = parser.parse_args()
13 | 
14 | def getInvertedIndex(sentencesTokenised):
15 |     Invbins = {}
16 |     for i,sentence in enumerate(sentencesTokenised):
17 |         for word in sentence:
18 |             try:
19 |                 Invbins[word].append(i)
20 |             except(KeyError):
21 |                 Invbins[word] = []
22 |                 Invbins[word].append(i)
23 |     return Invbins
24 | 
25 | def intersection(lst1, lst2):
26 |     # Use of hybrid method
27 |     temp = set(lst2)
28 |     lst3 = [value for value in lst1 if value in temp]
29 |     return lst3
30 | 
31 | # attr = 3
32 | dataname = args.data
33 | trainPath = dataname+"/base.fvecs"
34 | testPath = dataname+"/query.fvecs"
35 | trainConstPath = dataname+"/label_base_100.txt"
36 | testConstPath = dataname+"/label_query_100.txt"
37 | print (trainPath, testPath, trainConstPath, testConstPath)
38 | 
39 | train = fvecs_read(trainPath, c_contiguous=True)
40 | test = fvecs_read(testPath, c_contiguous=True)
41 | trainConst = np.genfromtxt(trainConstPath, skip_header=1, delimiter=" ",dtype='str')
42 | testConst = np.genfromtxt(testConstPath, skip_header=1, delimiter=" ",dtype='str')
43 | 
44 | # get inv index on constraint
45 | Invbins = getInvertedIndex(trainConst)
46 | norms = 0.5*(np.linalg.norm(train,axis=1))**2
47 | 
48 | # for Probab in [0.1, 0.3, 0.5, 0.7, 0.9]:
49 | for Probab in [1-0.03]:
50 |     select = np.random.uniform(low=0.0, high=1.0, size=(testConst.shape[0],testConst.shape[1]))
51 |     select = select<Probab
52 |     for i in range(testConst.shape[0]):
53 |         testConst[i][select[i]] ="X"
54 |     # save testConst
55 |     # save a header as well
56 |     with open(dataname + "/label_query_100_{}.txt".format(Probab), "w") as file:
57 |         file.write("{} {}\n".format(testConst.shape[0], testConst.shape[1]))
58 |         np.savetxt(file, testConst, delimiter=" ", fmt="%s")
59 |     # np.savetxt(dataname+"/label_query_100_{}.txt".format(Probab), testConst, delimiter=" ", fmt="%s")
60 | 
61 |     nt = test.shape[0]
62 |     largest_indices = []
63 |     print ("starting")
64 |     print (time.time())
65 |     for i in tqdm(range(0, nt)):
66 |         # try:
67 |         a = [Invbins[key] for key in testConst[i] if key != "X"]
68 |         if len(a)==0: # regular NNS
69 |             dist = norms -train@test[i] 
70 |             temp = np.argpartition(dist, 100)[:100]
71 |             temp = temp[np.argsort(dist[temp])]
72 |             assert len(temp)==100
73 |             largest_indices.append(temp) 
74 |         else:           # filtered NNS
75 |             candidates = a[0]
76 |             for k in range(1,len(a)):
77 |                 candidates = intersection(candidates,a[k])
78 |             candidates = np.array(candidates)
79 |             dist = norms[candidates]  -train[candidates, :]@test[i] 
80 |             if len(dist)>100:
81 |                 temp = np.argpartition(dist, 100)[:100]
82 |                 temp = temp[np.argsort(dist[temp])]
83 |             else:
84 |                 temp = np.argsort(dist)
85 |                 temp = np.append(temp,-np.ones(100-len(temp),dtype=np.dtype(temp[0])))
86 |             assert len(temp)==100
87 |             largest_indices.append(candidates[temp])     
88 |     print (len(largest_indices), nt)
89 |     a = np.array(largest_indices).astype('int32')
90 |     N,d = a.shape
91 |     a = np.column_stack((d*np.ones(N, dtype='int32'),a)).flatten()
92 |     a.tofile(dataname+"/label_100_{}_hard_groundtruth.ivecs".format(Probab))
93 | 


--------------------------------------------------------------------------------
/include/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gaurav16gupta/constrainedANN/61eae4c56dc0d6c9906bee608cecbf23b11e0260/include/.DS_Store


--------------------------------------------------------------------------------
/include/FilterIndex.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <iostream>
 3 | #include <fstream>
 4 | #include <string>
 5 | #include <cstdlib>
 6 | #include <vector>
 7 | #include <set>
 8 | #include <iterator>
 9 | #include <stdlib.h>     
10 | #include <numeric>
11 | #include <algorithm>
12 | #include <string> 
13 | #include <cstdint>
14 | #include <map>
15 | #include <chrono>
16 | 
17 | #include <cstdio>
18 | #include "faiss/AutoTune.h"
19 | #include "faiss/index_factory.h"
20 | #include "faiss/index_io.h"
21 | 
22 | #include "cluster.h"
23 | using namespace std;
24 | 
25 | class FilterIndex
26 | {
27 |     public:
28 |         FilterIndex(float* data, size_t d_, size_t nb_, size_t nc_, vector<vector<string>>properties_, string algo, int mode);
29 |         void get_index(string metric, string indexpath, int mode);
30 |         void get_cluster_propertiesIndex();
31 | 
32 |         void loadIndex(string indexpath);
33 |         void query(float* queryset, int nq, vector<vector<string>> queryprops, int num_results, int num_mini_probes);
34 |         void findNearestNeighbor(float* query, vector<string> Stprops, int num_results, int nprobe, size_t qnum);
35 |         vector<uint32_t> satisfyingIDs(vector<uint16_t> props);
36 |         void get_mc_propertiesIndex();
37 |         // bool not_in(uint16_t x, vector<pair<uint16_t, pair<uint16_t, int>>> &maxMC);
38 | 
39 |         float *dataset; //use <dtype> array instead
40 |         float *dataset_reordered;
41 |         float *centroids; 
42 |         float* cen_norms;
43 |         float* data_norms;
44 |         float *data_norms_reordered;
45 |         uint32_t* invLookup;
46 |         uint32_t* Lookup; 
47 |         uint32_t* counts;
48 |         int32_t* neighbor_set;
49 |         int treelen;
50 |         int numAttr;
51 | 
52 |         cluster* clusterAlgo; // parent class
53 |         unordered_map<uint8_t, uint8_t>PrpAtrMap;
54 |         vector<vector<uint8_t>>properties;
55 |         uint8_t* properties_reordered;
56 |         vector<vector<uint16_t>> ClusterProperties;//properties of each cluster
57 |         // vector<pair<uint16_t, pair<uint16_t, int>>> maxMC;
58 |         uint16_t* maxMC;
59 | 
60 |         uint32_t d, nb, nc, k;
61 |         unordered_map<uint16_t, vector<uint32_t>> inverted_index; //can use other efficient maps
62 |         unordered_map<string, uint16_t> prLook; 
63 | };


--------------------------------------------------------------------------------
/include/bliss/binReader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pdb
 3 | def fvecs_read(filename, c_contiguous=True):
 4 |     fv = np.fromfile(filename, dtype=np.float32)
 5 |     if fv.size == 0:
 6 |         return np.zeros((0, 0))
 7 |     dim = fv.view(np.int32)[0]
 8 |     assert dim > 0
 9 |     fv = fv.reshape(-1, 1 + dim)
10 |     if not all(fv.view(np.int32)[:, 0] == dim):
11 |         raise IOError("Non-uniform vector sizes in " + filename)
12 |     fv = fv[:, 1:]
13 |     if c_contiguous:
14 |         fv = fv.copy()
15 |     return fv
16 | 
17 | def ivecs_read(fname):
18 |     a = np.fromfile(fname, dtype='int32')
19 |     d = a[0] 
20 |     return a.reshape(-1, d + 1)[:, 1:].copy()
21 | 
22 | def ibin_read(fname):
23 |     a = np.fromfile(fname, dtype='int32')
24 |     N = a[0]
25 |     d = a[1]
26 |     a = a[2:]
27 |     return a.reshape(-1, d).copy()
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/include/bliss/config.py:
--------------------------------------------------------------------------------
1 | class config:
2 |     DATASET = {'glove-100': {'N':1183514, 'd':100 , 'metric': 'L2', 'dt':'float32'},           
3 |             'sift': {'N':1000000, 'd':128 , 'metric': 'L2', 'dt':'float32'},
4 |             'gist': {'N':1000000, 'd':960 , 'metric': 'L2', 'dt':'float32'},  
5 |             'crawl': {'N':1989995, 'd':300 , 'metric': 'L2', 'dt':'float32'}, 
6 |             'audio': {'N':53387, 'd':192 , 'metric': 'L2', 'dt':'float32'},
7 |             'msong': {'N':992272, 'd':420 , 'metric': 'L2', 'dt':'float32'}    
8 |             }  
9 | 


--------------------------------------------------------------------------------
/include/bliss/construct.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import os, sys
 4 | sys.path.append('../indices/')
 5 | import pdb
 6 | from utils import *
 7 | from train import trainIndex
 8 | from config import config
 9 | import argparse
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("--index", default='sift_epc40_K10_B1024_R1', type=str)
13 | parser.add_argument("--gpu", default='0', type=str)
14 | parser.add_argument("--memmap", default=False, type=bool)
15 | parser.add_argument("--mode", default=1, type=int)
16 | parser.add_argument("--hdim", default=256, type=int)
17 | parser.add_argument("--kn", default=10, type=int)
18 | args = parser.parse_args()
19 | 
20 | datasetName = args.index.split('_')[0]  
21 | n_epochs = int(args.index.split('_')[1].split('epc')[1]) 
22 | K = int(args.index.split('_')[2].split('K')[1])  
23 | B = int(args.index.split('_')[3].split('B')[1])
24 | R = int(args.index.split('_')[4].split('R')[1])
25 | feat_dim =  config.DATASET[datasetName]['d']
26 | N = config.DATASET[datasetName]['N'] 
27 | metric = config.DATASET[datasetName]['metric'] 
28 | dtype = config.DATASET[datasetName]['dt'] 
29 | 
30 | # if not os.path.exists("../logs/{}".format(datasetName)):  
31 | #     os.makedirs("../logs/{}".format(datasetName))
32 | 
33 | mode = args.mode
34 | lookups_loc  = "indices/{}/".format(datasetName+"blissMode"+str(mode))
35 | train_data_loc = "data/{}/".format(datasetName)
36 | model_save_loc = lookups_loc
37 | batch_size = 5000
38 | hidden_dim = args.hdim #512 initially, should be an argumment, observation lower numbers like 4-16 are best
39 | # logfile = "../logs/{}/".format(datasetName)
40 | gpu = 0
41 | gpu_usage =0.9
42 | load_epoch = 0
43 | 
44 | t1 = time.time()
45 | for r in range(R):
46 |     trainIndex(lookups_loc, train_data_loc, datasetName, model_save_loc, batch_size, B, feat_dim, hidden_dim,
47 |                     r, gpu, gpu_usage, load_epoch, K, n_epochs, mode, args.kn)
48 | 
49 | print ("Training finished in: ",time.time()-t1, " sec")
50 | 
51 | 


--------------------------------------------------------------------------------
/include/bliss/dataPrepare.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import h5py
  3 | from utils import *
  4 | from config import config
  5 | 
  6 | # add SIFT and other data as well
  7 | 
  8 | def getTraindata(dataname):
  9 |     metric = config.DATASET[dataname]['metric']
 10 |     datapath = '../../../../data/{}/'.format(dataname)
 11 |     trainpath = datapath + 'train.npy'
 12 |     gtpath = datapath + 'groundTruth.npy'
 13 | 
 14 |     if os.path.exists(trainpath) and os.path.exists(gtpath):     #check file size as well   
 15 |         print ("GT already there")
 16 |     else:
 17 |         #load the full data and get fraction
 18 |         fulldata = getFulldata(dataname, datapath)
 19 |         N = fulldata.shape[0]
 20 |         if N>10**6:
 21 |             # pick = np.random.choice(N, np.clip(N//100, 10**4, 10**6), replace=False) # fix seed
 22 |             np.random.seed(0)
 23 |             pick = np.random.choice(N, 10**6, replace=False) # fix seed
 24 |             data_train = fulldata[pick,:]
 25 |         else:
 26 |             data_train = fulldata
 27 |         
 28 |         gt = getTrueNNS(data_train, metric, 100)
 29 |         np.save(gtpath, gt)
 30 |         np.save(trainpath, data_train)
 31 |         del fulldata
 32 | 
 33 | def getFulldata(dataname, datapath):
 34 |     if dataname == 'glove':
 35 |         if os.path.exists(datapath+'fulldata.dat'):
 36 |             dt = config.DATASET[dataname]['dt'] 
 37 |             N = config.DATASET[dataname]['N']
 38 |             d = config.DATASET[dataname]['d']
 39 |             return np.array(np.memmap(datapath+'fulldata.dat', dtype=dt, mode='c', shape=(N,d)))
 40 |         else:
 41 |             data = np.array(h5py.File('../../../../data/glove/glove-100-angular.hdf5', 'r').get('train'))
 42 |             norms = np.linalg.norm(data,axis=1)
 43 |             savememmap(datapath+'fulldata.dat', data)
 44 |             np.save(datapath+'norms.npy', norms)
 45 |             return data
 46 |     if dataname == 'sift':
 47 |         if os.path.exists(datapath+'fulldata.dat'):
 48 |             dt = config.DATASET[dataname]['dt'] 
 49 |             N = config.DATASET[dataname]['N']
 50 |             d = config.DATASET[dataname]['d']
 51 |             return np.array(np.memmap(datapath+'fulldata.dat', dtype=dt, mode='c', shape=(N,d)))
 52 |         else:
 53 |             data = np.array(h5py.File('../../../../data/sift/sift-128-euclidean.hdf5', 'r').get('train'))
 54 |             norms = np.linalg.norm(data,axis=1)
 55 |             savememmap(datapath+'fulldata.dat', data)
 56 |             np.save(datapath+'norms.npy', norms)
 57 |             return data
 58 | 
 59 | def getQueries(dataname):
 60 |     datapath = '../../../../data/{}/'.format(dataname)
 61 |     if dataname == 'glove':
 62 |         if os.path.exists(datapath+'queries.npy') and os.path.exists(datapath+ 'neighbors100.npy'): 
 63 |             queries = np.load(datapath+'queries.npy')
 64 |             neighbors100 = np.load(datapath+ 'neighbors100.npy')
 65 |         else:
 66 |             queries = np.array(h5py.File('../../../../data/glove/glove-100-angular.hdf5', 'r').get('test'))
 67 |             neighbors100 = np.array(h5py.File('../../../../data/glove/glove-100-angular.hdf5', 'r').get('neighbors'))
 68 |             np.save(datapath+'queries.npy', queries)
 69 |             np.save(datapath+ 'neighbors100.npy', neighbors100)
 70 |         return [queries, neighbors100]
 71 | 
 72 |     if dataname == 'sift':
 73 |         if os.path.exists(datapath+'queries.npy') and os.path.exists(datapath+ 'neighbors100.npy'): 
 74 |             queries = np.load(datapath+'queries.npy')
 75 |             neighbors100 = np.load(datapath+ 'neighbors100.npy')
 76 |         else:
 77 |             queries = np.array(h5py.File('../../../../data/sift/sift-128-euclidean.hdf5', 'r').get('test'))
 78 |             neighbors100 = np.array(h5py.File('../../../../data/sift/sift-128-euclidean.hdf5', 'r').get('neighbors'))
 79 |             np.save(datapath+'queries.npy', queries)
 80 |             np.save(datapath+ 'neighbors100.npy', neighbors100)
 81 |         return [queries, neighbors100]
 82 | 
 83 | # if dataname == 'deep-1b':
 84 | #     import subprocess
 85 | #     import os
 86 | #     yadiskLink = "https://yadi.sk/d/11eDCm7Dsn9GA"
 87 | 
 88 | #     # download base files
 89 | #     for i in range(0,4):
 90 | #         command = 'curl ' + '"https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=' \
 91 | #                 + yadiskLink + '&path=/base/base_' + str(i).zfill(2) + '"'
 92 |         
 93 | #         process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
 94 | #         (out, err) = process.communicate()
 95 | #         out = out.decode()
 96 | #         wgetLink = out.split(',')[0][:]
 97 | #         wgetCommand = 'wget ' + wgetLink + ' -O base_' + str(i).zfill(2)
 98 | #         os.system(wgetCommand.split('{"href":')[0]+ wgetCommand.split('{"href":')[1])
 99 | 
100 | #         print ("Downloading base chunk " + str(i).zfill(2) + ' ...')
101 | #         #process = subprocess.Popen(wgetCommand, stdin=subprocess.PIPE, shell=True)
102 | #         #process.stdin.write('e')
103 | #         #process.wait()
104 | 
105 | #     #curate
106 | #     #convert
107 | #     #split
108 | #     #get groundtruth
109 | 
110 | #     DATASET = {'name':'deep-1b','N':10**9, 'd':96, 'metric': 'ip', 'dt':'float32', 
111 | #                     'fullpath':'../../../../data/deep-1b/fulldata.dat', 'trainpath':'../../../../data/deep-1b/traindata.dat'}
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/include/bliss/dataPrepare_constrained.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import h5py
  3 | from utils import *
  4 | from binReader import *
  5 | import tensorflow as tf
  6 | import argparse
  7 | import multiprocessing as mp
  8 | 
  9 | # add SIFT and other data as well
 10 | def getInvertedIndex(sentencesTokenised):
 11 |     Invbins = {}
 12 |     for i,sentence in enumerate(sentencesTokenised):
 13 |         # for word in sentence:
 14 |         try:
 15 |             Invbins[sentence].append(i)
 16 |         except(KeyError):
 17 |             Invbins[sentence] = []
 18 |             Invbins[sentence].append(i)
 19 |     return Invbins
 20 | 
 21 | def intersection(lst1, lst2):
 22 |     # Use of hybrid method
 23 |     temp = set(lst2)
 24 |     lst3 = [value for value in lst1 if value in temp]
 25 |     return lst3
 26 | 
 27 | def makeTraindata(dataname, K):
 28 |     x_train, trainConst = loaddata(dataname)
 29 |     os.environ['CUDA_VISIBLE_DEVICES'] = '3'
 30 |     begin_time = time.time()
 31 |     batch_size = 500
 32 |     output = np.zeros([x_train.shape[0], K], dtype=np.int32) # for upto 2B
 33 | 
 34 |     #L2 metric
 35 |     W = x_train.T
 36 |     W_norm = np.square(np.linalg.norm(W,axis=0))
 37 |     W = tf.constant(W)
 38 |     for i in tqdm(range(x_train.shape[0]//batch_size)):
 39 |         start_idx = i*batch_size
 40 |         end_idx = start_idx+batch_size
 41 |         x_batch = x_train[start_idx:end_idx]
 42 |         # sim = 2*x_batch@W - W_norm
 43 |         sim = 2*tf.matmul(x_batch,W)- W_norm
 44 |         # top_idxs = np.argpartition(sim, -K)[:,-K:]
 45 |         top_idxs = tf.nn.top_k(sim, k=K, sorted=True)[1]
 46 |         output[start_idx:end_idx] = top_idxs
 47 | 
 48 |     N,d = output.shape
 49 |     output = np.column_stack((d*np.ones(N, dtype='int32'),output)).flatten()
 50 |     output.tofile(dataname+"/BLISS_train_groundtruth.ivecs")
 51 |     print(time.time()-begin_time)
 52 | 
 53 | def makeTraindata_wAttr(dataname, K):
 54 |     train, trainConst = loaddata(dataname)
 55 |     N = train.shape[0]
 56 |     # if N>10**6:
 57 |     #     np.random.seed(0)
 58 |     #     pick = np.random.choice(N, 10**6, replace=False) # fix seed
 59 |     #     train = train[pick,:]
 60 |     #     trainConst = trainConst[pick,:]
 61 |   
 62 |     Invbins = getInvertedIndex(trainConst)
 63 |     norms = 0.5*np.linalg.norm(train,axis=1)**2 # if L2 distance
 64 | 
 65 |     # do filter then search
 66 |     # get neighbors
 67 |     nt = train.shape[0]
 68 |     largest_indices = []
 69 |     print ("starting")
 70 |     print (time.time())
 71 | 
 72 |     # def process_chunk(chunk):
 73 |     #     result = []
 74 |     #     for i in chunk:
 75 |     #         candidates = Invbins[trainConst[i]]
 76 |     #         candidates = np.array(candidates)
 77 |     #         dist = (-train[candidates, :]@train[i] +norms[candidates])
 78 |     #         if len(dist)>K:
 79 |     #             temp = np.argpartition(dist, K)[:K]
 80 |     #             temp = temp[np.argsort(dist[temp])]
 81 |     #         else:
 82 |     #             temp = np.argsort(dist)
 83 |     #             temp = np.append(temp,-np.ones(K-len(temp),dtype=np.dtype(temp[0])))
 84 |     #         assert len(temp)==K
 85 |     #         result.append(candidates[temp])
 86 |     #     return result
 87 |     # num_processes = 32
 88 |     # chunk_size = int(nt/num_processes)
 89 |     # pool = mp.Pool(processes=num_processes)
 90 |     # chunks = [range(i*chunk_size, (i+1)*chunk_size) for i in range(num_processes-1)]
 91 |     # chunks.append(range((num_processes-1)*chunk_size, nt))
 92 |     # results = []
 93 |     # for chunk in tqdm(chunks):
 94 |     #     result = pool.apply_async(process_chunk, args=(chunk,))
 95 |     #     results.append(result)
 96 |     # # largest_indices = []
 97 |     # for result in results:
 98 |     #     largest_indices.extend(result.get())
 99 |     # pool.close()
100 |     # pool.join()
101 | 
102 |     for i in tqdm(range(0, nt)):
103 |         candidates = Invbins[trainConst[i]]
104 |         candidates = np.array(candidates)
105 |         dist = (-train[candidates, :]@train[i] +norms[candidates])
106 |         if len(dist)>K:
107 |             temp = np.argpartition(dist, K)[:K]
108 |             temp = temp[np.argsort(dist[temp])]
109 |         else:
110 |             temp = np.argsort(dist)
111 |             temp = np.append(temp,-np.ones(K-len(temp),dtype=np.dtype(temp[0])))
112 |         assert len(temp)==K
113 |         largest_indices.append(candidates[temp]) # to verify this
114 |     print (len(largest_indices), nt)
115 |     a = np.array(largest_indices).astype('int32')
116 |     N,d = a.shape
117 |     a = np.column_stack((d*np.ones(N, dtype='int32'),a)).flatten()
118 |     a.tofile(dataname+"/BLISS_train_3_groundtruth.ivecs")
119 | 
120 | 
121 | # get data and properties
122 | def loaddata(dataname):
123 |     trainPath = dataname+"/base.fvecs"
124 |     trainConstPath = dataname+"/label_base_3.txt"
125 |     train = fvecs_read(trainPath, c_contiguous=True)
126 |     trainConst = np.genfromtxt(trainConstPath, skip_header=1, delimiter=",",dtype='str')    
127 |     return train, trainConst
128 | 
129 | # queries and query properties
130 | def loadQueries(dataname):
131 |     testPath = dataname+"/query.fvecs"
132 |     testConstPath = dataname+"/label_query_3.txt"
133 |     test = fvecs_read(testPath, c_contiguous=True)  
134 |     testConst = np.genfromtxt(testConstPath, skip_header=1, delimiter=",",dtype='str')
135 |     return test, testConst
136 | 
137 | def appendVecAtt(train, trainConst):
138 |     # convert to binary vector
139 |     trainConst = np.array([sublist.split(" ") for sublist in trainConst])
140 |     vocab = np.unique(trainConst)
141 |     vcaobtoNum = {}
142 |     for i,v in enumerate(vocab):
143 |         vcaobtoNum[v]= i
144 |     trainConst = np.array([[vcaobtoNum[v] for v in sublist] for sublist in trainConst])
145 |     constArray = np.empty((trainConst.shape[0], len(vocab)))
146 |     for i in range(constArray.shape[0]):
147 |         constArray[i][trainConst[i]] =1
148 |     return np.column_stack((train, constArray))
149 | 
150 | if __name__ == "__main__":
151 |     parser = argparse.ArgumentParser()
152 |     parser.add_argument("--data", default='sift', type=str)
153 |     args = parser.parse_args()
154 | 
155 |     dataname = '{}/'.format(args.data)
156 |     # makeTraindata(dataname, 100)
157 |     makeTraindata_wAttr(dataname, 100)


--------------------------------------------------------------------------------
/include/bliss/index.py:
--------------------------------------------------------------------------------
  1 | from config import config
  2 | import tensorflow as tf
  3 | import time
  4 | import numpy as np
  5 | import argparse
  6 | import os
  7 | import pdb
  8 | import sys
  9 | from dataPrepare import *
 10 | from net import MyModule
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument("--index", default='glove_epc20_K2_B4096_R4', type=str)
 14 | parser.add_argument("--mode", default=1, type=int)
 15 | 
 16 | args = parser.parse_args()
 17 | datasetName = args.index.split('_')[0]  
 18 | n_epochs = int(args.index.split('_')[1].split('epc')[1]) 
 19 | K = int(args.index.split('_')[2].split('K')[1])  
 20 | B = int(args.index.split('_')[3].split('B')[1])
 21 | R = int(args.index.split('_')[4].split('R')[1])
 22 | 
 23 | def Index(B,R,datasetName, load_epoch, K):
 24 |     bucketSort = True
 25 |     # if not gpu=='all':
 26 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 27 | 
 28 |     #########################Tochange
 29 |     model_save_loc = "../indices/{}/".format(datasetName+"Mode"+str(args.mode))
 30 |     lookups_loc  = "../indices/{}/".format(datasetName+"Mode"+str(args.mode))
 31 |     N = config.DATASET[datasetName]['N'] 
 32 |     train_data_loc = "../../../../data/{}/".format(datasetName)
 33 |     batch_size = 5000
 34 | 
 35 |     ##########################
 36 |     # N = n_classes
 37 | 
 38 |     Model = MyModule(R)
 39 |     Model.load([model_save_loc+'/r_'+str(r)+'.npz' for r in range(R)]) # node 0 for all
 40 |     print ("model loaded")
 41 |     # print (lookups_loc+'epoch_'+str(load_epoch))
 42 | 
 43 |     datapath = train_data_loc
 44 |     dataset = tf.data.Dataset.from_tensor_slices(getFulldata(datasetName, datapath).astype(np.float32))
 45 |     dataset = dataset.batch(batch_size = batch_size)
 46 |     iterator = iter(dataset)
 47 |     print("data loaded")
 48 | 
 49 |     top_preds = np.zeros([R, N, K], dtype=np.int32)
 50 | 
 51 |     # p = Pool(n_cores)
 52 |     t1 = time.time()
 53 |     # pdb.set_trace()
 54 |     start_idx = 0
 55 |     while True: # this loops for batches
 56 |         try:
 57 |             # print (start_idx)
 58 |             top_preds[:, start_idx:min(start_idx+batch_size, N)]  = Model(iterator.get_next(), K) # should give top K bucket IDs
 59 |             pdb.set_trace()
 60 |             start_idx += batch_size
 61 |             sys.stdout.write("Inference progress: %d%%   \r" % (start_idx*100/N) )
 62 |             sys.stdout.flush()
 63 |         except:
 64 |             print (start_idx)
 65 |             # pdb.set_trace()
 66 |             assert (start_idx >=N), "batch iterator issue!"
 67 |             break
 68 | 
 69 |     t2 = time.time()
 70 |     print("Inference time: ", t2-t1)
 71 |     #####################################
 72 |     try:
 73 |         #make it parallel
 74 |         for r in range(R):
 75 |             counts = np.zeros(B+1, dtype=np.int32)
 76 |             bucket_order = np.zeros(N, dtype=np.int32)
 77 |             for i in range(N):
 78 |                 bucket = top_preds[r, i, np.argmin(counts[top_preds[r,i]+1])] 
 79 |                 bucket_order[i] = bucket
 80 |                 counts[bucket+1] += 1  
 81 |                         
 82 |             counts = np.cumsum(counts)
 83 |             class_order = np.zeros(N,dtype=np.int32)
 84 |             class_order = np.argsort(bucket_order)
 85 |             # sorting buckets
 86 |             if bucketSort:
 87 |                 for b in range(B):
 88 |                     class_order[counts[b]:counts[b+1]] = np.sort(class_order[counts[b]:counts[b+1]])
 89 |             ###
 90 |             folder_path = lookups_loc+'epoch_'+str(load_epoch)
 91 |             if not os.path.exists(folder_path):
 92 |                 os.makedirs(folder_path)
 93 |             np.save(folder_path+'/class_order_'+str(r)+'.npy', class_order)
 94 |             np.save(folder_path+'/counts_'+str(r)+'.npy', counts)
 95 |             np.save(folder_path+'/bucket_order_'+str(r)+'.npy', bucket_order)
 96 |             print (r)
 97 |     except:
 98 |         print ("check indexing issue", r)
 99 |     t3 = time.time()
100 |     print("indexed and saved in time: ", t3-t2)
101 | 
102 | Index(B, R, datasetName, n_epochs, K)
103 | 


--------------------------------------------------------------------------------
/include/bliss/net.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | class MyModule(tf.Module):
 5 |   def __init__(self, R):
 6 |     self.R = R
 7 |     self.W1 = [None for r in range(R)]
 8 |     self.b1 = [None for r in range(R)]
 9 |     self.hidden_layer = [None for r in range(R)]
10 |     self.W2 = [None for r in range(R)]
11 |     self.b2 = [None for r in range(R)]
12 |     self.logits = [None for r in range(R)]
13 |     self.top_buckets = [None for r in range(R)]
14 | 
15 |   def load(self,paths):
16 |     params = [np.load(path) for path in paths]
17 |     self.W1 = [tf.constant(params[r]['W1']) for r in range(self.R)]
18 |     self.b1 = [tf.constant(params[r]['b1']) for r in range(self.R)]
19 |     self.W2 = [tf.constant(params[r]['W2']) for r in range(self.R)]
20 |     self.b2 = [tf.constant(params[r]['b2']) for r in range(self.R)]
21 | 
22 |   @tf.function
23 |   def __call__(self, x, topk):
24 |     for r in range(self.R):
25 |         self.hidden_layer[r] = tf.nn.relu(tf.matmul(x, self.W1[r])+self.b1[r])
26 |         self.logits[r] = tf.matmul(self.hidden_layer[r],self.W2[r])+self.b2[r]
27 |         self.top_buckets[r] = tf.nn.top_k(self.logits[r], k=topk, sorted=False)[1]
28 |     return self.top_buckets


--------------------------------------------------------------------------------
/include/bliss/query.py:
--------------------------------------------------------------------------------
  1 | from config import config
  2 | import tensorflow as tf
  3 | import time
  4 | import numpy as np
  5 | import argparse
  6 | import os, sys
  7 | from utils import *
  8 | from multiprocessing import Pool
  9 | import pdb
 10 | sys.path.append('InvertedIndex/')
 11 | import scoreAgg
 12 | from net import MyModule
 13 | from dataPrepare import *
 14 | import argparse
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("--topm", default=10, type=int)
 18 | parser.add_argument("--mf", default=1, type=int)
 19 | parser.add_argument("--gpu", default='0', type=str)
 20 | parser.add_argument("--index", default='deep-1b_epc20_K2_B65536_R4', type=str)
 21 | parser.add_argument("--memmap", default=False, type=bool)
 22 | parser.add_argument("--rerank", default=True, type=bool)
 23 | parser.add_argument("--mode", default=1, type=int)
 24 | 
 25 | args = parser.parse_args()
 26 | 
 27 | datasetName = args.index.split('_')[0]  
 28 | eval_epoch = int(args.index.split('_')[1].split('epc')[1]) 
 29 | K = int(args.index.split('_')[2].split('K')[1])  
 30 | B = int(args.index.split('_')[3].split('B')[1])
 31 | R = int(args.index.split('_')[4].split('R')[1])
 32 | feat_dim =  config.DATASET[datasetName]['d']
 33 | N = config.DATASET[datasetName]['N'] 
 34 | metric = config.DATASET[datasetName]['metric'] 
 35 | dtype = config.DATASET[datasetName]['dt'] 
 36 | model_loc = "../indices/{}/".format(datasetName+"Mode"+str(args.mode))
 37 | lookups_loc  = "../indices/{}/".format(datasetName+"Mode"+str(args.mode)) + '/epoch_'+ str(eval_epoch)
 38 | data_loc = "../../../../data/{}/".format(datasetName)
 39 | print (model_loc, lookups_loc, data_loc)
 40 | buffer = 1024*(int(2*R*N*args.topm/(B*args.mf))//1024)
 41 | 
 42 | batch_size = 32
 43 | logfile = '../logs/'+datasetName+'/'+args.index+'query.txt'
 44 | output_loc = logfile[:-3]+'npy'
 45 | 
 46 | if not args.gpu=='all':
 47 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 48 | 
 49 | ############################## load lookups ################################
 50 | Model = MyModule(R)
 51 | Model.load([model_loc+'/r_'+str(r)+'.npz' for r in range(R)])
 52 | print ("model loaded")
 53 | 
 54 | inv_lookup = np.zeros(R*N, dtype=np.int32)
 55 | counts = np.zeros(R*(B+1), dtype=np.int32)
 56 | for r in range(R):
 57 |     inv_lookup[r*N: (r +1)*N ] = np.load(lookups_loc+'/class_order_'+str(r)+'.npy')# block size 
 58 |     counts[r*(B+1) : (r +1 )*(B+1) ] = np.load(lookups_loc+'/counts_'+str(r)+'.npy')[:B+1] 
 59 | 
 60 | inv_lookup = np.ascontiguousarray(inv_lookup, dtype=np.int32) 
 61 | counts = np.ascontiguousarray(counts, dtype=np.int32)
 62 | pdb.set_trace()
 63 | fastIv = scoreAgg.PyFastIV(R, N, (B+1), args.mf, args.topm, inv_lookup, counts)
 64 | # fastIv.createIndex() # in future load this directly from a binary file. Saved by C++ code
 65 | print ("Deserialized")
 66 | 
 67 | ################# Data Loader ####################
 68 | [queries, neighbors100] = getQueries(datasetName)
 69 | queries = queries[:1000,:]
 70 | print("queries loaded ", queries.shape)
 71 | queries = tf.data.Dataset.from_tensor_slices(queries)
 72 | queries = queries.batch(batch_size = batch_size)
 73 | iterator = iter(queries)
 74 | 
 75 | if args.rerank:
 76 |     dataset = getFulldata(datasetName, data_loc)
 77 |     if metric=="L2":
 78 |         norms= np.load(data_loc +"norms.npy")
 79 |     if metric =="cosine":
 80 |         norms= np.load(data_loc +"norms.npy")
 81 |         dataset = dataset/(norms[:,None])
 82 |     print ("dense vectors loaded")
 83 | # to check these
 84 | count = 0
 85 | score_sum = [0.0,0.0,0.0]
 86 | output = -1* np.ones([10000,10])
 87 | #########################################
 88 | 
 89 | # p = Pool(config.n_cores)
 90 | fw = open(logfile, 'a', encoding='utf-8') # log file
 91 | bthN = 0
 92 | begin_time = time.time()
 93 | 
 94 | # p = Pool(Parts)
 95 | Inf = 0
 96 | RetRank = 0
 97 | 
 98 | while True:
 99 |     try:
100 |         x_batch = iterator.get_next()
101 |         x_batch = tf.cast(x_batch, tf.float32)
102 |         t1 = time.time()
103 |         top_buckets_ = Model(x_batch, args.topm) # should give topm bucket IDs, [R,batch_size,topmvals, ]
104 |         pdb.set_trace()
105 |         top_buckets_ = np.array(top_buckets_)
106 |         # top_buckets_ = np.transpose(top_buckets_, (2,0,1,3)) # bring batch_size (index 2) ahead, [batch_size,R,2,topm]
107 |         top_buckets_ = np.transpose(top_buckets_, (1,0,2)) # bring batch_size (index 1) ahead, [batch_size,R,topm]
108 | 
109 |         len_cands = np.zeros(top_buckets_.shape[0])
110 |         t2 = time.time()
111 |         Inf+= (t2-t1)
112 |         for i in range(top_buckets_.shape[0]):
113 |             candid = np.empty(buffer, dtype='int32') # does this init takes time?
114 |             candSize = np.empty(1, dtype='int32' )
115 |             fastIv.FC(np.ascontiguousarray(top_buckets_[i,:,:], dtype=np.int32).reshape(-1), buffer, candid, candSize)
116 |             candidates = (candid[0: candSize[0]])
117 |             
118 |             # candidates = (process_scores(top_buckets_[i]))
119 |             score_sum[1] += len(candidates)
120 |             if args.rerank:
121 |                 if metric == "IP":
122 |                     dists = np.dot(dataset[candidates],x_batch[i]) # or L2 dist
123 |                 if metric == "L2":
124 |                     dists = 2*np.dot(dataset[candidates],x_batch[i]) -norms[candidates]
125 |                 if metric =="cosine":
126 |                     dists = np.dot(dataset[candidates],x_batch[i]) # or L2 dist
127 |                 if len(dists)<=10:
128 |                     output[bthN*batch_size + i, :len(dists)] = candidates 
129 |                 if len(dists)>10:
130 |                     top_cands = np.argpartition(dists, -10)[-10:]
131 |                     output[bthN*batch_size + i, :10] = candidates[top_cands] 
132 |                     candidates = candidates[top_cands] 
133 |                 
134 |             score_sum[0] += len(np.intersect1d(candidates, neighbors100[bthN*batch_size + i,:10]))/10
135 | 
136 |         t3 = time.time()
137 |         RetRank+= t3-t2
138 |         bthN+=1
139 |         print (bthN)
140 |     except:
141 |         # print (bthN)
142 |         print ( " topm: ", args.topm, " mf: ", args.mf)
143 |         print('overall Recall for',count,'points:',score_sum[0]/((bthN-1)*batch_size + i))
144 |         print('Avg can. size for',count,'points:',score_sum[1]/((bthN-1)*batch_size + i))
145 |         pdb.set_trace()
146 |         print('Inf per point: ',Inf/((bthN-1)*batch_size))
147 |         print('Ret+rank per point: ',RetRank/((bthN-1)*batch_size))
148 |         print('per point to report: ',(Inf/32 + RetRank/4)/((bthN-1)*batch_size))
149 | 
150 |         print (" topm: ", args.topm, " mf: ", args.mf, file=fw)
151 |         print('overall Recall for',count,'points:',score_sum[0]/((bthN-1)*batch_size + i), file=fw)
152 |         print('Avg can. size for',count,'points:',score_sum[1]/((bthN-1)*batch_size + i), file=fw)
153 |         print('Inf per point: ',Inf/((bthN-1)*batch_size), file=fw)
154 |         print('Ret+rank  per point: ',RetRank/((bthN-1)*batch_size), file=fw)
155 |         print('per point to report: ',(Inf/32 + RetRank/4)/((bthN-1)*batch_size), file=fw)
156 |         np.save(output_loc,output)
157 |         break
158 | 
159 | # p.close()
160 | 
161 | 


--------------------------------------------------------------------------------
/include/bliss/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import argparse
  3 | import time
  4 | import os
  5 | import numpy as np
  6 | from binReader import *
  7 | from dataPrepare_constrained import *
  8 | from utils import *
  9 | 
 10 | def trainIndex(lookups_loc, train_data_loc, datasetName, model_save_loc, batch_size, B, vec_dim, hidden_dim,
 11 |                     r, gpu, gpu_usage, load_epoch, k2, n_epochs, mode, kn):
 12 | 
 13 |     assert (B%2==0)
 14 |     B = 2*B
 15 |     tf.compat.v1.disable_eager_execution()
 16 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 17 |     # get train data
 18 |     # x_train = np.load(train_data_loc+'train.npy')
 19 |     # y_train = np.load(train_data_loc+'groundTruth.npy')
 20 |     print ("mode ",mode)
 21 |     # mode 1: Simple ANN train
 22 |     IntSave = True
 23 |     if mode==1:
 24 |         x_train, trainConst = loaddata(train_data_loc)
 25 |         y_train = ivecs_read(train_data_loc + "/BLISS_train_groundtruth.ivecs")
 26 |     # mode 2: Label with constraints
 27 |     if mode==2:
 28 |         x_train, trainConst = loaddata(train_data_loc)
 29 |         y_train = ivecs_read(train_data_loc + "/BLISS_train_3_groundtruth.ivecs")
 30 |     # mode 3: input both vector and attributes
 31 |     if mode==3:
 32 |         x_train, trainConst = loaddata(train_data_loc)
 33 |         x_train = appendVecAtt(x_train, trainConst)
 34 |         y_train = ivecs_read(train_data_loc + "/BLISS_train_3_groundtruth.ivecs")
 35 |     vec_dim = x_train.shape[1]
 36 |     N = x_train.shape[0]
 37 |     
 38 |     y_train = y_train[:,:kn]  
 39 | 
 40 |     # randIndx = np.arange(0, N)
 41 |     # np.random.shuffle(randIndx)
 42 |     # Winit = x_train[randIndx[:B],:]
 43 |     ###############
 44 |     if not os.path.exists(lookups_loc+'epoch_'+str(load_epoch)+'/'):  
 45 |         os.makedirs(lookups_loc+'epoch_'+str(load_epoch)+'/')
 46 |     create_universal_lookups(r, B, N, lookups_loc+'epoch_'+str(load_epoch)+'/')
 47 |     # randomPoints_lookups(r, B, N, x_train, lookups_loc+'epoch_'+str(load_epoch)+'/')
 48 |     
 49 |     lookup = tf.Variable(np.load(lookups_loc+'epoch_'+str(load_epoch)+'/bucket_order_'+str(r)+'.npy')[:N])
 50 |     ###############
 51 |     temp = tf.constant(np.arange(batch_size*kn)//kn)
 52 |     x = tf.compat.v1.placeholder(tf.float32, shape=[batch_size, vec_dim])
 53 |     _y = tf.compat.v1.placeholder(tf.int64, shape=[batch_size*kn])
 54 |     y_idxs = tf.stack([temp, tf.gather(lookup, _y)], axis=-1)
 55 |     y_vals = tf.ones_like(y_idxs[:,0], dtype=tf.float32)
 56 |     y = tf.compat.v1.SparseTensor(y_idxs, y_vals, [batch_size, B])
 57 |     y_ = tf.compat.v1.sparse_tensor_to_dense(y, validate_indices=False)
 58 | 
 59 |     ###############
 60 |     W1 = tf.Variable(tf.compat.v1.truncated_normal([vec_dim, hidden_dim], stddev=0.05, dtype=tf.float32))
 61 |     b1 = tf.Variable(tf.compat.v1.truncated_normal([hidden_dim], stddev=0.05, dtype=tf.float32))
 62 |     hidden_layer = tf.nn.relu(tf.matmul(x,W1)+b1)
 63 |     W2 = tf.Variable(tf.compat.v1.truncated_normal([hidden_dim, B], stddev=0.05, dtype=tf.float32))
 64 |     b2 = tf.Variable(tf.compat.v1.truncated_normal([B], stddev=0.05, dtype=tf.float32))
 65 |     logits = tf.matmul(hidden_layer,W2)+b2
 66 |     ###############
 67 |     top_buckets = tf.nn.top_k(logits, k=k2, sorted=True)[1]
 68 |     loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y_))
 69 |     train_op = tf.compat.v1.train.AdamOptimizer().minimize(loss)
 70 | 
 71 |     sess = tf.compat.v1.Session(config = tf.compat.v1.ConfigProto(
 72 |                             allow_soft_placement=True,
 73 |                             log_device_placement=False,
 74 |                             gpu_options=tf.compat.v1.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=gpu_usage)))
 75 | 
 76 |     sess.run(tf.compat.v1.global_variables_initializer())
 77 | 
 78 |     n_check=200
 79 |     n_steps_per_epoch = N//batch_size
 80 |     for curr_epoch in range(load_epoch+1,load_epoch+n_epochs+1):
 81 |         count = 0
 82 |         for j in range(n_steps_per_epoch):
 83 |             start_idx = j*batch_size
 84 |             end_idx = start_idx+batch_size
 85 |             sess.run(train_op, feed_dict={x:x_train[start_idx:end_idx], _y:y_train[start_idx:end_idx].reshape([-1])})
 86 |             
 87 |             count += 1
 88 |             if count%n_check==0:
 89 |                 _, train_loss = sess.run([train_op,loss], feed_dict={x:x_train[start_idx:end_idx], _y:y_train[start_idx:end_idx].reshape([-1])})
 90 |                 # print('train_loss: '+str(train_loss))
 91 |                 temp = tf.constant(np.arange(batch_size*kn)//kn)
 92 |                 y_idxs = tf.stack([temp, tf.gather(lookup, y_train[start_idx:end_idx].reshape([-1]) )], axis=-1)
 93 |                 y_vals = tf.ones_like(y_idxs[:,0], dtype=tf.float32)
 94 |                 y = tf.compat.v1.SparseTensor(y_idxs, y_vals, [batch_size, B])
 95 |                 y_ = tf.compat.v1.sparse_tensor_to_dense(y, validate_indices=False)
 96 |                 yout = sess.run(y_)
 97 |                 # print (np.sum(yout)/yout.shape[0])
 98 |                 count+=1
 99 | 
100 |         if curr_epoch%5==0:                
101 |             #####################################
102 |             top_preds = np.zeros([N,k2], dtype=int)
103 |             start_idx = 0
104 |             for i in range(x_train.shape[0]//batch_size):
105 |                 top_preds[start_idx:start_idx+batch_size] = sess.run(top_buckets, feed_dict={x:x_train[start_idx:start_idx+batch_size]})
106 |                 start_idx += batch_size
107 |             ##################################### 
108 |             counts = np.zeros(B+1, dtype=int)
109 |             bucket_order = np.zeros(N, dtype=int)
110 |             for i in range(N):
111 |                 bucket = (top_preds[i, counts[top_preds[i]+1] == np.min(counts[top_preds[i]+1])])[0]
112 |                 # bucket = top_preds[i, np.argmin(counts[top_preds[i]+1])] 
113 |                 bucket_order[i] = bucket
114 |                 counts[bucket+1] += 1
115 | 
116 |             nothing = sess.run(tf.compat.v1.assign(lookup,bucket_order))
117 |             print ("max bin: ", np.max(counts))
118 |             print ("load Std: ", np.std(counts))
119 |             print ("empty bins: ", np.sum(counts==0))
120 |             counts = np.cumsum(counts)
121 |             class_order = np.argsort(bucket_order)
122 |     params = sess.run([W1,b1,W2,b2])
123 |     np.savez_compressed(model_save_loc+'/r_'+str(r)+'.npz',
124 |     W1=params[0], 
125 |     b1=params[1], 
126 |     W2=params[2], 
127 |     b2=params[3])
128 | 
129 |     # keep only top B/2
130 |     cnt = counts[1:]-counts[:-1]
131 |     keep  = np.argsort(cnt)[::-1][:int(B/2)]
132 |     params[2] = params[2][:,keep]
133 |     params[3] = params[3][keep]
134 |     A1 = np.vstack((params[0], params[1])).T.flatten()
135 |     A2 = np.vstack((params[2], params[3])).T.flatten()
136 |     np.concatenate([A1, A2]).tofile(model_save_loc+'/model.bin')
137 |     del params
138 | 


--------------------------------------------------------------------------------
/include/bliss/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import time
  4 | import numpy as np
  5 | import os, sys
  6 | import pdb
  7 | import math
  8 | import matplotlib.pyplot as plt
  9 | from tqdm import tqdm
 10 | import time
 11 | import numpy as np
 12 | from multiprocessing import Pool
 13 | from sklearn.utils import murmurhash3_32 as mmh3
 14 | 
 15 | 
 16 | def savememmap(path, ar):
 17 |     if path[-4:]!='.dat':
 18 |         path = path +'.dat'
 19 |     shape = ar.shape
 20 |     dtype = ar.dtype
 21 |     fp = np.memmap( path, dtype=dtype, mode='w+', shape=(shape))
 22 |     fp[:]= ar[:]
 23 |     fp.flush()
 24 | 
 25 | def getTrueNNS(x_train, metric, K):
 26 |     os.environ['CUDA_VISIBLE_DEVICES'] = '2'
 27 |     begin_time = time.time()
 28 |     batch_size = 1000
 29 |     output = np.zeros([x_train.shape[0], K], dtype=np.int32) # for upto 2B
 30 | 
 31 |     if metric=='IP':
 32 |         W = x_train.T
 33 |         W = tf.constant(W)
 34 |         for i in tqdm(range(x_train.shape[0]//batch_size)):
 35 |             start_idx = i*batch_size
 36 |             end_idx = start_idx+batch_size
 37 |             x_batch = x_train[start_idx:end_idx]
 38 |             # sim = x_batch@W
 39 |             sim = tf.matmul(x_batch,W)
 40 |             # top_idxs = np.argpartition(sim, -K)[:,-K:]
 41 |             top_idxs = tf.nn.top_k(sim, k=K, sorted=True)[1]
 42 |             output[start_idx:end_idx] = top_idxs
 43 | 
 44 |     elif metric=='L2':
 45 |         W = x_train.T
 46 |         W_norm = np.square(np.linalg.norm(W,axis=0))
 47 |         W = tf.constant(W)
 48 |         for i in tqdm(range(x_train.shape[0]//batch_size)):
 49 |             start_idx = i*batch_size
 50 |             end_idx = start_idx+batch_size
 51 |             x_batch = x_train[start_idx:end_idx]
 52 |             # sim = 2*x_batch@W - W_norm
 53 |             sim = 2*tf.matmul(x_batch,W)- W_norm
 54 |             # top_idxs = np.argpartition(sim, -K)[:,-K:]
 55 |             top_idxs = tf.nn.top_k(sim, k=K, sorted=True)[1]
 56 |             output[start_idx:end_idx] = top_idxs
 57 |     
 58 |     elif metric=='cosine': 
 59 |         x_train = x_train/(np.linalg.norm(x_train,axis=1)[:,None])
 60 |         W = tf.constant(x_train.T)
 61 |         for i in tqdm(range(x_train.shape[0]//batch_size)):
 62 |             start_idx = i*batch_size
 63 |             end_idx = start_idx+batch_size
 64 |             x_batch = x_train[start_idx:end_idx]
 65 |             sim = tf.matmul(x_batch,W)
 66 |             top_idxs = tf.nn.top_k(sim, k=K, sorted=True)[1]
 67 |             output[start_idx:end_idx] = top_idxs
 68 | 
 69 |     print(time.time()-begin_time)
 70 |     return output
 71 | 
 72 | def randomPoints_lookups(r, B, N, data, lookups_loc):
 73 |     c_o = lookups_loc+'class_order_'+str(r)+'.npy'
 74 |     ct = lookups_loc+'counts_'+str(r)+'.npy'
 75 |     b_o = lookups_loc+'bucket_order_'+str(r)+'.npy'
 76 |     if os.path.exists(c_o) and os.path.exists(ct) and os.path.exists(b_o):
 77 |         print ('init lookups exists')
 78 |     else:
 79 |         rind = np.arange(0, B)
 80 |         np.random.shuffle(rind)
 81 |         cents = data[rind,:]
 82 |         cents = cents.T
 83 |         scores = data @ cents
 84 |         bucket_order = np.argmin(scores, axis=1)
 85 |         class_order = np.argsort(bucket_order)
 86 |         counts = np.zeros(B+1, dtype=int)
 87 |         for i in range(N):
 88 |             counts[bucket_order[i]+1] += 1
 89 |         np.save(c_o, np.array(class_order))
 90 |         np.save(ct,np.array(counts))
 91 |         np.save(b_o, np.array(bucket_order))
 92 | 
 93 | def kmeans_lookups(r, B, N, data, lookups_loc):
 94 |     c_o = lookups_loc+'class_order_'+str(r)+'.npy'
 95 |     ct = lookups_loc+'counts_'+str(r)+'.npy'
 96 |     b_o = lookups_loc+'bucket_order_'+str(r)+'.npy'
 97 |     if os.path.exists(c_o) and os.path.exists(ct) and os.path.exists(b_o):
 98 |         print ('init lookups exists')
 99 |     else:
100 |         kmeans = KMeans(n_clusters=B, random_state=r).fit(data)
101 |         counts = np.zeros(B+1, dtype=int)
102 |         bucket_order = kmeans.labels_
103 |         class_order = np.argsort(bucket_order)
104 |         for i in range(N):
105 |             counts[bucket_order[i]+1] += 1
106 |         np.save(c_o, class_order)
107 |         np.save(ct,counts)
108 |         np.save(b_o, bucket_order)
109 | 
110 | def create_universal_lookups(r, B, n_classes, lookups_loc):
111 |     c_o = lookups_loc+'class_order_'+str(r)+'.npy'
112 |     ct = lookups_loc+'counts_'+str(r)+'.npy'
113 |     b_o = lookups_loc+'bucket_order_'+str(r)+'.npy'
114 |     if os.path.exists(c_o) and os.path.exists(ct) and os.path.exists(b_o):
115 |         print ('init lookups exists')
116 |     else:
117 |         counts = np.zeros(B+1, dtype=int)
118 |         bucket_order = np.zeros(n_classes, dtype=int)
119 |         for i in range(n_classes):
120 |             bucket = mmh3(i,seed=r)%B
121 |             bucket_order[i] = bucket
122 |             counts[bucket+1] += 1
123 |         counts = np.cumsum(counts)
124 |         rolling_counts = np.zeros(B, dtype=int)
125 |         class_order = np.zeros(n_classes,dtype=int)
126 |         for i in range(n_classes):
127 |             temp = bucket_order[i]
128 |             class_order[counts[temp]+rolling_counts[temp]] = i
129 |             rolling_counts[temp] += 1
130 |         
131 |         np.save(c_o, class_order)
132 |         np.save(ct,counts)
133 |         np.save(b_o, bucket_order)
134 | 
135 | # to do: fix this
136 | def process_scores(inp, ):
137 |     R = inp.shape[0]
138 |     topk = inp.shape[2]
139 |     # scores = {}
140 |     freqs = {}
141 |     for r in range(R):
142 |         for k in range(topk):
143 |             val = inp[r,0,k] # inp[r,0,k] is values, inp[r,1,k] is the indices
144 |             for key in inv_lookup[r,counts[r,int(inp[r,1,k])]:counts[r,int(inp[r,1,k])+1]]:
145 |                 if key in freqs:
146 |                     # scores[key] += val
147 |                     freqs[key] += 1  
148 |                 else:
149 |                     # scores[key] = val
150 |                     freqs[key] = 1
151 |     i = 0
152 |     while True:
153 |         candidates = np.array([key for key in freqs if freqs[key]>=args.mf-i])
154 |         if len(candidates)>=10:
155 |             break
156 |         i += 1
157 |     return candidates
158 |     ###


--------------------------------------------------------------------------------
/include/cluster.h:
--------------------------------------------------------------------------------
  1 | // function to bins
  2 | #pragma once
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <stdlib.h>    
  6 | #include <numeric>
  7 | #include <algorithm>
  8 | #include <string> 
  9 | #include "utils.h"
 10 | 
 11 | //faiss specific imports
 12 | #include <faiss/Clustering.h> 
 13 | #include <faiss/IndexFlat.h>
 14 | #include <bits/stdc++.h>
 15 | 
 16 | using namespace std;
 17 | class cluster{
 18 |     public:
 19 |         virtual void train(float* dataset, int nb, string modelpath) = 0;
 20 |         virtual void getscore(float* input, float* last) = 0;
 21 |         virtual uint32_t top(float* input) = 0;
 22 |         virtual void load(string modelpath) = 0;
 23 | };
 24 | 
 25 | // add any other cluster method as a seperate derived class
 26 | class BLISS : public cluster{
 27 |     public:
 28 |         int s1;
 29 |         int s2;
 30 |         int s3;
 31 |         float* weights;
 32 |         
 33 |         BLISS(int s1, int s2, int s3): s1(s1), s2(s2), s3(s3){
 34 |             weights = new float[(s2*(s1+1) +s3*(s2+1))]; 
 35 |         }
 36 | 
 37 |         void train(float* dataset, int nb, string modelpath){
 38 |             // load .bin
 39 |             FILE* f = fopen((modelpath + "/model.bin").c_str(), "rb");
 40 |             fread(weights, sizeof(float), (s2*(s1+1) +s3*(s2+1)), f); 
 41 |             fclose (f);
 42 |         } 
 43 | 
 44 |         void getscore(float* input, float* last){
 45 |             float* hd = new float[s2];
 46 |             for (uint32_t id=0; id<s2; id++){
 47 |                 hd[id] = IPSIMD4ExtAVX(input, weights+ id*(s1+1), s1);
 48 |                 hd[id] += *(weights+ id*(s1+1) +s1); //bias
 49 |                 if (hd[id]<0) hd[id] =0; // Relu 0 if negative
 50 |             }
 51 |             float* L1 = weights+ (s1+1)*s2;
 52 | 
 53 |             for (uint32_t id=0; id<s3; id++){
 54 |                 float temp =0;
 55 |                 last[id] = IPSIMD4ExtAVX(hd, L1+ id*(s2+1), s2); //multiply
 56 |                 last[id] += *(L1+ id*(s2+1) +s2); //bias
 57 |             }
 58 |             // last = softmax(last);
 59 |         }
 60 | 
 61 |         uint32_t top(float* input){ 
 62 |             float* hd = new float[s2];
 63 |             for (uint32_t id=0; id<s2; id++){
 64 |                 hd[id] = IPSIMD4ExtAVX(input, weights+ id*(s1+1), s1);
 65 |                 hd[id] += *(weights+ id*(s1+1) +s1); //bias
 66 |                 if (hd[id]<0) hd[id] =0; // Relu 0 if negative
 67 |             }
 68 |             float* L1 = weights+ (s1+1)*s2;
 69 |             uint32_t bin;
 70 |             float maxscore = -1000000; 
 71 |             for (uint32_t id=0; id<s3; id++){
 72 |                 float temp =0;
 73 |                 temp = IPSIMD4ExtAVX(hd, L1+ id*(s2+1), s2); //multiply
 74 |                 temp += *(L1+ id*(s2+1) +s2); //bias
 75 |                 if (temp>maxscore) {
 76 |                 maxscore=temp;
 77 |                 bin = id;}
 78 |             }
 79 |             return bin;
 80 |         }
 81 | 
 82 |         void load(string modelpath){
 83 |             // load .bin
 84 |             FILE* f = fopen((modelpath + "/model.bin").c_str(), "rb");
 85 |             fread(weights, sizeof(float), (s2*(s1+1) +s3*(s2+1)), f); 
 86 |             fclose (f);
 87 |         } 
 88 | };
 89 | 
 90 | class Kmeans: public cluster{
 91 |     public:
 92 |         int nc;
 93 |         int d;
 94 |         float* centroids;
 95 |         float* cen_norms;
 96 |         Kmeans(int nc, int d): nc(nc), d(d){
 97 |             centroids = new float[d*nc]; //provide random nc vectors
 98 |             cen_norms = new float[nc]{0};
 99 |         }
100 | 
101 |         void train(float* dataset, int nb, string modelpath){
102 |             //centroids (nc * d) if centroids are set on input to train, they will be used as initialization
103 |             int v[nb];
104 |             randomShuffle(v , 0, nb);
105 |             faiss::Clustering clus(d, nc);
106 |             clus.centroids.resize(d*nc);
107 |             for(uint32_t i = 0; i < nc; ++i) { 
108 |                 for(uint32_t j = 0; j < d; ++j){
109 |                     clus.centroids[i*d+j] = dataset[v[i]*d +j];
110 |                 }
111 |                 // memcpy(clus.centroids + i*d, dataset +v[i]*d, sizeof(*centroids) * d);
112 |             }
113 |             clus.verbose = d * nb * nc > (1L << 30);
114 |             // display logs if > 1Gflop per iteration
115 | 
116 |             faiss::IndexFlatL2 index(d);
117 |             clus.train(nb, dataset, index);
118 | 
119 |             memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * nc);
120 |             cout<<"centroids size: "<<clus.centroids.size()<<endl; //centroids (nc * d) if centroids are set on input to train, they will be used as initialization
121 |             
122 |             // if L2 get norms as well
123 |             for(uint32_t j = 0; j < nc; ++j){ 
124 |                 cen_norms[j]=0; 
125 |                 for(uint32_t k = 0; k < d; ++k) {                 
126 |                     cen_norms[j] += centroids[j*d +k]*centroids[j*d +k];        
127 |                 } 
128 |                 cen_norms[j] = cen_norms[j]/2;
129 |             }
130 | 
131 |             FILE* f1 = fopen((modelpath+"/centroids.bin").c_str(), "wb");
132 |             fwrite(centroids, sizeof(float), nc*d, f1);
133 |             fclose (f1);
134 | 
135 |             FILE* f2 = fopen((modelpath+"/centroidsNorms.bin").c_str(), "wb");
136 |             fwrite(cen_norms, sizeof(float), nc, f2);
137 |             fclose (f2);
138 |         }
139 | 
140 |         uint32_t top(float* input){
141 |             uint32_t bin=0;
142 |             float maxscore = L2SIMD4ExtAVX(input, centroids, cen_norms[0], d);;  
143 |             float temp;
144 |             for(uint32_t j = 1; j < nc; ++j){  
145 |                 temp = L2SIMD4ExtAVX(input, centroids+j*d, cen_norms[j], d);
146 |                 // for(uint32_t k = 0; k < d; ++k) {                 
147 |                 //     temp += pow(input[k] - centroids[j*d +k], 2);   //change this to L2 dist function    
148 |                 // } 
149 |                 if (temp>maxscore) {
150 |                     maxscore=temp;
151 |                     bin = j;}    
152 |             }
153 |             return bin;
154 |         }
155 |         
156 |         void getscore(float* input, float* scores){
157 |             for (uint32_t id=0; id<nc; id++){
158 |                 scores[id] = L2SIMD4ExtAVX(input, centroids+id*d, cen_norms[id], d);
159 |             }
160 |         }
161 |         
162 |         void load(string indexpath){
163 |             FILE* f1 = fopen((indexpath+"/centroids.bin").c_str(), "rb");
164 |             fread(centroids, sizeof(float), nc*d, f1);   
165 |             fclose (f1);    
166 |             FILE* f2 = fopen((indexpath+"/centroidsNorms.bin").c_str(), "rb");
167 |             fread(cen_norms, sizeof(float), nc, f2);
168 |             fclose (f2);
169 |         } 
170 | };
171 | 
172 | // add any other cluster method as a seperate derived class
173 | class BLISSmode3 : public cluster{
174 |     public:
175 |         int s1;
176 |         int sa; //size of attribute vocab
177 |         int na;
178 |         int s2;
179 |         int s3;
180 |         float* weights;
181 |         
182 |         BLISSmode3(int s1, int s2, int s3, int na): s1(s1), s2(s2), s3(s3), sa(sa), na(na){
183 |             weights = new float[(s2*(s1+sa+1) +s3*(s2+1))]; 
184 |         }
185 | 
186 |         void train(float* dataset, int nb, string modelpath){
187 |             // load .bin
188 |             FILE* f = fopen((modelpath + "/model.bin").c_str(), "rb");
189 |             fread(weights, sizeof(float), (s2*(s1+sa+1) +s3*(s2+1)), f); 
190 |             fclose (f);
191 |         } 
192 |         
193 |         void getscore(float* input, uint8_t* prop, float* last){
194 |             float* hd = new float[s2];
195 |             for (uint32_t id=0; id<s2; id++){
196 |                 hd[id] = IPSIMD4ExtAVX(input, weights+ id*(s1+sa+1), s1) + spaseMul(prop, weights+ id*(s1+sa+1),na);
197 |                 hd[id] += *(weights+ id*(s1+sa+1) +s1+sa); //bias
198 |                 if (hd[id]<0) hd[id] =0; // Relu 0 if negative
199 |             }
200 |             float* L1 = weights+ (s1+sa +1)*s2;
201 | 
202 |             for (uint32_t id=0; id<s3; id++){
203 |                 float temp =0;
204 |                 last[id] = IPSIMD4ExtAVX(hd, L1+ id*(s2+1), s2); //multiply
205 |                 last[id] += *(L1+ id*(s2+1) +s2); //bias
206 |             }
207 |             // last = softmax(last);
208 |         }
209 | 
210 |         uint32_t top(float* input, uint8_t* prop){ 
211 |             float* hd = new float[s2];
212 |             for (uint32_t id=0; id<s2; id++){
213 |                 hd[id] = IPSIMD4ExtAVX(input, weights+ id*(s1+sa+1), s1) + spaseMul(prop, weights+ id*(s1+sa+1),na);
214 |                 hd[id] += *(weights+ id*(s1+sa+1) +s1+sa); //bias
215 |                 if (hd[id]<0) hd[id] =0; // Relu 0 if negative
216 |             }
217 |             float* L1 = weights+ (s1+sa+1)*s2;
218 |             uint32_t bin;
219 |             float maxscore = -1000000; 
220 |             for (uint32_t id=0; id<s3; id++){
221 |                 float temp =0;
222 |                 temp = IPSIMD4ExtAVX(hd, L1+ id*(s2+1), s2); //multiply
223 |                 temp += *(L1+ id*(s2+1) +s2); //bias
224 |                 if (temp>maxscore) {
225 |                 maxscore=temp;
226 |                 bin = id;}
227 |             }
228 |             return bin;
229 |         }
230 | 
231 |         void load(string modelpath){
232 |             // load .bin
233 |             FILE* f = fopen((modelpath + "/model.bin").c_str(), "rb");
234 |             fread(weights, sizeof(float), (s2*(s1+sa+1) +s3*(s2+1)), f); 
235 |             fclose (f);
236 |         } 
237 | };
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/include/readfile.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <iostream>
 3 | #include <iomanip>
 4 | #include <string>
 5 | #include <cmath>
 6 | #include <stdio.h>
 7 | #include <string.h>
 8 | #include <cstdlib>
 9 | #include <vector>
10 | #include <set>
11 | #include <iterator>
12 | using namespace std;
13 | 
14 | #include <fstream>
15 | 
16 | vector<vector<string>> getproperties(string fileName, char dlim);
17 | vector<vector<int>> coordinates(string fileName);
18 | float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out);
19 | int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out);
20 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <iostream>
 3 | #include <fstream>
 4 | #include <string>
 5 | #include <cstdlib>
 6 | #include <vector>
 7 | #include <set>
 8 | #include <iterator>
 9 | #include "readfile.h"
10 | #include <stdlib.h>     /* calloc, exit, free */
11 | #include <numeric>
12 | #include <algorithm>
13 | #include <string> 
14 | #include <cstdint>
15 | #include <map>
16 | #include <cstdio>
17 | 
18 | #ifdef __AVX__
19 |   #include <immintrin.h>
20 | #else
21 |   #warning AVX is not available. Code will not compile!
22 | #endif
23 | 
24 | #if defined(__GNUC__)
25 | #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
26 | #define PORTABLE_ALIGN64 __attribute__((aligned(64)))
27 | #else
28 | #define PORTABLE_ALIGN32 __declspec(align(32))
29 | #define PORTABLE_ALIGN64 __declspec(align(64))
30 | #endif
31 | 
32 | //include something for map
33 | using namespace std;
34 | int argparser(int argc, char** argv, string* basepath, string* labelpath, string* indexpath, size_t* nc, string* algo, int* mode);
35 | int argparser(int argc, char** argv, string* basepath, string* labelpath, string* querypath, string* queryAttripath, string* indexpath, string* GTpath, size_t* nc, string* algo, int* mode, size_t* buffer_size);
36 | double computeRecall(vector<vector<int>> answer, vector<vector<int>> guess);
37 | 
38 | float IP(float* a, float* b, size_t d);
39 | double L2sim(float* a, float* b, float norm_bsq, size_t d);
40 | double L2Square(float* a, float* b, size_t d);
41 | double L2normSquare(float* a, size_t d);
42 | float IPSIMD4ExtAVX(float *pVect1, float *pVect2, size_t qty);
43 | float L2SIMD4ExtAVX(float *pVect1, float *pVect2, float norm_bsq, size_t qty);
44 | // float IPSIMD16ExtAVX512(float *pVect1v,  float *pVect2v,  float *qty_ptr);
45 | // float L2SIMD16ExtAVX512(float *pVect1, float *pVect2, float norm_bsq, size_t qty);
46 | float spaseMul(uint8_t* prop, float* weight ,int na);
47 | // to add AVXSIMD16 and SSE as well
48 | uint16_t getclusterPart(uint16_t* maxMC, uint8_t* props, int treelen);
49 | bool not_in(uint16_t x, uint16_t* a, int h);
50 | double RecallAtK(int* answer, int* guess, size_t k, size_t nq);
51 | void randomShuffle(int* v , int l, int u);
52 | vector<uint32_t> argTopK(float* query, float* vectors, uint32_t d, uint32_t N, vector<uint32_t> idx, uint32_t idxSize, uint32_t k, vector<float> topkDist);
53 | 


--------------------------------------------------------------------------------
/plots.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import time
 4 | import matplotlib.pyplot as plt
 5 | from matplotlib.pyplot import cm
 6 | import sys
 7 | 
 8 | natt = 3
 9 | ncon = 27
10 | results = np.loadtxt("results/Results_sift_3_27_1024Mode1Corrected.txt", delimiter=',')
11 | color=cm.rainbow(np.linspace(0,1,5))
12 | x = results[:,2]
13 | y = results[:,3]
14 | plt.plot(x, y ,color = 'red',marker='o', label="BLISS 1 -ours")
15 | 
16 | 
17 | results = np.loadtxt("results/Results_sift_3_27_1024Mode2Corrected.txt", delimiter=',')
18 | color=cm.rainbow(np.linspace(0,1,5))
19 | x = results[:,2]
20 | y = results[:,3]
21 | plt.plot(x, y ,color = 'blue',marker='o', label="BLISS 2 -ours")
22 | 
23 | 
24 | results = np.loadtxt("results/Results_sift_3_27_1024Mode3Corrected.txt", delimiter=',')
25 | color=cm.rainbow(np.linspace(0,1,5))
26 | x = results[:,2]
27 | y = results[:,3]
28 | plt.plot(x, y ,color = 'green',marker='o', label="BLISS 3 -ours")
29 | 
30 | 
31 | results = np.loadtxt("results/Results_sift_3_27_1024v2.txt", delimiter=',')
32 | color=cm.rainbow(np.linspace(0,1,5))
33 | x = results[:,2]
34 | y = results[:,3]
35 | plt.plot(x, y ,color = 'orange',marker='+', label="faissKmeans -ours")
36 | 
37 | 
38 | results = np.loadtxt("../../ResultsNHQ_sift_3_27.txt", delimiter=' ')
39 | color=cm.rainbow(np.linspace(0,1,5))
40 | x = 10000/results[:,0]
41 | y = results[:,1]
42 | plt.plot(x, y ,color = 'black',marker='*', label="NHQ")
43 | 
44 | 
45 | plt.xlabel('QPS')
46 | plt.ylabel('Recall100@100')    
47 | plt.title('SIFT, num_attributes=3 , total_constraints='+str(ncon)+' ')
48 | # plt.xscale('log')
49 | # plt.grid()
50 | plt.minorticks_on()
51 | plt.grid(which='major', color='black', linestyle='--')
52 | # plt.grid(b=True, which='minor', color='black', linestyle=':')
53 | # plt.ylim(0.5,0.83)
54 | # plt.xlim(0,20)
55 | plt.legend(fontsize=7.5)
56 | # plt.show()
57 | plt.savefig('sift_3_'+str(ncon)+'_1024BLISSCorrectedTuned.png', dpi = 500)
58 | 
59 | 
60 | 
61 | 
62 | # results = np.loadtxt('/scratch/gg29/ResultsNHQ_sift_'+str(natt)+'_'+str(ncon)+'.txt' , delimiter=' ')
63 | 
64 | # # plt.figure()
65 | # x = 10000/results[:,0]
66 | # y = results[:,1]
67 | # plt.plot(x, y ,color = 'blue',marker='o', label="NHQ")
68 | 
69 | # plt.xlabel('QPS')
70 | # plt.ylabel('Recall100@100')    
71 | # plt.title('SIFT, num_attributes=3 , total_constraints='+str(ncon)+', NHQ ')
72 | # # plt.xscale('log')
73 | # # plt.grid()
74 | # plt.minorticks_on()
75 | # plt.grid(which='major', color='black', linestyle='-')
76 | # # plt.grid(b=True, which='minor', color='black', linestyle=':')
77 | # # plt.ylim(0.5,0.83)
78 | # # plt.xlim(0,20)
79 | # plt.legend(fontsize=7.5)
80 | # # plt.show()
81 | # plt.savefig('NHQsift_'+str(natt)+'_'+str(ncon)+'.png', dpi = 500)
82 | # plt.savefig('Comparision sift_'+str(natt)+'_'+str(ncon)+'corrected.png', dpi = 500)
83 | 
84 | 


--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gaurav16gupta/constrainedANN/61eae4c56dc0d6c9906bee608cecbf23b11e0260/src/.DS_Store


--------------------------------------------------------------------------------
/src/FilterIndexHamming.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <sys/stat.h>
  3 | #include "FilterIndex.h"
  4 | 
  5 | using namespace std;
  6 | 
  7 | template <typename S>
  8 | // operator to print vectors, matrix
  9 | ostream& operator<<(ostream& os, const vector<S>& vector){
 10 |     // Printing vector elements using <<
 11 |     for (auto element : vector) {
 12 |         os << element << " ";
 13 |     }
 14 |     return os;
 15 | }
 16 | 
 17 | template <typename S>
 18 | ostream& operator<<(ostream& os, const vector<vector<S>>& matrix){
 19 |     // Printing matrix elements using <<
 20 |     for (const vector<S>& vector : matrix) {
 21 |         for (auto element : vector) {
 22 |             os << element << " ";
 23 |         }
 24 |     os << endl;
 25 |     }
 26 |     return os;
 27 | }
 28 | 
 29 | FilterIndex::FilterIndex(float* data, size_t d_, size_t nb_, size_t nc_, vector<vector<string>>properties_, string algo, int mode){
 30 |     dataset = data; // data
 31 |     d =d_; // dim
 32 |     nb = nb_; //num data points
 33 |     nc = nc_; // num clusters
 34 |     treelen = 20; //length of truncated huffman tree, num miniclusters= treelen+1
 35 |     // cin >> treelen;
 36 |     if (algo=="kmeans") {
 37 |         clusterAlgo = new Kmeans(nc, d); //dynamic allocation of object class
 38 |     }
 39 |     else if (algo=="bliss") {
 40 |         clusterAlgo = new BLISS(d, 256, nc);
 41 |         }
 42 |     else {
 43 |         cout<<"clustering unrecognised. Choosing Faiss-Kmeans as default"<<endl;
 44 |         clusterAlgo= new Kmeans(nc, d);
 45 |     }
 46 |     properties.resize(nb);
 47 |     uint8_t cnt=0;
 48 |     for (int i=0; i<nb; i++){
 49 |         for (string prp: properties_[i]){
 50 |             if (prLook.count(prp)==0){
 51 |                 prLook[prp]=cnt;
 52 |                 cnt++;
 53 |             }
 54 |             properties[i].push_back(prLook[prp]);
 55 |         }
 56 |         // beware:: sorting the properties will loose the position information!!
 57 |         // sort(properties[i].begin(), properties[i].end()); 
 58 |     }   
 59 |     cout<<unsigned(cnt)<<" total unique constraints"<<endl; 
 60 |     // Properties to location map
 61 |     numAttr = properties[0].size();
 62 |     for (int i=0; i<nb; i++){
 63 |         for (int j=0; j<numAttr; j++){
 64 |             PrpAtrMap[properties[i][j]] = j;
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | //NN index
 70 | void FilterIndex::get_index(string metric, string indexpath, int mode){
 71 |     //to save index files
 72 |     mkdir(indexpath.c_str(), 0777);
 73 |     data_norms = new float[nb]{0};
 74 |     Lookup= new uint32_t[nb];
 75 |     counts = new uint32_t[nc+1]{0};
 76 | 
 77 |     clusterAlgo->train(dataset, nb, indexpath); //take the properties for Mode 3 bliss
 78 |     for(uint32_t j = 0; j < nb; ++j){  
 79 |         data_norms[j]=0;
 80 |         for(uint32_t k = 0; k < d; ++k) {      
 81 |             data_norms[j] += dataset[j*d +k]*dataset[j*d +k];        
 82 |         } 
 83 |         data_norms[j]=data_norms[j]/2;
 84 |     }
 85 |     
 86 |     uint32_t* invLookup = new uint32_t[nb];
 87 |     //get best score cluster
 88 |     #pragma omp parallel for  
 89 |     for(uint32_t i = 0; i < nb; ++i) {  
 90 |         invLookup[i] = clusterAlgo->top(dataset+ i*d);   
 91 |     }
 92 |      for(uint32_t i = 0; i < nb; ++i) {
 93 |         counts[invLookup[i]+1] = counts[invLookup[i]+1]+1; 
 94 |     }
 95 |     for(uint32_t j = 1; j < nc+1; ++j) {
 96 |         counts[j] = counts[j]+ counts[j-1]; //cumsum 
 97 |     }
 98 | 
 99 |     //argsort invLookup to get the Lookup
100 |     iota(Lookup, Lookup+nb, 0);
101 |     stable_sort(Lookup, Lookup+nb, [&invLookup](size_t i1, size_t i2) {return invLookup[i1] < invLookup[i2];});
102 |     // get_mc_propertiesIndex(); // this will change counts, Lookup; and add maxMC tree
103 |     
104 |     FILE* f3 = fopen((indexpath+"/dataNorms.bin").c_str(), "wb");
105 |     fwrite(data_norms, sizeof(float), nb, f3);
106 |     fclose (f3);
107 |     FILE* f4 = fopen((indexpath+"/Lookup.bin").c_str(), "wb");
108 |     fwrite(Lookup, sizeof(uint32_t), nb, f4);
109 |     fclose (f4);
110 |     FILE* f5 = fopen((indexpath+"/counts.bin").c_str(), "wb");
111 |     fwrite(counts, sizeof(uint32_t), nc*(treelen+1)+1, f5);
112 |     fclose (f5);
113 | }   
114 | 
115 | void FilterIndex::get_mc_propertiesIndex(){
116 |     vector<vector<uint32_t>> maxMCIDs; //nested array to store the mini-clusters, change to uint32_t array later
117 |     maxMCIDs.resize((treelen+1)*nc);
118 |     maxMC = new uint16_t[3*(treelen+1)*nc]; 
119 |     for (int clID = 0; clID < nc; clID++){
120 |         if (counts[clID+1]- counts[clID]==0) continue; // what  if the cluster size is less than 4. Do something then
121 |         //get count of vector properties        
122 |         //get the max
123 |         for (int h=0; h<treelen; h++){ //iterate over tree height
124 |             unordered_map<uint16_t, int> freq; //property to frequency map
125 |             for (int i =counts[clID]; i< counts[clID+1]; i++){ // for all points in the cluster
126 |                 for (uint16_t x: properties[Lookup[i]]){
127 |                     if(not_in(x, maxMC + (treelen+1)*clID*3, h)) freq[x]++; // count number of items with the property x 
128 |                 }
129 |             }
130 |             int r = (treelen+1)*3*clID + 3*h;
131 |             //choose property with max freq
132 |             if (freq.end()== freq.begin()){
133 |                 maxMC[r+0]= 0; // property location
134 |                 maxMC[r+1]= 0; // property
135 |                 maxMC[r+2]= 0; // frequency
136 |             }
137 |             else{
138 |                 auto maxElement = max_element(freq.begin(), freq.end(),
139 |                                     [](const pair<uint16_t, int>& p1, const pair<uint16_t, int>& p2) { return p1.second < p2.second;});
140 |                 maxMC[r+0]= PrpAtrMap[maxElement->first]; // property location
141 |                 maxMC[r+1]= maxElement->first; // property
142 |                 maxMC[r+2]= maxElement->second; // frequency, do we need this in maxMC??
143 |             }
144 |         }
145 |         //maxMC serves as a node list and node data size in hamming tree, where
146 |         //node: property from maxMC
147 |         //node data: corresponding vector IDs
148 |         for (int i =counts[clID]; i< counts[clID+1]; i++){
149 |             for (int j=0; j< treelen; j++){
150 |                 int r = (treelen+1)*3*clID + 3*j;
151 |                 if((properties[Lookup[i]][maxMC[r]]==maxMC[r+1]) && (maxMC[r+2]>0)){ 
152 |                     maxMCIDs[(treelen+1)*clID +j].push_back(Lookup[i]);
153 |                     goto m_label;
154 |                 }
155 |             }
156 |             maxMCIDs[(treelen+1)*clID +treelen].push_back(Lookup[i]);
157 |             m_label:;
158 |         }
159 |     }
160 |     //need some assert statements
161 |     //update Lookup, counts. Flatten the maxMCIDs into Lookup
162 |     //each cluster now spans treelen+1 buckets
163 |     Lookup= new uint32_t[nb];
164 |     counts = new uint32_t[nc*(treelen+1)+1]{0}; 
165 |     for (int clID = 0; clID < nc; clID++){
166 |         for (int j=0; j< treelen+1; j++){
167 |             int id = clID*(treelen+1) +j;
168 |             counts[id+1] = counts[id]+ maxMCIDs[id].size();
169 |             memcpy(Lookup+ counts[id], maxMCIDs[id].data(), sizeof(*Lookup) * maxMCIDs[id].size());
170 |         }
171 |     }
172 | }
173 | 
174 | void FilterIndex::loadIndex(string indexpath){
175 |     data_norms = new float[nb]{0};
176 |     Lookup= new uint32_t[nb];
177 |     counts = new uint32_t[nc+1]; 
178 |     // counts = new uint32_t[nc*(treelen+1)+1]; 
179 |     // maxMC = new uint16_t[3*(treelen+1)*nc]; 
180 |     cout<<indexpath<<endl;
181 |     clusterAlgo->load(indexpath);
182 |     FILE* f3 = fopen((indexpath+"/dataNorms.bin").c_str(), "r");
183 |     fread(data_norms, sizeof(float), nb, f3);
184 |     FILE* f4 = fopen((indexpath+"/Lookup.bin").c_str(), "r");
185 |     fread(Lookup, sizeof(uint32_t), nb, f4);
186 |     FILE* f5 = fopen((indexpath+"/counts.bin").c_str(), "r");
187 |     fread(counts, sizeof(uint32_t), nc+1, f5);
188 |     get_mc_propertiesIndex();
189 |     //this changes Lookup
190 |     for (int i =0; i< nc*(treelen+1); i++){
191 |         int m1 = counts[i];
192 |         int m2 = counts[i+1]; // some miniclusters are empty!!
193 |         sort(Lookup+m1, Lookup+m2,
194 |             [&](uint32_t a, uint32_t b) {
195 |             return properties[a] < properties[b];
196 |             });
197 |     }
198 |     // reorder data and index
199 |     dataset_reordered = new float[nb*d];
200 |     data_norms_reordered = new float[nb];
201 |     properties_reordered = new uint8_t[nb*numAttr];
202 |     for(uint32_t i = 0; i < nb; ++i) {
203 |         copy(dataset+Lookup[i]*d, dataset+(Lookup[i]+1)*d , dataset_reordered+i*d);
204 |         data_norms_reordered[i] = data_norms[Lookup[i]];
205 |         memcpy(properties_reordered+ i*numAttr, properties[Lookup[i]].data(), sizeof(*properties_reordered) * numAttr);
206 |     }
207 |     delete dataset;
208 |     delete data_norms;
209 |     vector<vector<uint8_t>>().swap(properties);
210 | }
211 | 
212 | void FilterIndex::query(float* queryset, int nq, vector<vector<string>> queryprops, int num_results, int max_num_distances){
213 |     neighbor_set = new int32_t[nq*num_results]{-1};
214 |     cout<<"num queries: "<<nq<<", "<<"num max attributes: "<<numAttr<<endl;
215 |     for (size_t i = 0; i < nq; i++){
216 |         // run query
217 |         findNearestNeighbor(queryset+(i*d), queryprops[i], num_results, max_num_distances, i);
218 |     }
219 | }
220 | 
221 | // start from best cluster -> choose minicluster -> bruteforce search
222 | void FilterIndex::findNearestNeighbor(float* query, vector<string> Stprops, int num_results, int max_num_distances, size_t qnum)
223 | {   
224 |     chrono::time_point<chrono::high_resolution_clock> t1, t2,t2_1, t3, t4, t5, t6;
225 |     t1 = chrono::high_resolution_clock::now();
226 |     uint8_t props[numAttr];
227 |     uint8_t countX = 0; // count of X (missing attributes) in query props
228 |     string querymode = "varying";
229 |     for (size_t j =0; j<numAttr ;j++){
230 |         if (Stprops[j] !="X") props[j]= (prLook[Stprops[j]]);
231 |         else {
232 |             countX++;
233 |             props[j]=255; // this should be a key which is not seen before
234 |         }
235 |     }
236 |     if (countX==numAttr) querymode = "noAttribute";
237 |     else if (countX==0) querymode = "fixed";
238 |     t2 = chrono::high_resolution_clock::now();
239 | 
240 |     // sort(props.begin(), props.end());
241 |     priority_queue<pair<float, uint32_t> > pq;
242 |     uint32_t simid[nc];
243 |     float simv[nc];
244 |     clusterAlgo->getscore(query, simv);
245 |     t2_1 = chrono::high_resolution_clock::now();
246 |     // need argsorted IDs
247 |     iota(simid, simid+nc, 0);
248 |     // partial_sort(simid, simid+100, simid+nc, [&simv](size_t i1, size_t i2) {return simv[i1] > simv[i2];});
249 |     sort(simid, simid+nc, [&simv](size_t i1, size_t i2) {return simv[i1] > simv[i2];});
250 | 
251 |     priority_queue<pair<float, uint32_t> > Candidates_pq;
252 |     uint32_t Candidates[max_num_distances];
253 |     float score[max_num_distances];
254 |     int seen=0, seenbin=0;
255 |     float sim;
256 |     float a=0,b=0;
257 |     t3 = chrono::high_resolution_clock::now();
258 |     if (querymode == "fixed"){
259 |         while(seen<max_num_distances && seenbin<nc){ 
260 |             uint32_t bin = simid[seenbin];
261 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
262 |             int id = bin*(treelen+1);
263 |             bin = bin*(treelen+1);
264 | 
265 |             //get which sub-cluster query belongs to
266 |             uint16_t membership = getclusterPart(maxMC+ bin*3 , props, treelen);
267 |             bin = bin+membership;
268 |             for (int i =counts[bin]; i< counts[bin+1] && seen<max_num_distances; i++){
269 |                 // __builtin_prefetch (properties_reordered +(i+2)*numAttr, 0, 2); software prefect is not very useful here
270 |                 //check if constraint statisfies
271 |                 int j =0;
272 |                 while (j<numAttr && properties_reordered[i*numAttr +j]== props[j]) j++; // plus the number of empty attrs X
273 |                 if (j==numAttr){
274 |                     Candidates[seen]=i; 
275 |                     seen++;
276 |                 }
277 |             }
278 |         }
279 |     }
280 |     else if (querymode == "varying"){
281 |         while(seen<max_num_distances && seenbin<nc){ 
282 |             uint32_t bin = simid[seenbin];
283 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
284 |             bin = bin*(treelen+1);
285 |             // go through each sub partition
286 |             bool checkremaining=true;
287 |             for (uint16_t u=0;u<treelen; u++){
288 |                 if (props[maxMC[u*3+0]]==255){
289 |                     for (int i =counts[bin+u]; i< counts[bin+u+1] && seen<max_num_distances; i++){
290 |                         //check if constraint statisfies
291 |                         int j =0;
292 |                         while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
293 |                         if (j==numAttr){
294 |                             Candidates[seen]=i; 
295 |                             seen++;
296 |                         }
297 |                     }
298 |                 }
299 |                 else if (maxMC[u*3+1] == props[maxMC[u*3+0]]){
300 |                     for (int i =counts[bin+u]; i< counts[bin+u+1] && seen<max_num_distances; i++){
301 |                         //check if constraint statisfies
302 |                         int j =0;
303 |                         while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
304 |                         if (j==numAttr){
305 |                             Candidates[seen]=i; 
306 |                             seen++;
307 |                         }
308 |                     }
309 |                     checkremaining = false;
310 |                     break;
311 |                 }
312 |             }
313 |             if (checkremaining == true){
314 |                 for (int i =counts[bin+ treelen]; i< counts[bin+ treelen+1] && seen<max_num_distances; i++){
315 |                         //check if constraint statisfies
316 |                         int j =0;
317 |                        while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
318 |                         if (j==numAttr){
319 |                             Candidates[seen]=i; 
320 |                             seen++;
321 |                         }
322 |                 }
323 |             }
324 |         }
325 |     }
326 |     else if (querymode == "noAttribute"){
327 |         while(seen<max_num_distances && seenbin<nc){ 
328 |             uint32_t bin = simid[seenbin];
329 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
330 |             bin = bin*(treelen+1);
331 |             // go through each sub partition
332 |             for (uint16_t u=0;u<treelen; u++){
333 |                 for (int i =counts[bin+u]; i< counts[bin+u+1] && seen<max_num_distances; i++){
334 |                     Candidates[seen]=i; 
335 |                     seen++;
336 |                 }
337 |             }  
338 |         }
339 |     }
340 |     else cout << "querymode got a different val somehow"<<endl;
341 |     t4 = chrono::high_resolution_clock::now();
342 |     // NN distance computations
343 |     float maxk;
344 |     if (seen<num_results+1){
345 |         for (int i =0; i< seen; i++){ 
346 |             neighbor_set[qnum*num_results+ i] = Lookup[Candidates[i]];
347 |         }
348 |     }
349 |     else{
350 |         for (int i =0; i< seen; i++){
351 |             score[i] = -L2SIMD4ExtAVX(query, dataset_reordered +Candidates[i]*d, data_norms_reordered[Candidates[i]], d);
352 |         }
353 |         for (int i =0; i< num_results; i++){ 
354 |             Candidates_pq.push({score[i],Candidates[i]});
355 |         }
356 |         maxk = Candidates_pq.top().first;
357 |         for (int i =num_results; i< seen; i++){ 
358 |             if (score[i]< maxk){
359 |                 maxk = Candidates_pq.top().first;
360 |                 Candidates_pq.pop();
361 |                 Candidates_pq.push({score[i], Candidates[i]});
362 |             }
363 |         }
364 |         for (int i =0; i< num_results; i++){ 
365 |             neighbor_set[qnum*num_results+ i] = Lookup[Candidates_pq.top().second];
366 |             Candidates_pq.pop();
367 |         }
368 |     }
369 |     t5 = chrono::high_resolution_clock::now();
370 |     // cout<<"time: "<<chrono::duration_cast<chrono::nanoseconds>(t2 - t1).count()<<" ";
371 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t2_1 - t2).count()<<" ";
372 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t3 - t2_1).count()<<" ";
373 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t4 - t3).count()<<" ";
374 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t5 - t4).count()<<endl;    
375 | }
376 | 
377 | // TODO
378 | // 1) change dtype of the properties (uint16_t/uint32_t/uint8_t), based on the vocab size
379 | 


--------------------------------------------------------------------------------
/src/FilterIndexHammingThNprobe.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <sys/stat.h>
  3 | #include "FilterIndex.h"
  4 | 
  5 | using namespace std;
  6 | 
  7 | template <typename S>
  8 | // operator to print vectors, matrix
  9 | ostream& operator<<(ostream& os, const vector<S>& vector){
 10 |     // Printing vector elements using <<
 11 |     for (auto element : vector) {
 12 |         os << element << " ";
 13 |     }
 14 |     return os;
 15 | }
 16 | 
 17 | template <typename S>
 18 | ostream& operator<<(ostream& os, const vector<vector<S>>& matrix){
 19 |     // Printing matrix elements using <<
 20 |     for (const vector<S>& vector : matrix) {
 21 |         for (auto element : vector) {
 22 |             os << element << " ";
 23 |         }
 24 |     os << endl;
 25 |     }
 26 |     return os;
 27 | }
 28 | 
 29 | FilterIndex::FilterIndex(float* data, size_t d_, size_t nb_, size_t nc_, vector<vector<string>>properties_, string algo, int mode){
 30 |     dataset = data; // data
 31 |     d =d_; // dim
 32 |     nb = nb_; //num data points
 33 |     nc = nc_; // num clusters
 34 |     treelen = 20; //length of truncated huffman tree, num miniclusters= treelen+1
 35 |     // cin >> treelen;
 36 |     if (algo=="kmeans") {
 37 |         clusterAlgo = new Kmeans(nc, d); //dynamic allocation of object class
 38 |     }
 39 |     else if (algo=="bliss") {
 40 |         clusterAlgo = new BLISS(d, 256, nc);
 41 |         }
 42 |     else {
 43 |         cout<<"clustering unrecognised. Choosing Faiss-Kmeans as default"<<endl;
 44 |         clusterAlgo= new Kmeans(nc, d);
 45 |     }
 46 |     properties.resize(nb);
 47 |     uint8_t cnt=0;
 48 |     for (int i=0; i<nb; i++){
 49 |         for (string prp: properties_[i]){
 50 |             if (prLook.count(prp)==0){
 51 |                 prLook[prp]=cnt;
 52 |                 cnt++;
 53 |             }
 54 |             properties[i].push_back(prLook[prp]);
 55 |         }
 56 |         // beware:: sorting the properties will loose the position information!!
 57 |         // sort(properties[i].begin(), properties[i].end()); 
 58 |     }  
 59 |     cout<<unsigned(cnt)<<" total unique constraints"<<endl; 
 60 |     if (cnt>254) cout<<"change attribute token precision to uint16_t"<<endl;
 61 |     if (cnt>65534) cout<<"change attribute token precision to uint32_t"<<endl;
 62 |     // Properties to location map
 63 |     numAttr = properties[0].size();
 64 |     for (int i=0; i<nb; i++){
 65 |         for (int j=0; j<numAttr; j++){
 66 |             PrpAtrMap[properties[i][j]] = j;
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | //NN index
 72 | void FilterIndex::get_index(string metric, string indexpath, int mode){
 73 |     //to save index files
 74 |     mkdir(indexpath.c_str(), 0777);
 75 |     data_norms = new float[nb]{0};
 76 |     Lookup= new uint32_t[nb];
 77 |     counts = new uint32_t[nc+1]{0};
 78 | 
 79 |     clusterAlgo->train(dataset, nb, indexpath); //take the properties for Mode 3 bliss
 80 |     for(uint32_t j = 0; j < nb; ++j){  
 81 |         data_norms[j]=0;  //was a bug here, now removed         
 82 |         for(uint32_t k = 0; k < d; ++k) {   
 83 |             data_norms[j] += dataset[j*d +k]*dataset[j*d +k];        
 84 |         } 
 85 |         data_norms[j]=data_norms[j]/2;
 86 |     }
 87 |     
 88 |     uint32_t* invLookup = new uint32_t[nb];
 89 |     //get best score cluster
 90 |     #pragma omp parallel for  
 91 |     for(uint32_t i = 0; i < nb; ++i) {  
 92 |         invLookup[i] = clusterAlgo->top(dataset+ i*d);   
 93 |     }
 94 |      for(uint32_t i = 0; i < nb; ++i) {
 95 |         counts[invLookup[i]+1] = counts[invLookup[i]+1]+1; // 0 5 4 6 3
 96 |     }
 97 |     for(uint32_t j = 1; j < nc+1; ++j) {
 98 |         counts[j] = counts[j]+ counts[j-1]; //cumsum 
 99 |     }
100 | 
101 |     //argsort invLookup to get the Lookup
102 |     iota(Lookup, Lookup+nb, 0);
103 |     stable_sort(Lookup, Lookup+nb, [&invLookup](size_t i1, size_t i2) {return invLookup[i1] < invLookup[i2];});
104 |     // get_mc_propertiesIndex(); // this will change counts, Lookup; and add maxMC tree
105 |     
106 |     FILE* f3 = fopen((indexpath+"/dataNorms.bin").c_str(), "wb");
107 |     fwrite(data_norms, sizeof(float), nb, f3);
108 |     fclose (f3);
109 |     FILE* f4 = fopen((indexpath+"/Lookup.bin").c_str(), "wb");
110 |     fwrite(Lookup, sizeof(uint32_t), nb, f4);
111 |     fclose (f4);
112 |     FILE* f5 = fopen((indexpath+"/counts.bin").c_str(), "wb");
113 |     fwrite(counts, sizeof(uint32_t), nc*(treelen+1)+1, f5);
114 |     fclose (f5);
115 | }   
116 | 
117 | void FilterIndex::get_mc_propertiesIndex(){
118 |     vector<vector<uint32_t>> maxMCIDs; //nested array to store the mini-clusters, change to uint32_t array later
119 |     maxMCIDs.resize((treelen+1)*nc);
120 |     maxMC = new uint16_t[3*(treelen+1)*nc]; 
121 |     for (int clID = 0; clID < nc; clID++){
122 |         if (counts[clID+1]- counts[clID]==0) continue; // what  if the cluster size is less than 4. Do something then
123 |         //get count of vector properties        
124 |         //get the max
125 |         for (int h=0; h<treelen; h++){ //iterate over tree height
126 |             unordered_map<uint16_t, int> freq; //property to frequency map
127 |             for (int i =counts[clID]; i< counts[clID+1]; i++){ // for all points in the cluster
128 |                 for (uint16_t x: properties[Lookup[i]]){
129 |                     if(not_in(x, maxMC + (treelen+1)*clID*3, h)) freq[x]++; // count number of items with the property x 
130 |                 }
131 |             }
132 |             int r = (treelen+1)*3*clID + 3*h;
133 |             //choose property with max freq
134 |             if (freq.end()== freq.begin()){
135 |                 maxMC[r+0]= 0; // property location
136 |                 maxMC[r+1]= 0; // property
137 |                 maxMC[r+2]= 0; // frequency
138 |             }
139 |             else{
140 |                 auto maxElement = max_element(freq.begin(), freq.end(),
141 |                                     [](const pair<uint16_t, int>& p1, const pair<uint16_t, int>& p2) { return p1.second < p2.second;});
142 |                 maxMC[r+0]= PrpAtrMap[maxElement->first]; // property location
143 |                 maxMC[r+1]= maxElement->first; // property
144 |                 maxMC[r+2]= maxElement->second; // frequency, do we need this in maxMC??
145 |             }
146 |         }
147 |         //maxMC serves as a node list and node data size in hamming tree, where
148 |         //node: property from maxMC
149 |         //node data: corresponding vector IDs
150 |         for (int i =counts[clID]; i< counts[clID+1]; i++){
151 |             for (int j=0; j< treelen; j++){
152 |                 int r = (treelen+1)*3*clID + 3*j;
153 |                 if((properties[Lookup[i]][maxMC[r]]==maxMC[r+1]) && (maxMC[r+2]>0)){ 
154 |                     maxMCIDs[(treelen+1)*clID +j].push_back(Lookup[i]);
155 |                     goto m_label;
156 |                 }
157 |             }
158 |             maxMCIDs[(treelen+1)*clID +treelen].push_back(Lookup[i]);
159 |             m_label:;
160 |         }
161 |     }
162 |     //need some assert statements
163 |     //update Lookup, counts. Flatten the maxMCIDs into Lookup
164 |     //each cluster now spans treelen+1 buckets
165 |     Lookup= new uint32_t[nb];
166 |     counts = new uint32_t[nc*(treelen+1)+1]{0}; 
167 |     for (int clID = 0; clID < nc; clID++){
168 |         for (int j=0; j< treelen+1; j++){
169 |             int id = clID*(treelen+1) +j;
170 |             counts[id+1] = counts[id]+ maxMCIDs[id].size();
171 |             memcpy(Lookup+ counts[id], maxMCIDs[id].data(), sizeof(*Lookup) * maxMCIDs[id].size());
172 |         }
173 |     }
174 | }
175 | 
176 | void FilterIndex::loadIndex(string indexpath){
177 |     data_norms = new float[nb]{0};
178 |     Lookup= new uint32_t[nb];
179 |     counts = new uint32_t[nc+1]; 
180 |     // counts = new uint32_t[nc*(treelen+1)+1]; 
181 |     // maxMC = new uint16_t[3*(treelen+1)*nc]; 
182 |     cout<<indexpath<<endl;
183 |     clusterAlgo->load(indexpath);
184 |     FILE* f3 = fopen((indexpath+"/dataNorms.bin").c_str(), "r");
185 |     fread(data_norms, sizeof(float), nb, f3);
186 |     FILE* f4 = fopen((indexpath+"/Lookup.bin").c_str(), "r");
187 |     fread(Lookup, sizeof(uint32_t), nb, f4);
188 |     FILE* f5 = fopen((indexpath+"/counts.bin").c_str(), "r");
189 |     fread(counts, sizeof(uint32_t), nc+1, f5);
190 |     get_mc_propertiesIndex();
191 |     cout<<"here1"<<endl;
192 |     //this changes Lookup
193 |     for (int i =0; i< nc*(treelen+1); i++){
194 |         int m1 = counts[i];
195 |         int m2 = counts[i+1]; // some miniclusters are empty!!
196 |         sort(Lookup+m1, Lookup+m2,
197 |             [&](uint32_t a, uint32_t b) {
198 |             return properties[a] < properties[b];
199 |             });
200 |     }
201 |     cout<<"here2"<<endl;
202 |     // reorder data and index
203 |     dataset_reordered = new float[nb*d];
204 |     data_norms_reordered = new float[nb];
205 |     properties_reordered = new uint8_t[nb*numAttr];
206 |     for(uint32_t i = 0; i < nb; ++i) {
207 |         copy(dataset+Lookup[i]*d, dataset+(Lookup[i]+1)*d , dataset_reordered+i*d);
208 |         data_norms_reordered[i] = data_norms[Lookup[i]];
209 |         memcpy(properties_reordered+ i*numAttr, properties[Lookup[i]].data(), sizeof(*properties_reordered) * numAttr);
210 |     }
211 |     cout<<"here3"<<endl;
212 |     delete dataset;
213 |     delete data_norms;
214 |     vector<vector<uint8_t>>().swap(properties);
215 | }
216 | 
217 | void FilterIndex::query(float* queryset, int nq, vector<vector<string>> queryprops, int num_results, int nprobe){
218 |     neighbor_set = new int32_t[nq*num_results]{-1};
219 |     cout<<"num queries: "<<nq<<", "<<"num max attributes: "<<numAttr<<endl;
220 |     for (size_t i = 0; i < nq; i++){
221 |         // run query
222 |         cout<<"query: "<<i<<endl;
223 |         findNearestNeighbor(queryset+(i*d), queryprops[i], num_results, nprobe, i);
224 |     }
225 | }
226 | 
227 | // start from best cluster -> choose minicluster -> bruteforce search
228 | void FilterIndex::findNearestNeighbor(float* query, vector<string> Stprops, int num_results, int nprobe, size_t qnum)
229 | {   
230 |     chrono::time_point<chrono::high_resolution_clock> t1, t2,t2_1, t3, t4, t5, t6;
231 |     t1 = chrono::high_resolution_clock::now();
232 |     uint8_t props[numAttr];
233 |     uint8_t countX = 0; // count of X (missing attributes) in query props
234 |     string querymode = "varying";
235 |     for (size_t j =0; j<numAttr ;j++){
236 |         if (Stprops[j] !="X") props[j]= (prLook[Stprops[j]]);
237 |         else {
238 |             countX++;
239 |             props[j]=255; // this should be a key which is not seen before
240 |         }
241 |     }
242 |     if (countX==numAttr) querymode = "noAttribute";
243 |     else if (countX==0) querymode = "fixed";
244 |     t2 = chrono::high_resolution_clock::now();
245 |     
246 |     // sort(props.begin(), props.end());
247 |     priority_queue<pair<float, uint32_t> > pq;
248 |     uint32_t simid[nc];
249 |     float simv[nc];
250 |     clusterAlgo->getscore(query, simv);
251 |     t2_1 = chrono::high_resolution_clock::now();
252 |     // need argsorted IDs
253 |     iota(simid, simid+nc, 0);
254 |     // good if nprobe is << nc
255 |     if (nprobe< nc/10)
256 |         nth_element(simid, simid + nprobe, simid + nc, [&simv](size_t i1, size_t i2) { return simv[i1] > simv[i2]; });
257 |     else
258 |         sort(simid, simid+nc, [&simv](size_t i1, size_t i2) {return simv[i1] > simv[i2];});
259 | 
260 |     priority_queue<pair<float, uint32_t> > Candidates_pq;
261 |     cout<<querymode<<endl;
262 |     uint32_t Candidates[counts[nprobe]];
263 |     float score[counts[nprobe]];
264 |     int seen=0, seenbin=0;
265 |     float sim;
266 |     float a=0,b=0;
267 |     t3 = chrono::high_resolution_clock::now();
268 |     if (querymode == "fixed"){
269 |         while(seenbin<nprobe){ 
270 |             uint32_t bin = simid[seenbin];
271 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
272 |             int id = bin*(treelen+1);
273 |             bin = bin*(treelen+1);
274 | 
275 |             //get which sub-cluster query belongs to
276 |             uint16_t membership = getclusterPart(maxMC+ bin*3 , props, treelen);
277 |             bin = bin+membership;
278 |             for (int i =counts[bin]; i< counts[bin+1]; i++){
279 |                 // __builtin_prefetch (properties_reordered +(i+2)*numAttr, 0, 2); software prefect is not very useful here
280 |                 //check if constraint statisfies
281 |                 int j =0;
282 |                 while (j<numAttr && properties_reordered[i*numAttr +j]== props[j]) j++; // plus the number of empty attrs X
283 |                 if (j==numAttr){
284 |                     Candidates[seen]=i; 
285 |                     seen++;
286 |                 }
287 |             }
288 |         }
289 |     }
290 |     else if (querymode == "varying"){
291 |         cout<<nprobe<<endl;
292 |         while(seenbin<nprobe){ 
293 |             cout<<"seenbin :"<<seenbin<<endl;
294 |             uint32_t bin = simid[seenbin];
295 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
296 |             bin = bin*(treelen+1);
297 |             // go through each sub partition
298 |             bool checkremaining=true;
299 |             for (uint16_t u=0;u<treelen; u++){
300 |                 cout<<u<<" ";
301 |                 if (props[maxMC[u*3+0]]==255){
302 |                     for (int i =counts[bin+u]; i< counts[bin+u+1]; i++){
303 |                         //check if constraint statisfies
304 |                         int j =0;
305 |                         while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
306 |                         if (j==numAttr){
307 |                             Candidates[seen]=i; 
308 |                             seen++;
309 |                         }
310 |                     }
311 |                 }
312 |                 else if (maxMC[u*3+1] == props[maxMC[u*3+0]]){
313 |                     for (int i =counts[bin+u]; i< counts[bin+u+1]; i++){
314 |                         //check if constraint statisfies
315 |                         int j =0;
316 |                         while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
317 |                         if (j==numAttr){
318 |                             Candidates[seen]=i; 
319 |                             seen++;
320 |                         }
321 |                     }
322 |                     checkremaining = false;
323 |                     break;
324 |                 }
325 |             }
326 |             if (checkremaining == true){
327 |                 cout<<"checkremaining"<<" ";
328 |                 for (int i =counts[bin+ treelen]; i< counts[bin+ treelen+1]; i++){
329 |                         //check if constraint statisfies
330 |                         int j =0;
331 |                        while (j<numAttr && (properties_reordered[i*numAttr +j]== props[j] | props[j]==255)) j++; 
332 |                         if (j==numAttr){
333 |                             Candidates[seen]=i; 
334 |                             seen++;
335 |                         }
336 |                 }
337 |             }
338 |             cout<<endl;
339 |         }
340 |     }
341 |     else if (querymode == "noAttribute"){
342 |         while(seenbin<nprobe){ 
343 |             uint32_t bin = simid[seenbin];
344 |             seenbin++; // not if we are probing multiple subbins, in case of varying #attrs
345 |             bin = bin*(treelen+1);
346 |             // go through each sub partition
347 |             for (uint16_t u=0;u<treelen; u++){
348 |                 for (int i =counts[bin+u]; i< counts[bin+u+1]; i++){
349 |                     Candidates[seen]=i; 
350 |                     seen++;
351 |                 }
352 |             }  
353 |         }
354 |     }
355 |     else cout << "querymode got a different val somehow"<<endl;
356 |     t4 = chrono::high_resolution_clock::now();
357 |     // NN distance computations
358 |     float maxk;
359 |     cout<<"end of getting candidates "<<endl;
360 |     if (seen<num_results+1){
361 |         for (int i =0; i< seen; i++){ 
362 |             neighbor_set[qnum*num_results+ i] = Lookup[Candidates[i]];
363 |         }
364 |     }
365 |     else{
366 |         for (int i =0; i< seen; i++){
367 |             score[i] = -L2SIMD4ExtAVX(query, dataset_reordered +Candidates[i]*d, data_norms_reordered[Candidates[i]], d);
368 |         }
369 |         for (int i =0; i< num_results; i++){ 
370 |             Candidates_pq.push({score[i],Candidates[i]});
371 |         }
372 |         maxk = Candidates_pq.top().first;
373 |         for (int i =num_results; i< seen; i++){ 
374 |             if (score[i]< maxk){
375 |                 maxk = Candidates_pq.top().first;
376 |                 Candidates_pq.pop();
377 |                 Candidates_pq.push({score[i], Candidates[i]});
378 |             }
379 |         }
380 |         for (int i =0; i< num_results; i++){ 
381 |             neighbor_set[qnum*num_results+ i] = Lookup[Candidates_pq.top().second];
382 |             Candidates_pq.pop();
383 |         }
384 |     }
385 |     
386 |     t5 = chrono::high_resolution_clock::now();
387 |     // cout<<"time: "<<chrono::duration_cast<chrono::nanoseconds>(t2 - t1).count()<<" ";
388 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t2_1 - t2).count()<<" ";
389 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t3 - t2_1).count()<<" ";
390 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t4 - t3).count()<<" ";
391 |     // cout<<chrono::duration_cast<chrono::nanoseconds>(t5 - t4).count()<<endl;    
392 | }
393 | 
394 | // TODO
395 | // 1) change dtype of the properties (uint16_t/uint32_t/uint8_t), based on the vocab size
396 | 


--------------------------------------------------------------------------------
/src/index.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include "FilterIndex.h"
 4 | 
 5 | #define DATAPATH "/scratch/gg29/data/"
 6 | int main(int argc, char** argv)
 7 | {   
 8 |     string basepath, labelpath, indexpath;
 9 |     //default
10 |     string metric = "L2";
11 |     int mode = 1;
12 |     string algo ="kmeans";
13 |     size_t nc =1024;
14 | 
15 |     int success = argparser(argc, argv, &basepath, &labelpath, &indexpath, &nc, &algo, &mode);
16 | 
17 |     size_t d, nb; 
18 |     float* data = fvecs_read(basepath.c_str(), &d, &nb);
19 |     vector<vector<string>> properties = getproperties(labelpath,' ');
20 |     cout << "Data files read" << endl;
21 |     chrono::time_point<chrono::high_resolution_clock> t1, t2;
22 |     t1 = chrono::high_resolution_clock::now();
23 |     FilterIndex myFilterIndex(data, d, nb, nc, properties, algo, mode);
24 |     myFilterIndex.get_index(metric, indexpath, mode);
25 |     // t2 = chrono::high_resolution_clock::now();
26 |     // cout<<"Index time: "<<chrono::duration_cast<chrono::nanoseconds>(t2 - t1).count()/1000000000<<endl;
27 |     // cout << "Indexed at: " << indexpath << endl;
28 |     return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/src/libfaiss.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gaurav16gupta/constrainedANN/61eae4c56dc0d6c9906bee608cecbf23b11e0260/src/libfaiss.a


--------------------------------------------------------------------------------
/src/query.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include "FilterIndex.h"
 4 | 
 5 | // #define DATAPATH "/scratch/gg29/data/"
 6 | 
 7 | int main(int argc, char** argv)
 8 | {
 9 |     //default
10 |     string metric = "L2";
11 |     int mode = 0;
12 |     string algo ="kmeans";
13 |     size_t nc =0;
14 |     // size_t buffer_size =0;
15 |     size_t nprobe =0;
16 | 
17 |     size_t d, nb, nq, num_results; 
18 |     string datapath, Attripath, querypath, queryAttripath, indexpath, GTpath;
19 |     int success = argparser(argc, argv, &datapath, &Attripath, &querypath, &queryAttripath, &indexpath, &GTpath, &nc, &algo, &mode, &nprobe);
20 | 
21 |     float* data = fvecs_read(datapath.c_str(), &d, &nb);
22 |     vector<vector<string>> properties = getproperties(Attripath,' ');
23 |     // nc = atoi(argv[2]); // num clusters
24 |     FilterIndex myFilterIndex(data, d, nb, nc, properties, algo, mode);
25 |     myFilterIndex.loadIndex(indexpath);
26 |     cout << "Loaded" << endl;
27 | 
28 |     float* queryset = fvecs_read(querypath.c_str(), &d, &nq);
29 |     vector<vector<string>> queryprops = getproperties(queryAttripath,' ');
30 |     int* queryGTlabel = ivecs_read(GTpath.c_str(), &num_results, &nq);
31 |     cout << "Query files read..." << endl;
32 |     // nq = 10000;
33 |     chrono::time_point<chrono::high_resolution_clock> t1, t2;
34 |     t1 = chrono::high_resolution_clock::now();
35 |     myFilterIndex.query(queryset, nq, queryprops, num_results, nprobe);
36 |     t2 = chrono::high_resolution_clock::now();
37 |     std::chrono::duration<double> diff = t2 - t1;
38 | 
39 |     int32_t* output = myFilterIndex.neighbor_set;
40 |     int output_[num_results*nq];
41 |     copy(output, output+num_results*nq , output_);
42 |     cout<<"numClusters, buffersize, QPS, Recall100@100 :"<<endl;
43 |     //QPS and recall
44 |     double QPS;
45 |     double recall = RecallAtK(queryGTlabel, output_, num_results, nq);
46 |     printf("%d,%d,%f,%f\n",nc, nprobe, nq/diff.count(), recall);
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/src/readfile.cpp:
--------------------------------------------------------------------------------
  1 | #include <cassert>
  2 | #include "readfile.h"
  3 | #include <sys/stat.h>
  4 | //#include "mnist/mnist_reader.hpp"
  5 | 
  6 | using namespace std;
  7 | 
  8 | vector<vector<string>> getproperties(string fileName, char dlim)
  9 | {
 10 |     vector<vector<string>> queryproperties;
 11 |     ifstream infile(fileName);
 12 |     string s;
 13 |     getline(infile, s);
 14 |     while (infile)
 15 |     {
 16 |         if (!getline(infile, s)) break;
 17 |         istringstream ss(s);
 18 |         vector<string> property;
 19 | 
 20 |         while (ss)
 21 |         {
 22 |             string st;
 23 |             if (!getline( ss, st, dlim )) break;
 24 |             property.push_back( st );
 25 |         }
 26 | 
 27 |         queryproperties.push_back(property);
 28 |     }
 29 |     if (!infile.eof())
 30 |     {
 31 |         cout << "Something is off\n";
 32 |     }
 33 |     return queryproperties;    
 34 | }
 35 | 
 36 | vector<vector<int>> coordinates(string fileName)
 37 | {
 38 |     vector<vector<int>> coordinates;
 39 |     ifstream infile(fileName);
 40 | 
 41 |     while (infile)
 42 |     {
 43 |         string s;
 44 |         if (!getline(infile, s)) break;
 45 | 
 46 |         istringstream ss(s);
 47 |         vector<int> coord;
 48 | 
 49 |         while (ss)
 50 |         {
 51 |             string s;
 52 |             if (!getline( ss, s, ',' )) break;
 53 |             coord.push_back( stoi(s) );
 54 |         }
 55 | 
 56 |         coordinates.push_back(coord);
 57 |     }
 58 |     if (!infile.eof())
 59 |     {
 60 |         cout << "Something is off\n";
 61 |     }
 62 |     return coordinates;    
 63 | }
 64 | 
 65 | //write nested vector
 66 | // void vectorWrite(const char* fname, vector<vector<uint16_t>> vec) {
 67 | //     for (int i=0; i<vec.size(), i++){
 68 | //         uint16_t d= vec[i].size();
 69 | 
 70 | //         for (const auto& element: vec) { 
 71 | //         file << element << " "; 
 72 | //     } 
 73 | //     }
 74 | //     FILE* f1 = fopen("centroids.bin", "w");
 75 | // fwrite(centroids, sizeof(float), nc*d, f1);
 76 | 
 77 | // }
 78 | 
 79 | 
 80 | //write dict unordered_map
 81 | // void mapWrite(const char* fname, unordered_map<uint16_t, vector<uint32_t>> inverted_index) {
 82 | 
 83 | // }
 84 | 
 85 | 
 86 | /*****************************************************
 87 |  * I/O functions for fvecs and ivecs
 88 |  *****************************************************/
 89 | 
 90 | float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
 91 |     FILE* f = fopen(fname, "r");
 92 |     if (!f) {
 93 |         fprintf(stderr, "could not open %s\n", fname);
 94 |         perror("");
 95 |         abort();
 96 |     }
 97 |     int d;
 98 |     fread(&d, 1, sizeof(int), f);
 99 |     assert((d > 0 && d < 1000000) || !"unreasonable dimension");
100 |     fseek(f, 0, SEEK_SET);
101 |     struct stat st;
102 |     fstat(fileno(f), &st);
103 |     size_t sz = st.st_size;
104 |     assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
105 |     size_t n = sz / ((d + 1) * 4);
106 | 
107 |     *d_out = d;
108 |     *n_out = n;
109 |     float* x = new float[n * (d + 1)];
110 |     size_t nr = fread(x, sizeof(float), n * (d + 1), f);
111 |     assert(nr == n * (d + 1) || !"could not read whole file");
112 | 
113 |     // shift array to remove row headers
114 |     for (size_t i = 0; i < n; i++)
115 |         memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
116 | 
117 |     fclose(f);
118 |     return x;
119 | }
120 | 
121 | // not very clean, but works as long as sizeof(int) == sizeof(float)
122 | int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
123 |     return (int*)fvecs_read(fname, d_out, n_out);
124 | }


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "utils.h"
  3 | #include <bits/stdc++.h>
  4 | 
  5 | using namespace std;
  6 | 
  7 | int argparser(int argc, char** argv, string* basepath, string* labelpath, string* indexpath, size_t* nc, string* algo, int* mode){
  8 |     if (argc < 4){
  9 |         std::clog<<"Usage: "<<std::endl; 
 10 |         std::clog<<"./index <data> <properties> <outfile>";
 11 |         std::clog<<" [--Nc num_clusters] [--Algo method] [--mode method_version]"<<std::endl;
 12 | 
 13 |         std::clog<<"Positional arguments: "<<std::endl;
 14 |         std::clog<<"\t data: Filename pointing to an fvecs file (4 byte uint N, then list of  4 byte uint dim, then list of 32-bit little-endian floats)."<<std::endl;
 15 |         std::clog<<"\t properties: Filename pointing to a properties file (text file containing <num points> <num attributes> <newline> whitespace-separated property lists)"<<std::endl;
 16 |         // std::clog<<"\t space: Integer distance ID: 0 for L2 distance, 1 for inner product (angular distance)."<<std::endl;
 17 |         std::clog<<"\t outfile: folder for the index. Example : sift1024blissMode1, sift1024kmeans"<<std::endl;
 18 | 
 19 |         std::clog<<"Optional arguments: "<<std::endl;
 20 |         std::clog<<"\t [--Nc num_clusters]: (Optional, default 1024) Number of clusters/bins."<<std::endl;
 21 |         std::clog<<"\t [--Algo method]: (Optional, default kmeans) MEthod used for clustering. Use either bliss or kmeans"<<std::endl;
 22 |         std::clog<<"\t [--mode method_version]: (Optional, default 1) Only for bliss, use either 1,2 or 3. 1: embedding input - ANN labels, 2: embedding input - FilterANN labels, 3: embedding+Attribute input - FilterANN labels"<<std::endl;
 23 |         return -1;
 24 |     }
 25 | 
 26 |     // Positional arguments.
 27 |     *basepath = (string(argv[1])); 
 28 |     *labelpath = (string(argv[2]));
 29 |     *indexpath = (string(argv[3]));
 30 |     *nc = std::atoi(argv[4]);
 31 |     *algo = std::string(argv[5]);
 32 |     *mode = std::atoi(argv[6]);
 33 |     return 0;
 34 | }
 35 | int argparser(int argc, char** argv, string* basepath, string* labelpath, string* querypath, string* queryAttripath, string* indexpath, string* GTpath, size_t* nc, string* algo, int* mode, size_t* buffer_size){
 36 |     if (argc < 7){
 37 |         std::clog<<"Usage: "<<std::endl; 
 38 |         std::clog<<"./index <data> <properties> <queries> <queryProperties> <index> <groundtruth>";
 39 |         std::clog<<" [--Nc num_clusters] [--Algo method] [--mode method_version]"<<std::endl;
 40 | 
 41 |         std::clog<<"Positional arguments: "<<std::endl;
 42 |         std::clog<<"\t data: Filename pointing to an fvecs file (4 byte uint N, then list of  4 byte uint dim, then list of 32-bit little-endian floats)."<<std::endl;
 43 |         std::clog<<"\t properties: Filename pointing to a properties file (text file containing <num points> <num attributes> <newline> whitespace-separated property lists)"<<std::endl;
 44 |         std::clog<<"\t queries: file for the queries."<<std::endl;
 45 |         std::clog<<"\t queryProperties: file for the queryProperties."<<std::endl;;
 46 |         std::clog<<"\t index: folder for the index. Example : sift1024blissMode1, sift1024kmeans"<<std::endl;
 47 |         std::clog<<"\t groundtruth: file for the groundtruth."<<std::endl;
 48 | 
 49 |         std::clog<<"Optional arguments: "<<std::endl;
 50 |         std::clog<<"\t [--Nc num_clusters]: (Optional, default 1024) Number of clusters/bins."<<std::endl;
 51 |         std::clog<<"\t [--Algo method]: (Optional, default kmeans) MEthod used for clustering. Use either bliss or kmeans"<<std::endl;
 52 |         std::clog<<"\t [--mode method_version]: (Optional, default 1) Only for bliss, use either 1,2 or 3. 1: embedding input - ANN labels, 2: embedding input - FilterANN labels, 3: embedding+Attribute input - FilterANN labels"<<std::endl;
 53 |         std::clog<<"\t [--Bf BufferSize]: (Optional, default 500) Number of distance computations."<<std::endl;
 54 | 
 55 |         return -1;
 56 |     }
 57 | 
 58 |     // Positional arguments.
 59 |     *basepath = (string(argv[1])); 
 60 |     *labelpath = (string(argv[2]));
 61 |     *querypath = (string(argv[3]));
 62 |     *queryAttripath = (string(argv[4])); 
 63 |     *indexpath = (string(argv[5]));
 64 |     *GTpath = (string(argv[6]));
 65 |     *nc = std::atoi(argv[7]);
 66 |     *algo = std::string(argv[8]);
 67 |     *mode = std::atoi(argv[9]);
 68 |     *buffer_size = std::atoi(argv[10]);
 69 |     return 0;
 70 | }
 71 | 
 72 | double RecallAtK(int* answer, int* guess, size_t k, size_t nq){
 73 |     uint32_t count = 0;
 74 |     for (int i=0;i<nq;i++){
 75 |         sort(answer+ k*i, answer + (i+1)*k);
 76 |         sort(guess+ k*i, guess+ (i+1)*k);
 77 |         std::vector<int> tmp;
 78 |         std::set_intersection(answer+ k*i, answer + (i+1)*k,  // Input iterators for first range 
 79 |                             guess+ k*i, guess+ (i+1)*k, // Input iterators for second range 
 80 |                             std::back_inserter(tmp));
 81 |         count += double(tmp.size());
 82 |     }
 83 |     return (count/double(nq*k));
 84 | }
 85 | 
 86 | float IP(float* a, float* b, size_t d){
 87 |     float ip=0;
 88 |     for(uint32_t k = 0; k < d; ++k) {    
 89 |         ip += a[k]*b[k]; // one unit FLOP- mul
 90 |     } 
 91 |     return ip;
 92 | }
 93 | 
 94 | double L2sim(float* a, float* b, float norm_bsq, size_t d){
 95 |     return (IP(a, b, d) -norm_bsq);
 96 | }
 97 | 
 98 | double L2Square(float* a, float* b, size_t d){
 99 |     double dist=0;
100 |     for(uint32_t k = 0; k < d; ++k) {    
101 |         dist += pow(a[k]-b[k],2); // two units FLOPS- mul and sub
102 |     } 
103 |     return dist;
104 | }
105 | 
106 | double L2normSquare(float* a, size_t d){
107 |     double norm=0;
108 |     for(uint32_t k = 0; k < d; ++k) {    
109 |         norm += a[k]*a[k]; // two units FLOPS- mul and sub
110 |     } 
111 |     return norm;
112 | }
113 | 
114 | float IPSIMD4ExtAVX(float *pVect1, float *pVect2, size_t qty) {
115 |     float PORTABLE_ALIGN32 TmpRes[8];
116 |     size_t qty16 = qty / 16;
117 |     size_t qty4 = qty / 4;
118 |     const float *pEnd1 = pVect1 + 16 * qty16;
119 |     const float *pEnd2 = pVect1 + 4 * qty4;
120 |     __m256 sum256 = _mm256_set1_ps(0);
121 |     while (pVect1 < pEnd1) {
122 |         //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
123 |         __m256 v1 = _mm256_loadu_ps(pVect1);
124 |         pVect1 += 8;
125 |         __m256 v2 = _mm256_loadu_ps(pVect2);
126 |         pVect2 += 8;
127 |         sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
128 | 
129 |         v1 = _mm256_loadu_ps(pVect1);
130 |         pVect1 += 8;
131 |         v2 = _mm256_loadu_ps(pVect2);
132 |         pVect2 += 8;
133 |         sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
134 |     }
135 | 
136 |     __m128 v1, v2;
137 |     __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
138 |     while (pVect1 < pEnd2) {
139 |         v1 = _mm_loadu_ps(pVect1);
140 |         pVect1 += 4;
141 |         v2 = _mm_loadu_ps(pVect2);
142 |         pVect2 += 4;
143 |         sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
144 |     }
145 |     _mm_store_ps(TmpRes, sum_prod);
146 |     return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
147 | }
148 | 
149 | float L2SIMD4ExtAVX(float *pVect1, float *pVect2, float norm_bsq, size_t qty) {
150 |     return (IPSIMD4ExtAVX(pVect1, pVect2, qty) -norm_bsq);
151 | }
152 | 
153 | float spaseMul(uint8_t* prop, float* weight ,int na){
154 |     float sum=0;
155 |     for (size_t i=0; i<na; i++){
156 |         sum+=weight[prop[i]];
157 |     }
158 |     return sum;
159 | }
160 | 
161 | // float IPSIMD16ExtAVX512(float *pVect1,  float *pVect2,  size_t qty) {
162 | //     float PORTABLE_ALIGN64 TmpRes[16];
163 | //     size_t qty16 = qty / 16;
164 | //     const float *pEnd1 = pVect1 + 16 * qty16;
165 | //     __m512 sum512 = _mm512_set1_ps(0);
166 | //     while (pVect1 < pEnd1) {
167 | //         //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
168 | //         __m512 v1 = _mm512_loadu_ps(pVect1);
169 | //         pVect1 += 16;
170 | //         __m512 v2 = _mm512_loadu_ps(pVect2);
171 | //         pVect2 += 16;
172 | //         sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
173 | //     }
174 | //     _mm512_store_ps(TmpRes, sum512);
175 | //     float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
176 | //     return sum;
177 | // }
178 | 
179 | // float L2SIMD16ExtAVX512(float *pVect1, float *pVect2, float norm_bsq, size_t qty) {
180 | //     return (IPSIMD16ExtAVX512(pVect1, pVect2, qty) -norm_bsq);
181 | // }
182 | 
183 | 
184 | uint16_t getclusterPart(uint16_t* maxMC, uint8_t* props, int treelen){
185 |     // maxMC: property location, property, frequency
186 |     for (uint16_t i=0;i<treelen; i++){
187 |         if (maxMC[i*3+1] == props[maxMC[i*3+0]]){
188 |             return i;
189 |         }
190 |     }
191 |     return treelen;    
192 | }
193 | 
194 | //checks if the property x is seen before in maxMC
195 | bool not_in(uint16_t x, uint16_t* a, int h){
196 |     // property location, property, frequency
197 |     if (h == 0){
198 |         return 1;
199 |     } 
200 |     else{
201 |         for(uint16_t i=0;i< h;i++){ 
202 |             if (a[i*3+1]==x){return 0;}
203 |         };
204 |         return 1;
205 |     }
206 | }
207 | 
208 | void randomShuffle(int* v , int l, int u){
209 |      // Range of numbers [l, u]
210 |     iota(v, v+u-l, l); 
211 |     std::random_device rd;
212 |     std::mt19937 g(rd());
213 |     std::shuffle(v, v+u-l, g);
214 | }
215 | 
216 | 
217 | vector<uint32_t> argTopK(float* query, float* vectors, uint32_t d, uint32_t N, vector<uint32_t> idx, uint32_t idxSize, uint32_t k, vector<float> topkDist){
218 |     float dist; 
219 |     vector<uint32_t> topk;
220 |     priority_queue<pair<float, uint32_t> > pq;
221 |     if (idxSize ==N){
222 |         for (uint32_t i = 0; i < N; i++){
223 |             //L2
224 |             dist =0;
225 |             for (size_t j = 0; j < d; j++){
226 |                 dist += pow(vectors[i*d+j] - query[j], 2);
227 |             }
228 |             dist = sqrt(dist);
229 |             //topk
230 |             if (i<k) pq.push({dist, i});
231 |             else{
232 |                 if (dist< pq.top().first){
233 |                     pq.pop();
234 |                     pq.push({dist, i});
235 |                 }
236 |             }
237 |         }
238 |     }
239 |     else{
240 |         for (uint32_t i = 0; i < idxSize; i++){
241 |             //L2
242 |             try{
243 |                 dist =0;
244 |                 for (size_t j = 0; j < d; j++){
245 |                     dist += pow(vectors[idx[i]*d+j] - query[j], 2);//*
246 |                 }
247 | 
248 |                 dist = sqrt(dist);
249 |                 //topk
250 |                 if (i<k) pq.push({dist, idx[i]});
251 |                 else{
252 |                     if (dist< pq.top().first){
253 |                         pq.pop();
254 |                         pq.push({dist, idx[i]});
255 |                     }
256 |                 }
257 |             }
258 |             catch(int mynum){
259 |                 cout << "Error number: "; 
260 |             }
261 |         }
262 |     }
263 |     for (uint32_t i = 0; i < k; i++){
264 |         topk.push_back(pq.top().second);
265 |         topkDist.push_back(pq.top().first);
266 |         pq.pop();
267 |     }
268 |     return topk;
269 | }
270 | 


--------------------------------------------------------------------------------