├── CMakeLists.txt
├── HOW_TO_INSTALL.txt
├── HOW_TO_SEARCH.txt
├── HOW_TO_TRAINING.txt
├── LICENSE.TXT
├── README.md
├── ReleaseVersion
    ├── CQParameters.cpp
    ├── CQParameters.h
    ├── ClosureCluster.cpp
    ├── ClosureCluster.h
    ├── Cluster.cpp
    ├── Cluster.h
    ├── ClusterCommon.cpp
    ├── ClusterCommon.h
    ├── CompositeQuantization.cpp
    ├── CompositeQuantization.h
    ├── DataUtil.h
    ├── Dataset.h
    ├── Demo.cpp
    ├── Distance.h
    ├── Kmeans.cpp
    ├── Kmeans.h
    ├── NoConstraintCompositeQuantization.cpp
    ├── NoConstraintCompositeQuantization.h
    ├── PartitioningTree.cpp
    ├── PartitioningTree.h
    ├── ProductQuantization.cpp
    ├── ProductQuantization.h
    ├── Searcher.cpp
    ├── Searcher.h
    ├── config.txt
    ├── lbfgs.c
    ├── lbfgs.h
    └── lbfgslib
    │   └── lbfgs.lib
├── build_project.bat
└── lbfgslib
    └── lbfgs.lib


/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | #########################################################################
  2 | # CMake build script for CompositeQuantization under Win32
  3 | #
  4 | ###########################################################################
  5 | #
  6 | # start description
  7 | cmake_minimum_required (VERSION 2.8)
  8 | project (CompositeQuantization)
  9 | 
 10 | #########################################################################
 11 | #
 12 | # set open MP property yes
 13 | find_package(OpenMP)
 14 | if (OPENMP_FOUND)
 15 |     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 16 |     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 17 |     MESSAGE ("SET OPENMP")
 18 | endif()
 19 | 
 20 | MESSAGE("current dir: ${CMAKE_CURRENT_LIST_DIR}")
 21 | SET (Source_Path ${CMAKE_CURRENT_LIST_DIR}/ReleaseVersion)
 22 | SET (Current_Path ${CMAKE_CURRENT_LIST_DIR})
 23 | 
 24 | 
 25 | # add blas (this is where you need to change to your own directory)
 26 | SET(BLAS_DIR "${Current_Path}/include")
 27 | SET(BLAS_LIB "${Current_Path}/mkllib"
 28 |              "${Current_Path}/compilerlib")
 29 | 
 30 | ########################################################################
 31 | #
 32 | # let's divide binaries in groups, for comfort navigation	
 33 | SOURCE_GROUP(Utility FILES   ${Source_Path}/DataUtil.h
 34 |                              ${Source_Path}/Kmeans.h
 35 |                              ${Source_Path}/Kmeans.cpp
 36 |                              ${Source_Path}/CQParameters.h
 37 |                              ${Source_Path}/CQParameters.cpp
 38 |                              ${Source_Path}/lbfgs.h)
 39 | 
 40 | 	
 41 | SET(UTIL                ${Source_Path}/DataUtil.h
 42 |                         ${Source_Path}/Kmeans.h
 43 |                         ${Source_Path}/Kmeans.cpp
 44 |                         ${Source_Path}/CQParameters.h
 45 |                         ${Source_Path}/CQParameters.cpp
 46 |                         ${Source_Path}/lbfgs.h)
 47 | 
 48 | 
 49 | SOURCE_GROUP(ClosureCluster FILES   ${Source_Path}/ClosureCluster.h
 50 |                                     ${Source_Path}/ClosureCluster.cpp
 51 |                                     ${Source_Path}/Cluster.h
 52 |                                     ${Source_Path}/Cluster.cpp
 53 |                                     ${Source_Path}/ClusterCommon.h
 54 |                                     ${Source_Path}/ClusterCommon.cpp
 55 |                                     ${Source_Path}/Dataset.h
 56 |                                     ${Source_Path}/Distance.h
 57 |                                     ${Source_Path}/PartitioningTree.h
 58 |                                        ${Source_Path}/PartitioningTree.cpp)
 59 | 
 60 | 						
 61 | 						   
 62 | SET(CLOSURE             ${Source_Path}/ClosureCluster.h
 63 |                         ${Source_Path}/ClosureCluster.cpp
 64 |                         ${Source_Path}/Cluster.h
 65 |                         ${Source_Path}/Cluster.cpp
 66 |                         ${Source_Path}/ClusterCommon.h
 67 |                         ${Source_Path}/ClusterCommon.cpp
 68 |                         ${Source_Path}/Dataset.h
 69 |                         ${Source_Path}/Distance.h
 70 |                         ${Source_Path}/PartitioningTree.h
 71 |                         ${Source_Path}/PartitioningTree.cpp)
 72 | 
 73 | SOURCE_GROUP(Header FILES  
 74 |                       ${Source_Path}/CompositeQuantization.h
 75 |                       ${Source_Path}/NoConstraintCompositeQuantization.h
 76 |                       ${Source_Path}/ProductQuantization.h
 77 |                       ${Source_Path}/Searcher.h)
 78 | 
 79 | 					
 80 | 							
 81 | SET(HEADER            ${Source_Path}/CompositeQuantization.h
 82 |                       ${Source_Path}/NoConstraintCompositeQuantization.h
 83 |                       ${Source_Path}/ProductQuantization.h
 84 |                       ${Source_Path}/Searcher.h)
 85 | 
 86 | 
 87 | SOURCE_GROUP(Source FILES               
 88 |                     ${Source_Path}/CompositeQuantization.cpp 
 89 |                     ${Source_Path}/NoConstraintCompositeQuantization.cpp
 90 |                     ${Source_Path}/ProductQuantization.cpp
 91 |                     ${Source_Path}/Searcher.cpp
 92 |                     ${Source_Path}/Demo.cpp)
 93 | 
 94 | 
 95 | SET(SOURCE          ${Source_Path}/CompositeQuantization.cpp
 96 |                     ${Source_Path}/NoConstraintCompositeQuantization.cpp
 97 |                     ${Source_Path}/ProductQuantization.cpp
 98 |                     ${Source_Path}/Searcher.cpp 
 99 |                     ${Source_Path}/Demo.cpp)
100 | 
101 | 							
102 | 
103 | # let's list all CompositeQuantization's source binaries
104 | SET(CQTraining_ALL_CC ${UTIL} ${CLOSURE} ${HEADER} ${SOURCE})
105 | 
106 | ########################################################################
107 | #
108 | INCLUDE_DIRECTORIES(${BLAS_DIR})
109 | LINK_DIRECTORIES(${BLAS_LIB})
110 | INCLUDE_DIRECTORIES(${Source_Path})
111 | LINK_DIRECTORIES(${Current_Path}/lbfgslib)
112 | ADD_EXECUTABLE (CompositeQuantization ${CQTraining_ALL_CC})
113 | TARGET_LINK_LIBRARIES (CompositeQuantization  mkl_intel_lp64.lib mkl_intel_thread.lib mkl_core.lib libiomp5md.lib lbfgs.lib)
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/HOW_TO_INSTALL.txt:
--------------------------------------------------------------------------------
 1 | There are three steps to build our project.
 2 | 
 3 | 1: Install third-part software
 4 |   To use our code, you should install some third-part software:
 5 | 	(1). CMake
 6 | 	(2). IntelMKL 
 7 | 	
 8 | 2: Change directory in CMakeLists.txt file
 9 |   You should insert path to BLAS (MKL) sources and libraries in file CMakeLists.txt.
10 |   
11 | 3: Build project
12 | 	(1). create folder "build" in directory with source files
13 | 	(2). run "build_project.bat" or you can use CMake GUI to build


--------------------------------------------------------------------------------
/HOW_TO_SEARCH.txt:
--------------------------------------------------------------------------------
 1 | 1. Algorithm 
 2 | 
 3 | The search process is conducted via linear scan. 
 4 | 
 5 | Given a query point, the search algorithm traverses all the data points and hold the R points whose distances to the query is R smallest. After linear scan, the reserved R points are sorted in the order of increasing distance and regarded as the R nearest neighbors to the query.
 6 | 
 7 | 
 8 | 
 9 | 2. File formats
10 | 
11 | Our code uses four file formats. 
12 | 
13 | Usually, the query points are assumed to be .fvecs or .bvecs file formats and the ground truth nearest neighbors are assumed to be .ivecs file format developed by INRIA LEAR and TEXMEX groups. 
14 | 
15 | The fourth file format we referred in our code is BINARY, described as follows.
16 | 
17 | (1) dictionary
18 |     We assume that dictionary is in the following format:
19 | 	4 bytes (one int32) -- the number of dictionary elements (M*K)
20 | 	4 bytes (one int32) -- the dimension of element (d)
21 | 	4*M*K*d bytes (M*K*d floats) -- the dictionary element entries one after another
22 | 	
23 | (2) binary codes 
24 |     We assume that binary codes are in the following format:
25 | 	4 bytes (one int32) -- the number of points (N)
26 | 	4 bytes (one int32) -- the number of indexes (M)
27 | 	4*N*M bytes (N*M int32) or N*M bytes (N*M unsigned char) -- binary code entries one after another
28 | 	
29 | (3) retrieval results 
30 |     We assume that retrieval results are in the following format:
31 | 	4 bytes (one int32) -- the number of queries (Q)
32 | 	4 bytes (one int32) -- the length of list retrieved (R)
33 | 	4*Q*R bytes(Q*R int32) -- retrieval results entries one after another
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/HOW_TO_TRAINING.txt:
--------------------------------------------------------------------------------
 1 | 1. Algorithm 
 2 | 
 3 | We provide the implementation of four vector quantization methods, the process of (3) Non-constrained Composite Quantization and (4) Composite Quantization is described in our paper. 
 4 | 
 5 | (1). K-means 
 6 | 
 7 |     For this classical clustering method, we offer two choices to perform k-means. One is the most common algorithm called Lloyd's algorithm (the centers can be initialized using random selection or k-means++). The other is Fast k-means proposed by J. Wang et al. in 2012. We suggest to use the second one to accelerate the k-means quantization process.
 8 | 	
 9 | (2). Product Quantization
10 | 
11 |     Product Quantization divides the data space into several subspaces and performs k-means quantization in each subspace. The way of partition is natural (i.e. successive dimensions are in the same subspace) if it is not specified from outside.
12 | 	
13 | (3). Non-constrained Composite Quantization
14 | 
15 |     Non-constrained Composite Quantization is mentioned in our paper as a initialization for Composite Quantization. It has a lower vector reconstruction error than composite quantization since the constant constraint added on the composite quantization has been ignored.
16 | 	
17 | (4). Composite Quantization
18 | 
19 |     Composite Quantization jointly optimize the dictionary and binary codes through the training process.
20 | 
21 | 	
22 | 	
23 | 2. File formats
24 | 
25 | Our code uses four file formats. 
26 | 
27 | Usually, the data points are assumed to be .fvecs or .bvecs file formats and the ground truth are assumed to be .ivecs file format developed by INRIA LEAR and TEXMEX groups. 
28 | 
29 | The fourth file format we referred in our code is BINARY, described as follows.
30 | 
31 | (1) dictionary
32 |     We assume that dictionary is in the following format:
33 | 	4 bytes (one int32) -- the number of dictionary elements (M*K)
34 | 	4 bytes (one int32) -- the dimension of element (d)
35 | 	4*M*K*d bytes (M*K*d floats) -- the dictionary element entries one after another
36 | 	
37 | (2) binary codes 
38 |     We assume that binary codes are in the following format:
39 | 	4 bytes (one int32) -- the number of points (N)
40 | 	4 bytes (one int32) -- the number of indexes (M)
41 | 	4*N*M bytes (N*M int32) or N*M bytes (N*M unsigned char) -- binary code entries one after another
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
  1 | 		    GNU GENERAL PUBLIC LICENSE
  2 | 		       Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.
  5 |                        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | 			    Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Library General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 | 		    GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 | 			    NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 | 		     END OF TERMS AND CONDITIONS
281 | 
282 | 	    How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License
307 |     along with this program; if not, write to the Free Software
308 |     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
309 | 
310 | 
311 | Also add information on how to contact you by electronic and paper mail.
312 | 
313 | If the program is interactive, make it output a short notice like this
314 | when it starts in an interactive mode:
315 | 
316 |     Gnomovision version 69, Copyright (C) year name of author
317 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
318 |     This is free software, and you are welcome to redistribute it
319 |     under certain conditions; type `show c' for details.
320 | 
321 | The hypothetical commands `show w' and `show c' should show the appropriate
322 | parts of the General Public License.  Of course, the commands you use may
323 | be called something other than `show w' and `show c'; they could even be
324 | mouse-clicks or menu items--whatever suits your program.
325 | 
326 | You should also get your employer (if you work as a programmer) or your
327 | school, if any, to sign a "copyright disclaimer" for the program, if
328 | necessary.  Here is a sample; alter the names:
329 | 
330 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
331 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
332 | 
333 |   <signature of Ty Coon>, 1 April 1989
334 |   Ty Coon, President of Vice
335 | 
336 | This General Public License does not permit incorporating your program into
337 | proprietary programs.  If your program is a subroutine library, you may
338 | consider it more useful to permit linking proprietary applications with the
339 | library.  If this is what you want to do, use the GNU Library General
340 | Public License instead of this License.
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Composite Quantization
 2 | 
 3 | ## What is it?
 4 | 
 5 | This software library implements the composite quantization algorithm
 6 | described in
 7 | 
 8 | >    Composite Quantization for approximate nearest neighbor search. Ting Zhang, Chao Du and Jingdong Wang.
 9 |       In International Conference on Machine Learning (ICML), 2014.
10 | 
11 | If you use this software for research purposes, please cite the aforementioned paper in any resulting publication.
12 | 
13 | 
14 | ## The latest version
15 | Version 1.0 (2015-01-28):
16 | 	Initial release.
17 | 
18 | 
19 | ## Installation
20 | 
21 | See HOW_TO_INSTALL.txt.
22 | 
23 | 
24 | ## Documentation
25 | 
26 | See HOW_TO_TRAINING.txt and HOW_TO_SEARCH.txt.
27 | 
28 | 
29 | ## Example usage
30 | 
31 | See demo.cpp and config.txt in the source code.
32 | 
33 | 
34 | 
35 | ## Reference
36 | 
37 | libLBFGS: 
38 |           http://www.chokkan.org/software/liblbfgs/index.html
39 |           
40 | Fast k-means: 
41 |           http://research.microsoft.com/en-us/um/people/jingdw/LargeScaleClustering/index.html
42 | 
43 | Product quantization:
44 |           http://people.rennes.inria.fr/Herve.Jegou/projects/ann.html
45 | 


--------------------------------------------------------------------------------
/ReleaseVersion/CQParameters.cpp:
--------------------------------------------------------------------------------
 1 | #include "CQParameters.h"
 2 | 
 3 | CQParameters::CQParameters()
 4 | {}
 5 | 
 6 | CQParameters::~CQParameters()
 7 | {}
 8 | 
 9 | bool CQParameters::Exists(const string& key)
10 | {
11 | 	return parameter_set.find(key) != parameter_set.end();
12 | }
13 | 
14 | void CQParameters::WriteHelpInformation()
15 | {
16 | 	cout << "PQ=";
17 | 	cout << "NCQ=";
18 | 	cout << "CQ=";
19 | 	cout << "Search=";
20 | 
21 | 	cout << "********* global parameters *********\n";
22 | 	cout << "points_count=\n";
23 | 	cout << "dictionaries_count=\n";
24 | 	cout << "words_count=\n";
25 | 	cout << "space_dimension=\n";
26 | 	cout << "points_file=\n";
27 | 	cout << "output_file_prefix=\n";
28 | 	cout << "max_iter=\n";
29 | 
30 | 	cout << "********** PQ parameters *************\n";
31 | 	cout << "distortion_tol=\n";
32 | 	cout << "read_partition=\n";
33 | 	cout << "partition_file=\n";
34 | 
35 | 	cout << "********** NCQ and CQ parameters **********\n";
36 | 	cout << "num_sep=\n";
37 | 	cout << "~~~~~~~~~~ initial from outside ~~~~~~~~~~\n";
38 | 	cout << "initial_from_outside=\n";
39 | 	cout << "dictionary_file=\n";
40 | 	cout << "binary_codes_file=\n";
41 | 	
42 | 	cout << "********** CQ parameters ************\n";
43 | 	cout << "mu=\n";
44 | 
45 | 	cout << "********** Search parameters ***********\n";
46 | 	cout << "queries_count=\n";
47 | 	cout << "groundtruth_length=\n";
48 | 	cout << "results_length=\n";
49 | 	cout << "queries_file=\n";
50 | 	cout << "groundtruth_file=\n";
51 | 	
52 | 	cout << "trained_dictionary_file=\n";
53 | 	cout << "trained_binary_codes_file=\n";
54 | 	cout << "output_retrieved_results_file=\n";
55 | }
56 | 
57 | void CQParameters::LoadFromFile(const string parameter_file)
58 | {
59 | 	parameter_set.clear();
60 | 
61 | 	string currentLine;
62 | 	ifstream inputStream;
63 | 	inputStream.open(parameter_file);
64 | 	if (!inputStream.good())
65 | 	{
66 | 		cout << "unable to open configuration file " + parameter_file << endl;
67 | 		throw std::logic_error("unable to open configuration file " + parameter_file);
68 | 	}
69 | 	while (!inputStream.eof())
70 | 	{
71 | 		std::getline(inputStream, currentLine);
72 | 		if (currentLine.find("help") != string::npos)
73 | 		{
74 | 			WriteHelpInformation();
75 | 			return;
76 | 		}
77 | 		if (currentLine.length() > 0)
78 | 		{
79 | 			if ('#' == currentLine[0])	// All lines starting with '#' are skipped as comments.
80 | 				continue;
81 | 			size_t found = currentLine.find('=');
82 | 			if (found == string::npos || found != currentLine.find_last_of('='))
83 | 			{
84 | 				cout << "Error in parsing data " + currentLine << endl;
85 | 				throw std::logic_error("Error in parsing data " + currentLine);
86 | 			}
87 | 			parameter_set.insert(std::pair<string, string>(currentLine.substr(0, found), currentLine.substr(found + 1)));
88 | 		}
89 | 	}
90 | 	inputStream.close();
91 | }


--------------------------------------------------------------------------------
/ReleaseVersion/CQParameters.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <vector>
  7 | #include <map>
  8 | 
  9 | using std::string;
 10 | using std::ifstream;
 11 | using std::map;
 12 | using std::cout;
 13 | using std::endl;
 14 | 
 15 | class CQParameters
 16 | {
 17 | public:
 18 | 	/**
 19 | 	* The constructor function.
 20 | 	*/
 21 | 	CQParameters();
 22 | 
 23 | 	/**
 24 | 	* The deconstructor function.
 25 | 	*/
 26 | 	~CQParameters();
 27 | 
 28 | 
 29 | 	/**
 30 | 	* This function tests whether the value of a parameter key exists.
 31 | 	*  @param  key    A parameter.
 32 | 	*/
 33 | 	bool Exists(const string& key);
 34 | 
 35 | 	/**
 36 | 	* This function outputs parameter information when help is used in the config.txt.
 37 | 	*/
 38 | 	void WriteHelpInformation();
 39 | 
 40 | 	/**
 41 | 	* This function load the parameters from txt file.
 42 | 	*  @param  parameter_file   The filename that stores all the parameters.
 43 | 	*/
 44 | 	void LoadFromFile(const string parameter_file);
 45 | 
 46 | 
 47 | 	/**
 48 | 	* This template function gets the value of parameter key.
 49 | 	*  @param  key    The parameter.
 50 | 	*  @param  val    The value of the parameter key.
 51 | 	*/
 52 | 	template<class T>
 53 | 	void Get(const string& key, T& val)
 54 | 	{
 55 | 		map<string, string>::const_iterator p = parameter_set.find(key);
 56 | 		if (p != parameter_set.end())
 57 | 		{
 58 | 			val = StringToValue<T>(p->second);
 59 | 		}
 60 | 		else
 61 | 		{
 62 | 			cout << "Error : can not find the parameter " + key << endl;
 63 | 			throw std::logic_error("the parameter can't be found: " + key);
 64 | 		}
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	* This template function returns the value of parameter key.
 69 | 	*  @param  key    The parameter.
 70 | 	*/
 71 | 	template<class T>
 72 | 	 const T Get(const string& key)
 73 | 	{
 74 | 		T val;
 75 | 		map<string, string>::const_iterator p = parameter_set.find(key);
 76 | 		if (p != parameter_set.end())
 77 | 		{
 78 | 			val = StringToValue<T>(p->second);
 79 | 		}
 80 | 		else
 81 | 		{
 82 | 			cout << "Error : can not find the parameter " + key << endl;
 83 | 			throw std::logic_error("the parameter can't be found: " + key);
 84 | 		}
 85 | 		return val;
 86 | 	}
 87 | 
 88 | 
 89 | 	/**
 90 | 	* This template function sets the value of parameter key.
 91 | 	*  @param  key    The parameter.
 92 | 	*  @param  val    The value of the parameter key.
 93 | 	*/
 94 | 	template<class T>
 95 | 	void Set(const string& key, const T& val)
 96 | 	{
 97 | 		parameter_set[key] = ValueToString(val);
 98 | 	}
 99 | 
100 | 
101 | 	/**
102 | 	* This template function converts a string to a value and returns it.
103 | 	*  @param  str    The value stored in string.
104 | 	*/
105 | 	template<typename T>
106 | 	T StringToValue(const string& str);
107 | 	
108 | 	/**
109 | 	* string (The explicit specialization definition).
110 | 	*/
111 | 	template<> string StringToValue<string>(const string& str)
112 | 	{
113 | 		return string(str);
114 | 	}
115 | 
116 | 	/**
117 | 	* int (The explicit specialization definition).
118 | 	*/
119 | 	template<> int StringToValue<int>(const string& str)
120 | 	{
121 | 		return atoi(str.c_str());
122 | 	}
123 | 
124 | 	/**
125 | 	* float (The explicit specialization definition).
126 | 	*/
127 | 	template<> float StringToValue<float>(const string& str)
128 | 	{
129 | 		return float(atof(str.c_str()));
130 | 	}
131 | 
132 | 	/**
133 | 	* double (The explicit specialization definition).
134 | 	*/
135 | 	template<> double StringToValue<double>(const string& str)
136 | 	{
137 | 		return atof(str.c_str());
138 | 	}
139 | 
140 | 
141 | 	/**
142 | 	* This template function converts a value to a string and returns it.
143 | 	*  @param  val    The value to be coverted.
144 | 	*/
145 | 	template<typename T>
146 | 	string ValueToString(const T& val)
147 | 	{
148 | 		return std::to_string(T);
149 | 	}
150 | 
151 | 	/**
152 | 	* string (The explicit specialization definition).
153 | 	*/
154 | 	template<> string ValueToString<string>(const string& val)
155 | 	{
156 | 		return val;
157 | 	}
158 | 
159 | private:
160 | 	/**
161 | 	* The set of parameters.
162 | 	*/
163 | 	map<string, string> parameter_set;
164 | };


--------------------------------------------------------------------------------
/ReleaseVersion/ClosureCluster.cpp:
--------------------------------------------------------------------------------
  1 | #include "ClosureCluster.h"
  2 | 
  3 | namespace KMC
  4 | {
  5 | 	// Fast assignment based on cluster closures
  6 | 	void ClosureCluster::AssignmentStep()
  7 | 	{
  8 | 		if (m_iNThreads > 0) omp_set_num_threads(m_iNThreads);
  9 | 
 10 | 		// check array to avoid dup computation
 11 | 		int ** pClusterCheck = new int * [m_iNThreads];
 12 | 		for (int i = 0; i < m_iNThreads; i++)
 13 | 		{
 14 | 			pClusterCheck[i] = new int [m_iNCluster];
 15 | 			for (int j = 0; j < m_iNCluster; j++) pClusterCheck[i][j] = -1;
 16 | 		}
 17 | 
 18 | 		double WCSSD = 0;
 19 | #pragma omp parallel for reduction(+ : WCSSD)
 20 | 		for (int i = 0; i < m_iDataSize; i++)
 21 | 		{
 22 | 			int iThread = omp_get_thread_num();
 23 | 			FloatType fMinDist = MaxDist;
 24 | 			for (int j = 0; j < m_iCurrentTreeNum; j++)
 25 | 			{
 26 | 				int x = (*m_pCode)[j][i]; // the leaf node of point i in tree j
 27 | 				for (int k = 0; k < m_pInvertedList[x].size(); k++)
 28 | 				{
 29 | 					int y = m_pCenterId[m_pInvertedList[x][k]]; // get the cluster id of the k-th member if list[x]
 30 | 					if (pClusterCheck[iThread][y] != i)
 31 | 					{
 32 | 						pClusterCheck[iThread][y] = i;
 33 | 						FloatType fDist = ComputeDistance((*m_pCenter)[y], (*m_pData)[i], m_iDataDimension);
 34 | 						if (fDist < fMinDist)
 35 | 						{
 36 | 							fMinDist = fDist;
 37 | 							m_pCenterId[i] = y;
 38 | 						}
 39 | 					}
 40 | 				}
 41 | 			}
 42 | 
 43 | 			WCSSD += fMinDist;
 44 | 		}
 45 | 		std::cout << WCSSD / m_iDataSize << std::endl;
 46 | 
 47 | 		for (int i = 0; i < m_iNThreads; i++) delete [] pClusterCheck[i];
 48 | 		delete [] pClusterCheck;
 49 | 	}
 50 | 
 51 | 	void ClosureCluster::Initialization()
 52 | 	{
 53 | 		m_pCenterId = new int [m_iDataSize];
 54 | 		m_pCenter = new Dataset<CenterType> (m_iDataSize, m_iDataDimension);
 55 | 
 56 | 		// Partition Data by a Random Projection Tree, you can replace it with other patitioning methods
 57 | 		PartitionTreeBase * pTree = NewPartitionTree(sPartitionMethod, pParams);
 58 | 		pTree->PartitionData(m_pData, m_pCenterId, m_iNCluster);
 59 | 		delete pTree;
 60 | 
 61 | 		// initialize arrays to store forward and inverted index of multiple random partitions
 62 | 		m_iNPartitions = m_iDataSize / m_iLeafSize;
 63 | 		m_pCode = new Dataset<int> (m_iMaxTreeNum, m_iDataSize);
 64 | 		m_pInvertedList.clear();
 65 | 		for (int i = 0; i < m_iMaxTreeNum * m_iNPartitions; i++)
 66 | 		{
 67 | 			std::vector<int> vec;
 68 | 			vec.clear();
 69 | 			m_pInvertedList.push_back(vec);
 70 | 		}
 71 | 		m_iCurrentTreeNum = 0;
 72 | 
 73 | 		GenerateNewTrees();
 74 | 
 75 | 		if (!m_bDynamicTrees) // if not dynamic, generate all trees
 76 | 		{
 77 | 			while (m_iCurrentTreeNum < m_iMaxTreeNum) GenerateNewTrees();
 78 | 		}
 79 | 	}
 80 | 
 81 | 	// generate new trees, the number accords the number of threads
 82 | 	void ClosureCluster::GenerateNewTrees()
 83 | 	{
 84 | 		if (m_iNThreads > 0) omp_set_num_threads(m_iNThreads);
 85 | 		//int iNextTreeNum = m_iCurrentTreeNum + m_iNThreads;
 86 | 		int iNextTreeNum = m_iCurrentTreeNum + 1;
 87 | 		if (iNextTreeNum > m_iMaxTreeNum) iNextTreeNum = m_iMaxTreeNum;
 88 | 
 89 | #pragma omp parallel for
 90 | 		for (int i = m_iCurrentTreeNum; i < iNextTreeNum; i++)
 91 | 		{
 92 | 			PartitionTreeBase * pTree = NewPartitionTree(sPartitionMethod, pParams);
 93 | 			pTree->PartitionData(m_pData, (*m_pCode)[i], m_iNPartitions);
 94 | 			delete pTree;
 95 | 			for (int j = 0; j < m_iDataSize; j++)
 96 | 			{
 97 | 				(*m_pCode)[i][j] += m_iNPartitions * i;
 98 | 				m_pInvertedList[(*m_pCode)[i][j]].push_back(j);
 99 | 			}
100 | 		}
101 | 
102 | 		m_iCurrentTreeNum = iNextTreeNum;
103 | #ifdef ConsoleOutput
104 | 		std::cout << m_iCurrentTreeNum << " trees have been built" << std::endl;
105 | #endif
106 | 	}
107 | 
108 | 	void ClosureCluster::RunClustering()
109 | 	{
110 | 		FloatType TotalRunTime = 0;
111 | 		Initialization();
112 | 		FloatType LastWCSSD = UpdateStep();
113 | 		FloatType LastDrop = -1;
114 | 		FloatType LastElapse = 0;
115 | #ifdef ConsoleOutput
116 | 		std::cout << "Iteration 0: WCSSD = " << LastWCSSD << std::endl;
117 | #endif
118 | 		for (int it = 1; it <= m_iMaxIteration; it++)
119 | 		{
120 | 			int LastClock = clock();
121 | 			AssignmentStep();
122 | 			FloatType WCSSD = UpdateStep();
123 | 			int Elapse = clock() - LastClock;
124 | 			TotalRunTime += FloatType(Elapse) / 1000;
125 | #ifdef ConsoleOutput
126 | 			std::cout << "Iteration " << it << ": WCSSD = " << WCSSD << "\tTime cost = " << TotalRunTime << "s" << std::endl;
127 | #endif
128 | 			if (LastWCSSD - WCSSD < m_fEpsilon || TotalRunTime > m_iMaxRunTime) break;
129 | 
130 | 			if (m_bDynamicTrees && m_iCurrentTreeNum < m_iMaxTreeNum) // dynamically generate new trees
131 | 			{
132 | 				FloatType Drop = LastWCSSD - WCSSD;
133 | 				if (LastDrop >= 0)
134 | 				{
135 | 					// Heuristic Criterion
136 | 					if (Drop/Elapse < 0.4 * LastDrop/LastElapse)
137 | 					{
138 | 						GenerateNewTrees();
139 | 						LastDrop = -1; // we do not generate new trees in two consecutive iterations
140 | 					}
141 | 				}
142 | 				else
143 | 				{
144 | 					LastDrop = Drop;
145 | 					LastElapse = Elapse;
146 | 				}
147 | 			}
148 | 
149 | 			LastWCSSD = WCSSD;
150 | 		}
151 | 		total_WCSSD = LastWCSSD * m_iDataSize;
152 | 	}
153 | }


--------------------------------------------------------------------------------
/ReleaseVersion/ClosureCluster.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Cluster.h"
 3 | #include "PartitioningTree.h"
 4 | #include <vector>
 5 | #include <omp.h>
 6 | 
 7 | namespace KMC
 8 | {
 9 | 	class ClosureCluster: public ClusterBase
10 | 	{
11 | 	public:
12 | 		virtual void LoadParameters(const Parameters & params)
13 | 		{
14 | 			// Parameters for general K-Means
15 | 			params.Get<int>("NCluster", m_iNCluster, 10);
16 | 			params.Get<int>("MaxIteration", m_iMaxIteration, 10);
17 | 			params.Get<FloatType>("Epsilon", m_fEpsilon, FloatType(1e-3));
18 | 			params.Get<std::string>("PartitionMethod", sPartitionMethod, "Rptree");
19 | 			params.Get<int>("MaxRunTime", m_iMaxRunTime, 36000);
20 | 			params.Get<int>("NThreads", m_iNThreads, -1);
21 | 			if (m_iNThreads > 0) 
22 | 			{
23 | 				omp_set_num_threads(m_iNThreads);
24 | 			}
25 | 			else 
26 | 			{
27 | 				m_iNThreads = omp_get_num_threads();
28 | 			}
29 | 
30 | 			// Parameters for closure algorithm
31 | 			params.Get<int>("Closure_MaxTreeNum", m_iMaxTreeNum, 10);
32 | 			params.Get<int>("Closure_LeafSize", m_iLeafSize, 200);
33 | 			params.Get<int>("Closure_DynamicTrees", m_bDynamicTrees, 0);
34 | 			pParams = params;
35 | 		}
36 | 
37 | 		virtual void RunClustering();
38 | 		~ClosureCluster()
39 | 		{
40 | 			if (m_pCode != NULL) delete m_pCode;
41 | 		}
42 | 
43 | 		FloatType total_WCSSD;
44 | 
45 | 	protected:
46 | 		virtual void Initialization();
47 | 		virtual void AssignmentStep();
48 | 
49 | 	private:
50 | 		std::string sPartitionMethod;
51 | 		Parameters pParams;
52 | 
53 | 		Dataset<int> * m_pCode;
54 | 		std::vector<std::vector<int>> m_pInvertedList;
55 | 
56 | 		int m_iMaxTreeNum;
57 | 		int m_iCurrentTreeNum;
58 | 		int m_iNThreads;
59 | 		int m_iNPartitions;
60 | 		int m_iLeafSize;
61 | 
62 | 		// 0: generate all trees at first; 1: genearte trees dynamically
63 | 		int m_bDynamicTrees;
64 | 
65 | 		int m_iMaxRunTime;
66 | 
67 | 		void GenerateNewTrees();
68 | 	};
69 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Cluster.cpp:
--------------------------------------------------------------------------------
  1 | #include "Cluster.h"
  2 | #include <algorithm>
  3 | #include <string>
  4 | 
  5 | namespace KMC
  6 | {
  7 | 	ClusterBase::ClusterBase()
  8 | 		:m_pData(NULL), m_pCenter(NULL), m_bOwnData(false), m_pCenterId(NULL)
  9 | 	{
 10 | 	}
 11 | 
 12 | 	ClusterBase::~ClusterBase()
 13 | 	{
 14 | 		if (m_pData != NULL && m_bOwnData) delete m_pData;
 15 | 		if (m_pCenter != NULL) delete m_pCenter;
 16 | 		if (m_pCenterId != NULL) delete [] m_pCenterId;
 17 | 	}
 18 | 
 19 | 	void ClusterBase::SetData(Dataset<DataType> * pData)
 20 | 	{
 21 | 		if (m_pData != NULL && m_bOwnData) delete m_pData;
 22 | 		m_pData = pData;
 23 | 		m_bOwnData = false;
 24 | 		m_iDataSize = m_pData->R();
 25 | 		m_iDataDimension = m_pData->C();
 26 | 	}
 27 | 
 28 | 	const CenterType* ClusterBase::GetCenter()
 29 | 	{
 30 | 		return (*m_pCenter)[0];
 31 | 	}
 32 | 	
 33 | 	const int* ClusterBase::GetCenterId()
 34 | 	{
 35 | 		return m_pCenterId;
 36 | 	}
 37 | 
 38 | 	// Load in Data set
 39 | 	void ClusterBase::LoadData(const Parameters & params)
 40 | 	{
 41 | 		std::string sDataPath = params.Get<std::string>("DataPath");
 42 | 		FILE * fp;
 43 | 		fopen_s(&fp, sDataPath.c_str(), "rb");
 44 |         if (m_pData != NULL && m_bOwnData) delete m_pData;
 45 | 		m_bOwnData = true;
 46 |         int R, C;
 47 |         fread(&R, sizeof(int), 1, fp);
 48 |         fread(&C, sizeof(int), 1, fp);
 49 | 
 50 | 		int iPartialDataSize, iStartDimension, iEndDimension;
 51 | 
 52 | 		// Check whether we only need process a subset of the data
 53 | 		params.Get<int>("PartialDataSize", iPartialDataSize, -1);
 54 | 		if (iPartialDataSize > 0 && iPartialDataSize < R)
 55 | 		{
 56 | 			R = iPartialDataSize;
 57 | 		}
 58 | 		params.Get<int>("StartDimension", iStartDimension, 0);
 59 | 		params.Get<int>("EndDimension", iEndDimension, C);
 60 | 
 61 | #ifdef ConsoleOutput
 62 |         std::cout << "DataSize = " << R << std::endl << "DataDimension = " << iEndDimension - iStartDimension << std::endl;
 63 | #endif
 64 | 		DataType * pTemp = new DataType [C];
 65 |         m_pData = new Dataset<DataType> (R, iEndDimension - iStartDimension);
 66 |         for (int i = 0; i < R; i++) 
 67 | 		{
 68 | #ifdef ConsoleOutput
 69 | 			if ((i+1)*100/R != i*100/R) std::cout << "\rLoading " << (i+1)*100/R << "%" ;
 70 | #endif
 71 | 			fread(pTemp, sizeof(DataType), C, fp);
 72 | 			for (int j = 0; j < iEndDimension - iStartDimension; j++)
 73 | 			{
 74 | 				(*m_pData)[i][j] = pTemp[j+iStartDimension];
 75 | 			}
 76 | 		}
 77 | 		delete [] pTemp;
 78 | 		std::cout << std::endl;
 79 |         fclose(fp);
 80 | 
 81 | 		m_iDataSize = m_pData->R();
 82 | 		m_iDataDimension = m_pData->C();
 83 | 
 84 | 	}
 85 | 
 86 | 	void ClusterBase::OutputResult(const Parameters & params) const
 87 | 	{
 88 | 		std::string sOutputFilename = params.Get<std::string>("OutputPrefix");
 89 | 		char sCenterFilename[255];
 90 | 		char sAssignFilename[255];
 91 | 
 92 | 		FILE * fp;
 93 | 
 94 | 		// Output the center vectors and cluster id of each data vector to text file
 95 | 		if (params.Get<int>("OutputTextResult", 0) == 1)
 96 | 		{
 97 | 			sprintf(sCenterFilename, "%s.center.txt", sOutputFilename.c_str());
 98 | 			fp = fopen(sCenterFilename, "w");
 99 | 			fprintf(fp, "%d %d\n", m_iNCluster, m_iDataDimension);
100 | 			for (int i = 0; i < m_iNCluster; i++)
101 | 			{
102 | 				for (int j = 0; j < m_iDataDimension; j++) fprintf(fp, "%f ", float((*m_pCenter)[i][j]));
103 | 				fprintf(fp, "\n");
104 | 			}
105 | 			fclose(fp);
106 | 
107 | 			sprintf(sAssignFilename, "%s.assign.txt", sOutputFilename.c_str());
108 | 			fp = fopen(sAssignFilename, "w");
109 | 			fprintf(fp, "%d %d\n", m_iDataSize, m_iNCluster);
110 | 			for (int i = 0; i < m_iDataSize; i++) fprintf(fp, "%d\n", m_pCenterId[i]);
111 | 			fclose(fp);
112 | 		}
113 | 		
114 | 		// Output the center vectors and cluster id of each data vector to binary file
115 | 		if (params.Get<int>("OutputBinaryResult", 0) == 1)
116 | 		{
117 | 			sprintf(sCenterFilename, "%s.center.bin", sOutputFilename.c_str());
118 | 			fp = fopen(sCenterFilename, "wb");
119 | 			fwrite(&m_iNCluster, sizeof(int), 1, fp);
120 | 			fwrite(&m_iDataDimension, sizeof(int), 1, fp);
121 | 			for (int i = 0; i < m_iNCluster; i++)
122 | 			{
123 | 				fwrite((*m_pData)[i], sizeof(CenterType), m_iDataDimension, fp);
124 | 			}
125 | 			fclose(fp);
126 | 
127 | 			sprintf(sAssignFilename, "%s.assign.bin", sOutputFilename.c_str());
128 | 			fp = fopen(sAssignFilename, "wb");
129 | 			fwrite(&m_iDataSize, sizeof(int), 1, fp);
130 | 			fwrite(&m_iNCluster, sizeof(int), 1, fp);
131 | 			fwrite(m_pCenterId, sizeof(int), m_iDataSize, fp);
132 | 			fclose(fp);
133 | 		}
134 | 	}
135 | 
136 | 	// Update step in the Lloyd Iteration
137 | 	// Handle the problem of empty cluster by assigning isolated points to the empty clusters
138 | 	FloatType ClusterBase::UpdateStep()
139 | 	{
140 | 		// Count the size of each cluster
141 | 		int * pClusterSize = new int [m_iNCluster];
142 | 		memset(pClusterSize, 0, sizeof(int) * m_iNCluster);
143 | 		for (int i = 0; i < m_iDataSize; i++) 
144 | 		{
145 | 			pClusterSize[m_pCenterId[i]]++;
146 | 		}
147 | 
148 | 		// Check whether empty cluster exists
149 | 		// Check the max and min size of a non-empty cluster
150 | 		int iEmptyClusterNum = 0;
151 | 		int iMaxClusterSize = 0;
152 | 		int iMinClusterSize = m_iDataSize;
153 | 		for (int i = 0; i < m_iNCluster; i++)
154 | 		{
155 | 			if (pClusterSize[i] > 0)
156 | 			{
157 | 				if (pClusterSize[i] > iMaxClusterSize) iMaxClusterSize = pClusterSize[i];
158 | 				if (pClusterSize[i] < iMinClusterSize) iMinClusterSize = pClusterSize[i];
159 | 			}
160 | 			else iEmptyClusterNum++;
161 | 		}
162 | 
163 | #ifdef ConsoleOutput
164 | 		std::cout << "# empty clusters = " << iEmptyClusterNum << "; Max cluster size = " << iMaxClusterSize << "; Min cluster size = " << iMinClusterSize << std::endl;
165 | #endif
166 | 		// Handle the problem of empty clusters
167 | 		if (iEmptyClusterNum > 0)
168 | 		{
169 | #ifdef ConsoleOutput
170 | 			std::cout << "Fixing empty clusters... " << std::endl;
171 | #endif
172 | 
173 | 			KeyScorePair * pairs = new KeyScorePair [m_iDataSize];
174 | #pragma omp parallel for
175 | 			for (int i = 0; i < m_iDataSize; i++)
176 | 			{
177 | 				pairs[i].Key = i;
178 | 				pairs[i].Score = ComputeDistance((*m_pCenter)[m_pCenterId[i]], (*m_pData)[i], m_iDataDimension);
179 | 			}
180 | 			std::sort(pairs, pairs + m_iDataSize, KeyScorePair::Compare);
181 | 			int k = m_iDataSize - 1;
182 | 			for (int i = 0; i < m_iNCluster; i++)
183 | 			{
184 | 				if (pClusterSize[i] == 0)
185 | 				{
186 | 					while (pClusterSize[m_pCenterId[pairs[k].Key]] < 2) k--;
187 | 					pClusterSize[m_pCenterId[pairs[k].Key]]--;
188 | 					m_pCenterId[pairs[k].Key] = i;
189 | 					pClusterSize[i] = 1;
190 | 				}
191 | 			}
192 | 			delete [] pairs;
193 | 		}
194 | 
195 | 		// Update center vectors
196 | 		for (int i = 0; i < m_iNCluster; i++)
197 | 		{
198 | 			pClusterSize[i] = 0;
199 | 			memset((*m_pCenter)[i], 0, sizeof(CenterType)*m_iDataDimension);
200 | 		}
201 | 
202 | 		for (int i = 0; i < m_iDataSize; i++)
203 | 		{
204 | 			pClusterSize[m_pCenterId[i]]++;
205 | 			for (int j = 0; j < m_iDataDimension; j++) (*m_pCenter)[m_pCenterId[i]][j] += (*m_pData)[i][j];
206 | 		}
207 | 
208 | 		iEmptyClusterNum = 0;
209 | 		for (int i = 0; i < m_iNCluster; i++)
210 | 		{
211 | 			if (pClusterSize[i] > 0)
212 | 			{
213 | 				for (int j = 0; j < m_iDataDimension; j++) (*m_pCenter)[i][j] /= pClusterSize[i];
214 | 			}
215 | 			else iEmptyClusterNum++;
216 | 		}
217 | 		if (iEmptyClusterNum > 0)
218 | 		{
219 | 			std::cout << "Error: found " << iEmptyClusterNum << " empty clusters after fixing" << std::endl;
220 | 			system("pause");
221 | 		}
222 | 		delete [] pClusterSize;
223 | 
224 | 		// Calculate WCSSD
225 | 		double WCSSD = 0.0f;
226 | 
227 | #pragma omp parallel for reduction(+ : WCSSD)
228 | 		for (int i = 0; i < m_iDataSize; i++)
229 | 		{
230 | 			WCSSD += ComputeDistance((*m_pCenter)[m_pCenterId[i]], (*m_pData)[i], m_iDataDimension);
231 | 		}
232 | 		return FloatType(WCSSD / m_iDataSize);
233 | 	}
234 | 
235 | 
236 | 
237 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Cluster.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "ClusterCommon.h"
  3 | #include "Dataset.h"
  4 | #include "Distance.h"
  5 | #include <map>
  6 | #include <string>
  7 | #include <sstream>
  8 | #include <fstream>
  9 | #include <iostream>
 10 | #include <ctime>
 11 | 
 12 | namespace KMC
 13 | {
 14 |     class Parameters
 15 |     {
 16 |     public:
 17 |         void Add(const std::string& key, const std::string& val)
 18 |         {
 19 |             if(m_params.find(key) != m_params.end()) 
 20 | 			{
 21 |                 throw KMCException(("duplicate keys in params: " + key).c_str());
 22 | 			}
 23 |             m_params[key] = val;
 24 |         }
 25 | 
 26 | 		void Set(const std::string& key, const std::string& val)
 27 | 		{
 28 | 			m_params[key] = val;
 29 | 		}
 30 | 
 31 |         template<class T>
 32 |         void Get(const std::string& key, T& val) const
 33 |         {
 34 |             t_params::const_iterator p = m_params.find(key);
 35 |             if(p != m_params.end())
 36 | 			{
 37 |                 val = StringToValue<T>(p->second);
 38 | 			}
 39 |             else
 40 | 			{
 41 |                 throw KMCException(("the parameter can't be found: " + key).c_str());
 42 | 			}
 43 |         }
 44 | 
 45 |         template<class T>
 46 |         bool Get(const std::string& key, T& val, const T& defaultValue) const
 47 |         {
 48 |             t_params::const_iterator p = m_params.find(key);
 49 |             if(p != m_params.end())
 50 |             {
 51 |                 val = StringToValue<T>(p->second);
 52 |                 return true;
 53 |             }
 54 |             else
 55 |             {
 56 |                 val = defaultValue;
 57 |                 return false;
 58 |             }
 59 |         }
 60 | 
 61 | 		template<class T>
 62 |         T Get(const std::string& key, const T& defaultValue) const
 63 |         {
 64 | 			T val;
 65 |             t_params::const_iterator p = m_params.find(key);
 66 |             if(p != m_params.end())
 67 | 			{
 68 |                 val = StringToValue<T>(p->second);
 69 | 			}
 70 |             else
 71 | 			{
 72 |                 val = defaultValue;
 73 | 			}
 74 | 			return val;
 75 |         }
 76 | 
 77 | 		template<class T>
 78 |         T Get(const std::string& key) const
 79 |         {
 80 | 			T val;
 81 |             t_params::const_iterator p = m_params.find(key);
 82 |             if(p != m_params.end())
 83 | 			{
 84 |                 val = StringToValue<T>(p->second);
 85 | 			}
 86 |             else
 87 | 			{
 88 |                 throw KMCException(("the parameter can't be found: " + key).c_str());
 89 | 			}
 90 | 			return val;
 91 |         }
 92 |         
 93 |         bool Exists(const std::string& key)
 94 |         {
 95 |             return m_params.find(key) != m_params.end();
 96 |         }
 97 | 
 98 |         void LoadFromFile(const std::string fileName)
 99 |         {
100 |             m_params.clear();
101 | 
102 |             std::string currentLine;
103 |             std::ifstream inputStream;
104 |             inputStream.open(fileName);
105 | 
106 |             if(inputStream.is_open() == false)
107 |             {
108 |                 std::string message = "unable to open configuration file " + fileName;    
109 |                 std::cerr<<std::endl<<message;
110 |                 throw std::exception(message.c_str());        
111 |             }
112 | 
113 |             while(!inputStream.eof())
114 |             {        
115 |                 std::getline(inputStream, currentLine);                
116 |                 if(currentLine.length() > 0)
117 |                 {
118 | 				    if('#' == currentLine[0])	// All lines starting with '#' are skipped as comments.
119 | 					    continue;
120 | 
121 |                     std::vector<std::string> tokens = StringSplit(currentLine, "= ");
122 | 
123 |                     if(tokens.size() == 2)
124 |                     {
125 |                         m_params.insert(std::pair<std::string, std::string>(tokens[0], tokens[1]));  
126 |                         std::cout << tokens[0] << '=' << tokens[1] << std::endl;
127 |                     }
128 |                     else
129 |                     {
130 |                         throw std::exception(("Error in parsing data " + currentLine).c_str());
131 |                     }
132 | 
133 |                     tokens.clear();
134 |                     tokens.resize(0);
135 |                 }
136 |             }    
137 | 
138 |             inputStream.close();
139 |         }
140 | 
141 |     private:
142 |         typedef std::map<std::string, std::string> t_params;
143 |         t_params m_params;
144 |     };
145 | 
146 | 	class ClusterBase
147 | 	{
148 | 	public:
149 | 		ClusterBase();
150 | 		~ClusterBase();
151 | 
152 | 		virtual void LoadParameters(const Parameters & params) = 0;
153 | 		virtual void Initialization() = 0;
154 | 		virtual void RunClustering() = 0;
155 | 
156 | 		virtual void SetData(Dataset<DataType> * pData);
157 | 		virtual void LoadData(const Parameters & params);
158 | 		virtual void OutputResult(const Parameters & params) const;
159 | 
160 | 		virtual const CenterType* GetCenter();
161 | 		virtual const int* GetCenterId();
162 | 
163 | 	protected:
164 | 		bool m_bOwnData;
165 | 		Dataset<DataType> * m_pData;
166 | 
167 | 		int m_iNCluster;
168 | 		int m_iDataSize;
169 | 		int m_iDataDimension;
170 | 
171 | 		int m_iMaxIteration;
172 | 		FloatType m_fEpsilon;
173 | 
174 | 		Dataset<CenterType> * m_pCenter;
175 | 		int * m_pCenterId;
176 | 
177 | 		virtual void AssignmentStep() = 0;
178 | 		virtual FloatType UpdateStep();
179 | 
180 | 	private:
181 | 
182 | 		ClusterBase(const ClusterBase &);
183 | 		ClusterBase & operator = (const ClusterBase &);
184 | 	};
185 | 
186 | 	
187 | 
188 | }


--------------------------------------------------------------------------------
/ReleaseVersion/ClusterCommon.cpp:
--------------------------------------------------------------------------------
 1 | #include "ClusterCommon.h"
 2 | 
 3 | namespace KMC
 4 | {
 5 |     template<> std::string StringToValue<std::string>(const std::string& str)
 6 |     {
 7 |         return str;
 8 |     }
 9 | 
10 |     template<> int StringToValue<int>(const std::string& str)
11 |     {
12 |         return atoi(str.c_str());
13 |     }
14 | 
15 |     template<> float StringToValue<float>(const std::string& str)
16 |     {
17 |         return float(atof(str.c_str()));
18 |     }
19 | 
20 |     template<> double StringToValue<double>(const std::string& str)
21 |     {
22 |         return atof(str.c_str());
23 |     }
24 | 
25 |     std::vector<std::string> StringSplit(const std::string &str,const std::string &sep)
26 |     {
27 |         char* cstr=const_cast<char*>(str.c_str());
28 |         char* current;
29 |         char* context = NULL;
30 | 
31 |         std::vector<std::string> arr;
32 |         current=strtok_s(cstr,sep.c_str(), &context);
33 |         while(current!=NULL){
34 |             arr.push_back(current);
35 |             current=strtok_s(NULL,sep.c_str(), &context);
36 |         }
37 |         return arr;
38 |     }
39 | }


--------------------------------------------------------------------------------
/ReleaseVersion/ClusterCommon.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <exception>
 4 | #include <vector>
 5 | #include <string>
 6 | #include <iostream>
 7 | 
 8 | namespace KMC
 9 | {
10 | 	typedef unsigned char byte;
11 | 	typedef float FloatType;
12 | 	typedef byte IntegerType;
13 | 
14 | #define FloatData
15 | #define ConsoleOutput
16 | 
17 | #ifdef FloatData
18 | 	typedef FloatType DataType;
19 | 	typedef FloatType CenterType;
20 | #else
21 | 	typedef IntegerType DataType;
22 | 	typedef FloatType CenterType;
23 | #endif // FloatData
24 | 	
25 | 	const FloatType MaxDist = 1e20f;
26 | 
27 | 	class KMCException:public std::exception
28 |     {
29 |     public:
30 |         KMCException() {}
31 | 
32 |         KMCException(const char * const & info)
33 |             :exception(info)
34 |         {
35 |         }
36 | 
37 |         KMCException(const std::string& info)
38 |             :exception(info.c_str())
39 |         {
40 |         }
41 |     };
42 | 
43 |     struct KeyScorePair
44 |     {
45 |         int Key;
46 |         FloatType Score;
47 |         KeyScorePair(int _key = -1, FloatType _score = 0)
48 |             :Key(_key), Score(_score)
49 |         {}
50 |         __forceinline static bool Compare (KeyScorePair i,KeyScorePair j) { return (i.Score < j.Score || i.Score == j.Score && i.Key < j.Key); }
51 |     };
52 | 
53 |     template<typename T>
54 |     T StringToValue(const std::string& str);
55 | 
56 |     template<> std::string StringToValue<std::string>(const std::string& str);
57 |     template<> int StringToValue<int>(const std::string& str);
58 |     template<> float StringToValue<float>(const std::string& str);
59 |     template<> double StringToValue<double>(const std::string& str);
60 |     std::vector<std::string> StringSplit(const std::string &str,const std::string &sep);
61 | }


--------------------------------------------------------------------------------
/ReleaseVersion/CompositeQuantization.cpp:
--------------------------------------------------------------------------------
  1 | #include "CompositeQuantization.h"
  2 | 
  3 | /***************************Implementation*******************************/
  4 | 
  5 | CompositeQuantization::CompositeQuantization(
  6 | 	const int points_count,
  7 | 	const int dictionaries_count, 
  8 | 	const int words_count, 
  9 | 	const int space_dimension, 
 10 | 	const int num_sep)
 11 | 	:points_count_(points_count), 
 12 | 	dictionaries_count_(dictionaries_count),
 13 | 	words_count_(words_count),
 14 | 	space_dimension_(space_dimension),
 15 | 	num_sep_(num_sep)
 16 | {
 17 | 	if (dictionaries_count <= 0 || words_count <= 0 || space_dimension <= 0 || points_count <= 0 || num_sep <= 0)
 18 | 	{
 19 | 		cout << "CQ: bad input parameters\n";
 20 | 		throw std::logic_error("Bad input parameters");
 21 | 	}
 22 | 	InitLbfgsParam();
 23 | 
 24 | 	points_ = NULL;
 25 | 	own_points_memory_ = false;
 26 | 	dictionary_ = new DictionaryType[dictionaries_count*words_count*space_dimension];
 27 | 	memset(dictionary_, 0, sizeof(DictionaryType)*dictionaries_count*words_count*space_dimension);
 28 | 	binary_codes_ = new CodeType[dictionaries_count*points_count];
 29 | 	memset(binary_codes_, 0, sizeof(CodeType)*dictionaries_count*points_count);
 30 | 	
 31 | 	distortions_ = new float[points_count];
 32 | 	memset(distortions_, 0, sizeof(float)*points_count);
 33 | 	distortion_ = 0;
 34 | 	constants_ = new float[points_count];
 35 | 	memset(constants_, 0, sizeof(float)*points_count);
 36 | 	constant_ = 0;
 37 | 
 38 | 	epsilon_ = 0;
 39 | 	mu_ = 0;
 40 | 	
 41 | 	dictionary_gradient_sep_.resize(num_sep, vector<float>(dictionaries_count*words_count*space_dimension));
 42 | 	dictionary_cross_products_.resize(dictionaries_count*words_count, vector<float>(dictionaries_count*words_count));
 43 | }
 44 | 
 45 | CompositeQuantization::~CompositeQuantization()
 46 | {
 47 | 	if (constants_)    delete[] constants_;
 48 | 	if (distortions_)  delete[] distortions_;
 49 | 	if (binary_codes_) delete[] binary_codes_;
 50 | 	if (dictionary_)   delete[] dictionary_;
 51 | 	if (points_)       delete[] points_;
 52 | }
 53 | 
 54 | void CompositeQuantization::SaveDictionary(const string output_file_prefix)
 55 | {
 56 | 	cout << "Saving dictionary in " + output_file_prefix + "D\n";
 57 | 	SaveOneDimensionalPoints<DictionaryType>(output_file_prefix + "D", dictionary_, dictionaries_count_*words_count_, space_dimension_);
 58 | }
 59 | 
 60 | void CompositeQuantization::SaveBinaryCodes(const string output_file_prefix)
 61 | {
 62 | 	cout << "Saving binary codes in " + output_file_prefix + "B\n";
 63 | 	SaveOneDimensionalPoints<CodeType>(output_file_prefix + "B", binary_codes_, points_count_, dictionaries_count_);
 64 | }
 65 | 
 66 | void CompositeQuantization::InitPoints(
 67 | 	const string points_file,
 68 | 	const PointStoreType point_store_type)
 69 | {
 70 | 	cout << "Reading points in " + points_file << endl;
 71 | 	if (!own_points_memory_)
 72 | 	{
 73 | 		points_ = new float[space_dimension_*points_count_];
 74 | 		own_points_memory_ = true;
 75 | 	}
 76 | 	ReadOneDimensionalPoints<PointType>(points_file, point_store_type, points_, points_count_, space_dimension_);
 77 | }
 78 | 
 79 | void CompositeQuantization::InitPoints(
 80 | 	PointType* points,
 81 | 	const int points_count,
 82 | 	const int space_dimension)
 83 | {
 84 | 	if (points_count != points_count_ || space_dimension != space_dimension_)
 85 | 	{
 86 | 		cout << "unmatched points dimension\n";
 87 | 		throw std::logic_error("unmatched points dimension");
 88 | 	}
 89 | 	cout << "Reading points...\n";
 90 | 	if (own_points_memory_)
 91 | 		memcpy(points_, points, sizeof(PointType)*space_dimension_*points_count_);
 92 | 	else
 93 | 		points_ = points;
 94 | }
 95 | 
 96 | void CompositeQuantization::InitDictionary(
 97 | 	const string dictionary_file,
 98 | 	const PointStoreType dictionary_store_type)
 99 | {
100 | 	cout << "Reading dictionary in " + dictionary_file << endl;
101 | 	ReadOneDimensionalPoints<DictionaryType>(dictionary_file, dictionary_store_type, dictionary_, dictionaries_count_*words_count_, space_dimension_);
102 | }
103 | 
104 | void CompositeQuantization::InitDictionary(
105 | 	const DictionaryType* dictionary,
106 | 	const int dictionaries_count,
107 | 	const int words_count)
108 | {
109 | 	if (dictionaries_count != dictionaries_count_ || words_count != words_count_)
110 | 	{
111 | 		cout << "unmatched dictionary dimension\n";
112 | 		throw std::logic_error("unmatched dictionary dimension");
113 | 	}
114 | 	cout << "Reading dictionary...\n";
115 | 	memcpy(dictionary_, dictionary, sizeof(DictionaryType)*dictionaries_count_*words_count_*space_dimension_);
116 | }
117 | 
118 | void CompositeQuantization::InitBinaryCodes(
119 | 	const string binary_codes_file,
120 | 	const PointStoreType binary_codes_store_type)
121 | {
122 | 	cout << "Reading binary codes in " + binary_codes_file << endl;
123 | 	ReadOneDimensionalPoints<CodeType>(binary_codes_file, binary_codes_store_type, binary_codes_, points_count_, dictionaries_count_);
124 | }
125 | 
126 | void CompositeQuantization::InitBinaryCodes(
127 | 	const CodeType* binary_codes,
128 | 	const int points_count,
129 | 	const int dictionaries_count)
130 | {
131 | 	if (points_count != points_count_ || dictionaries_count != dictionaries_count_)
132 | 	{
133 | 		cout << "unmatched binary codes dimension\n";
134 | 		throw std::logic_error("unmatched binary codes dimension");
135 | 	}
136 | 	cout << "Reading binary codes...\n";
137 | 	memcpy(binary_codes_, binary_codes, sizeof(CodeType)*dictionaries_count_*points_count_);
138 | }
139 | 
140 | const DictionaryType* CompositeQuantization::GetDictionary()
141 | {
142 | 	return dictionary_;
143 | }
144 | 
145 | const CodeType* CompositeQuantization::GetBinaryCodes()
146 | {
147 | 	return binary_codes_;
148 | }
149 | 
150 | void CompositeQuantization::InitLbfgsParam()
151 | {
152 | 	lbfgs_parameter_init(&lbfgs_param_);
153 | 	lbfgs_param_.m = 5;
154 | }
155 | 
156 | void CompositeQuantization::InitDictionaryBinaryCodes(const string output_file_prefix)
157 | {
158 | 	cout << "initial dictionary and binary codes using approximate results obtained from PQ...\n";
159 | 	ProductQuantization PQ(points_count_, dictionaries_count_, words_count_, space_dimension_);
160 | 	PQ.InitPoints(points_, points_count_, space_dimension_);
161 | 	PQ.Training(30, 1e-4, Closure, output_file_prefix + "PQ.", false);
162 | 
163 | 	InitDictionary(PQ.GetDictionary(), dictionaries_count_, words_count_);
164 | 	InitBinaryCodes(PQ.GetBinaryCodes(), points_count_, dictionaries_count_);
165 | }
166 | 
167 | void CompositeQuantization::GetDictionaryCrossProducts(const float* dictionary)
168 | {
169 | 	int all_words_count = dictionaries_count_*words_count_;
170 | #pragma omp parallel for
171 | 	for (int word_id1 = 0; word_id1 < all_words_count; ++word_id1)
172 | 	{
173 | 		for (int word_id2 = word_id1 + 1; word_id2 < all_words_count; ++word_id2)
174 | 		{
175 | 			float product = 0;
176 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
177 | 				product += dictionary[word_id1*space_dimension_ + dimension]
178 | 							* dictionary[word_id2*space_dimension_ + dimension];
179 | 			dictionary_cross_products_[word_id1][word_id2] = product;
180 | 			dictionary_cross_products_[word_id2][word_id1] = product;
181 | 		}
182 | 	}
183 | }
184 | 
185 | void CompositeQuantization::GetDictionaryCrossProducts(const lbfgsfloatval_t* dictionary)
186 | {
187 | 	int all_words_count = dictionaries_count_*words_count_;
188 | #pragma omp parallel for
189 | 	for (int word_id1 = 0; word_id1 < all_words_count; ++word_id1)
190 | 	{
191 | 		for (int word_id2 = word_id1 + 1; word_id2 < all_words_count; ++word_id2)
192 | 		{
193 | 			float product = 0;
194 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
195 | 				product += dictionary[word_id1*space_dimension_ + dimension]
196 | 							* dictionary[word_id2*space_dimension_ + dimension];
197 | 			dictionary_cross_products_[word_id1][word_id2] = product;
198 | 			dictionary_cross_products_[word_id2][word_id1] = product;
199 | 		}
200 | 	}
201 | }
202 | 
203 | void CompositeQuantization::GetDistortionsConstants()
204 | {
205 | 	memset(constants_, 0, sizeof(float)*points_count_);
206 | 	memset(distortions_, 0, sizeof(float)*points_count_);
207 | 
208 | 	int all_words_count = words_count_*dictionaries_count_;
209 | #pragma omp parallel for
210 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
211 | 	{
212 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
213 | 		PointType* point = &points_[point_id*space_dimension_];
214 | 		vector<PointType> point_approximate_error(point, point + space_dimension_);
215 | 
216 | 		float cross_product = 0;
217 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
218 | 		{
219 | 			int word_id1 = dictionary_id*words_count_ + point_codes[dictionary_id];
220 | 			float* pWord = &dictionary_[word_id1*space_dimension_];
221 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
222 | 				point_approximate_error[dimension] -= pWord[dimension];
223 | 			for (int dictionary_id2 = dictionary_id + 1; dictionary_id2 < dictionaries_count_; ++dictionary_id2)
224 | 			{
225 | 				int word_id2 = dictionary_id2*words_count_ + point_codes[dictionary_id2];
226 | 				cross_product += dictionary_cross_products_[word_id1][word_id2];
227 | 			}
228 | 		}
229 | 		for (int dimension = 0; dimension < space_dimension_; ++dimension)
230 | 			distortions_[point_id] += point_approximate_error[dimension] * point_approximate_error[dimension];
231 | 		constants_[point_id] = 2 * cross_product;
232 | 	}
233 | 
234 | 	distortion_ = constant_ = 0;
235 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
236 | 	{
237 | 		distortion_ += distortions_[point_id];
238 | 		constant_ += (constants_[point_id] - epsilon_) * (constants_[point_id] - epsilon_);
239 | 	}
240 | }
241 | 
242 | void CompositeQuantization::UpdateEpsilon()
243 | {
244 | 	memset(constants_, 0, sizeof(float)*points_count_);
245 | 
246 | #pragma omp parallel for
247 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
248 | 	{
249 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
250 | 		for (int dictionary_id1 = 0; dictionary_id1 < dictionaries_count_; ++dictionary_id1)
251 | 		{
252 | 			int word_id1 = dictionary_id1*words_count_ + point_codes[dictionary_id1];
253 | 			for (int dictionary_id2 = dictionary_id1 + 1; dictionary_id2 < dictionaries_count_; ++dictionary_id2)
254 | 			{
255 | 				int word_id2 = dictionary_id2*words_count_ + point_codes[dictionary_id2];
256 | 				constants_[point_id] += dictionary_cross_products_[word_id1][word_id2];
257 | 			}
258 | 		}
259 | 		constants_[point_id] = 2 * constants_[point_id];
260 | 	}
261 | 
262 | 	float sum = 0;
263 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
264 | 		sum += constants_[point_id];
265 | 	epsilon_ = sum / points_count_;
266 | }
267 | 
268 | void CompositeQuantization::UpdateDictionary()
269 | {
270 | 	lbfgsfloatval_t function_value;
271 | 	lbfgsfloatval_t* x = lbfgs_malloc(dictionaries_count_*words_count_*space_dimension_);
272 | #pragma omp parallel for
273 | 	for (int i = 0; i < dictionaries_count_*words_count_*space_dimension_; ++i)
274 | 		x[i] = dictionary_[i];
275 | 
276 | 	lbfgs(dictionaries_count_*words_count_*space_dimension_, x, &function_value, evaluate, progress, this, &lbfgs_param_);
277 | 
278 | #pragma omp parallel for
279 | 	for (int i = 0; i < dictionaries_count_*words_count_*space_dimension_; ++i)
280 | 		dictionary_[i] = x[i];
281 | 	lbfgs_free(x);
282 | 
283 | 	GetDictionaryCrossProducts(&(dictionary_[0]));
284 | 	GetDistortionsConstants();
285 | }
286 | 
287 | void CompositeQuantization::UpdateBinaryCodes()
288 | {
289 | #pragma omp parallel for
290 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
291 | 	{
292 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
293 | 		PointType* point = &points_[point_id*space_dimension_];
294 | 
295 | 		vector<PointType> point_approximate_error(point, point + space_dimension_);
296 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
297 | 		{
298 | 			DictionaryType* pWord = &(dictionary_[(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_]);
299 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
300 | 				point_approximate_error[dimension] -= pWord[dimension];
301 | 			//PointType and DictionaryType must be the same and be float!
302 | 			//cblas_saxpy(space_dimension_, -1.0, pWord, 1, &(point_approximate_error[0]), 1);
303 | 		}
304 | 
305 | 		double objective_function_value = distortions_[point_id] + mu_
306 | 			* (constants_[point_id] - epsilon_)*(constants_[point_id] - epsilon_) / 4;
307 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
308 | 		{
309 | 			//int old_selected_id = dictionary_id*words_count_ + point_codes[dictionary_id];
310 | 			DictionaryType* pWord = &(dictionary_[(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_]);
311 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
312 | 				point_approximate_error[dimension] += pWord[dimension];
313 | 			//PointType and DictionaryType must be the same and be float!
314 | 			//cblas_saxpy(space_dimension_, 1.0, &(dictionary_[old_selected_id*space_dimension_]), 1, &(point_approximate_error[0]), 1);
315 | 			double temp_distortion, temp_constant, temp_objective_function_value;
316 | 			for (int word_id = 0; word_id < words_count_; ++word_id)
317 | 			{
318 | 				int current_selected_id = dictionary_id*words_count_ + point_codes[dictionary_id];
319 | 				int temp_selected_id = dictionary_id*words_count_ + word_id;
320 | 				DictionaryType* pWord_temp = &dictionary_[temp_selected_id*space_dimension_];
321 | 				temp_distortion = 0;
322 | 				for (int dimension = 0; dimension < space_dimension_; ++dimension)
323 | 				{
324 | 					float diff = point_approximate_error[dimension] - pWord_temp[dimension];
325 | 					temp_distortion += diff*diff;
326 | 				}
327 | 				temp_constant = constants_[point_id];
328 | 				for (int dictionary_id2 = 0; dictionary_id2 < dictionaries_count_; ++dictionary_id2)
329 | 				{
330 | 					if (dictionary_id2 == dictionary_id) continue;
331 | 					int word_id2 = dictionary_id2*words_count_ + point_codes[dictionary_id2];
332 | 					temp_constant = temp_constant + 2 * (dictionary_cross_products_[temp_selected_id][word_id2]
333 | 						- dictionary_cross_products_[current_selected_id][word_id2]);
334 | 				}
335 | 				temp_objective_function_value = temp_distortion + mu_*(temp_constant - epsilon_)*(temp_constant - epsilon_) / 4;
336 | 				if (temp_objective_function_value < objective_function_value)
337 | 				{
338 | 					objective_function_value = temp_objective_function_value;
339 | 					distortions_[point_id] = temp_distortion;
340 | 					constants_[point_id] = temp_constant;
341 | 					point_codes[dictionary_id] = word_id;
342 | 				}
343 | 			}
344 | 			//int new_selected_id = dictionary_id*words_count_ + point_codes[dictionary_id];
345 | 			pWord = &(dictionary_[(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_]);
346 | 			for (int dimension = 0; dimension < space_dimension_; ++dimension)
347 | 				point_approximate_error[dimension] -= pWord[dimension];
348 | 			//PointType and DictionaryType must be the same and be float!
349 | 			//cblas_saxpy(space_dimension_, -1.0, &(dictionary_[new_selected_id*space_dimension_]), 1, &point_approximate_error[0], 1);
350 | 		}
351 | 	}
352 | 
353 | 	distortion_ = constant_ = 0;
354 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
355 | 	{
356 | 		distortion_ += distortions_[point_id];
357 | 		constant_ += (constants_[point_id] - epsilon_) * (constants_[point_id] - epsilon_);
358 | 	}
359 | }
360 | 
361 | void CompositeQuantization::Training(
362 | 	const int iters, 
363 | 	const double mu,
364 | 	const string output_file_prefix,
365 | 	const bool initial)
366 | {
367 | 	mu_ = mu;
368 | 	cout << "Composite Quantization Training...\n";
369 | 	cout << "Reminder: The points, dictionary and binary codes should be initialized first! \n";
370 | 
371 | 	if (initial)
372 | 		InitDictionaryBinaryCodes(output_file_prefix);
373 | 	GetDictionaryCrossProducts(&(dictionary_[0]));
374 | 	GetDistortionsConstants();
375 | 
376 | 	ofstream out(output_file_prefix + "distor_iter.txt");
377 | 	for (int iter = 0; iter < iters; ++iter)
378 | 	{
379 | 		cout << "Iteration " << iter << ": distortion = " << distortion_ << ", constant = " << constant_ << endl;
380 | 		out << "Iteration " << iter << ": distortion = " << distortion_ << ", constant = " << constant_ << endl;
381 | 		cout << "Updating epsilon: \n";
382 | 		UpdateEpsilon();
383 | 		cout << "epsilon = " << epsilon_ << endl;
384 | 		cout << "Updating dictionary: \n";
385 | 		UpdateDictionary();
386 | 		cout << "Updating binary codes: \n\n";
387 | 		UpdateBinaryCodes();
388 | 	}
389 | 	out.close();
390 | 
391 | 	SaveDictionary(output_file_prefix);
392 | 	SaveBinaryCodes(output_file_prefix);
393 | }
394 | 
395 | /*****************************************************************************/
396 | /*************************** friend function *********************************/
397 | /*****************************************************************************/
398 | lbfgsfloatval_t evaluate(
399 | 	void *instance,
400 | 	const lbfgsfloatval_t *x,
401 | 	lbfgsfloatval_t *g,
402 | 	const int n,
403 | 	const lbfgsfloatval_t step)
404 | {
405 | 	CompositeQuantization* CQ = static_cast<CompositeQuantization*>(instance);
406 | 	int space_dimension = CQ->space_dimension_;
407 | 	CQ->GetDictionaryCrossProducts(x);
408 | 
409 | #pragma omp parallel for
410 | 	for (int sep = 0; sep < CQ->num_sep_; ++sep)
411 | 	{
412 | 		int start_point_id = CQ->points_count_ / CQ->num_sep_ * sep;
413 | 		int end_point_id = CQ->points_count_ / CQ->num_sep_ * (sep + 1);
414 | 		vector<float> apprvec(space_dimension);
415 | 		vector<float> diffvec(space_dimension);
416 | 		CQ->dictionary_gradient_sep_[sep].assign(n, 0);
417 | 		for (int point_id = start_point_id; point_id < end_point_id; ++point_id)
418 | 		{
419 | 			CodeType* point_codes = &CQ->binary_codes_[point_id*CQ->dictionaries_count_];
420 | 			PointType* point = &CQ->points_[point_id*space_dimension];
421 | 			apprvec.assign(space_dimension, 0);
422 | 			diffvec.assign(space_dimension, 0);
423 | 
424 | 			float constant = 0;
425 | 			float distortion = 0;
426 | 			for (int dictionary_id = 0; dictionary_id < CQ->dictionaries_count_; ++dictionary_id)
427 | 			{
428 | 				int word_id = dictionary_id*CQ->words_count_ + point_codes[dictionary_id];
429 | 				for (int dimension = 0; dimension < space_dimension; ++dimension)
430 | 					apprvec[dimension] += x[word_id*space_dimension + dimension];
431 | 				for (int dictionary_id2 = dictionary_id + 1; dictionary_id2 < CQ->dictionaries_count_; ++dictionary_id2)
432 | 					constant += CQ->dictionary_cross_products_[word_id][dictionary_id2*CQ->words_count_ + point_codes[dictionary_id2]];
433 | 			}
434 | 			for (int dimension = 0; dimension < space_dimension; ++dimension)
435 | 			{
436 | 				float diff = apprvec[dimension] - point[dimension];
437 | 				diffvec[dimension] = diff;
438 | 				distortion += diff*diff;
439 | 			}
440 | 			CQ->distortions_[point_id] = distortion;
441 | 			CQ->constants_[point_id] = 2 * constant;
442 | 
443 | 			float coeff = CQ->mu_ * (CQ->constants_[point_id] - CQ->epsilon_);
444 | 			for (int dictionary_id = 0; dictionary_id < CQ->dictionaries_count_; ++dictionary_id)
445 | 			{
446 | 				int word_id = dictionary_id*CQ->words_count_ + point_codes[dictionary_id];
447 | 				float* dictionary_gradient_column = &(CQ->dictionary_gradient_sep_[sep][word_id*space_dimension]);
448 | 				for (int dimension = 0; dimension < space_dimension; ++dimension)
449 | 					dictionary_gradient_column[dimension] += diffvec[dimension] * 2 + coeff*(apprvec[dimension] - x[word_id*space_dimension + dimension]);
450 | 			}
451 | 		}
452 | 	}
453 | 
454 | 	memset(g, 0, sizeof(lbfgsfloatval_t)*n);
455 | 	for (int sep = 0; sep < CQ->num_sep_; ++sep)
456 | 	{
457 | #pragma omp parallel for
458 | 		for (int entry_id = 0; entry_id < n; ++entry_id)
459 | 			g[entry_id] += CQ->dictionary_gradient_sep_[sep][entry_id];
460 | 	}
461 | 
462 | 	CQ->distortion_ = CQ->constant_ = 0;
463 | 	for (int point_id = 0; point_id < CQ->points_count_; ++point_id)
464 | 	{
465 | 		CQ->distortion_ += CQ->distortions_[point_id];
466 | 		CQ->constant_ += (CQ->constants_[point_id] - CQ->epsilon_)*(CQ->constants_[point_id] - CQ->epsilon_);
467 | 	}
468 | 
469 | 	return CQ->distortion_ + CQ->mu_ *  CQ->constant_ / 4;
470 | }
471 | 
472 | int progress(
473 | 	void *instance,
474 | 	const lbfgsfloatval_t *x,
475 | 	const lbfgsfloatval_t *g,
476 | 	const lbfgsfloatval_t fx,
477 | 	const lbfgsfloatval_t xnorm,
478 | 	const lbfgsfloatval_t gnorm,
479 | 	const lbfgsfloatval_t step,
480 | 	int n,
481 | 	int k,
482 | 	int ls
483 | 	)
484 | {
485 | 	cout << "Lbfgs Iteration " << k << ":\n";
486 | 	cout << "  objective function value = " << fx << endl;
487 | 	cout << "  xnorm = " << xnorm << ", gnorm = " << gnorm << ", step = " << step << endl << endl;
488 | 	return 0;
489 | }


--------------------------------------------------------------------------------
/ReleaseVersion/CompositeQuantization.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #pragma once
  3 | 
  4 | #include "lbfgs.h"
  5 | #include "time.h"
  6 | #include "DataUtil.h"
  7 | #include "ProductQuantization.h"
  8 | #include "NoConstraintCompositeQuantization.h"
  9 | //#pragma comment(lib,"lbfgs.lib")
 10 | 
 11 | 
 12 | class CompositeQuantization{
 13 |  public:
 14 | 	/**
 15 | 	* The constructor function.
 16 | 	*  @param  points_count        The number of points in the dataset.
 17 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 18 | 	*  @param  words_count the     The number of words in each dictionary (K).
 19 | 	*  @param  space_dimension     The dimension of database vector.
 20 | 	*  @param  num_sep             The number of partitions of the points to accelerate the gradient computation (default 20).
 21 | 	*/
 22 | 	CompositeQuantization(
 23 | 		const int points_count,
 24 | 		const int dictionaries_count, 
 25 | 		const int words_count, 
 26 | 		const int space_dimension,
 27 | 		const int num_sep = 20);
 28 | 
 29 | 	/**
 30 | 	* The deconstructor function.
 31 | 	*/
 32 | 	~CompositeQuantization();
 33 | 
 34 | 
 35 | 	/**
 36 | 	* The initial function for points.
 37 | 	*  @param  points_file          The filename with points in .fvecs format or binary format.
 38 | 	*  @param  point_store_type     The type of points, should be FVEC, IVEC or BINARY.
 39 | 	*/
 40 | 	void InitPoints(
 41 | 		const string points_file,
 42 | 		const PointStoreType point_store_type);
 43 | 
 44 | 	/**
 45 | 	* The initial function for points.
 46 | 	*  @param  points              The array that stores the points.
 47 | 	*  @param  points_count        The number of points in the dataset.
 48 | 	*  @param  space_dimension     The dimension of database vector.
 49 | 	*/
 50 | 	void InitPoints(
 51 | 		PointType* points,
 52 | 		const int points_count,
 53 | 		const int space_dimension);
 54 | 
 55 | 	/**
 56 | 	* The initial function for dictionary.
 57 | 	*  @param  dictionary_file        The filename with dictionary in binary format.
 58 | 	*  @param  dictionary_store_type  The type of dictionary, should be BINARY.
 59 | 	*/
 60 | 	void InitDictionary(
 61 | 		const string dictionary_file,
 62 | 		const PointStoreType dictionary_store_type);
 63 | 
 64 | 	/**
 65 | 	* The initial function for points.
 66 | 	*  @param  dictionary           The array that stores the dictionary.
 67 | 	*  @param  dictionaries_count   The number of dictionaries (M).
 68 | 	*  @param  words_count the      The number of words in each dictionary (K).
 69 | 	*/
 70 | 	void InitDictionary(
 71 | 		const DictionaryType* dictionary,
 72 | 		const int dictionaries_count,
 73 | 		const int words_count);
 74 | 
 75 | 	/**
 76 | 	* The initial function for dictionary.
 77 | 	*  @param  binary_codes_file        The filename with binary codes in binary format.
 78 | 	*  @param  binary_codes_store_type  The type of binary codes, should be BINARY.
 79 | 	*/
 80 | 	void InitBinaryCodes(
 81 | 		const string binary_codes_file,
 82 | 		const PointStoreType binary_codes_store_type);
 83 | 
 84 | 	/**
 85 | 	* The initial function for points.
 86 | 	*  @param  binary_codes        The array that stores the binary codes.
 87 | 	*  @param  points_count        The number of points in the dataset.
 88 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 89 | 	*/
 90 | 	void InitBinaryCodes(
 91 | 		const CodeType* binary_codes,
 92 | 		const int points_count,
 93 | 		const int dictionaries_count);
 94 | 
 95 | 
 96 | 	/**
 97 | 	* This function returns the trained dictionary.
 98 | 	*/
 99 | 	const DictionaryType* GetDictionary();
100 | 
101 | 	/**
102 | 	* This function returns the trained binary codes.
103 | 	*/
104 | 	const CodeType* GetBinaryCodes();
105 | 
106 | 
107 | 	/**
108 | 	* The main function that trains the dictionary and the binary codes initialized by solving a simple problem.
109 | 	*  @param  iters               The iterations of alternating update the three groups of variables.
110 | 	*  @param  mu                  The penalty parameter (0.0004 for SIFT, 100 for GIST, 0.00001 for MNIST).
111 | 	*  @param  output_file_prefix  The prefix of the output file.
112 | 	*  @param  initial                            The flag to indicate whether to initial dictionary and binary codes,
113 | 	*                                             false -> already initialed from outside
114 | 	*                                             true  -> initial inside using results obtained from PQ.
115 | 	*/
116 | 	void Training(
117 | 		const int iters,
118 | 		const double mu,
119 | 		const string output_file_prefix,
120 | 		const bool initial);
121 | 
122 | 	/**
123 | 	 * This function gets the binary codes for points (database vectors as learning vectors are used for training)
124 | 	   with the trained dictionary fixed.
125 | 	 */
126 | 	void GetBinaryCodes(
127 | 		const PointType* points,
128 | 		const DictionaryType* dictionary,
129 | 		CodeType* binary_codes, 
130 | 		const int iters);
131 | 
132 |  private:
133 | 	 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
134 | 	 /**
135 | 	  * This function disallows the use of compiler-generated copy constructor function
136 | 	  */
137 | 	 CompositeQuantization(const CompositeQuantization&);
138 | 	 /**
139 | 	 * This function disallows the use of compiler-generated copy assignment function
140 | 	 */
141 | 	 CompositeQuantization& operator=(const CompositeQuantization&);
142 | 
143 | 
144 | 	 /**
145 | 	 * This function intialize the lbfgs parameter for usage in LBFGS method.
146 | 	 */
147 | 	 void InitLbfgsParam();
148 | 	 /**
149 | 	 * The initialization function for dictionary and binary codes.
150 | 	 *  @param  output_file_prefix                 The prefix of the output file.
151 | 	 */
152 | 	 void InitDictionaryBinaryCodes(const string output_file_prefix);
153 | 
154 | 
155 | 	 /**
156 | 	 * This function conducts the update epsilon step.
157 | 	 */
158 | 	 void UpdateEpsilon();
159 | 	 /**
160 | 	 * This function conducts the update dictionary step.
161 | 	 */
162 | 	 void UpdateDictionary();
163 | 	 /**
164 | 	 * This function conducts the update binary codes step.
165 | 	 */
166 | 	 void UpdateBinaryCodes();
167 | 
168 | 
169 | 	 /**
170 | 	 * This function computes the inner products between dictionary words from different dictionaries.
171 | 	 *  @param  dictionary    A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
172 | 	 *                        that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
173 | 	 */
174 | 	 void GetDictionaryCrossProducts(const DictionaryType* dictionary);
175 | 	 /**
176 | 	 * This function computes the inner products between dictionary words from different dictionaries
177 | 	   (called by lbfgs evaluate function).
178 | 	 *  @param  dictionary    A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
179 | 	 *                        that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
180 | 	 */
181 | 	 void GetDictionaryCrossProducts(const lbfgsfloatval_t* dictionary);
182 | 	 /**
183 | 	 * This function computes the distortions and constants for each point.
184 | 	 */
185 | 	 void GetDistortionsConstants();
186 | 
187 | 
188 | 	 /**
189 | 	 * This function output dictionary in a binary format.
190 | 	 */
191 | 	 void SaveDictionary(const string output_file_prefix);
192 | 	 /**
193 | 	 * This function output binary codes in a binary format.
194 | 	 */
195 | 	 void SaveBinaryCodes(const string output_file_prefix);
196 | 
197 | 
198 | 	 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ friend functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
199 | 	 /**
200 | 	 * Callback interface to provide objective function and gradient evaluations.
201 | 	 * 
202 | 	 *  The lbfgs() function call this function to obtain the values of objective
203 | 	 *  function and its gradients when needed. A client program must implement
204 | 	 *  this function to evaluate the values of the objective function and its
205 | 	 *  gradients, given current values of variables.
206 | 	 *
207 | 	 *  @param  instance        The user data sent for lbfgs() function by the client.
208 | 	 *  @param  x               The current values of variables.
209 | 	 *  @param  g               The gradient vector. The callback function must compute
210 | 	 *                          the gradient values for the current variables.
211 | 	 *  @param  n               The number of variables 
212 | 	                            (equals the number of entries in dictionary, i.e., space_dimension_*words_count_*dictionaries_count_).
213 | 	 *  @param  step            The current step of the line search routine.
214 | 	 *  @retval lbfgsfloatval_t The value of the objective function for the current
215 | 	 *                          variables.
216 | 	 */
217 | 	 friend static lbfgsfloatval_t evaluate(
218 | 		 void *instance,
219 | 		 const lbfgsfloatval_t *x,
220 | 		 lbfgsfloatval_t *g,
221 | 		 const int n,
222 | 		 const lbfgsfloatval_t step
223 | 		 );
224 | 	 /**
225 | 	 * Callback interface to receive the progress of the optimization process.
226 | 	 *
227 | 	 *  The lbfgs() function call this function for each iteration. Implementing
228 | 	 *  this function, a client program can store or display the current progress
229 | 	 *  of the optimization process.
230 | 	 *
231 | 	 *  @param  instance    The user data sent for lbfgs() function by the client.
232 | 	 *  @param  x           The current values of variables.
233 | 	 *  @param  g           The current gradient values of variables.
234 | 	 *  @param  fx          The current value of the objective function.
235 | 	 *  @param  xnorm       The Euclidean norm of the variables.
236 | 	 *  @param  gnorm       The Euclidean norm of the gradients.
237 | 	 *  @param  step        The line-search step used for this iteration.
238 | 	 *  @param  n           The number of variables
239 | 	                        (equals the number of entries in dictionary, i.e., space_dimension_*words_count_*dictionaries_count_).
240 | 	 *  @param  k           The iteration count.
241 | 	 *  @param  ls          The number of evaluations called for this iteration.
242 | 	 *  @retval int         Zero to continue the optimization process. Returning a
243 | 	 *                      non-zero value will cancel the optimization process.
244 | 	 */
245 | 	 friend static int progress(
246 | 		 void *instance,
247 | 		 const lbfgsfloatval_t *x,
248 | 		 const lbfgsfloatval_t *g,
249 | 		 const lbfgsfloatval_t fx,
250 | 		 const lbfgsfloatval_t xnorm,
251 | 		 const lbfgsfloatval_t gnorm,
252 | 		 const lbfgsfloatval_t step,
253 | 		 int n,
254 | 		 int k,
255 | 		 int ls
256 | 		 );
257 | 
258 | 
259 | 	 /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member variables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
260 | 	 /**
261 | 	 * The number of points in the dataset.
262 | 	 */
263 | 	 int points_count_;
264 | 	/**
265 | 	 * The number of dictionaries (M).
266 | 	 */
267 | 	int dictionaries_count_;
268 | 	/**
269 | 	 * The number of words in each dictionary (K).
270 | 	 */
271 | 	int words_count_;
272 | 	/**
273 | 	 * The dimension of database vector.
274 | 	 */
275 | 	int space_dimension_;
276 | 	/**
277 | 	* The number of partitions of the points to accelerate the gradient computation (default 20).
278 | 	*/
279 | 	int num_sep_;
280 | 	/**
281 | 	* The LBFGS parameter, its property can be found in the document of lib-lbfgs.
282 | 	*/
283 | 	lbfgs_parameter_t lbfgs_param_;
284 | 
285 | 
286 | 	/**
287 | 	* A one-dimensional array (of length space_dimension_*points_count_)
288 | 	* that the first space_dimension_ data is the first point.
289 | 	*/
290 | 	PointType* points_;
291 | 	/**
292 | 	* A flag to indicate whether to manage the points memory.
293 | 	*/
294 | 	bool own_points_memory_;
295 | 	/**
296 | 	 * A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
297 | 	 * that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
298 | 	 */
299 | 	DictionaryType* dictionary_;
300 | 	/**
301 | 	* A one-dimensional array (of length dictionaries_count_*points_count_)
302 | 	* that the frist dictionaries_count_ data is the binary codes for the first point.
303 | 	*/
304 | 	CodeType* binary_codes_;
305 | 
306 | 
307 | 	/**
308 | 	* Stores the distortion for each point.
309 | 	*/
310 | 	float* distortions_;
311 | 	/**
312 | 	* Stores the distortion for all points.
313 | 	*/
314 | 	float distortion_;	
315 | 	/**
316 | 	 * Stores the inter-dictionary-element-product for each point.
317 | 	 */
318 | 	float* constants_;
319 | 	/**
320 | 	* Stores the inter-dictionary-element-product for all points.
321 | 	*/
322 | 	float constant_;
323 | 
324 | 
325 | 	/**
326 | 	 * The introduced constant inter-dictionary-element-product initialized by 0.
327 | 	 */
328 | 	double epsilon_;
329 | 	/**
330 | 	 * The penalty parameter selected by validation (equals 4*mu in the paper).
331 | 	 *  The value choosed in the experiment is 
332 | 	 *    1MSIFT         0.0004
333 | 	 *    1MGIST         100
334 | 	 *    1BSIFT         0.0004
335 | 	 *    MNIST          0.00001
336 | 	 */
337 | 	double mu_;
338 | 	
339 | 
340 | 	/**
341 | 	* The temporary variable (of length space_dimension_*words_count_*dictionaries_count_)
342 | 	* to accelerate the gradient computation.
343 | 	*/
344 | 	vector<vector<float>> dictionary_gradient_sep_;
345 | 	/**
346 | 	* The temporary variable storing the inner products between dictionary words from different dictionaries.
347 | 	*/
348 | 	vector<vector<float>> dictionary_cross_products_;
349 | };


--------------------------------------------------------------------------------
/ReleaseVersion/DataUtil.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #pragma once
  3 | 
  4 | #include <string>
  5 | #include <fstream>
  6 | #include <ios>
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | using std::string;
 11 | using std::ifstream;
 12 | using std::ofstream;
 13 | using std::vector;
 14 | using std::ios;
 15 | using std::cout;
 16 | using std::endl;
 17 | using std::pair;
 18 | 
 19 | // PointType and DcitionaryType must be the same and be float!
 20 | typedef float FloatType;
 21 | typedef FloatType PointType;
 22 | typedef FloatType DictionaryType;
 23 | 
 24 | typedef float QueryType;
 25 | 
 26 | typedef float DistanceType;
 27 | typedef int PointIdType;
 28 | typedef int CodeType;
 29 | typedef pair<DistanceType, PointIdType> DistanceToQueryType;
 30 | 
 31 | /**
 32 | * Compare function for DistanceToQuery.
 33 | */
 34 | struct CompDistanceToQuery
 35 | {
 36 | 	bool operator()(const DistanceToQueryType& lhs, const DistanceToQueryType& rhs)
 37 | 	{
 38 | 		return lhs.first < rhs.first;
 39 | 	}
 40 | };
 41 | 
 42 | /**
 43 | * This enumeration presents different methods of kmeans algorithm.
 44 | */
 45 | enum KmeansMethod
 46 | {
 47 | 	Lloyd = 100,
 48 | 	Closure
 49 | };
 50 | 
 51 | /**
 52 | * This enumeration presents different store types of input point coordinate.
 53 | */
 54 | enum PointStoreType
 55 | {
 56 | 	FVEC,
 57 | 	BVEC,
 58 | 	IVEC,
 59 | 	BINARY
 60 | };
 61 | 
 62 | /**
 63 |  * This function read training points from point_file
 64 |  *  @param  points_file   The filename with points in .fvecs format or binary float format.
 65 |  *  @param  point_type    The type of points, should be FVECS or FLOAT.
 66 |  *  @param  points        A one-dimensional array data (of dimension*points_count).
 67 |  *  @param  points_count  The number of points.
 68 |  *  @param  dimension     The dimension of points.
 69 |  */
 70 | template<typename T>
 71 | void ReadOneDimensionalPoints(
 72 | 	const string point_file, 
 73 | 	PointStoreType point_sotre_type,
 74 | 	vector<T>& points,
 75 | 	const int points_count,
 76 | 	const int dimension)
 77 | {
 78 | 	ifstream point_stream;
 79 | 	point_stream.open(point_file.c_str(), ios::binary);
 80 | 	if (!point_stream.good())
 81 | 	{
 82 | 		cout << "Error in open " + point_file << endl;
 83 | 		throw std::logic_error("Bad input points stream" + point_file);
 84 | 	}
 85 | 	int dim = 0, count = 0;
 86 | 	switch (point_sotre_type)
 87 | 	{
 88 | 		case FVEC: 
 89 | 			point_stream.read((char *)&dim, sizeof(int));
 90 | 			cout << "Dimension of the vector set:" << dim << endl;
 91 | 			point_stream.seekg(0, point_stream.end);
 92 | 			count = point_stream.tellg() / ((dim + 1) * 4);
 93 | 			cout << "Number of the vector set:" << count << endl;
 94 | 			if (dim != dimension || count != points_count)
 95 | 			{
 96 | 				cout << "unmatched dimension!\n";
 97 | 				throw std::logic_error("unmatched dimension!");
 98 | 			}
 99 | 			point_stream.seekg(0, point_stream.beg);
100 | 			for (int count_id = 0; count_id < count; ++count_id)
101 | 			{
102 | 				float vector_dimension = 0;
103 | 				point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
104 | 				point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(float)*dim);
105 | 			}
106 | 			break;
107 | 		case BVEC:
108 | 			point_stream.read((char *)&dim, sizeof(int));
109 | 			cout << "Dimension of the vector set:" << dim << endl;
110 | 			point_stream.seekg(0, point_stream.end);
111 | 			count = point_stream.tellg() / (dim + 4);
112 | 			cout << "Number of the vector set:" << count << endl;
113 | 			if (dim != dimension || count != points_count)
114 | 			{
115 | 				cout << "unmatched dimension!\n";
116 | 				throw std::logic_error("unmatched dimension!");
117 | 			}
118 | 			point_stream.seekg(0, point_stream.beg);
119 | 			for (int count_id = 0; count_id < count; ++count_id)
120 | 			{
121 | 				int vector_dimension = 0;
122 | 				point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
123 | 				point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(unsigned char)*dim);
124 | 			}
125 | 			break;
126 | 		case IVEC:
127 | 			point_stream.read((char *)&dim, sizeof(int));
128 | 			cout << "Dimension of the vector set:" << dim << endl;
129 | 			point_stream.seekg(0, point_stream.end);
130 | 			count = point_stream.tellg() / ((dim + 1) * 4);
131 | 			cout << "Number of the vector set:" << count << endl;
132 | 			if (dim != dimension || count != points_count)
133 | 			{
134 | 				cout << "unmatched dimension!\n";
135 | 				throw std::logic_error("unmatched dimension!");
136 | 			}
137 | 			point_stream.seekg(0, point_stream.beg);
138 | 			for (int count_id = 0; count_id < count; ++count_id)
139 | 			{
140 | 				int vector_dimension = 0;
141 | 				point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
142 | 				point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(int)*dim);
143 | 			}
144 | 			break;
145 | 		case BINARY:
146 | 			point_stream.read((char *)&count, sizeof(int));
147 | 			point_stream.read((char *)&dim, sizeof(int));
148 | 			if (dim != dimension || count != points_count)
149 | 			{
150 | 				cout << "unmatched dimension!\n";
151 | 				throw std::logic_error("unmatched dimension!");
152 | 			}
153 | 			cout << "Dimension of the vector set:" << dim << endl;
154 | 			cout << "Number of the vector set:" << count << endl;
155 | 			point_stream.read(reinterpret_cast<char*>(&(points[0])), sizeof(T)*dim*count);
156 | 			break;
157 | 	}
158 | 	point_stream.close();
159 | }
160 | 
161 | template<typename T>
162 | void ReadOneDimensionalPoints(
163 | 	const string point_file,
164 | 	PointStoreType point_sotre_type,
165 | 	T* points,
166 | 	const int points_count,
167 | 	const int dimension)
168 | {
169 | 	ifstream point_stream;
170 | 	point_stream.open(point_file.c_str(), ios::binary);
171 | 	if (!point_stream.good()) {
172 | 		cout << "Error in open " + point_file << endl;
173 | 		throw std::logic_error("Bad input points stream: " + point_file);
174 | 	}
175 | 	int dim = 0, count = 0;
176 | 	switch (point_sotre_type)
177 | 	{
178 | 	case FVEC:
179 | 		point_stream.read((char *)&dim, sizeof(int));
180 | 		cout << "Dimension of the vector set:" << dim << endl;
181 | 		point_stream.seekg(0, point_stream.end);
182 | 		count = point_stream.tellg() / ((dim + 1) * 4);
183 | 		cout << "Number of the vector set:" << count << endl;
184 | 		if (dim != dimension || count != points_count)
185 | 		{
186 | 			cout << "unmatched dimension!\n";
187 | 			throw std::logic_error("unmatched dimension!");
188 | 		}
189 | 		point_stream.seekg(0, point_stream.beg);
190 | 		for (int count_id = 0; count_id < count; ++count_id)
191 | 		{
192 | 			float vector_dimension = 0;
193 | 			point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
194 | 			point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(float)*dim);
195 | 		}
196 | 		break;
197 | 	case BVEC:
198 | 		point_stream.read((char *)&dim, sizeof(int));
199 | 		cout << "Dimension of the vector set:" << dim << endl;
200 | 		point_stream.seekg(0, point_stream.end);
201 | 		count = point_stream.tellg() / (dim + 4);
202 | 		cout << "Number of the vector set:" << count << endl;
203 | 		if (dim != dimension || count != points_count)
204 | 		{
205 | 			cout << "unmatched dimension!\n";
206 | 			throw std::logic_error("unmatched dimension!");
207 | 		}
208 | 		point_stream.seekg(0, point_stream.beg);
209 | 		for (int count_id = 0; count_id < count; ++count_id)
210 | 		{
211 | 			int vector_dimension = 0;
212 | 			point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
213 | 			point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(unsigned char)*dim);
214 | 		}
215 | 		break;
216 | 	case IVEC:
217 | 		point_stream.read((char *)&dim, sizeof(int));
218 | 		cout << "Dimension of the vector set:" << dim << endl;
219 | 		point_stream.seekg(0, point_stream.end);
220 | 		count = point_stream.tellg() / ((dim + 1) * 4);
221 | 		cout << "Number of the vector set:" << count << endl;
222 | 		if (dim != dimension || count != points_count)
223 | 		{
224 | 			cout << "unmatched dimension!\n";
225 | 			throw std::logic_error("unmatched dimension!");
226 | 		}
227 | 		point_stream.seekg(0, point_stream.beg);
228 | 		for (int count_id = 0; count_id < count; ++count_id)
229 | 		{
230 | 			int vector_dimension = 0;
231 | 			point_stream.read(reinterpret_cast<char*>(&vector_dimension), sizeof(vector_dimension));
232 | 			point_stream.read(reinterpret_cast<char*>(&points[count_id*dim]), sizeof(int)*dim);
233 | 		}
234 | 		break;
235 | 	case BINARY:
236 | 		point_stream.read((char *)&count, sizeof(int));
237 | 		point_stream.read((char *)&dim, sizeof(int));
238 | 		if (dim != dimension || count != points_count)
239 | 		{
240 | 			cout << "unmatched dimension!\n";
241 | 			throw std::logic_error("unmatched dimension!");
242 | 		}
243 | 		cout << "Dimension of the vector set:" << dim << endl;
244 | 		cout << "Number of the vector set:" << count << endl;
245 | 		point_stream.read(reinterpret_cast<char*>(points), sizeof(T)*dim*count);
246 | 		break;
247 | 	}
248 | 	point_stream.close();
249 | }
250 | 
251 | /**
252 | * This function read training points from point_file
253 | *  @param  points_file   The filename with points in .fvecs format or binary float format.
254 | *  @param  points        A one-dimensional array data (of dimension*points_count).
255 | *  @param  points_count  The number of points.
256 | *  @param  dimension     The dimension of points.
257 | */
258 | template<typename T>
259 | void SaveOneDimensionalPoints(
260 | 	const string point_file,
261 | 	vector<T>& points,
262 | 	const int points_count,
263 | 	const int dimension)
264 | {
265 | 	ofstream point_stream;
266 | 	point_stream.open(point_file.c_str(), ios::binary);
267 | 	if (!point_stream.good()) 
268 | 	{
269 | 		cout << "Error in write " + point_file << endl;
270 | 		throw std::logic_error("Bad output points stream" + point_file);
271 | 	}
272 | 	point_stream.write((char *)&points_count, sizeof(int));
273 | 	point_stream.write((char *)&dimension, sizeof(int));
274 | 	point_stream.write(reinterpret_cast<char*>(&(points[0])), sizeof(T)*dimension*points_count);
275 | 	point_stream.close();
276 | }
277 | 
278 | template<typename T>
279 | void SaveOneDimensionalPoints(
280 | 	const string point_file,
281 | 	T* points,
282 | 	const int points_count,
283 | 	const int dimension)
284 | {
285 | 	ofstream point_stream;
286 | 	point_stream.open(point_file.c_str(), ios::binary);
287 | 	if (!point_stream.good()) 
288 | 	{
289 | 		cout << "Error in write " + point_file << endl;
290 | 		throw std::logic_error("Bad output points stream" + point_file);
291 | 	}
292 | 	point_stream.write((char *)&points_count, sizeof(int));
293 | 	point_stream.write((char *)&dimension, sizeof(int));
294 | 	point_stream.write(reinterpret_cast<char*>(points), sizeof(T)*dimension*points_count);
295 | 	point_stream.close();
296 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Dataset.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdint.h>
 3 | 
 4 | namespace KMC
 5 | {
 6 |     // structure to save Data Matrix
 7 |     template <typename T> 
 8 |     class Dataset 
 9 |     {
10 |         bool ownData;  // Flag showing if the class owns its data storage.
11 | 
12 |         void shallow_copy(const Dataset& rhs)
13 |         {
14 |             data = rhs.data;
15 |             rows = rhs.rows;
16 |             cols = rhs.cols;
17 |             ownData = false;
18 |         }
19 |         uint32_t rows;
20 |         uint32_t cols;
21 |         T* data;
22 | 
23 |     public:
24 | 
25 | 	    Dataset()
26 | 	    {
27 | 	    }
28 | 
29 | 	    Dataset(long rows_, long cols_, T* data_ = NULL) :
30 |             rows(rows_), cols(cols_), data(data_), ownData(false)
31 | 	    {
32 |             if (data_==NULL) 
33 | 		    {
34 | 		        data = new T[rows*cols];
35 |                 ownData = true;
36 |             }
37 | 	    }
38 | 
39 |         Dataset(const Dataset& d)
40 |         {
41 |             shallow_copy(d);
42 |         }
43 | 
44 |         const Dataset& operator=(const Dataset& rhs)
45 |         {
46 |             if (this!=&rhs) {
47 |                 shallow_copy(rhs);
48 |             }
49 |             return *this;
50 |         }
51 | 
52 | 	    ~Dataset()
53 | 	    {
54 |             if (ownData) 
55 | 		    {
56 | 		      delete[] data;
57 |             }
58 | 	    }
59 | 
60 |         /**
61 |         * Operator that return a (pointer to a) row of the data.
62 |         */
63 |         T* operator[](size_t index)
64 |         {
65 |             return data+index*cols;
66 |         }
67 | 
68 |         T* operator[](size_t index) const
69 |         {
70 |             return data+index*cols;
71 |         }
72 | 
73 |         uint32_t R() const
74 |         { 
75 |             return rows; 
76 |         }
77 | 
78 |         uint32_t C() const
79 |         {
80 |             return cols;
81 |         }
82 | 
83 |     };
84 | 
85 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Demo.cpp:
--------------------------------------------------------------------------------
  1 | // CompositeQuantizationTraining.cpp : Defines the entry point for the console application.
  2 | //
  3 | #define _CRTDBG_MAP_ALLOC
  4 | #include "CompositeQuantization.h"
  5 | #include "NoConstraintCompositeQuantization.h"
  6 | #include "ProductQuantization.h"
  7 | #include "Searcher.h"
  8 | #include "CQParameters.h"
  9 | #include <omp.h>
 10 | 
 11 | 
 12 | 
 13 | void ProductQuantizationDemo(CQParameters& param)
 14 | {
 15 | 	ProductQuantization PQ(
 16 | 		param.Get<int>("points_count"),
 17 | 		param.Get<int>("dictionaries_count"),
 18 | 		param.Get<int>("words_count"),
 19 | 		param.Get<int>("space_dimension"));
 20 | 
 21 | 	PQ.InitPoints(param.Get<string>("points_file"), FVEC);
 22 | 
 23 | 	KmeansMethod kmeans_method = Lloyd;
 24 | 	if (param.Get<int>("kmeans_method") == 101)
 25 | 		kmeans_method = Closure;
 26 | 
 27 | 	PQ.Training(
 28 | 		param.Get<int>("max_iter"),
 29 | 		param.Get<float>("distortion_tol"),
 30 | 		kmeans_method,
 31 | 		param.Get<string>("output_file_prefix"),
 32 | 		param.Get<int>("read_partition"),
 33 | 		param.Get<string>("partition_file"));
 34 | }
 35 | 
 36 | void NoConstraintCompositeQuantizationDemo(CQParameters& param)
 37 | {
 38 | 	NoConstraintCompositeQuantization NCQ(
 39 | 		param.Get<int>("points_count"),
 40 | 		param.Get<int>("dictionaries_count"),
 41 | 		param.Get<int>("words_count"),
 42 | 		param.Get<int>("space_dimension"),
 43 | 		param.Get<int>("num_sep"));
 44 | 
 45 | 	NCQ.InitPoints(param.Get<string>("points_file"), FVEC);
 46 | 
 47 | 	if (param.Get<int>("initial_from_outside") == 1)
 48 | 	{
 49 | 		NCQ.InitDictionary(param.Get<string>("dictionary_file"), BINARY);
 50 | 		NCQ.InitBinaryCodes(param.Get<string>("binary_codes_file"), BINARY);
 51 | 		NCQ.Training(
 52 | 			param.Get<int>("max_iter"),
 53 | 			param.Get<string>("output_file_prefix"),
 54 | 			false);
 55 | 	}
 56 | 	else
 57 | 	{
 58 | 		NCQ.Training(
 59 | 			param.Get<int>("max_iter"),
 60 | 			param.Get<string>("output_file_prefix"),
 61 | 			true);
 62 | 	}
 63 | }
 64 | 
 65 | void CompositeQuantizationDemo(CQParameters& param)
 66 | {
 67 | 	CompositeQuantization CQ(
 68 | 		param.Get<int>("points_count"),
 69 | 		param.Get<int>("dictionaries_count"),
 70 | 		param.Get<int>("words_count"),
 71 | 		param.Get<int>("space_dimension"),
 72 | 		param.Get<int>("num_sep"));
 73 | 
 74 | 	CQ.InitPoints(param.Get<string>("points_file"), FVEC);
 75 | 
 76 | 	if (param.Get<int>("initial_from_outside") == 1)
 77 | 	{
 78 | 		CQ.InitDictionary(param.Get<string>("dictionary_file"), BINARY);
 79 | 		CQ.InitBinaryCodes(param.Get<string>("binary_codes_file"), BINARY);
 80 | 		CQ.Training(
 81 | 			param.Get<int>("max_iter"),
 82 | 			param.Get<float>("mu"),
 83 | 			param.Get<string>("output_file_prefix"),
 84 | 			false);
 85 | 	}
 86 | 	else
 87 | 	{
 88 | 		CQ.Training(
 89 | 			param.Get<int>("max_iter"),
 90 | 			param.Get<float>("mu"),
 91 | 			param.Get<string>("output_file_prefix"),
 92 | 			true);
 93 | 	}
 94 | }
 95 | 
 96 | void SearchDemo(CQParameters& param)
 97 | {
 98 | 	Searcher Search(
 99 | 		param.Get<int>("points_count"),
100 | 		param.Get<int>("dictionaries_count"),
101 | 		param.Get<int>("words_count"),
102 | 		param.Get<int>("space_dimension"),
103 | 		param.Get<int>("queries_count"),
104 | 		param.Get<int>("groundtruth_length"),
105 | 		param.Get<int>("result_length"));
106 | 
107 | 	Search.InitQueries(param.Get<string>("queries_file"), FVEC);
108 | 	Search.InitGroundtruth(param.Get<string>("groundtruth_file"), IVEC);
109 | 	Search.InitDictionary(param.Get<string>("trained_dictionary_file"), BINARY);
110 | 	Search.InitBinaryCodes(param.Get<string>("trained_binary_codes_file"), BINARY);
111 | 
112 | 	Search.GetNearestNeighbors(param.Get<string>("output_retrieved_results_file"));
113 | 
114 | 	vector<int> R;
115 | 	R.push_back(1);
116 | 	R.push_back(10);
117 | 	R.push_back(100);
118 | 	Search.GetRecall(R, 1);
119 | }
120 | 
121 | int main(int argc, char** argv)
122 | {
123 | 	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
124 | 
125 | 	omp_set_num_threads(omp_get_num_procs());
126 | 	cout << "Set threads: " << omp_get_num_procs() << endl;
127 | 
128 | 	//an example of running different quantization methods on 1MSIFT
129 | 
130 | 	{
131 | 		CQParameters param;
132 | 		param.LoadFromFile("config.txt");
133 | 
134 | 		if (param.Get<int>("PQ") == 1)
135 | 		{
136 | 			ProductQuantizationDemo(param);
137 | 		}
138 | 
139 | 		if (param.Get<int>("NCQ") == 1)
140 | 		{
141 | 			NoConstraintCompositeQuantizationDemo(param);
142 | 		}
143 | 
144 | 		if (param.Get<int>("CQ") == 1)
145 | 		{
146 | 			CompositeQuantizationDemo(param);
147 | 		}
148 | 
149 | 		if (param.Get<int>("Search") == 1)
150 | 		{
151 | 			SearchDemo(param);
152 | 		}
153 | 	}
154 | 	
155 | 	
156 | 	_CrtDumpMemoryLeaks();
157 | 	return 0;
158 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Distance.h:
--------------------------------------------------------------------------------
 1 | #pragma once  
 2 | 
 3 | #include <iostream>
 4 | #include "ClusterCommon.h"
 5 | 
 6 | namespace KMC
 7 | {
 8 |     inline FloatType ComputeDistance(const FloatType *p1, const FloatType *p2, size_t length = 100)
 9 |     {
10 |         FloatType distance = 0;
11 | 	    for (int i = 0; i < length; i++)
12 | 	    {
13 |             FloatType temp = p1[i] - p2[i];
14 |             distance += temp * temp;
15 | 	    }
16 | 
17 | 	    return distance;
18 |     }
19 | 
20 | 	inline FloatType ComputeDistance(const FloatType *p1, const IntegerType *p2, size_t length = 100)
21 |     {
22 | 		FloatType distance = 0;
23 | 		for (int i = 0; i < length; i++)
24 | 		{
25 | 			FloatType temp = p1[i] - FloatType(p2[i]);
26 | 			distance += temp * temp;
27 | 	    }
28 | 
29 | 	    return distance;
30 | 	}
31 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Kmeans.cpp:
--------------------------------------------------------------------------------
  1 | #include "Kmeans.h"
  2 | 
  3 | void Kmeans_Reset(
  4 | 	Kmeans* self,
  5 | 	const int points_count,
  6 | 	const int clusters_count,
  7 | 	const int dimension,
  8 | 	const float* points,
  9 | 	const bool verbosity)
 10 | {
 11 | 	if (points_count*dimension > self->points_count_*self->dimension_)
 12 | 	{
 13 | 		if (self->points_) free(self->points_);
 14 | 		self->points_ = (float*)malloc(sizeof(float)*dimension*points_count);
 15 | 	}
 16 | 	if (points_count > self->points_count_)
 17 | 	{
 18 | 		if (self->assignments_) free(self->assignments_);
 19 | 		if (self->distances_) free(self->distances_);
 20 | 		self->assignments_ = (int*)malloc(sizeof(int)*points_count);
 21 | 		self->distances_ = (float*)malloc(sizeof(float)*points_count);
 22 | 	}
 23 | 	if (dimension*clusters_count > self->dimension_*self->clusters_count_)
 24 | 	{
 25 | 		if (self->centers_) free(self->centers_);
 26 | 		self->centers_ = (float*)malloc(sizeof(float)*dimension*clusters_count);
 27 | 	}
 28 | 	self->points_count_ = points_count;
 29 | 	self->clusters_count_ = clusters_count;
 30 | 	self->dimension_ = dimension;
 31 | 	self->verbosity_ = verbosity;
 32 | 	if (points)
 33 | 		memcpy(self->points_, points, sizeof(float)*dimension*points_count);
 34 | }
 35 | 
 36 | Kmeans* Kmeans_New(
 37 | 	const int points_count,
 38 | 	const int clusters_count,
 39 | 	const int dimension,
 40 | 	const float* points,
 41 | 	const bool verborsity)
 42 | {
 43 | 	Kmeans* kmeans = (Kmeans*)malloc(sizeof(Kmeans));
 44 | 	kmeans->points_ = (float*)malloc(sizeof(float)*dimension*points_count);
 45 | 	kmeans->centers_ = (float*)malloc(sizeof(float)*dimension*clusters_count);
 46 | 	kmeans->assignments_ = (int*)malloc(sizeof(int)*points_count);
 47 | 	kmeans->distances_ = (float*)malloc(sizeof(float)*points_count);
 48 | 	Kmeans_Reset(kmeans, points_count, clusters_count, dimension, points, verborsity);
 49 | 	return kmeans;
 50 | }
 51 | 
 52 | void Kmeans_Delete(Kmeans* self)
 53 | {
 54 | 	if (self->points_)
 55 | 		free(self->points_);
 56 | 	if (self->centers_)
 57 | 		free(self->centers_);
 58 | 	if (self->assignments_)
 59 | 		free(self->assignments_);
 60 | 	if (self->distances_)
 61 | 		free(self->distances_);
 62 | 	free(self);
 63 | }
 64 | 
 65 | void Kmeans_Initialize(Kmeans* self, const KmeansInitialType initial_type)
 66 | {
 67 | 	switch (initial_type)
 68 | 	{
 69 | 	case KmeansInitial_RANDOM:
 70 | 		Kmeans_RandomInitialize(self->points_count_, self->clusters_count_, self->dimension_, self->points_, self->centers_);
 71 | 		break;
 72 | 	case KmeansInitial_KmeansPlusPlus:
 73 | 		Kmeans_KmeansPlusPlusInitialize(self->points_count_, self->clusters_count_, self->dimension_, self->points_, self->centers_);
 74 | 	}
 75 | }
 76 | 
 77 | void Kmeans_RandomInitialize(
 78 | 	const int points_count,
 79 | 	const int clusters_count,
 80 | 	const int dimension,
 81 | 	const float* points,
 82 | 	float* centers)
 83 | {
 84 | 	vector<int> perm;
 85 | 	for (int id = 0; id < points_count; ++id)
 86 | 		perm.push_back(id);
 87 | 	std::random_shuffle(perm.begin(), perm.end());
 88 | 	for (int cluster_id = 0; cluster_id < clusters_count; ++cluster_id)
 89 | 	{
 90 | 		memcpy(&centers[cluster_id*dimension], &points[perm[cluster_id] * dimension], sizeof(float)*dimension);
 91 | 	}
 92 | }
 93 | 
 94 | void Kmeans_KmeansPlusPlusInitialize(
 95 | 	const int points_count,
 96 | 	const int clusters_count,
 97 | 	const int dimension,
 98 | 	const float* points,
 99 | 	float* centers)
100 | {
101 | 	float* min_distances = new float[points_count];
102 | 	memset(min_distances, FLT_MAX, sizeof(float)*points_count);
103 | 
104 | 	/* select the first point at random */
105 | 	int selected_id = rand() % points_count;
106 | 	int selected_ids_count = 0;
107 | 	while (true)
108 | 	{
109 | 		memcpy(&centers[selected_ids_count*dimension], &points[selected_id*dimension], sizeof(float)*dimension);
110 | 		selected_ids_count++;
111 | 		if (selected_ids_count == clusters_count) break;
112 | 		double distortion = 0;
113 | 		for (int point_id = 0; point_id < points_count; ++point_id)
114 | 		{
115 | 			float dist = 0;
116 | 			for (int dim = 0; dim < dimension; ++dim)
117 | 			{
118 | 				float diff = points[point_id*dimension + dim] - centers[(selected_ids_count - 1)*dimension + dim];
119 | 				dist += diff*diff;
120 | 			}
121 | 			if (dist < min_distances[point_id])
122 | 				min_distances[point_id] = dist;
123 | 			distortion += min_distances[point_id];
124 | 		}
125 | 		double thresh = rand() / RAND_MAX * distortion;
126 | 		double probability = 0;
127 | 		for (selected_id = 0; selected_id < points_count - 1; ++selected_id)
128 | 		{
129 | 			probability += min_distances[selected_id];
130 | 			if (probability >= thresh) break;
131 | 		}
132 | 	}
133 | 	delete[] min_distances;
134 | }
135 | 
136 | void Kmeans_LloydQuantization(
137 | 	Kmeans* self, 
138 | 	const int max_iters, 
139 | 	const double distortion_tol)
140 | {
141 | 	self->max_iteration_ = max_iters;
142 | 	self->distortion_tol_ = distortion_tol;
143 | 	double distortion, previous_distortion;
144 | 	int* cluster_masses = new int[self->clusters_count_];
145 | 	for (int iteration = 0; true; ++iteration)
146 | 	{
147 | 		clock_t start = clock();
148 | 		/* assign point to clusters */
149 | #pragma omp parallel for
150 | 		for (int point_id = 0; point_id < self->points_count_; ++point_id)
151 | 		{
152 | 			float min_dist = FLT_MAX;
153 | 			int selected_id = 0;
154 | 			for (int cluster_id = 0; cluster_id < self->clusters_count_; ++cluster_id)
155 | 			{
156 | 				float dist = 0;
157 | 				for (int dim = 0; dim < self->dimension_; ++dim)
158 | 				{
159 | 					float diff = self->points_[point_id*self->dimension_ + dim] - self->centers_[cluster_id*self->dimension_ + dim];
160 | 					dist += diff*diff;
161 | 				}
162 | 				if (dist < min_dist)
163 | 				{
164 | 					min_dist = dist;
165 | 					selected_id = cluster_id;
166 | 				}
167 | 			}
168 | 			self->distances_[point_id] = min_dist;
169 | 			self->assignments_[point_id] = selected_id;
170 | 		}
171 | 		
172 | 		/* compute distortion*/
173 | 		distortion = 0;
174 | 		for (int point_id = 0; point_id < self->points_count_; ++point_id)
175 | 			distortion += self->distances_[point_id];
176 | 		if (self->verbosity_)
177 | 			cout << " kmeans: Lloyd iter " << iteration << " : distortion = " << distortion << endl;
178 | 		
179 | 		/* check termination conditions */
180 | 		if (iteration >= self->max_iteration_)
181 | 		{
182 | 			if (self->verbosity_)
183 | 				cout << "kmeans: Lloyd terminating because maximum number of iterations reached\n";
184 | 			break;
185 | 		}
186 | 		if (iteration == 0)
187 | 		{
188 | 			previous_distortion = distortion;
189 | 		}
190 | 		else
191 | 		{
192 | 			double eps = (previous_distortion - distortion) / previous_distortion;
193 | 			if (eps < self->distortion_tol_)
194 | 			{
195 | 				if (self->verbosity_)
196 | 					cout << "kmeans: Lloyd terminating because the distortion relative variation was less than " << self->distortion_tol_ << endl;
197 | 				break;
198 | 			}
199 | 		}
200 | 
201 | 		/* begin next iteration */
202 | 		previous_distortion = distortion;
203 | 
204 | 		/* update centers */
205 | 		memset(cluster_masses, 0, sizeof(int)*self->clusters_count_);
206 | 		for (int point_id = 0; point_id < self->points_count_; ++point_id)
207 | 			cluster_masses[self->assignments_[point_id]]++;
208 | 
209 | 		int restarted_centers_count = 0;
210 | 		memset(self->centers_, 0, sizeof(float)*self->dimension_*self->clusters_count_);
211 | 		for (int point_id = 0; point_id < self->points_count_; ++point_id)
212 | 		{
213 | 			float* center = &self->centers_[self->assignments_[point_id] * self->dimension_];
214 | 			const float* point = &self->points_[point_id*self->dimension_];
215 | 			for (int dim = 0; dim < self->dimension_; ++dim)
216 | 			{
217 | 				center[dim] += point[dim];
218 | 			}
219 | 		}
220 | 		for (int cluster_id = 0; cluster_id < self->clusters_count_; ++cluster_id)
221 | 		{
222 | 			float* center = &self->centers_[cluster_id*self->dimension_];
223 | 			if (cluster_masses[cluster_id] > 0)
224 | 			{
225 | 				for (int dim = 0; dim < self->dimension_; ++dim)
226 | 					center[dim] /= cluster_masses[cluster_id];
227 | 			}
228 | 			else
229 | 			{
230 | 				restarted_centers_count++;
231 | 				int rand_id = rand() % self->points_count_;
232 | 				for (int dim = 0; dim < self->dimension_; ++dim)
233 | 					center[dim] = self->points_[rand_id*self->dimension_ + dim];
234 | 			}
235 | 		}
236 | 		clock_t finish = clock();
237 | 		if (self->verbosity_)
238 | 			cout << " cost = " << finish - start << " milliseconds " << endl;
239 | 	}
240 | 	self->distortion_ = distortion;
241 | 	delete[] cluster_masses;
242 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Kmeans.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "time.h"
  4 | #include <string>
  5 | #include <stdlib.h>
  6 | #include <algorithm>
  7 | #include <vector>
  8 | #include <iostream>
  9 | 
 10 | using std::string;
 11 | using std::vector;
 12 | using std::cout;
 13 | using std::endl;
 14 | 
 15 | enum KmeansInitialType {
 16 | 	KmeansInitial_RANDOM,
 17 | 	KmeansInitial_KmeansPlusPlus
 18 | };
 19 | 
 20 | /*
 21 |  * Kmeans quantizer
 22 |  */
 23 | typedef struct _Kmeans
 24 | {
 25 | 	int points_count_;                        /* The number of points. */
 26 | 	int clusters_count_;                      /* The number of clusters. */
 27 | 	int dimension_;                           /* The dimension of points. */
 28 | 	
 29 | 	float* points_;                           /* A one-dimensional array (of length dimension_*points_count_)
 30 | 	                                             that the first dimension_ data is the first point. */
 31 | 	float* centers_;                          /* A one-dimensional array (of length dimension_*clusters_count_)
 32 | 	                                             that the first (second) dimension_ data is the first (second) center. */
 33 | 	int* assignments_;                        /* A one-dimensional array (of length points_count_)
 34 | 	                                             that indicate the cluster assignment of each point. */
 35 | 
 36 | 	float* distances_;                        /* Stores the distance from each point to its assigned cluster center. */
 37 | 	float distortion_;                        /* The total distortion error. */
 38 | 
 39 | 	int max_iteration_;                       /* The maximum number of iteration. */
 40 | 	double distortion_tol_;                   /* The parameter to test the distortion relative variation. */
 41 | 	bool verbosity_;                          /* The flag of whether to display the current progress of the optimization process. */
 42 | }Kmeans;
 43 | 
 44 | 
 45 | /**
 46 | * This function reset the kmeans quantizer parameters.
 47 | *  @param  self             The Kmeans object.
 48 | *  @param  points_count     The number of points.
 49 | *  @param  clusters_count   The number of clusters.
 50 | *  @param  dimension        The dimension of points.
 51 | *  @param  points           A one-dimensional array (of length dimension_*points_count_)
 52 | *                           that the first dimension_ data is the first point.
 53 | *  @param  verbosity        The flag of whether to display the current progress of the optimization process.
 54 | */
 55 | void Kmeans_Reset(
 56 | 	Kmeans* self,
 57 | 	const int points_count,
 58 | 	const int clusters_count,
 59 | 	const int dimension,
 60 | 	const float* points,
 61 | 	const bool verbosity = true);
 62 | 
 63 | /**
 64 | * This function creat a new kmeans quantizer and return it.
 65 | *  @param  points_count     The number of points.
 66 | *  @param  clusters_count   The number of clusters.
 67 | *  @param  dimension        The dimension of points.
 68 | *  @param  points           A one-dimensional array (of length dimension_*points_count_)
 69 | *                           that the first dimension_ data is the first point.
 70 | *  @param  verbosity        The flag of whether to display the current progress of the optimization process.
 71 | */
 72 | Kmeans * Kmeans_New(
 73 | 	const int points_count,
 74 | 	const int clusters_count,
 75 | 	const int dimension,
 76 | 	const float* points,
 77 | 	const bool verbosity = true);
 78 | 
 79 | /**
 80 | * This function delete the kmeans quantizer.
 81 | *  @param  self             The Kmeans object.
 82 | */
 83 | void Kmeans_Delete(Kmeans * self);
 84 | 
 85 | /**
 86 | * This function is the main function that perfrom Lloyd's algorithm.
 87 | *  @param  self             The Kmeans object.
 88 | *  @param  max_iters        The maximum iteration of the algorithm.
 89 | *  @param  distortion_tol   The parameter to test the distortion relative variation.
 90 | */
 91 | void Kmeans_LloydQuantization(
 92 | 	Kmeans* self,
 93 | 	const int max_iters,
 94 | 	const double distortion_tol);
 95 | 
 96 | /**
 97 | * This function initials the kmeans centers.
 98 | *  @param  self             The Kmeans object.
 99 | *  @param  initial_type     The way of initialize, should be RANDOM or KmeansPlusPlus.
100 | */
101 | void Kmeans_Initialize(Kmeans* self, const KmeansInitialType initial_type);
102 | 
103 | /**
104 | * This function performs random initialization.
105 | *  @param  points_count     The number of points.
106 | *  @param  clusters_count   The number of clusters.
107 | *  @param  dimension        The dimension of points.
108 | *  @param  points           A one-dimensional array (of length dimension_*points_count_)
109 | *                           that the first dimension_ data is the first point.
110 | *  @param  centers          A one-dimensional array (of length dimension_*clusters_count_)
111 | *                           that the first (second) dimension_ data is the first (second) center.
112 | */
113 | void Kmeans_RandomInitialize(
114 | 	const int points_count,
115 | 	const int clusters_count,
116 | 	const int dimension,
117 | 	const float* points,
118 | 	float* centers);
119 | 
120 | /**
121 | * This function performs kmeans++ initialization.
122 | *  @param  points_count     The number of points.
123 | *  @param  clusters_count   The number of clusters.
124 | *  @param  dimension        The dimension of points.
125 | *  @param  points           A one-dimensional array (of length dimension_*points_count_)
126 | *                           that the first dimension_ data is the first point.
127 | *  @param  centers          A one-dimensional array (of length dimension_*clusters_count_)
128 | *                           that the first (second) dimension_ data is the first (second) center.
129 | */
130 | void Kmeans_KmeansPlusPlusInitialize(
131 | 	const int points_count,
132 | 	const int clusters_count,
133 | 	const int dimension,
134 | 	const float* points,
135 | 	float* centers);


--------------------------------------------------------------------------------
/ReleaseVersion/NoConstraintCompositeQuantization.cpp:
--------------------------------------------------------------------------------
  1 | #include "NoConstraintCompositeQuantization.h"
  2 | 
  3 | /***************************Implementation*******************************/
  4 | 
  5 | NoConstraintCompositeQuantization::NoConstraintCompositeQuantization(
  6 | 	const int points_count,
  7 | 	const int dictionaries_count,
  8 | 	const int words_count,
  9 | 	const int space_dimension,
 10 | 	const int num_sep)
 11 | 	: points_count_(points_count),
 12 | 	dictionaries_count_(dictionaries_count),
 13 | 	words_count_(words_count),
 14 | 	space_dimension_(space_dimension),
 15 | 	num_sep_(num_sep)
 16 | {
 17 | 	if (dictionaries_count <= 0 || words_count <= 0 || space_dimension <= 0 || points_count <= 0 || num_sep <= 0)
 18 | 	{
 19 | 		cout << "NCQ: bad input parameters\n";
 20 | 		throw std::logic_error("Bad input parameters");
 21 | 	}
 22 | 	points_ = NULL;
 23 | 	own_points_memory_ = false;
 24 | 	dictionary_ = new DictionaryType[dictionaries_count*words_count*space_dimension];
 25 | 	memset(dictionary_, 0, sizeof(DictionaryType)*dictionaries_count*words_count*space_dimension);
 26 | 	binary_codes_ = new CodeType[dictionaries_count*points_count];
 27 | 	memset(binary_codes_, 0, sizeof(CodeType)*dictionaries_count*points_count);
 28 | 	
 29 | 	distortions_ = new float[points_count];
 30 | 	memset(distortions_, 0, sizeof(float)*points_count);
 31 | 	distortion_ = 0;
 32 | 
 33 | 	binary_multi_binaryTranspose_ = new float[dictionaries_count*words_count*dictionaries_count*words_count];
 34 | 	memset(binary_multi_binaryTranspose_, 0, sizeof(float)*dictionaries_count*words_count*dictionaries_count*words_count);
 35 | 	binary_multi_binaryTranspose_sep_.resize(num_sep, vector<float>(dictionaries_count*words_count*dictionaries_count*words_count));
 36 | 	points_multi_binaryTranspose_ = new float[dictionaries_count*words_count*space_dimension];
 37 | 	memset(points_multi_binaryTranspose_, 0, sizeof(float)*dictionaries_count*words_count*space_dimension);
 38 | 	points_multi_binaryTranspose_sep_.resize(num_sep, vector<float>(dictionaries_count*words_count*space_dimension));
 39 | 	
 40 | 	u_matrix = new float[dictionaries_count*words_count*dictionaries_count*words_count];
 41 | 	s_vector = new float[dictionaries_count*words_count];
 42 | 	vt_matrix = new float[dictionaries_count*words_count*dictionaries_count*words_count];
 43 | 	superb = new float[dictionaries_count*words_count];
 44 | }
 45 | 
 46 | NoConstraintCompositeQuantization::~NoConstraintCompositeQuantization()
 47 | {
 48 | 	if (superb)                        delete[] superb;
 49 | 	if (vt_matrix)                     delete[] vt_matrix;
 50 | 	if (s_vector)                      delete[] s_vector;
 51 | 	if (u_matrix)                      delete[] u_matrix;
 52 | 
 53 | 	if (points_multi_binaryTranspose_) delete[] points_multi_binaryTranspose_;
 54 | 	if (binary_multi_binaryTranspose_) delete[] binary_multi_binaryTranspose_;
 55 | 	if (distortions_)                  delete[] distortions_;
 56 | 	if (binary_codes_)                 delete[] binary_codes_;
 57 | 	if (dictionary_)                   delete[] dictionary_;
 58 | 	if (own_points_memory_ && points_) delete[] points_;
 59 | }
 60 | 
 61 | void NoConstraintCompositeQuantization::SaveDictionary(const string output_file_prefix)
 62 | {
 63 | 	cout << "Saving dictionary in " + output_file_prefix + "D\n";
 64 | 	SaveOneDimensionalPoints<DictionaryType>(output_file_prefix + "D", dictionary_, dictionaries_count_*words_count_, space_dimension_);
 65 | }
 66 | 
 67 | void NoConstraintCompositeQuantization::SaveBinaryCodes(const string output_file_prefix)
 68 | {
 69 | 	cout << "Saving binary codes in " + output_file_prefix + "B\n";
 70 | 	SaveOneDimensionalPoints<CodeType>(output_file_prefix + "B", binary_codes_, points_count_, dictionaries_count_);
 71 | }
 72 | 
 73 | void NoConstraintCompositeQuantization::GetDistortions()
 74 | {
 75 | 	memset(distortions_, 0, sizeof(float)*points_count_);
 76 | 	int all_words_count = words_count_*dictionaries_count_;
 77 | #pragma omp parallel for
 78 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
 79 | 	{
 80 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
 81 | 		PointType* point = &points_[point_id*space_dimension_];
 82 | 		vector<PointType> point_approximate_error(point, point + space_dimension_);
 83 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
 84 | 		{
 85 | 			DictionaryType* pWord = &dictionary_[(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_];
 86 | 			//PointType and DictionaryType must be the same and be float!
 87 | 			cblas_saxpy(space_dimension_, -1.0, pWord, 1, &point_approximate_error[0], 1);
 88 | 		}
 89 | 		for (int dimension = 0; dimension < space_dimension_; ++dimension)
 90 | 			distortions_[point_id] += point_approximate_error[dimension] * point_approximate_error[dimension];
 91 | 	}
 92 | 	distortion_ = cblas_sasum(points_count_, distortions_, 1);
 93 | }
 94 | 
 95 | void NoConstraintCompositeQuantization::InitPoints(
 96 | 	const string points_file,
 97 | 	const PointStoreType point_store_type)
 98 | {
 99 | 	cout << "Reading points in " + points_file << endl;
100 | 	if (!own_points_memory_)
101 | 	{
102 | 		points_ = new PointType[space_dimension_*points_count_];
103 | 		own_points_memory_ = true;
104 | 	}
105 | 	ReadOneDimensionalPoints<PointType>(points_file, point_store_type, points_, points_count_, space_dimension_);
106 | }
107 | 
108 | void NoConstraintCompositeQuantization::InitPoints(
109 | 	float* points,
110 | 	const int points_count,
111 | 	const int space_dimension)
112 | {
113 | 	if (points_count != points_count_ || space_dimension != space_dimension_)
114 | 	{
115 | 		cout << "unmatched points dimension\n";
116 | 		throw std::logic_error("unmatched points dimension");
117 | 	}
118 | 	cout << "Reading points...\n";
119 | 	if (own_points_memory_)
120 | 		memcpy(points_, points, sizeof(PointType)*points_count_*space_dimension_);
121 | 	else
122 | 		points_ = points;
123 | }
124 | 
125 | void NoConstraintCompositeQuantization::InitDictionary(
126 | 	const string dictionary_file,
127 | 	const PointStoreType dictionary_store_type)
128 | {
129 | 	cout << "Reading dictionary in " + dictionary_file << endl;
130 | 	ReadOneDimensionalPoints<DictionaryType>(dictionary_file, dictionary_store_type, dictionary_, dictionaries_count_*words_count_, space_dimension_);
131 | }
132 | 
133 | void NoConstraintCompositeQuantization::InitDictionary(
134 | 	const float* dictionary,
135 | 	const int dictionaries_count,
136 | 	const int words_count)
137 | {
138 | 	if (dictionaries_count != dictionaries_count_ || words_count != words_count_)
139 | 	{
140 | 		cout << "unmatched dictionary dimension\n";
141 | 		throw std::logic_error("unmatched dictionary dimension");
142 | 	}
143 | 	cout << "Reading dictionary...\n";
144 | 	memcpy(dictionary_, dictionary, sizeof(DictionaryType)*dictionaries_count_*words_count_*space_dimension_);
145 | }
146 | 
147 | void NoConstraintCompositeQuantization::InitBinaryCodes(
148 | 	const string binary_codes_file,
149 | 	const PointStoreType binary_codes_store_type)
150 | {
151 | 	cout << "Reading binary codes in " + binary_codes_file << endl;
152 | 	ReadOneDimensionalPoints<CodeType>(binary_codes_file, binary_codes_store_type, binary_codes_, points_count_, dictionaries_count_);
153 | }
154 | 
155 | void NoConstraintCompositeQuantization::InitBinaryCodes(
156 | 	const CodeType* binary_codes,
157 | 	const int points_count,
158 | 	const int dictionaries_count)
159 | {
160 | 	if (points_count != points_count_ || dictionaries_count != dictionaries_count_)
161 | 	{
162 | 		cout << "unmatched binary codes dimension\n";
163 | 		throw std::logic_error("unmatched binary codes dimension");
164 | 	}
165 | 	cout << "Reading binary codes...\n";
166 | 	memcpy(binary_codes_, binary_codes, sizeof(CodeType)*dictionaries_count_*points_count_);
167 | }
168 | 
169 | const DictionaryType* NoConstraintCompositeQuantization::GetDictionary()
170 | {
171 | 	return dictionary_;
172 | }
173 | 
174 | const CodeType* NoConstraintCompositeQuantization::GetBinaryCodes()
175 | {
176 | 	return binary_codes_;
177 | }
178 | 
179 | void NoConstraintCompositeQuantization::InitDictionaryBinaryCodes(const string output_file_prefix)
180 | {
181 | 	cout << "initial dictionary and binary codes using approximate results obtained from PQ...\n";
182 | 	ProductQuantization PQ(points_count_, dictionaries_count_, words_count_, space_dimension_);
183 | 	PQ.InitPoints(points_, points_count_, space_dimension_);
184 | 	PQ.Training(5, 1e-4, Lloyd, output_file_prefix + "PQ.", false);
185 | 	InitDictionary(PQ.GetDictionary(), dictionaries_count_, words_count_);
186 | 	InitBinaryCodes(PQ.GetBinaryCodes(), points_count_, dictionaries_count_);
187 | }
188 | 
189 | void NoConstraintCompositeQuantization::Training(
190 | 	const int iters,
191 | 	const string output_file_prefix,
192 | 	const bool initial)
193 | {
194 | 	cout << "No Constraint Composite Quantization Training...\n";
195 | 	cout << "Reminder: The points should be initialized first! \n";
196 | 	if (initial)
197 | 		InitDictionaryBinaryCodes(output_file_prefix);
198 | 	GetDistortions();
199 | 	ofstream out(output_file_prefix + "distor_iter.txt");
200 | 	if (!out.good())
201 | 	{
202 | 		cout << "Bad directory: " + output_file_prefix << endl;
203 | 		throw std::logic_error("Bad directory: " + output_file_prefix);
204 | 	}
205 | 	for (int iter = 0; iter < iters; ++iter)
206 | 	{
207 | 		cout << "Iteration " << iter << ": \n";
208 | 		out << "Iteration " << iter << ": distortion = " << distortion_ << endl;
209 | 		cout << "Updating dictionary: ";
210 | 		UpdateDictionary();
211 | 		cout << " distortion = " << distortion_ << endl;
212 | 		cout << "Updating binary codes: ";
213 | 		UpdateBinaryCodes();
214 | 		cout << " distortion = " << distortion_ << endl << endl;
215 | 	}
216 | 	out.close();
217 | 	SaveDictionary(output_file_prefix);
218 | 	SaveBinaryCodes(output_file_prefix);
219 | }
220 | 
221 | void NoConstraintCompositeQuantization::UpdateDictionary()
222 | {
223 | 	clock_t start = clock();
224 | 	int all_words_count = dictionaries_count_*words_count_;
225 | 
226 | 	/* compute XB^T */
227 | #pragma omp parallel for
228 | 	for (int sep = 0; sep < num_sep_; ++sep)
229 | 	{
230 | 		int start_point_id = points_count_ / num_sep_ * sep;
231 | 		int end_point_id = points_count_ / num_sep_ * (sep + 1);
232 | 		points_multi_binaryTranspose_sep_[sep].assign(space_dimension_*all_words_count, 0);
233 | 		for (int point_id = start_point_id; point_id < end_point_id; ++point_id)
234 | 		{
235 | 			CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
236 | 			PointType* point = &points_[point_id*space_dimension_];
237 | 			for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
238 | 			{
239 | 				float* points_multi_binaryTranspose_column = &(points_multi_binaryTranspose_sep_[sep][(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_]);
240 | 				// PointType must be float!
241 | 				cblas_saxpy(space_dimension_, 1.0, point, 1, points_multi_binaryTranspose_column, 1);
242 | 			}
243 | 		}
244 | 	}
245 | 	memset(points_multi_binaryTranspose_, 0, sizeof(float)*space_dimension_*all_words_count);
246 | 	for (int sep = 0; sep < num_sep_; ++sep)
247 | 	{
248 | 		cblas_saxpy(space_dimension_*all_words_count, 1.0, &(points_multi_binaryTranspose_sep_[sep][0]), 1, points_multi_binaryTranspose_, 1);
249 | 	}
250 | 
251 | 	/* compute BB^T */
252 | #pragma omp parallel for
253 | 	for (int sep = 0; sep < num_sep_; ++sep)
254 | 	{
255 | 		int start_point_id = points_count_ / num_sep_ * sep;
256 | 		int end_point_id = points_count_ / num_sep_ * (sep + 1);
257 | 		binary_multi_binaryTranspose_sep_[sep].assign(all_words_count*all_words_count, 0);
258 | 		for (int point_id = start_point_id; point_id < end_point_id; ++point_id)
259 | 		{
260 | 			CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
261 | 			PointType* point = &points_[point_id*space_dimension_];
262 | 			for (int dictionary_id_row = 0; dictionary_id_row < dictionaries_count_; ++dictionary_id_row)
263 | 			{
264 | 				int row = dictionary_id_row*words_count_ + point_codes[dictionary_id_row];
265 | 				for (int dictionary_id_col = 0; dictionary_id_col < dictionaries_count_; ++dictionary_id_col)
266 | 				{
267 | 					int col = dictionary_id_col*words_count_ + point_codes[dictionary_id_col];
268 | 					++binary_multi_binaryTranspose_sep_[sep][row*all_words_count + col];
269 | 				}
270 | 			}
271 | 		}
272 | 	}
273 | 	memset(binary_multi_binaryTranspose_, 0, sizeof(float)*all_words_count*all_words_count);
274 | 	for (int sep = 0; sep < num_sep_; ++sep)
275 | 	{
276 | 		cblas_saxpy(all_words_count*all_words_count, 1.0, &binary_multi_binaryTranspose_sep_[sep][0], 1, binary_multi_binaryTranspose_, 1);
277 | 	}
278 | 
279 | 	/* singular value decomposition of (BB^T) */
280 | 	LAPACKE_sgesvd(LAPACK_ROW_MAJOR, 'A', 'A', all_words_count, all_words_count,
281 | 		binary_multi_binaryTranspose_, all_words_count,
282 | 		s_vector, u_matrix, all_words_count, vt_matrix, all_words_count,
283 | 		&superb[0]);
284 | 
285 | 	/* compute vs^(-1) stored in vt_matrix*/
286 | 	bool zero = false;
287 | 	for (int col = 0; col < all_words_count; ++col)
288 | 	{
289 | 		if (zero || s_vector[col] < 1e-3)
290 | 		{
291 | 			zero = true;
292 | 			memset(&vt_matrix[col*all_words_count], 0, sizeof(float)*all_words_count);
293 | 			continue;
294 | 		}
295 | 		for (int row = 0; row < all_words_count; ++row)
296 | 		{
297 | 			vt_matrix[col*all_words_count + row] = vt_matrix[col*all_words_count + row] / s_vector[col];
298 | 		}
299 | 	}
300 | 
301 | 	/* compute vs^(-1)u^T stored in binary_multi_binaryTranspose_ */
302 | 	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, all_words_count, all_words_count, all_words_count,
303 | 		1.0, vt_matrix, all_words_count, u_matrix, all_words_count, 0, binary_multi_binaryTranspose_, all_words_count);
304 | 
305 | 	/* compute XB^T(BB^T)^(-1) stored in dictionary_*/
306 | 	memset(dictionary_, 0, sizeof(DictionaryType)*space_dimension_*all_words_count);
307 | 	//DictionaryType must be float!
308 | 	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, all_words_count, space_dimension_, all_words_count,
309 | 		1.0, binary_multi_binaryTranspose_, all_words_count, points_multi_binaryTranspose_, space_dimension_,
310 | 		0, dictionary_, space_dimension_);
311 | 
312 | 	mkl_free_buffers();
313 | 
314 | 	GetDistortions();
315 | 
316 | 	clock_t finish = clock();
317 | 	cout << " cost = " << finish - start << " milliseconds \n"; 
318 | }
319 | 
320 | void NoConstraintCompositeQuantization::UpdateBinaryCodes()
321 | {
322 | 	clock_t start = clock();
323 | #pragma omp parallel for
324 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
325 | 	{
326 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
327 | 		PointType* point = &points_[point_id*space_dimension_];
328 | 
329 | 		vector<PointType> point_approximate_error(point, point + space_dimension_);
330 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
331 | 		{
332 | 			DictionaryType* pWord = &(dictionary_[(dictionary_id*words_count_ + point_codes[dictionary_id])*space_dimension_]);
333 | 			//PointType and DictioniaryType must be the same and be float!
334 | 			cblas_saxpy(space_dimension_, -1.0, pWord, 1, &point_approximate_error[0], 1);
335 | 		}
336 | 
337 | 		float distortion = distortions_[point_id];
338 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
339 | 		{
340 | 			int old_selected_id = dictionary_id*words_count_ + point_codes[dictionary_id];
341 | 			//PointType and DictioniaryType must be the same and be float!
342 | 			cblas_saxpy(space_dimension_, 1.0, &dictionary_[old_selected_id*space_dimension_], 1, &point_approximate_error[0], 1);
343 | 			float temp_distortion;
344 | 			for (int word_id = 0; word_id < words_count_; ++word_id)
345 | 			{
346 | 				DictionaryType* pWord_temp = &dictionary_[(dictionary_id*words_count_ + word_id)*space_dimension_];
347 | 				temp_distortion = 0;
348 | 				for (int dimension = 0; dimension < space_dimension_; ++dimension)
349 | 				{
350 | 					float diff = point_approximate_error[dimension] - pWord_temp[dimension];
351 | 					temp_distortion += diff*diff;
352 | 				}
353 | 				if (temp_distortion < distortion)
354 | 				{
355 | 					distortion = temp_distortion;
356 | 					distortions_[point_id] = temp_distortion;
357 | 					point_codes[dictionary_id] = word_id;
358 | 				}
359 | 			}
360 | 			int new_selected_id = dictionary_id*words_count_ + point_codes[dictionary_id];
361 | 			//PointType and DictioniaryType must be the same and be float!
362 | 			cblas_saxpy(space_dimension_, -1.0, &dictionary_[new_selected_id*space_dimension_], 1, &point_approximate_error[0], 1);
363 | 		}
364 | 	}
365 | 	distortion_ = cblas_sasum(points_count_, distortions_, 1);
366 | 	clock_t finish = clock();
367 | 	cout << " cost = " << finish - start << " milliseconds ";
368 | }


--------------------------------------------------------------------------------
/ReleaseVersion/NoConstraintCompositeQuantization.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "time.h"
  3 | #include "DataUtil.h"
  4 | #include "ProductQuantization.h"
  5 | #include <mkl_lapacke.h>
  6 | #include <mkl_cblas.h>
  7 | #include <mkl.h>
  8 | 
  9 | 
 10 | class NoConstraintCompositeQuantization
 11 | {
 12 | public:
 13 | 	/**
 14 | 	* The constructor function.
 15 | 	*  @param  points_count        The number of points in the dataset.
 16 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 17 | 	*  @param  words_count the     The number of words in each dictionary (K).
 18 | 	*  @param  space_dimension     The dimension of database vector.
 19 | 	*  @param  num_sep             The number of partitions of the points to accelerate the computation.
 20 | 	*/
 21 | 	NoConstraintCompositeQuantization(
 22 | 		const int points_count,
 23 | 		const int dictionaries_count,
 24 | 		const int words_count,
 25 | 		const int space_dimension,
 26 | 		const int num_sep = 20);
 27 | 
 28 | 	/**
 29 | 	* The deconstructor function.
 30 | 	*/
 31 | 	~NoConstraintCompositeQuantization();
 32 | 
 33 | 
 34 | 	/**
 35 | 	* The initial function for points.
 36 | 	*  @param  points_file          The filename with points in .fvecs format or binary format.
 37 | 	*  @param  point_store_type     The type of points, should be FVEC, IVEC or BINARY.
 38 | 	*/
 39 | 	void InitPoints(
 40 | 		const string points_file,
 41 | 		const PointStoreType point_store_type);
 42 | 
 43 | 	/**
 44 | 	* The initial function for points.
 45 | 	*  @param  points              The array that stores the points.
 46 | 	*  @param  points_count        The number of points in the dataset.
 47 | 	*  @param  space_dimension     The dimension of database vector.
 48 | 	*/
 49 | 	void InitPoints(
 50 | 		PointType* points,
 51 | 		const int points_count,
 52 | 		const int space_dimension);
 53 | 
 54 | 	/**
 55 | 	* The initial function for dictionary.
 56 | 	*  @param  dictionary_file        The filename with dictionary in binary format.
 57 | 	*  @param  dictionary_sotre_type  The type of dictionary, should be BINARY.
 58 | 	*/
 59 | 	void InitDictionary(
 60 | 		const string dictionary_file,
 61 | 		const PointStoreType dictionary_sotre_type);
 62 | 
 63 | 	/**
 64 | 	* The initial function for points.
 65 | 	*  @param  dictionary          The array that stores the dictionary.
 66 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 67 | 	*  @param  words_count the     The number of words in each dictionary (K).
 68 | 	*/
 69 | 	void InitDictionary(
 70 | 		const DictionaryType* dictionary,
 71 | 		const int dictionaries_count,
 72 | 		const int words_count);
 73 | 
 74 | 	/**
 75 | 	* The initial function for dictionary.
 76 | 	*  @param  binary_codes_file        The filename with binary codes in binary format.
 77 | 	*  @param  binary_codes_store_type  The type of binary codes, should be BINARY.
 78 | 	*/
 79 | 	void InitBinaryCodes(
 80 | 		const string binary_codes_file,
 81 | 		const PointStoreType binary_codes_store_type);
 82 | 
 83 | 	/**
 84 | 	* The initial function for points.
 85 | 	*  @param  binary_codes        The array that stores the binary codes.
 86 | 	*  @param  points_count        The number of points in the dataset.
 87 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 88 | 	*/
 89 | 	void InitBinaryCodes(
 90 | 		const CodeType* binary_codes,
 91 | 		const int points_count,
 92 | 		const int dictionaries_count);
 93 | 
 94 | 
 95 | 	/**
 96 | 	* This function returns the trained dictionary.
 97 | 	*/
 98 | 	const DictionaryType* GetDictionary();
 99 | 
100 | 	/**
101 | 	* This function returns the trained binary codes.
102 | 	*/
103 | 	const CodeType* GetBinaryCodes();
104 | 
105 | 
106 | 	/**
107 | 	* The main function that trains the dictionary and the binary codes.
108 | 	*  @param  iters                              The iterations of alternating update the three groups of variables.
109 | 	*  @param  output_file_prefix                 The prefix of the output file.
110 | 	*  @param  initial                            The flag to indicate whether to initial dictionary and binary codes,
111 | 	*                                             false -> already initialed from outside
112 | 	*                                             true  -> initial inside using results obtained from PQ.
113 | 	*/
114 | 	void Training(
115 | 		const int iters,
116 | 		const string output_file_prefix, 
117 | 		const bool initial);
118 | private:
119 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
120 | 	/**
121 | 	* This function disallows the use of compiler-generated copy constructor function
122 | 	*/
123 | 	NoConstraintCompositeQuantization(const NoConstraintCompositeQuantization&);
124 | 	/**
125 | 	* This function disallows the use of compiler-generated copy assignment function
126 | 	*/
127 | 	NoConstraintCompositeQuantization& operator=(const NoConstraintCompositeQuantization&);
128 | 
129 | 
130 | 	/**
131 | 	* The initialization function for dictionary and binary codes.
132 | 	*  @param  output_file_prefix                 The prefix of the output file.
133 | 	*/
134 | 	void InitDictionaryBinaryCodes(const string output_file_prefix);
135 | 	/**
136 | 	* This function conducts the update dictionary step.
137 | 	*/
138 | 	void UpdateDictionary();
139 | 	/**
140 | 	* This function conducts the update binary codes step.
141 | 	*/
142 | 	void UpdateBinaryCodes();
143 | 	/**
144 | 	* This function computes the distortions and constants for each point.
145 | 	*/
146 | 	void GetDistortions();
147 | 	/**
148 | 	* This function output dictionary in a binary format.
149 | 	*/
150 | 	void SaveDictionary(const string output_file_prefix);
151 | 	/**
152 | 	* This function output binary codes in a binary format.
153 | 	*/
154 | 	void SaveBinaryCodes(const string output_file_prefix);
155 | 
156 | 
157 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member variables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
158 | 	/**
159 | 	* The number of points in the dataset.
160 | 	*/
161 | 	int points_count_;
162 | 	/**
163 | 	* The number of dictionaries (M).
164 | 	*/
165 | 	int dictionaries_count_;
166 | 	/**
167 | 	* The number of words in each dictionary (K).
168 | 	*/
169 | 	int words_count_;
170 | 	/**
171 | 	* The dimension of database vector.
172 | 	*/
173 | 	int space_dimension_;
174 | 	/**
175 | 	* The number of partitions of the points to accelerate the computation.
176 | 	*/
177 | 	int num_sep_;
178 | 
179 | 
180 | 	/**
181 | 	* A one-dimensional array (of length space_dimension_*points_count_)
182 | 	* that the first space_dimension_ data is the first point.
183 | 	*/
184 | 	PointType* points_;
185 | 	/**
186 | 	* A flag to indicate whether to manage the points memory.
187 | 	*/
188 | 	bool own_points_memory_;
189 | 	/**
190 | 	* A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
191 | 	* that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
192 | 	*/
193 | 	DictionaryType* dictionary_;
194 | 	/**
195 | 	* A one-dimensional array (of length dictionaries_count_*points_count_)
196 | 	* that the frist dictionaries_count_ data is the binary codes for the first point.
197 | 	*/
198 | 	CodeType* binary_codes_;
199 | 	
200 | 
201 | 	/**
202 | 	* Stores the distortion for each point.
203 | 	*/
204 | 	float* distortions_;
205 | 	/**
206 | 	* Stores the distortion for all points.
207 | 	*/
208 | 	float distortion_;
209 | 
210 | 
211 | 	/**
212 | 	* The BB^T of one-dimensional array (of length dictionaries_count*words_count_*dictionaries_count_*words_count).
213 | 	*/
214 | 	float* binary_multi_binaryTranspose_;
215 | 	/**
216 | 	* The BB^T of one-dimensional array (of length dictionaries_count*words_count_*dictionaries_count_*words_count)
217 | 	* in binary_muti_binaryTranspose_sep_[i] (i is in [0,num_sep_ - 1])
218 | 	* to accelerate the computation.
219 | 	*/
220 | 	vector<vector<float>> binary_multi_binaryTranspose_sep_;
221 | 	/**
222 | 	* The XB^T of one-dimensional array (of length dictionaries_count*words_count_*space_dimension_).
223 | 	*/
224 | 	float* points_multi_binaryTranspose_;
225 | 	/**
226 | 	* The XB^T of one-dimensional array (of length dictionaries_count*words_count_*space_dimension_)
227 | 	* in points_multi_binaryTranspose_sep_[i] (i is in [0,num_sep_ - 1])
228 | 	* to accelerate the computation.
229 | 	*/
230 | 	vector<vector<float>> points_multi_binaryTranspose_sep_;
231 | 	
232 | 
233 | 	/**
234 | 	* Temporary variable to store the u matrix of the svd result.
235 | 	*/
236 | 	float* u_matrix;
237 | 	/**
238 | 	* Temporary variable to store the s vector (eigenvalues) of the svd result.
239 | 	*/
240 | 	float* s_vector;
241 | 	/**
242 | 	* Temporary variable to store the vt matrix of the svd result.
243 | 	*/
244 | 	float* vt_matrix;
245 | 	/**
246 | 	* Temporary variable to store the work of the sgesvd function.
247 | 	*/
248 | 	float* superb;
249 | };


--------------------------------------------------------------------------------
/ReleaseVersion/PartitioningTree.cpp:
--------------------------------------------------------------------------------
  1 | #include "PartitioningTree.h"
  2 | #include <cmath>
  3 | #include <vector>
  4 | #include <algorithm>
  5 | 
  6 | namespace KMC
  7 | {
  8 | 	PartitionTreeBase * NewPartitionTree(std::string sTreeName)
  9 | 	{
 10 | 		if (sTreeName == "Rptree") return new RptreePartition();
 11 | 		return NULL;
 12 | 	}
 13 | 
 14 | 	PartitionTreeBase * NewPartitionTree(std::string sTreeName, const Parameters & params)
 15 | 	{
 16 | 		if (sTreeName == "Rptree") return new RptreePartition(params);
 17 | 		return NULL;
 18 | 	}
 19 | 
 20 | 	void RptreePartition::PartitionData(Dataset<DataType> * m_pData, int * pPartitionId, int nPartition)
 21 | 	{
 22 | 		int N = m_pData->R();
 23 | 
 24 | 		// initialize a random id sequence for partition
 25 | 		int * pIndex = new int [N];
 26 | 		for (int i = 0; i < N; i++)
 27 | 		{
 28 | 			pIndex[i] = i;
 29 | 		}
 30 | 
 31 | 		for (int i = N - 1; i > 0; i--)
 32 | 		{
 33 | 			int k = (rand()%10000*rand())%(i+1);
 34 | 			std::swap(pIndex[k], pIndex[i]);
 35 | 		}
 36 | 
 37 | 		int CurrentPartitionId = 0;
 38 | 		// start recursive partitioning
 39 | 		PartitionDataByRpTree(m_pData, pIndex, pPartitionId, nPartition, CurrentPartitionId, 0, N);
 40 | 
 41 | 		delete [] pIndex;
 42 | 	}
 43 | 
 44 | 	// Partition data into K divisions by recursively random projection partitioning
 45 | 	void RptreePartition::PartitionDataByRpTree(Dataset<DataType> * m_pData, int * pIndex, int * pPartitionId, int K, 
 46 | 												int & CurrentPartitionId, int iStartIndex, int iEndIndex)
 47 | 	{
 48 | 		if (K == 1)
 49 | 		{
 50 | 			for (int i = iStartIndex; i < iEndIndex; i++) pPartitionId[pIndex[i]] = CurrentPartitionId;
 51 | 			CurrentPartitionId++;
 52 | 		}
 53 | 		else
 54 | 		{
 55 | 			// Partition into 2 parts, "iMidIndex" is the first index of the second part
 56 | 			int iMidIndex = ChooseDivisionRPTree(m_pData, pIndex, iStartIndex, iEndIndex);
 57 | 
 58 | 			// Calculate the number of partitions, proportional to its size, to be allocated to each part
 59 | 			int leftK = int(floor(float(iMidIndex-iStartIndex)*K/(iEndIndex-iStartIndex)+0.5));
 60 | 			int rightK = K-leftK;
 61 | 
 62 | 			// Handle the extreme case
 63 | 			if (leftK == 0) { leftK = 1; rightK--; }
 64 | 			if (rightK == 0) { rightK = 1; leftK--; }
 65 | 
 66 | 			PartitionDataByRpTree(m_pData, pIndex, pPartitionId, leftK, CurrentPartitionId, iStartIndex, iMidIndex);
 67 | 			PartitionDataByRpTree(m_pData, pIndex, pPartitionId, rightK, CurrentPartitionId, iMidIndex, iEndIndex);
 68 | 		}
 69 | 	}
 70 | 
 71 | 	int RptreePartition::ChooseDivisionRPTree(Dataset<DataType> * m_pData, int * pIndex, int iStartIndex, int iEndIndex)
 72 | 	{
 73 | 		int Dim = m_pData->C();
 74 |         FloatType * Mean = new FloatType [Dim];
 75 |         memset(Mean, 0, Dim*sizeof(FloatType));
 76 | 
 77 | 		// Some fixed parameters, work for almost all cases
 78 | 
 79 | 
 80 | 		// We evaluate the quality of a partition plane by a subset of the data
 81 | 		int iSampleEndIndex  = std::min(iStartIndex+nMaxSample, iEndIndex);
 82 | 		int nSample = iSampleEndIndex - iStartIndex;
 83 | 
 84 | 		// Calculate the mean of each dimension
 85 | 		for (int j = iStartIndex; j < iSampleEndIndex; j++)
 86 | 		{
 87 | 			DataType * v = (*m_pData)[pIndex[j]];
 88 | 			for (int k = 0; k < Dim; k++) Mean[k] += v[k];
 89 | 		}
 90 | 
 91 | 		for (int k = 0; k < Dim; k++) Mean[k] /= nSample;
 92 |          
 93 |         std::vector<KeyScorePair> Variance;
 94 |         Variance.clear();
 95 |         for (int j = 0; j < Dim; j++)
 96 |         {
 97 |             Variance.push_back(KeyScorePair(j, 0));
 98 |         }
 99 | 
100 | 		// Calculate the variance of each dimension
101 | 		for (int j = iStartIndex; j < iSampleEndIndex; j++)
102 | 		{
103 |             DataType * v = (*m_pData)[pIndex[j]];
104 | 			for (int k = 0; k < Dim; k++)
105 | 			{
106 |                 FloatType dist = v[k] - Mean[k];
107 | 				Variance[k].Score += dist*dist;
108 | 			}
109 | 		}
110 | 
111 | 		// Sort the axis by their variance and pick out the top "nAxis" ones
112 |         std::sort(Variance.begin(), Variance.end(), KeyScorePair::Compare);
113 |         int * AxisIndex = new int [nAxis];
114 |         float * Weight = new float [nAxis];
115 |         float * BestWeight = new float [nAxis];
116 |         float BestVariance = Variance[Dim-1].Score;
117 |         for (int i = 0; i < nAxis; i++)
118 |         {
119 |             AxisIndex[i] = Variance[Dim-1-i].Key;
120 |             BestWeight[i] = 0;
121 |         }
122 | 
123 | 		// Initial best partition plane is set to be the plane perpendicular to the axis with the max variance
124 |         BestWeight[0] = 1;
125 |         float BestMean = Mean[AxisIndex[0]];
126 | 
127 |         float * Val = new float [nSample];
128 | 		// Generate random weights to combine top "nAxis" axis to find better partition plane
129 |         for (int i = 0; i < nIteration; i++)
130 |         {
131 | 			// Generate random plane
132 |             float sumweight = 0;
133 |             for (int j = 0; j < nAxis; j++) 
134 |             {
135 |                 Weight[j] = float(rand()%10000)/5000.0f - 1.0f;
136 |                 sumweight += Weight[j] * Weight[j];
137 |             }
138 |             sumweight = sqrt(sumweight);
139 |             for (int j = 0; j < nAxis; j++) Weight[j] /= sumweight;
140 | 
141 | 			// Calculate the mean of the projection
142 |             float mean = 0;
143 |             for (int j = 0; j < nSample; j++)
144 |             {
145 |                 Val[j] = 0;
146 |                 for (int k = 0; k < nAxis; k++) Val[j] += Weight[k] * (*m_pData)[pIndex[iStartIndex+j]][AxisIndex[k]];
147 |                 mean += Val[j];
148 |             }
149 |             mean /= nSample;
150 | 
151 | 			// Calculate the variance of the projection
152 |             float var = 0;
153 |             for (int j = 0; j < nSample; j++)
154 |             {
155 |                 float dist = Val[j] - mean;
156 |                 var += dist * dist;
157 |             }
158 | 
159 |             if (var > BestVariance)
160 |             {
161 |                 BestVariance = var;
162 |                 BestMean = mean;
163 |                 for (int j = 0; j < nAxis; j++) BestWeight[j] = Weight[j];
164 |             }
165 |         }
166 | 
167 |         delete [] Mean;
168 | 
169 |         int iLeft = iStartIndex;
170 | 		int iRight = iEndIndex-1;
171 | 
172 | 		// decide which child one point belongs
173 | 		while (iLeft <= iRight)
174 | 		{
175 |             float val = 0;
176 |             for (int k = 0; k < nAxis; k++) val += BestWeight[k] * (*m_pData)[pIndex[iLeft]][AxisIndex[k]];
177 |             if (val < BestMean)
178 | 			{
179 | 				iLeft++;
180 | 			}
181 | 			else 
182 | 			{
183 | 			    std::swap(pIndex[iLeft], pIndex[iRight]);
184 | 				iRight--;
185 | 			}
186 | 		}
187 | 
188 | 		// if all the points in the node are equal,equally split the node into 2 evenly
189 | 		if ((iLeft==iStartIndex) || (iLeft==iEndIndex))
190 | 		{
191 | 			iLeft = (iStartIndex + iEndIndex)/2;
192 | 		}
193 | 
194 |         delete [] Val;
195 |         delete [] AxisIndex;
196 |         delete [] Weight;
197 |         delete [] BestWeight;
198 | 
199 | 		return iLeft;
200 | 	}
201 | }


--------------------------------------------------------------------------------
/ReleaseVersion/PartitioningTree.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Cluster.h"
 4 | 
 5 | namespace KMC
 6 | {
 7 | 	// Base class
 8 | 	class PartitionTreeBase
 9 | 	{
10 | 	public:
11 | 		virtual void PartitionData(Dataset<DataType> * m_pData, int * pPartitionId, int nPartition) = 0;		
12 | 	};
13 | 
14 | 	// Use Rptree to partition data
15 | 	class RptreePartition: public PartitionTreeBase
16 | 	{
17 | 	public:
18 | 		virtual void PartitionData(Dataset<DataType> * m_pData, int * pPartitionId, int nPartition);
19 | 
20 | 		RptreePartition() : nMaxSample(1000), nIteration(100), nAxis(5)
21 | 		{
22 | 		}
23 | 
24 | 		RptreePartition(const Parameters & params)
25 | 		{
26 | 			params.Get<int>("Rptree_nMaxSample", nMaxSample, 1000);
27 | 			params.Get<int>("Rptree_nIteration", nIteration, 100);
28 | 			params.Get<int>("Rptree_nAxis", nAxis, 5);
29 | 			//std::cout << nMaxSample << ' ' << nIteration << ' ' << nAxis << std::endl;
30 | 		}
31 | 
32 | 	private:
33 | 		void PartitionDataByRpTree(Dataset<DataType> * m_pData, int * pIndex, int * pPartitionId, int K, int & CurrentPartitionId, int iStartIndex, int iEndIndex);
34 | 		int ChooseDivisionRPTree(Dataset<DataType> * m_pData, int * pIndex, int iStartIndex, int iEndIndex);
35 | 
36 | 
37 | 		int nMaxSample;
38 |         int nIteration;
39 |         int nAxis;
40 | 	};
41 | 
42 | 	// New a partition tree pointer according to its name
43 | 	PartitionTreeBase * NewPartitionTree(std::string sTreeName);
44 | 	PartitionTreeBase * NewPartitionTree(std::string sTreeName, const Parameters & params);
45 | }


--------------------------------------------------------------------------------
/ReleaseVersion/ProductQuantization.cpp:
--------------------------------------------------------------------------------
  1 | #include "ProductQuantization.h"
  2 | 
  3 | ProductQuantization::ProductQuantization(
  4 | 	const int points_count,
  5 | 	const int dictionaries_count,
  6 | 	const int words_count,
  7 | 	const int space_dimension)
  8 | 	:points_count_(points_count),
  9 | 	dictionaries_count_(dictionaries_count),
 10 | 	words_count_(words_count),
 11 | 	space_dimension_(space_dimension)
 12 | {
 13 | 	if (points_count <= 0 || dictionaries_count <= 0 || words_count <= 0 || space_dimension <= 0 || space_dimension % dictionaries_count != 0)
 14 | 	{
 15 | 		cout << "PQ: bad input parameters\n";
 16 | 		throw std::logic_error("Bad input parameters");
 17 | 	}
 18 | 	subspace_dimension_ = space_dimension / dictionaries_count;
 19 | 	partition_.resize(dictionaries_count, vector<int>(subspace_dimension_));
 20 | 
 21 | 	points_ = NULL;
 22 | 	own_points_memory_ = false;
 23 | 	dictionary_ = new DictionaryType[dictionaries_count*words_count*space_dimension];
 24 | 	memset(dictionary_, 0, sizeof(DictionaryType)*dictionaries_count*words_count*space_dimension);
 25 | 	binary_codes_ = new CodeType[dictionaries_count*points_count];
 26 | 	memset(binary_codes_, 0, sizeof(CodeType)*dictionaries_count*points_count);
 27 | 
 28 | 	distortion_ = 0;
 29 | }
 30 | 
 31 | ProductQuantization::~ProductQuantization()
 32 | {
 33 | 	if (binary_codes_)                    delete[] binary_codes_;
 34 | 	if (dictionary_)                      delete[] dictionary_;
 35 | 	if (own_points_memory_ && points_)    delete[] points_;
 36 | }
 37 | 
 38 | void ProductQuantization::InitPoints(
 39 | 	const string points_file,
 40 | 	const PointStoreType point_sotre_type)
 41 | {
 42 | 	cout << "Reading points...\n";
 43 | 	if (!own_points_memory_)
 44 | 	{
 45 | 		points_ = new PointType[space_dimension_*points_count_];
 46 | 		own_points_memory_ = true;
 47 | 	}
 48 | 	ReadOneDimensionalPoints<PointType>(points_file, point_sotre_type, points_, points_count_, space_dimension_);
 49 | }
 50 | 
 51 | void ProductQuantization::InitPoints(
 52 | 	PointType* points,
 53 | 	const int points_count, 
 54 | 	const int space_dimension)
 55 | {
 56 | 	if (points_count != points_count_ || space_dimension != space_dimension_)
 57 | 	{
 58 | 		cout << "unmatched points dimension\n";
 59 | 		throw std::logic_error("unmatched points dimension");
 60 | 	}
 61 | 	cout << "Reading points...\n";
 62 | 	if (own_points_memory_)
 63 | 		memcpy(points_, points, sizeof(PointType)*points_count_*space_dimension_);
 64 | 	else
 65 | 		points_ = points;
 66 | }
 67 | 
 68 | const DictionaryType* ProductQuantization::GetDictionary()
 69 | {
 70 | 	return dictionary_;
 71 | }
 72 | 
 73 | const CodeType* ProductQuantization::GetBinaryCodes()
 74 | {
 75 | 	return binary_codes_;
 76 | }
 77 | 
 78 | void ProductQuantization::SaveDictionary(const string output_file_prefix)
 79 | {
 80 | 	cout << "Saving dictionary in " + output_file_prefix + "D\n";
 81 | 	SaveOneDimensionalPoints<DictionaryType>(output_file_prefix + "D", dictionary_, dictionaries_count_*words_count_, space_dimension_);
 82 | }
 83 | 
 84 | void ProductQuantization::SaveBinaryCodes(const string output_file_prefix)
 85 | {
 86 | 	cout << "Saving binary codes in " + output_file_prefix + "B\n";
 87 | 	SaveOneDimensionalPoints<CodeType>(output_file_prefix + "B", binary_codes_, points_count_, dictionaries_count_);
 88 | }
 89 | 
 90 | void ProductQuantization::SavePartition(const string output_file_prefix)
 91 | {
 92 | 	cout << "Saving partition in " + output_file_prefix + "partition\n";
 93 | 	ofstream partition_stream;
 94 | 	string partition_file = output_file_prefix + "partition";
 95 | 	partition_stream.open(partition_file.c_str(), ios::binary);
 96 | 	if (!partition_stream.good()) 
 97 | 	{
 98 | 		cout << "Bad output points stream : " + output_file_prefix + "partition\n";
 99 | 		throw std::logic_error("Bad output partition stream");
100 | 	}
101 | 	partition_stream.write((char *)&dictionaries_count_, sizeof(int));
102 | 	partition_stream.write((char *)&subspace_dimension_, sizeof(int));
103 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
104 | 		partition_stream.write(reinterpret_cast<char*>(&(partition_[dictionary_id][0])), sizeof(int)*subspace_dimension_);
105 | 	partition_stream.close();
106 | }
107 | 
108 | void ProductQuantization::ReadPartition(const string partition_file)
109 | {
110 | 	cout << "Reading partition in " + partition_file;
111 | 	ifstream partition_stream;
112 | 	partition_stream.open(partition_file.c_str(), ios::binary);
113 | 	if (!partition_stream.good())
114 | 	{
115 | 		cout << "Bad input partition stream : " + partition_file << endl;
116 | 		throw std::logic_error("Bad input partition stream");
117 | 	}
118 | 	int count = 0, dim = 0;
119 | 	partition_stream.read((char *)&count, sizeof(int));
120 | 	partition_stream.read((char *)&dim, sizeof(int));
121 | 	if (count != dictionaries_count_ || dim != subspace_dimension_)
122 | 	{
123 | 		cout << "unmatched partition dimension\n";
124 | 		throw std::logic_error("unmatched dimension!");
125 | 	}
126 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
127 | 		partition_stream.read(reinterpret_cast<char*>(&(partition_[dictionary_id][0])), sizeof(int)*subspace_dimension_);
128 | 	partition_stream.close();
129 | }
130 | 
131 | void ProductQuantization::IniNaturalPartition()
132 | {
133 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
134 | 	{
135 | 		for (int dim = 0; dim < subspace_dimension_; ++dim)
136 | 			partition_[dictionary_id][dim] = dictionary_id*subspace_dimension_ + dim;
137 | 	}
138 | }
139 | 
140 | void ProductQuantization::IniStructurePartition()
141 | {
142 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
143 | 	{
144 | 		for (int dim = 0; dim < subspace_dimension_; ++dim)
145 | 			partition_[dictionary_id][dim] = dictionary_id + dim*dictionaries_count_;
146 | 	}
147 | }
148 | 
149 | void ProductQuantization::Training(
150 | 	const int max_iters,
151 | 	const double distortion_tol,
152 | 	const KmeansMethod kmeans_method,
153 | 	const string output_file_prefix,
154 | 	const bool read_partition,
155 | 	const string partition_file)
156 | {
157 | 	cout << "Product Quantization Training...\n";
158 | 	if (read_partition)
159 | 		ReadPartition(partition_file);
160 | 	else
161 | 		IniNaturalPartition();
162 | 	switch (kmeans_method)
163 | 	{
164 | 	case Lloyd:
165 | 		LloydTraining(max_iters, distortion_tol);
166 | 		break;
167 | 	case Closure:
168 | 		ClosureTraining(max_iters, distortion_tol);
169 | 		break;
170 | 	}
171 | 	
172 | 	SaveDictionary(output_file_prefix);
173 | 	SaveBinaryCodes(output_file_prefix);
174 | 	SavePartition(output_file_prefix);
175 | 
176 | 	cout << "Total distortion = " << distortion_ << endl;
177 | }
178 | 
179 | void ProductQuantization::LloydTraining(
180 | 	const int max_iters,
181 | 	const double distortion_tol)
182 | {
183 | 	distortion_ = 0;
184 | 	Kmeans* kmeans = Kmeans_New(points_count_, words_count_, subspace_dimension_, NULL);
185 | 
186 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
187 | 	{
188 | 		vector<float> subpoints(subspace_dimension_*points_count_);
189 | #pragma omp parallel for
190 | 		for (int point_id = 0; point_id < points_count_; ++point_id)
191 | 		{
192 | 			for (int dim = 0; dim < subspace_dimension_; ++dim)
193 | 			{
194 | 				subpoints[point_id*subspace_dimension_ + dim] = points_[point_id*space_dimension_ + partition_[dictionary_id][dim]];
195 | 			}
196 | 		}
197 | 
198 | 		Kmeans_Reset(kmeans, points_count_, words_count_, subspace_dimension_, &subpoints[0]);
199 | 		Kmeans_Initialize(kmeans, KmeansInitial_KmeansPlusPlus);
200 | 		Kmeans_LloydQuantization(kmeans, max_iters, distortion_tol);
201 | 
202 | 		DictionaryType* current_dictionary = &dictionary_[dictionary_id*words_count_*space_dimension_];
203 | 		for (int word_id = 0; word_id < words_count_; ++word_id)
204 | 		{
205 | 			for (int dim = 0; dim < subspace_dimension_; ++dim)
206 | 			{
207 | 				current_dictionary[word_id*space_dimension_ + partition_[dictionary_id][dim]] = kmeans->centers_[word_id*subspace_dimension_ + dim];
208 | 			}
209 | 		}
210 | 		for (int point_id = 0; point_id < points_count_; ++point_id)
211 | 		{
212 | 			binary_codes_[point_id*dictionaries_count_ + dictionary_id] = kmeans->assignments_[point_id];
213 | 		}
214 | 		distortion_ += kmeans->distortion_;
215 | 	}
216 | 	Kmeans_Delete(kmeans);
217 | }
218 | 
219 | void ProductQuantization::ClosureTraining(
220 | 	const int max_iters,
221 | 	const double distortion_tol)
222 | {
223 | 	distortion_ = 0;
224 | 
225 | 	Parameters params;
226 | 	params.Set("NCluster", std::to_string(words_count_));
227 | 	params.Set("MaxIteration", std::to_string(max_iters));
228 | 	params.Set("PartitionMethod", "Rptree");
229 | 	params.Set("Rptree_nMaxSample", std::to_string(1000));
230 | 	params.Set("Rptree_nIteration", std::to_string(100));
231 | 	params.Set("Rptree_nAxis", std::to_string(5));
232 | 
233 | 	params.Set("NThreads", std::to_string(omp_get_num_procs()));
234 | 
235 | 	params.Set("Closure_MaxTreeNum", std::to_string(10));
236 | 	params.Set("Closure_LeafSize", std::to_string(int(words_count_ / 10)));
237 | 	params.Set("Closure_DynamicTrees", std::to_string(1));
238 | 
239 | 	for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
240 | 	{
241 | 		vector<float> subpoints(subspace_dimension_*points_count_);
242 | #pragma omp parallel for
243 | 		for (int point_id = 0; point_id < points_count_; ++point_id)
244 | 		{
245 | 			for (int dim = 0; dim < subspace_dimension_; ++dim)
246 | 			{
247 | 				subpoints[point_id*subspace_dimension_ + dim] = points_[point_id*space_dimension_ + partition_[dictionary_id][dim]];
248 | 			}
249 | 		}
250 | 		Dataset<float> subpoints_(points_count_, subspace_dimension_, &subpoints[0]);
251 | 
252 | 		ClosureCluster CC;
253 | 		CC.SetData(&subpoints_);
254 | 		CC.LoadParameters(params);
255 | 		CC.RunClustering();
256 | 
257 | 		DictionaryType* current_dictionary = &dictionary_[dictionary_id*words_count_*space_dimension_];
258 | 		for (int word_id = 0; word_id < words_count_; ++word_id)
259 | 		{
260 | 			for (int dim = 0; dim < subspace_dimension_; ++dim)
261 | 			{
262 | 				current_dictionary[word_id*space_dimension_ + partition_[dictionary_id][dim]] = (CC.GetCenter())[word_id*subspace_dimension_ + dim];
263 | 			}
264 | 		}
265 | 		for (int point_id = 0; point_id < points_count_; ++point_id)
266 | 		{
267 | 			binary_codes_[point_id*dictionaries_count_ + dictionary_id] = (CC.GetCenterId())[point_id];
268 | 		}
269 | 		distortion_ += CC.total_WCSSD;
270 | 	}
271 | }


--------------------------------------------------------------------------------
/ReleaseVersion/ProductQuantization.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "DataUtil.h"
  4 | #include "Kmeans.h"
  5 | #include "time.h"
  6 | #include "ClosureCluster.h"
  7 | #include "Cluster.h"
  8 | #include "ClusterCommon.h"
  9 | #include <algorithm>
 10 | 
 11 | using namespace KMC;
 12 | 
 13 | class ProductQuantization
 14 | {
 15 | public:
 16 | 	/**
 17 | 	* The constructor function.
 18 | 	*  @param  points_count        The number of points in the dataset.
 19 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 20 | 	*  @param  words_count         The number of words in each dictionary (K).
 21 | 	*  @param  space_dimension     The dimension of database vector.
 22 | 	*/
 23 | 	ProductQuantization(
 24 | 		const int points_count,
 25 | 		const int dictionaries_count,
 26 | 		const int words_count,
 27 | 		const int space_dimension);
 28 | 
 29 | 	/**
 30 | 	* The deconstructor function.
 31 | 	*/
 32 | 	~ProductQuantization();
 33 | 
 34 | 
 35 | 	/**
 36 | 	* The initial function for points.
 37 | 	*  @param  points_file          The filename with points in .fvecs format or binary format.
 38 | 	*  @param  point_store_type     The type of points, should be FVEC, IVEC or BINARY.
 39 | 	*/
 40 | 	void InitPoints(
 41 | 		const string points_file,
 42 | 		const PointStoreType point_store_type);
 43 | 
 44 | 	/**
 45 | 	* The initial function for points.
 46 | 	*  @param  points              The array that stores the points.
 47 | 	*  @param  points_count        The number of points in the dataset.
 48 | 	*  @param  space_dimension     The dimension of database vector.
 49 | 	*/
 50 | 	void InitPoints(
 51 | 		PointType* points,
 52 | 		const int points_count,
 53 | 		const int space_dimension);
 54 | 
 55 | 	/**
 56 | 	* This function returns the trained dictionary.
 57 | 	*/
 58 | 	const DictionaryType* GetDictionary();
 59 | 
 60 | 	/**
 61 | 	* This function returns the trained binary codes.
 62 | 	*/
 63 | 	const CodeType* GetBinaryCodes();
 64 | 
 65 | 
 66 | 	/**
 67 | 	* The main function that performs product quantization.
 68 | 	*  @param  max_iters                The maximum iteration of the algorithm.
 69 | 	*  @param  distortion_tol           The parameter to test the distortion relative variation in consecutive iterations.
 70 | 	*  @param  kmeans_method            The method of kmeans clustering adopted
 71 | 	*  @param  output_file_prefix       The prefix of the output file.
 72 | 	*  @param  read_partition           The flag that indicates whether to read partition outside.
 73 | 	*  @param  partition_file           The filename with partition in binary format.
 74 | 	*/
 75 | 	void Training(
 76 | 		const int max_iters,
 77 | 		const double distortion_tol,
 78 | 		const KmeansMethod kmeans_method,
 79 | 		const string output_file_prefix,
 80 | 		const bool read_partition,
 81 | 		const string partition_file = "");
 82 | private:
 83 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
 84 | 	/**
 85 | 	* This function disallows the use of compiler-generated copy constructor function
 86 | 	*/
 87 | 	ProductQuantization(const ProductQuantization&);
 88 | 	/**
 89 | 	* This function disallows the use of compiler-generated copy assignment function
 90 | 	*/
 91 | 	ProductQuantization& operator=(const ProductQuantization&);
 92 | 
 93 | 
 94 | 	/**
 95 | 	* This function output dictionary in a binary format.
 96 | 	*/
 97 | 	void SaveDictionary(const string output_file_prefix);
 98 | 	/**
 99 | 	* This function output binary codes in a binary format.
100 | 	*/
101 | 	void SaveBinaryCodes(const string output_file_prefix);
102 | 	/**
103 | 	* This function output partition in a binary format.
104 | 	*/
105 | 	void SavePartition(const string output_file_prefix);
106 | 	/**
107 | 	* This function read partition in a binary format.
108 | 	*/
109 | 	void ReadPartition(const string partition_file);
110 | 	/**
111 | 	* This function initial partition in a natural order (for SIFT).
112 | 	*/
113 | 	void IniNaturalPartition();
114 | 	/**
115 | 	* This function initial partition in a structure order (for GIST).
116 | 	*/
117 | 	void IniStructurePartition();
118 | 
119 | 
120 | 	/**
121 | 	* This function performs product quantization training using Lloyd kmeans algorithm.
122 | 	*  @param  max_iters                The maximum iteration of the algorithm.
123 | 	*  @param  distortion_tol           The parameter to test the distortion relative variation in consecutive iterations.
124 | 	*/
125 | 	void LloydTraining(const int max_iters, const double distortion_tol);
126 | 	/**
127 | 	* This function performs product quantization training using Closure cluster algorithm (fast kmeans).
128 | 	*  @param  max_iters                The maximum iteration of the algorithm.
129 | 	*  @param  distortion_tol           The parameter to test the distortion relative variation in consecutive iterations. 
130 | 	*/
131 | 	void ClosureTraining(const int max_iters, const double distortion_tol);
132 | 
133 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member variables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
134 | 	/**
135 | 	* The number of points in the dataset.
136 | 	*/
137 | 	int points_count_;
138 | 	/**
139 | 	* The number of dictionaries (M).
140 | 	*/
141 | 	int dictionaries_count_;
142 | 	/**
143 | 	* The number of words in each dictionary (K).
144 | 	*/
145 | 	int words_count_;
146 | 	/**
147 | 	* The dimension of database vector.
148 | 	*/
149 | 	int space_dimension_;
150 | 	/**
151 | 	* The dimension of subspace.
152 | 	*/
153 | 	int subspace_dimension_;
154 | 
155 | 
156 | 	/**
157 | 	* A two-dimensional array (of length dictionaries_count_*subspace_dimension_)
158 | 	* that partition_[0] containes the indexes of subspace_dimension_ that are divided into the 0th partition.
159 | 	*/
160 | 	vector<vector<int>> partition_;
161 | 	/**
162 | 	* A one-dimensional array (of length space_dimension_*points_count_)
163 | 	* that the first space_dimension_ data is the first point.
164 | 	*/
165 | 	PointType* points_;
166 | 	/**
167 | 	* A flag to indicate whether to manage the points memory.
168 | 	*/
169 | 	bool own_points_memory_;
170 | 	/**
171 | 	* A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
172 | 	* that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
173 | 	*/
174 | 	DictionaryType* dictionary_;
175 | 	/**
176 | 	* A one-dimensional array (of length dictionaries_count_*points_count_)
177 | 	* that the frist dictionaries_count_ data is the binary codes for the first point.
178 | 	*/
179 | 	CodeType* binary_codes_;
180 | 
181 | 
182 | 	/**
183 | 	* Stores the distortion for all points.
184 | 	*/
185 | 	float distortion_;
186 | };


--------------------------------------------------------------------------------
/ReleaseVersion/Searcher.cpp:
--------------------------------------------------------------------------------
  1 | #include "Searcher.h"
  2 | 
  3 | 
  4 | Searcher::Searcher(
  5 | 	const int points_count,
  6 | 	const int dictionaries_count,
  7 | 	const int words_count,
  8 | 	const int space_dimension,
  9 | 	const int queries_count,
 10 | 	const int groundtruth_length,
 11 | 	const int result_length)
 12 | 	:points_count_(points_count), 
 13 | 	dictionaries_count_(dictionaries_count),
 14 | 	words_count_(words_count),
 15 | 	space_dimension_(space_dimension),
 16 | 	queries_count_(queries_count),
 17 | 	groundtruth_length_(groundtruth_length),
 18 | 	result_length_(result_length)
 19 | {
 20 | 	if (dictionaries_count <= 0 || words_count <= 0 || space_dimension <= 0 || points_count <= 0 || 
 21 | 		queries_count_ <= 0 || groundtruth_length_ <= 0 || result_length_ <= 0)
 22 | 	{
 23 | 		cout << "Search:: bad input parameters\n";
 24 | 		throw std::logic_error("Bad input parameters");
 25 | 	}
 26 | 
 27 | 	queries_ = new QueryType[space_dimension*queries_count];
 28 | 	memset(queries_, 0, sizeof(QueryType)*space_dimension*queries_count);
 29 | 	dictionary_ = new DictionaryType[dictionaries_count*words_count*space_dimension];
 30 | 	memset(dictionary_, 0, sizeof(DictionaryType)*dictionaries_count*words_count*space_dimension);
 31 | 	binary_codes_ = new CodeType[dictionaries_count*points_count];
 32 | 	memset(binary_codes_, 0, sizeof(CodeType)*dictionaries_count*points_count);
 33 | 	
 34 | 	groundtruth_ = new PointIdType[queries_count*groundtruth_length];
 35 | 	memset(groundtruth_, 0, sizeof(PointIdType)*queries_count*groundtruth_length);
 36 | 
 37 | 	results_.resize(queries_count);
 38 | 	distance_table_.resize(queries_count, vector<DistanceType>(dictionaries_count*words_count));
 39 | }
 40 | 
 41 | Searcher::~Searcher()
 42 | {
 43 | 	if (groundtruth_)  delete[] groundtruth_;
 44 | 	if (binary_codes_) delete[] binary_codes_;
 45 | 	if (dictionary_)   delete[] dictionary_;
 46 | 	if (queries_)      delete[] queries_;
 47 | }
 48 | 
 49 | void Searcher::InitQueries(
 50 | 	const string queries_file,
 51 | 	const PointStoreType queries_store_type)
 52 | {
 53 | 	cout << "Reading queries in " + queries_file << endl;
 54 | 	ReadOneDimensionalPoints<QueryType>(queries_file, queries_store_type, queries_, queries_count_, space_dimension_);
 55 | }
 56 | 
 57 | void Searcher::InitQueries(const QueryType* queries)
 58 | {
 59 | 	cout << "Reading queries...\n";
 60 | 	memcpy(queries_, queries, sizeof(QueryType)*queries_count_*space_dimension_);
 61 | }
 62 | 
 63 | void Searcher::InitGroundtruth(
 64 | 	const string groundtruth_file,
 65 | 	const PointStoreType groundtruth_store_type)
 66 | {
 67 | 	cout << "Reading groundtruth in " + groundtruth_file << endl;
 68 | 	ReadOneDimensionalPoints<PointIdType>(groundtruth_file, groundtruth_store_type, groundtruth_, queries_count_, groundtruth_length_);
 69 | }
 70 | 
 71 | void Searcher::InitGroundtruth(const PointIdType* groundtruth)
 72 | {
 73 | 	cout << "Reading groundtruth...\n";
 74 | 	memcpy(groundtruth_, groundtruth, sizeof(PointIdType)*queries_count_*groundtruth_length_);
 75 | }
 76 | 
 77 | void Searcher::InitDictionary(
 78 | 	const string dictionary_file,
 79 | 	const PointStoreType dictionary_store_type)
 80 | {
 81 | 	cout << "Reading dictionaries in " + dictionary_file << endl;
 82 | 	ReadOneDimensionalPoints<DictionaryType>(dictionary_file, dictionary_store_type, dictionary_, dictionaries_count_*words_count_, space_dimension_);
 83 | }
 84 | 
 85 | void Searcher::InitDictionary(const DictionaryType* dictionary)
 86 | {
 87 | 	cout << "Reading dictionaries...\n";
 88 | 	memcpy(dictionary_, dictionary, sizeof(DictionaryType)*dictionaries_count_*words_count_*space_dimension_);
 89 | }
 90 | 
 91 | void Searcher::InitBinaryCodes(
 92 | 	const string binary_codes_file,
 93 | 	const PointStoreType binary_codes_store_type)
 94 | {
 95 | 	cout << "Reading binary codes in " + binary_codes_file << endl;
 96 | 	ReadOneDimensionalPoints<CodeType>(binary_codes_file, binary_codes_store_type, binary_codes_, points_count_, dictionaries_count_);
 97 | }
 98 | 
 99 | void Searcher::InitBinaryCodes(const CodeType* binary_codes)
100 | {
101 | 	cout << "Reading binary codes...\n";
102 | 	memcpy(binary_codes_, binary_codes, sizeof(CodeType)*dictionaries_count_*points_count_);
103 | }
104 | 
105 | void Searcher::SaveNearestNeighborsId(const string output_retrieved_results_file)
106 | {
107 | 	cout << "Saving retrieved results in " + output_retrieved_results_file << endl;
108 | 	ofstream out_results(output_retrieved_results_file, ios::binary);
109 | 	if (!out_results.good()) 
110 | 	{
111 | 		cout << "Bad output retrieved results file stream : " + output_retrieved_results_file << endl;
112 | 		throw std::logic_error("Bad output file stream");
113 | 	}
114 | 	out_results.write((char*)&queries_count_, sizeof(int));
115 | 	out_results.write((char*)&result_length_, sizeof(int));
116 | 	for (int query_id = 0; query_id < queries_count_; ++query_id)
117 | 	{
118 | 		for (int length = 0; length < result_length_; ++length)
119 | 		{
120 | 			out_results.write(reinterpret_cast<char*>(&(results_[query_id][length].second)), sizeof(PointIdType));
121 | 		}
122 | 	}
123 | 	out_results.close();
124 | }
125 | 
126 | void Searcher::GetDistanceTable(const int query_id)
127 | {
128 | 	QueryType* query = &(queries_[query_id * space_dimension_]);
129 | 	DistanceType* distance_table_for_current_query = &(distance_table_[query_id][0]);
130 | 
131 | 	for (int word_id = 0; word_id < dictionaries_count_*words_count_; ++word_id)
132 | 	{
133 | 		float distance = 0;
134 | 		DictionaryType* word = &(dictionary_[word_id*space_dimension_]);
135 | 		for (int dimension = 0; dimension < space_dimension_; ++dimension)
136 | 		{
137 | 			distance += (query[dimension] - word[dimension])*(query[dimension] - word[dimension]);
138 | 		}
139 | 		distance_table_for_current_query[word_id] = distance;
140 | 	}
141 | }
142 | 
143 | void Searcher::GetNearestNeighborsForEachQuery(const int query_id)
144 | {
145 | 	GetDistanceTable(query_id);
146 | 
147 | 	DistanceType* distance_table_for_current_query = &(distance_table_[query_id][0]);
148 | 	results_[query_id].resize(result_length_, std::make_pair(FLT_MAX, -1));
149 | 	vector<DistanceToQueryType> * result_list = &(results_[query_id]);
150 | 
151 | 	std::make_heap(result_list->begin(), result_list->end());
152 | 	for (int point_id = 0; point_id < points_count_; ++point_id)
153 | 	{
154 | 		CodeType* point_codes = &binary_codes_[point_id*dictionaries_count_];
155 | 		float distance = 0;
156 | 		for (int dictionary_id = 0; dictionary_id < dictionaries_count_; ++dictionary_id)
157 | 		{
158 | 			distance += distance_table_for_current_query[dictionary_id*words_count_ + point_codes[dictionary_id]];
159 | 		}
160 | 		if (distance < result_list->front().first)
161 | 		{
162 | 			std::pop_heap(result_list->begin(), result_list->end());
163 | 			result_list->pop_back();
164 | 			result_list->push_back(std::make_pair(distance, point_id));
165 | 			std::push_heap(result_list->begin(), result_list->end());
166 | 		}
167 | 	}
168 | 	std::sort(result_list->begin(), result_list->end());
169 | }
170 | 
171 | void Searcher::GetNearestNeighbors(const string output_retrieved_results_file)
172 | {
173 | 	cout << "Searching (after read queries, dictionary, binary codes)...\n";
174 | 	for (int query_id = 0; query_id < queries_count_; ++query_id)
175 | 	{
176 | 		cout << query_id << endl;
177 | 		GetNearestNeighborsForEachQuery(query_id);
178 | 	}
179 | 	SaveNearestNeighborsId(output_retrieved_results_file);
180 | }
181 | 
182 | void Searcher::GetRecall(
183 | 	const vector<int> & retrieved_lengths_considered, 
184 | 	const int n_nearest_groundturths)
185 | {
186 | 	if (n_nearest_groundturths > groundtruth_length_)
187 | 	{
188 | 		cout << "too large number of nearest groundtruth neighbors (" << n_nearest_groundturths << ") considered, should be 1 to " << groundtruth_length_ << endl;
189 | 		throw std::logic_error("too large number of nearest groundtruth neighbors considered");
190 | 	}
191 | 	for (int r_id = 0; r_id < retrieved_lengths_considered.size(); ++r_id)
192 | 	{
193 | 		float recall = GetRecallAt(groundtruth_, groundtruth_length_, retrieved_lengths_considered[r_id], n_nearest_groundturths);
194 | 		cout << "recall@" << retrieved_lengths_considered[r_id] << " (T=" << n_nearest_groundturths << "): " << recall << endl;
195 | 	}
196 | }
197 | 
198 | float Searcher::GetRecallAt(
199 | 	const PointIdType* groundtruth, 
200 | 	const int groundtruth_length,
201 | 	const int retrieved_length_considered, 
202 | 	const int n_nearest_groundturths)
203 | {
204 | 	if (groundtruth == NULL) {
205 | 		cout << "Groundtruth is empty!" << endl;
206 | 		return 0;
207 | 	}
208 | 	float recall = 0;
209 | 	for (int query_id = 0; query_id < queries_count_; ++query_id)
210 | 	{
211 | 		int count = 0;
212 | 		for (int index = 0; index < retrieved_length_considered && index < results_.size(); ++index) 
213 | 		{
214 | 			for (int nearest_id = 0; nearest_id < n_nearest_groundturths; ++nearest_id)
215 | 			{
216 | 				if (results_[query_id][index].second == groundtruth[query_id*groundtruth_length + nearest_id])
217 | 				{
218 | 					count++;
219 | 				}
220 | 			}
221 | 		}
222 | 		recall += count * 1.0 / n_nearest_groundturths;
223 | 	}
224 | 	return recall / queries_count_;
225 | }
226 | 
227 | void Searcher::ReadResults(const string results_file, const int queries_count, const int result_length)
228 | {
229 | 	cout << "Reading retrieved results in " + results_file << endl;
230 | 	queries_count_ = queries_count;
231 | 	result_length_ = result_length;
232 | 
233 | 	ifstream results_stream;
234 | 	results_stream.open(results_file.c_str(), ios::binary);
235 | 	if (!results_stream.good())
236 | 	{
237 | 		cout << "Bad results stream: " + results_file << endl;
238 | 		throw std::logic_error("Bad input points stream");
239 | 	}
240 | 
241 | 	int dim = 0, count = 0;
242 | 	results_stream.read((char *)&count, sizeof(int));
243 | 	results_stream.read((char *)&dim, sizeof(int));
244 | 	if (dim != result_length || count != queries_count)
245 | 	{
246 | 		cout << "unmatched retrieved results dimension\n";
247 | 		throw std::logic_error("unmatched dimension!");
248 | 	}
249 | 	cout << "Dimension of the vector set:" << dim << endl;
250 | 	cout << "Number of the vector set:" << count << endl;
251 | 	PointIdType id = 0;
252 | 	for (int query_id = 0; query_id < queries_count; ++query_id)
253 | 	{
254 | 		results_[query_id].resize(result_length);
255 | 		for (int length = 0; length < result_length; ++length)
256 | 		{
257 | 			results_stream.read(reinterpret_cast<char*>(&id), sizeof(PointIdType));
258 | 			results_[query_id][length] = std::make_pair(1.0, id);
259 | 		}
260 | 	}
261 | 
262 | 	results_stream.close();
263 | }


--------------------------------------------------------------------------------
/ReleaseVersion/Searcher.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "DataUtil.h"
  4 | #include <algorithm>
  5 | 
  6 | 
  7 | class Searcher
  8 | {
  9 | public:
 10 | 	/**
 11 | 	* The constructor function.
 12 | 	*  @param  points_count        The number of points in the dataset.
 13 | 	*  @param  dictionaries_count  The number of dictionaries (M).
 14 | 	*  @param  words_count the     The number of words in each dictionary (K).
 15 | 	*  @param  space_dimension     The dimension of database vector.
 16 | 	*  @param  queries_count       The number of queries.
 17 | 	*  @param groundtruth_length             The length of groundtruth neighbors.
 18 | 	*  @param  result_length       The number of list length retrived from dataset.
 19 | 	*/
 20 | 	Searcher(
 21 | 		const int points_count,
 22 | 		const int dictionaries_count,
 23 | 		const int words_count,
 24 | 		const int space_dimension,
 25 | 		const int queries_count,
 26 | 		const int groundtruth_length,
 27 | 		const int result_length = 1000);
 28 | 
 29 | 	/**
 30 | 	* The deconstructor function.
 31 | 	*/
 32 | 	~Searcher();
 33 | 
 34 | 
 35 | 	/**
 36 | 	* The initial function for points.
 37 | 	*  @param  queries_file           The filename with queries in .fvecs format or binary format.
 38 | 	*  @param  queries_store_type     The type of queries, should be FVEC, IVEC or BINARY.
 39 | 	*/
 40 | 	void InitQueries(
 41 | 		const string queries_file,
 42 | 		const PointStoreType queries_store_type);
 43 | 
 44 | 	/**
 45 | 	* The initial function for points.
 46 | 	*  @param  queries           A one-dimensional array (of length space_dimension_*queries_count_)
 47 | 	*                            that the first space_dimension_ data is the first query.
 48 | 	*/
 49 | 	void InitQueries(const QueryType* queries);
 50 | 
 51 | 	/**
 52 | 	* The initial function for points.
 53 | 	*  @param  groundtruth_file           The filename with groundtruth in .fvecs format or binary format.
 54 | 	*  @param  groundtruth_store_type     The type of groundtruth, should be IVEC or BINARY.
 55 | 	*/
 56 | 	void InitGroundtruth(
 57 | 		const string groundtruth_file,
 58 | 		const PointStoreType groundtruth_store_type);
 59 | 
 60 | 	/**
 61 | 	* The initial function for points.
 62 | 	*  @param  groundtruth           A one-dimensional array (of length queries_count_*groundtruth_length_)
 63 | 	*                                that the first groundtruth_length_ data is the nearest neighbors of the first query.
 64 | 	*/
 65 | 	void InitGroundtruth(const PointIdType* groundtruth);
 66 | 
 67 | 
 68 | 	/**
 69 | 	* The initial function for dictionary.
 70 | 	*  @param  dictionary_file        The filename with dictionary in binary format.
 71 | 	*  @param  dictionary_store_type  The type of dictionary, should be BINARY.
 72 | 	*/
 73 | 	void InitDictionary(
 74 | 		const string dictionary_file,
 75 | 		const PointStoreType dictionary_store_type);
 76 | 
 77 | 	/**
 78 | 	* The initial function for points.
 79 | 	*  @param  dictionary           The array that stores the dictionary.
 80 | 	*/
 81 | 	void InitDictionary(const DictionaryType* dictionary);
 82 | 
 83 | 	/**
 84 | 	* The initial function for dictionary.
 85 | 	*  @param  binary_codes_file        The filename with binary codes in binary format.
 86 | 	*  @param  binary_codes_store_type  The type of binary codes, should be BINARY.
 87 | 	*/
 88 | 	void InitBinaryCodes(
 89 | 		const string binary_codes_file,
 90 | 		const PointStoreType binary_codes_store_type);
 91 | 
 92 | 	/**
 93 | 	* The initial function for points.
 94 | 	*  @param  binary_codes           The array that stores the binary codes.
 95 | 	*/
 96 | 	void InitBinaryCodes(const CodeType* binary_codes);
 97 | 
 98 | 
 99 | 	/**
100 | 	* This function read results from outside through results_file, after read the GetRecall function can be called to compute recall.
101 | 	*  @param results_file     The filename with results in binary format.
102 | 	*  @param quereis_count    The number of queries.
103 | 	*  @param result_length    The number of list length retrieved from the dataset.
104 | 	*/
105 | 	void ReadResults(
106 | 		const string results_file,
107 | 		const int queries_count,
108 | 		const int result_length);
109 | 
110 | 
111 | 	/**
112 | 	* The main function that retrieve the nearest neighbors of queries given the dictionay and binary codes.
113 | 	*  @param  output_retrieved_results_file         The filename that will be used to save the retrieval results.
114 | 	*/
115 | 	void GetNearestNeighbors(const string output_retrieved_results_file);
116 | 
117 | 	/**
118 | 	* This function computes the performance in terms of recall@R with R being e.g., 1, 10, 100.
119 | 	*  @param retrieved_lengths_considered   The set of R parameters.
120 | 	*  @param n_nearest_groundturths         The number of nearest groundtruth neighbors considered
121 | 	*/
122 | 	void GetRecall(
123 | 		const vector<int> & retrieved_lengths_considered, 
124 | 		const int n_nearest_groundturths);
125 | 
126 | 	/**
127 | 	* This function computes the performance in terms of recall@R with R being e.g., 1, 10, 100.
128 | 	*  @param groundtruth                    The two-dimensional arrays with groudtruth nearest neighbors for all the queries.
129 | 	*  @param groundtruth_length             The length of groundtruth neighbors.
130 | 	*  @param retrieved_length_considered    The specific R parameters.
131 | 	*  @param n_nearest_groundturths         The number of nearest groundtruth neighbors considered
132 | 	*/
133 | 	float GetRecallAt(
134 | 		const PointIdType* groundtruth, 
135 | 		const int groundtruth_length,
136 | 		const int retrieved_length_considered, 
137 | 		const int n_nearest_groundturths);
138 | 	
139 | private:
140 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
141 | 	/**
142 | 	* This function disallows the use of compiler-generated copy constructor function
143 | 	*/
144 | 	Searcher(const Searcher&);
145 | 	/**
146 | 	* This function disallows the use of compiler-generated copy assignment function
147 | 	*/
148 | 	Searcher& operator=(const Searcher&);
149 | 
150 | 
151 | 	/** 
152 | 	* This function computes the nearest neighbors for the current query.
153 | 	*  @param  query_id   The id of the current query.
154 | 	*/
155 | 	void GetNearestNeighborsForEachQuery(const int query_id);
156 | 
157 | 	/**
158 | 	* This function compuste the distance lookup table for the current query.
159 | 	*  @param  query_id   The id of the current query.
160 | 	*/
161 | 	void GetDistanceTable(const int query_id);
162 | 
163 | 	/**
164 | 	* This function saves the retrieval results in the output_file.
165 | 	*  @param  output_retrieved_results_file         The filename that will be used to save the retrieval results.
166 | 	*/
167 | 	void SaveNearestNeighborsId(const string output_retrieved_results_file);
168 | 
169 | 
170 | 	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ private member variables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
171 | 	/**
172 | 	* The number of points in the dataset.
173 | 	*/
174 | 	int points_count_;
175 | 	/**
176 | 	* The number of dictionaries (M).
177 | 	*/
178 | 	int dictionaries_count_;
179 | 	/**
180 | 	* The number of words in each dictionary (K).
181 | 	*/
182 | 	int words_count_;
183 | 	/**
184 | 	* The dimension of database vector.
185 | 	*/
186 | 	int space_dimension_;
187 | 	/**
188 | 	* The number of queries.
189 | 	*/
190 | 	int queries_count_;
191 | 	/**
192 | 	* The number of list length of the groundturth nearest neighbors.
193 | 	*/
194 | 	int groundtruth_length_;
195 | 	/**
196 | 	* The number of list length retrived from the dataset.
197 | 	*/
198 | 	int result_length_;
199 | 
200 | 
201 | 	/**
202 | 	* A one-dimensional array (of length space_dimension_*queries_count_)
203 | 	* that the first space_dimension_ data is the first query.
204 | 	*/
205 | 	QueryType* queries_;
206 | 	/**
207 | 	* A one-dimensional array (of length space_dimension_*words_count_*dictionaries_count_)
208 | 	* that the first (second) space_dimension_ data is the first (second) word in the first dictionary.
209 | 	*/
210 | 	DictionaryType* dictionary_;
211 | 	/**
212 | 	* A one-dimensional array (of length dictionaries_count_*points_count_)
213 | 	* that the frist dictionaries_count_ data is the binary codes for the first point.
214 | 	*/
215 | 	CodeType* binary_codes_;
216 | 
217 | 	
218 | 	/**
219 | 	* A one-dimensional array (of length queries_count_*groundtruth_length_)
220 | 	* that the first groundtruth_length_ data is the nearest neighbors of the first query.
221 | 	*/
222 | 	PointIdType* groundtruth_;
223 | 	/**
224 | 	* A two-dimensional array (of queries_count_*result_length_)
225 | 	* that the first (second) result_length_ data is the retrive results for the first (second) query vector.
226 | 	*/
227 | 	vector<vector<DistanceToQueryType>> results_;
228 | 	
229 | 
230 | 	/**
231 | 	* Temporary variable: a two-dimensional array (of length queries_count_ * (words_count_*dictionaries_count_))
232 | 	* that the first (words_count_*dictionaries_count_) data is the distance from each word to the first query.
233 | 	*/
234 | 	vector<vector<DistanceType>> distance_table_;
235 | };
236 | 


--------------------------------------------------------------------------------
/ReleaseVersion/config.txt:
--------------------------------------------------------------------------------
 1 | PQ=0
 2 | NCQ=0
 3 | CQ=0
 4 | Search=1
 5 | 
 6 | # global parameters
 7 | points_count=1000000
 8 | dictionaries_count=8
 9 | words_count=256
10 | space_dimension=128
11 | points_file=\\4wzh122\d$\code\MATLAB\sift\sift_base.fvecs
12 | output_file_prefix=\\4wzh122\d$\temp\
13 | max_iter=30
14 | 
15 | # PQ parameters
16 | distortion_tol=0.0001
17 | read_partition=0
18 | partition_file=
19 | # if 101 then using closure cluster, else lloyd kmeans
20 | kmeans_method=101
21 | 
22 | # NCQ and CQ parameters
23 | num_sep=20
24 | # initial from outside, if 1 then set the file name of dictinary and codes
25 | initial_from_outside=0
26 | dictionary_file=
27 | binary_codes_file=
28 | 
29 | # CQ parameters
30 | mu=0.0004
31 | 
32 | # Search parameters
33 | queries_count=10000
34 | groundtruth_length=100
35 | result_length=100
36 | queries_file=\\4wzh122\d$\code\MATLAB\sift\sift_query.fvecs
37 | groundtruth_file=\\4wzh122\d$\code\MATLAB\sift\sift_groundtruth.ivecs
38 | trained_dictionary_file=D:\temp\D
39 | trained_binary_codes_file=D:\temp\B
40 | output_retrieved_results_file=\\4wzh122\d$\temp\results
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/ReleaseVersion/lbfgs.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *      C library of Limited memory BFGS (L-BFGS).
  3 |  *
  4 |  * Copyright (c) 1990, Jorge Nocedal
  5 |  * Copyright (c) 2007-2010 Naoaki Okazaki
  6 |  * All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 24 |  * THE SOFTWARE.
 25 |  */
 26 | 
 27 | /* $Id$ */
 28 | 
 29 | #ifndef __LBFGS_H__
 30 | #define __LBFGS_H__
 31 | 
 32 | #ifdef  __cplusplus
 33 | extern "C" {
 34 | #endif/*__cplusplus*/
 35 | 
 36 | /*
 37 |  * The default precision of floating point values is 64bit (double).
 38 |  */
 39 | #ifndef LBFGS_FLOAT
 40 | #define LBFGS_FLOAT     64
 41 | #endif/*LBFGS_FLOAT*/
 42 | 
 43 | /*
 44 |  * Activate optimization routines for IEEE754 floating point values.
 45 |  */
 46 | #ifndef LBFGS_IEEE_FLOAT
 47 | #define LBFGS_IEEE_FLOAT    1
 48 | #endif/*LBFGS_IEEE_FLOAT*/
 49 | 
 50 | #if     LBFGS_FLOAT == 32
 51 | typedef float lbfgsfloatval_t;
 52 | 
 53 | #elif   LBFGS_FLOAT == 64
 54 | typedef double lbfgsfloatval_t;
 55 | 
 56 | #else
 57 | #error "libLBFGS supports single (float; LBFGS_FLOAT = 32) or double (double; LBFGS_FLOAT=64) precision only."
 58 | 
 59 | #endif
 60 | 
 61 | 
 62 | /** 
 63 |  * \addtogroup liblbfgs_api libLBFGS API
 64 |  * @{
 65 |  *
 66 |  *  The libLBFGS API.
 67 |  */
 68 | 
 69 | /**
 70 |  * Return values of lbfgs().
 71 |  * 
 72 |  *  Roughly speaking, a negative value indicates an error.
 73 |  */
 74 | enum {
 75 |     /** L-BFGS reaches convergence. */
 76 |     LBFGS_SUCCESS = 0,
 77 |     LBFGS_CONVERGENCE = 0,
 78 |     LBFGS_STOP,
 79 |     /** The initial variables already minimize the objective function. */
 80 |     LBFGS_ALREADY_MINIMIZED,
 81 | 
 82 |     /** Unknown error. */
 83 |     LBFGSERR_UNKNOWNERROR = -1024,
 84 |     /** Logic error. */
 85 |     LBFGSERR_LOGICERROR,
 86 |     /** Insufficient memory. */
 87 |     LBFGSERR_OUTOFMEMORY,
 88 |     /** The minimization process has been canceled. */
 89 |     LBFGSERR_CANCELED,
 90 |     /** Invalid number of variables specified. */
 91 |     LBFGSERR_INVALID_N,
 92 |     /** Invalid number of variables (for SSE) specified. */
 93 |     LBFGSERR_INVALID_N_SSE,
 94 |     /** The array x must be aligned to 16 (for SSE). */
 95 |     LBFGSERR_INVALID_X_SSE,
 96 |     /** Invalid parameter lbfgs_parameter_t::epsilon specified. */
 97 |     LBFGSERR_INVALID_EPSILON,
 98 |     /** Invalid parameter lbfgs_parameter_t::past specified. */
 99 |     LBFGSERR_INVALID_TESTPERIOD,
100 |     /** Invalid parameter lbfgs_parameter_t::delta specified. */
101 |     LBFGSERR_INVALID_DELTA,
102 |     /** Invalid parameter lbfgs_parameter_t::linesearch specified. */
103 |     LBFGSERR_INVALID_LINESEARCH,
104 |     /** Invalid parameter lbfgs_parameter_t::max_step specified. */
105 |     LBFGSERR_INVALID_MINSTEP,
106 |     /** Invalid parameter lbfgs_parameter_t::max_step specified. */
107 |     LBFGSERR_INVALID_MAXSTEP,
108 |     /** Invalid parameter lbfgs_parameter_t::ftol specified. */
109 |     LBFGSERR_INVALID_FTOL,
110 |     /** Invalid parameter lbfgs_parameter_t::wolfe specified. */
111 |     LBFGSERR_INVALID_WOLFE,
112 |     /** Invalid parameter lbfgs_parameter_t::gtol specified. */
113 |     LBFGSERR_INVALID_GTOL,
114 |     /** Invalid parameter lbfgs_parameter_t::xtol specified. */
115 |     LBFGSERR_INVALID_XTOL,
116 |     /** Invalid parameter lbfgs_parameter_t::max_linesearch specified. */
117 |     LBFGSERR_INVALID_MAXLINESEARCH,
118 |     /** Invalid parameter lbfgs_parameter_t::orthantwise_c specified. */
119 |     LBFGSERR_INVALID_ORTHANTWISE,
120 |     /** Invalid parameter lbfgs_parameter_t::orthantwise_start specified. */
121 |     LBFGSERR_INVALID_ORTHANTWISE_START,
122 |     /** Invalid parameter lbfgs_parameter_t::orthantwise_end specified. */
123 |     LBFGSERR_INVALID_ORTHANTWISE_END,
124 |     /** The line-search step went out of the interval of uncertainty. */
125 |     LBFGSERR_OUTOFINTERVAL,
126 |     /** A logic error occurred; alternatively, the interval of uncertainty
127 |         became too small. */
128 |     LBFGSERR_INCORRECT_TMINMAX,
129 |     /** A rounding error occurred; alternatively, no line-search step
130 |         satisfies the sufficient decrease and curvature conditions. */
131 |     LBFGSERR_ROUNDING_ERROR,
132 |     /** The line-search step became smaller than lbfgs_parameter_t::min_step. */
133 |     LBFGSERR_MINIMUMSTEP,
134 |     /** The line-search step became larger than lbfgs_parameter_t::max_step. */
135 |     LBFGSERR_MAXIMUMSTEP,
136 |     /** The line-search routine reaches the maximum number of evaluations. */
137 |     LBFGSERR_MAXIMUMLINESEARCH,
138 |     /** The algorithm routine reaches the maximum number of iterations. */
139 |     LBFGSERR_MAXIMUMITERATION,
140 |     /** Relative width of the interval of uncertainty is at most
141 |         lbfgs_parameter_t::xtol. */
142 |     LBFGSERR_WIDTHTOOSMALL,
143 |     /** A logic error (negative line-search step) occurred. */
144 |     LBFGSERR_INVALIDPARAMETERS,
145 |     /** The current search direction increases the objective function value. */
146 |     LBFGSERR_INCREASEGRADIENT,
147 | };
148 | 
149 | /**
150 |  * Line search algorithms.
151 |  */
152 | enum {
153 |     /** The default algorithm (MoreThuente method). */
154 |     LBFGS_LINESEARCH_DEFAULT = 0,
155 |     /** MoreThuente method proposd by More and Thuente. */
156 |     LBFGS_LINESEARCH_MORETHUENTE = 0,
157 |     /**
158 |      * Backtracking method with the Armijo condition.
159 |      *  The backtracking method finds the step length such that it satisfies
160 |      *  the sufficient decrease (Armijo) condition,
161 |      *    - f(x + a * d) <= f(x) + lbfgs_parameter_t::ftol * a * g(x)^T d,
162 |      *
163 |      *  where x is the current point, d is the current search direction, and
164 |      *  a is the step length.
165 |      */
166 |     LBFGS_LINESEARCH_BACKTRACKING_ARMIJO = 1,
167 |     /** The backtracking method with the defualt (regular Wolfe) condition. */
168 |     LBFGS_LINESEARCH_BACKTRACKING = 2,
169 |     /**
170 |      * Backtracking method with regular Wolfe condition.
171 |      *  The backtracking method finds the step length such that it satisfies
172 |      *  both the Armijo condition (LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
173 |      *  and the curvature condition,
174 |      *    - g(x + a * d)^T d >= lbfgs_parameter_t::wolfe * g(x)^T d,
175 |      *
176 |      *  where x is the current point, d is the current search direction, and
177 |      *  a is the step length.
178 |      */
179 |     LBFGS_LINESEARCH_BACKTRACKING_WOLFE = 2,
180 |     /**
181 |      * Backtracking method with strong Wolfe condition.
182 |      *  The backtracking method finds the step length such that it satisfies
183 |      *  both the Armijo condition (LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
184 |      *  and the following condition,
185 |      *    - |g(x + a * d)^T d| <= lbfgs_parameter_t::wolfe * |g(x)^T d|,
186 |      *
187 |      *  where x is the current point, d is the current search direction, and
188 |      *  a is the step length.
189 |      */
190 |     LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 3,
191 | };
192 | 
193 | /**
194 |  * L-BFGS optimization parameters.
195 |  *  Call lbfgs_parameter_init() function to initialize parameters to the
196 |  *  default values.
197 |  */
198 | typedef struct {
199 |     /**
200 |      * The number of corrections to approximate the inverse hessian matrix.
201 |      *  The L-BFGS routine stores the computation results of previous \ref m
202 |      *  iterations to approximate the inverse hessian matrix of the current
203 |      *  iteration. This parameter controls the size of the limited memories
204 |      *  (corrections). The default value is \c 6. Values less than \c 3 are
205 |      *  not recommended. Large values will result in excessive computing time.
206 |      */
207 |     int             m;
208 | 
209 |     /**
210 |      * Epsilon for convergence test.
211 |      *  This parameter determines the accuracy with which the solution is to
212 |      *  be found. A minimization terminates when
213 |      *      ||g|| < \ref epsilon * max(1, ||x||),
214 |      *  where ||.|| denotes the Euclidean (L2) norm. The default value is
215 |      *  \c 1e-5.
216 |      */
217 |     lbfgsfloatval_t epsilon;
218 | 
219 |     /**
220 |      * Distance for delta-based convergence test.
221 |      *  This parameter determines the distance, in iterations, to compute
222 |      *  the rate of decrease of the objective function. If the value of this
223 |      *  parameter is zero, the library does not perform the delta-based
224 |      *  convergence test. The default value is \c 0.
225 |      */
226 |     int             past;
227 | 
228 |     /**
229 |      * Delta for convergence test.
230 |      *  This parameter determines the minimum rate of decrease of the
231 |      *  objective function. The library stops iterations when the
232 |      *  following condition is met:
233 |      *      (f' - f) / f < \ref delta,
234 |      *  where f' is the objective value of \ref past iterations ago, and f is
235 |      *  the objective value of the current iteration.
236 |      *  The default value is \c 0.
237 |      */
238 |     lbfgsfloatval_t delta;
239 | 
240 |     /**
241 |      * The maximum number of iterations.
242 |      *  The lbfgs() function terminates an optimization process with
243 |      *  ::LBFGSERR_MAXIMUMITERATION status code when the iteration count
244 |      *  exceedes this parameter. Setting this parameter to zero continues an
245 |      *  optimization process until a convergence or error. The default value
246 |      *  is \c 0.
247 |      */
248 |     int             max_iterations;
249 | 
250 |     /**
251 |      * The line search algorithm.
252 |      *  This parameter specifies a line search algorithm to be used by the
253 |      *  L-BFGS routine.
254 |      */
255 |     int             linesearch;
256 | 
257 |     /**
258 |      * The maximum number of trials for the line search.
259 |      *  This parameter controls the number of function and gradients evaluations
260 |      *  per iteration for the line search routine. The default value is \c 20.
261 |      */
262 |     int             max_linesearch;
263 | 
264 |     /**
265 |      * The minimum step of the line search routine.
266 |      *  The default value is \c 1e-20. This value need not be modified unless
267 |      *  the exponents are too large for the machine being used, or unless the
268 |      *  problem is extremely badly scaled (in which case the exponents should
269 |      *  be increased).
270 |      */
271 |     lbfgsfloatval_t min_step;
272 | 
273 |     /**
274 |      * The maximum step of the line search.
275 |      *  The default value is \c 1e+20. This value need not be modified unless
276 |      *  the exponents are too large for the machine being used, or unless the
277 |      *  problem is extremely badly scaled (in which case the exponents should
278 |      *  be increased).
279 |      */
280 |     lbfgsfloatval_t max_step;
281 | 
282 |     /**
283 |      * A parameter to control the accuracy of the line search routine.
284 |      *  The default value is \c 1e-4. This parameter should be greater
285 |      *  than zero and smaller than \c 0.5.
286 |      */
287 |     lbfgsfloatval_t ftol;
288 | 
289 |     /**
290 |      * A coefficient for the Wolfe condition.
291 |      *  This parameter is valid only when the backtracking line-search
292 |      *  algorithm is used with the Wolfe condition,
293 |      *  ::LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE or
294 |      *  ::LBFGS_LINESEARCH_BACKTRACKING_WOLFE .
295 |      *  The default value is \c 0.9. This parameter should be greater
296 |      *  the \ref ftol parameter and smaller than \c 1.0.
297 |      */
298 |     lbfgsfloatval_t wolfe;
299 | 
300 |     /**
301 |      * A parameter to control the accuracy of the line search routine.
302 |      *  The default value is \c 0.9. If the function and gradient
303 |      *  evaluations are inexpensive with respect to the cost of the
304 |      *  iteration (which is sometimes the case when solving very large
305 |      *  problems) it may be advantageous to set this parameter to a small
306 |      *  value. A typical small value is \c 0.1. This parameter shuold be
307 |      *  greater than the \ref ftol parameter (\c 1e-4) and smaller than
308 |      *  \c 1.0.
309 |      */
310 |     lbfgsfloatval_t gtol;
311 | 
312 |     /**
313 |      * The machine precision for floating-point values.
314 |      *  This parameter must be a positive value set by a client program to
315 |      *  estimate the machine precision. The line search routine will terminate
316 |      *  with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
317 |      *  of the interval of uncertainty is less than this parameter.
318 |      */
319 |     lbfgsfloatval_t xtol;
320 | 
321 |     /**
322 |      * Coeefficient for the L1 norm of variables.
323 |      *  This parameter should be set to zero for standard minimization
324 |      *  problems. Setting this parameter to a positive value activates
325 |      *  Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) method, which
326 |      *  minimizes the objective function F(x) combined with the L1 norm |x|
327 |      *  of the variables, {F(x) + C |x|}. This parameter is the coeefficient
328 |      *  for the |x|, i.e., C. As the L1 norm |x| is not differentiable at
329 |      *  zero, the library modifies function and gradient evaluations from
330 |      *  a client program suitably; a client program thus have only to return
331 |      *  the function value F(x) and gradients G(x) as usual. The default value
332 |      *  is zero.
333 |      */
334 |     lbfgsfloatval_t orthantwise_c;
335 | 
336 |     /**
337 |      * Start index for computing L1 norm of the variables.
338 |      *  This parameter is valid only for OWL-QN method
339 |      *  (i.e., \ref orthantwise_c != 0). This parameter b (0 <= b < N)
340 |      *  specifies the index number from which the library computes the
341 |      *  L1 norm of the variables x,
342 |      *      |x| := |x_{b}| + |x_{b+1}| + ... + |x_{N}| .
343 |      *  In other words, variables x_1, ..., x_{b-1} are not used for
344 |      *  computing the L1 norm. Setting b (0 < b < N), one can protect
345 |      *  variables, x_1, ..., x_{b-1} (e.g., a bias term of logistic
346 |      *  regression) from being regularized. The default value is zero.
347 |      */
348 |     int             orthantwise_start;
349 | 
350 |     /**
351 |      * End index for computing L1 norm of the variables.
352 |      *  This parameter is valid only for OWL-QN method
353 |      *  (i.e., \ref orthantwise_c != 0). This parameter e (0 < e <= N)
354 |      *  specifies the index number at which the library stops computing the
355 |      *  L1 norm of the variables x,
356 |      */
357 |     int             orthantwise_end;
358 | } lbfgs_parameter_t;
359 | 
360 | 
361 | /**
362 |  * Callback interface to provide objective function and gradient evaluations.
363 |  *
364 |  *  The lbfgs() function call this function to obtain the values of objective
365 |  *  function and its gradients when needed. A client program must implement
366 |  *  this function to evaluate the values of the objective function and its
367 |  *  gradients, given current values of variables.
368 |  *  
369 |  *  @param  instance    The user data sent for lbfgs() function by the client.
370 |  *  @param  x           The current values of variables.
371 |  *  @param  g           The gradient vector. The callback function must compute
372 |  *                      the gradient values for the current variables.
373 |  *  @param  n           The number of variables.
374 |  *  @param  step        The current step of the line search routine.
375 |  *  @retval lbfgsfloatval_t The value of the objective function for the current
376 |  *                          variables.
377 |  */
378 | typedef lbfgsfloatval_t (*lbfgs_evaluate_t)(
379 |     void *instance,
380 |     const lbfgsfloatval_t *x,
381 |     lbfgsfloatval_t *g,
382 |     const int n,
383 |     const lbfgsfloatval_t step
384 |     );
385 | 
386 | /**
387 |  * Callback interface to receive the progress of the optimization process.
388 |  *
389 |  *  The lbfgs() function call this function for each iteration. Implementing
390 |  *  this function, a client program can store or display the current progress
391 |  *  of the optimization process.
392 |  *
393 |  *  @param  instance    The user data sent for lbfgs() function by the client.
394 |  *  @param  x           The current values of variables.
395 |  *  @param  g           The current gradient values of variables.
396 |  *  @param  fx          The current value of the objective function.
397 |  *  @param  xnorm       The Euclidean norm of the variables.
398 |  *  @param  gnorm       The Euclidean norm of the gradients.
399 |  *  @param  step        The line-search step used for this iteration.
400 |  *  @param  n           The number of variables.
401 |  *  @param  k           The iteration count.
402 |  *  @param  ls          The number of evaluations called for this iteration.
403 |  *  @retval int         Zero to continue the optimization process. Returning a
404 |  *                      non-zero value will cancel the optimization process.
405 |  */
406 | typedef int (*lbfgs_progress_t)(
407 |     void *instance,
408 |     const lbfgsfloatval_t *x,
409 |     const lbfgsfloatval_t *g,
410 |     const lbfgsfloatval_t fx,
411 |     const lbfgsfloatval_t xnorm,
412 |     const lbfgsfloatval_t gnorm,
413 |     const lbfgsfloatval_t step,
414 |     int n,
415 |     int k,
416 |     int ls
417 |     );
418 | 
419 | /*
420 | A user must implement a function compatible with ::lbfgs_evaluate_t (evaluation
421 | callback) and pass the pointer to the callback function to lbfgs() arguments.
422 | Similarly, a user can implement a function compatible with ::lbfgs_progress_t
423 | (progress callback) to obtain the current progress (e.g., variables, function
424 | value, ||G||, etc) and to cancel the iteration process if necessary.
425 | Implementation of a progress callback is optional: a user can pass \c NULL if
426 | progress notification is not necessary.
427 | 
428 | In addition, a user must preserve two requirements:
429 |     - The number of variables must be multiples of 16 (this is not 4).
430 |     - The memory block of variable array ::x must be aligned to 16.
431 | 
432 | This algorithm terminates an optimization
433 | when:
434 | 
435 |     ||G|| < \epsilon \cdot \max(1, ||x||) .
436 | 
437 | In this formula, ||.|| denotes the Euclidean norm.
438 | */
439 | 
440 | /**
441 |  * Start a L-BFGS optimization.
442 |  *
443 |  *  @param  n           The number of variables.
444 |  *  @param  x           The array of variables. A client program can set
445 |  *                      default values for the optimization and receive the
446 |  *                      optimization result through this array. This array
447 |  *                      must be allocated by ::lbfgs_malloc function
448 |  *                      for libLBFGS built with SSE/SSE2 optimization routine
449 |  *                      enabled. The library built without SSE/SSE2
450 |  *                      optimization does not have such a requirement.
451 |  *  @param  ptr_fx      The pointer to the variable that receives the final
452 |  *                      value of the objective function for the variables.
453 |  *                      This argument can be set to \c NULL if the final
454 |  *                      value of the objective function is unnecessary.
455 |  *  @param  proc_evaluate   The callback function to provide function and
456 |  *                          gradient evaluations given a current values of
457 |  *                          variables. A client program must implement a
458 |  *                          callback function compatible with \ref
459 |  *                          lbfgs_evaluate_t and pass the pointer to the
460 |  *                          callback function.
461 |  *  @param  proc_progress   The callback function to receive the progress
462 |  *                          (the number of iterations, the current value of
463 |  *                          the objective function) of the minimization
464 |  *                          process. This argument can be set to \c NULL if
465 |  *                          a progress report is unnecessary.
466 |  *  @param  instance    A user data for the client program. The callback
467 |  *                      functions will receive the value of this argument.
468 |  *  @param  param       The pointer to a structure representing parameters for
469 |  *                      L-BFGS optimization. A client program can set this
470 |  *                      parameter to \c NULL to use the default parameters.
471 |  *                      Call lbfgs_parameter_init() function to fill a
472 |  *                      structure with the default values.
473 |  *  @retval int         The status code. This function returns zero if the
474 |  *                      minimization process terminates without an error. A
475 |  *                      non-zero value indicates an error.
476 |  */
477 | int lbfgs(
478 |     int n,
479 |     lbfgsfloatval_t *x,
480 |     lbfgsfloatval_t *ptr_fx,
481 |     lbfgs_evaluate_t proc_evaluate,
482 |     lbfgs_progress_t proc_progress,
483 |     void *instance,
484 |     lbfgs_parameter_t *param
485 |     );
486 | 
487 | /**
488 |  * Initialize L-BFGS parameters to the default values.
489 |  *
490 |  *  Call this function to fill a parameter structure with the default values
491 |  *  and overwrite parameter values if necessary.
492 |  *
493 |  *  @param  param       The pointer to the parameter structure.
494 |  */
495 | void lbfgs_parameter_init(lbfgs_parameter_t *param);
496 | 
497 | /**
498 |  * Allocate an array for variables.
499 |  *
500 |  *  This function allocates an array of variables for the convenience of
501 |  *  ::lbfgs function; the function has a requreiemt for a variable array
502 |  *  when libLBFGS is built with SSE/SSE2 optimization routines. A user does
503 |  *  not have to use this function for libLBFGS built without SSE/SSE2
504 |  *  optimization.
505 |  *  
506 |  *  @param  n           The number of variables.
507 |  */
508 | lbfgsfloatval_t* lbfgs_malloc(int n);
509 | 
510 | /**
511 |  * Free an array of variables.
512 |  *  
513 |  *  @param  x           The array of variables allocated by ::lbfgs_malloc
514 |  *                      function.
515 |  */
516 | void lbfgs_free(lbfgsfloatval_t *x);
517 | 
518 | /** @} */
519 | 
520 | #ifdef  __cplusplus
521 | }
522 | #endif/*__cplusplus*/
523 | 
524 | 
525 | 
526 | /**
527 | @mainpage libLBFGS: a library of Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
528 | 
529 | @section intro Introduction
530 | 
531 | This library is a C port of the implementation of Limited-memory
532 | Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
533 | The original FORTRAN source code is available at:
534 | http://www.ece.northwestern.edu/~nocedal/lbfgs.html
535 | 
536 | The L-BFGS method solves the unconstrainted minimization problem,
537 | 
538 | <pre>
539 |     minimize F(x), x = (x1, x2, ..., xN),
540 | </pre>
541 | 
542 | only if the objective function F(x) and its gradient G(x) are computable. The
543 | well-known Newton's method requires computation of the inverse of the hessian
544 | matrix of the objective function. However, the computational cost for the
545 | inverse hessian matrix is expensive especially when the objective function
546 | takes a large number of variables. The L-BFGS method iteratively finds a
547 | minimizer by approximating the inverse hessian matrix by information from last
548 | m iterations. This innovation saves the memory storage and computational time
549 | drastically for large-scaled problems.
550 | 
551 | Among the various ports of L-BFGS, this library provides several features:
552 | - <b>Optimization with L1-norm (Orthant-Wise Limited-memory Quasi-Newton
553 |   (OWL-QN) method)</b>:
554 |   In addition to standard minimization problems, the library can minimize
555 |   a function F(x) combined with L1-norm |x| of the variables,
556 |   {F(x) + C |x|}, where C is a constant scalar parameter. This feature is
557 |   useful for estimating parameters of sparse log-linear models (e.g.,
558 |   logistic regression and maximum entropy) with L1-regularization (or
559 |   Laplacian prior).
560 | - <b>Clean C code</b>:
561 |   Unlike C codes generated automatically by f2c (Fortran 77 into C converter),
562 |   this port includes changes based on my interpretations, improvements,
563 |   optimizations, and clean-ups so that the ported code would be well-suited
564 |   for a C code. In addition to comments inherited from the original code,
565 |   a number of comments were added through my interpretations.
566 | - <b>Callback interface</b>:
567 |   The library receives function and gradient values via a callback interface.
568 |   The library also notifies the progress of the optimization by invoking a
569 |   callback function. In the original implementation, a user had to set
570 |   function and gradient values every time the function returns for obtaining
571 |   updated values.
572 | - <b>Thread safe</b>:
573 |   The library is thread-safe, which is the secondary gain from the callback
574 |   interface.
575 | - <b>Cross platform.</b> The source code can be compiled on Microsoft Visual
576 |   Studio 2010, GNU C Compiler (gcc), etc.
577 | - <b>Configurable precision</b>: A user can choose single-precision (float)
578 |   or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
579 | - <b>SSE/SSE2 optimization</b>:
580 |   This library includes SSE/SSE2 optimization (written in compiler intrinsics)
581 |   for vector arithmetic operations on Intel/AMD processors. The library uses
582 |   SSE for float values and SSE2 for double values. The SSE/SSE2 optimization
583 |   routine is disabled by default.
584 | 
585 | This library is used by:
586 | - <a href="http://www.chokkan.org/software/crfsuite/">CRFsuite: A fast implementation of Conditional Random Fields (CRFs)</a>
587 | - <a href="http://www.chokkan.org/software/classias/">Classias: A collection of machine-learning algorithms for classification</a>
588 | - <a href="http://www.public.iastate.edu/~gdancik/mlegp/">mlegp: an R package for maximum likelihood estimates for Gaussian processes</a>
589 | - <a href="http://infmath.uibk.ac.at/~matthiasf/imaging2/">imaging2: the imaging2 class library</a>
590 | - <a href="http://search.cpan.org/~laye/Algorithm-LBFGS-0.16/">Algorithm::LBFGS - Perl extension for L-BFGS</a>
591 | - <a href="http://www.cs.kuleuven.be/~bernd/yap-lbfgs/">YAP-LBFGS (an interface to call libLBFGS from YAP Prolog)</a>
592 | 
593 | @section download Download
594 | 
595 | - <a href="https://github.com/downloads/chokkan/liblbfgs/liblbfgs-1.10.tar.gz">Source code</a>
596 | - <a href="https://github.com/chokkan/liblbfgs">GitHub repository</a>
597 | 
598 | libLBFGS is distributed under the term of the
599 | <a href="http://opensource.org/licenses/mit-license.php">MIT license</a>.
600 | 
601 | @section changelog History
602 | - Version 1.10 (2010-12-22):
603 |     - Fixed compiling errors on Mac OS X; this patch was kindly submitted by
604 |       Nic Schraudolph.
605 |     - Reduced compiling warnings on Mac OS X; this patch was kindly submitted
606 |       by Tamas Nepusz.
607 |     - Replaced memalign() with posix_memalign().
608 |     - Updated solution and project files for Microsoft Visual Studio 2010.
609 | - Version 1.9 (2010-01-29):
610 |     - Fixed a mistake in checking the validity of the parameters "ftol" and
611 |       "wolfe"; this was discovered by Kevin S. Van Horn.
612 | - Version 1.8 (2009-07-13):
613 |     - Accepted the patch submitted by Takashi Imamichi;
614 |       the backtracking method now has three criteria for choosing the step
615 |       length:
616 |         - ::LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: sufficient decrease (Armijo)
617 |           condition only
618 |         - ::LBFGS_LINESEARCH_BACKTRACKING_WOLFE: regular Wolfe condition
619 |           (sufficient decrease condition + curvature condition)
620 |         - ::LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: strong Wolfe condition
621 |     - Updated the documentation to explain the above three criteria.
622 | - Version 1.7 (2009-02-28):
623 |     - Improved OWL-QN routines for stability.
624 |     - Removed the support of OWL-QN method in MoreThuente algorithm because
625 |       it accidentally fails in early stages of iterations for some objectives.
626 |       Because of this change, <b>the OW-LQN method must be used with the
627 |       backtracking algorithm (::LBFGS_LINESEARCH_BACKTRACKING)</b>, or the
628 |       library returns ::LBFGSERR_INVALID_LINESEARCH.
629 |     - Renamed line search algorithms as follows:
630 |         - ::LBFGS_LINESEARCH_BACKTRACKING: regular Wolfe condition.
631 |         - ::LBFGS_LINESEARCH_BACKTRACKING_LOOSE: regular Wolfe condition.
632 |         - ::LBFGS_LINESEARCH_BACKTRACKING_STRONG: strong Wolfe condition.
633 |     - Source code clean-up.
634 | - Version 1.6 (2008-11-02):
635 |     - Improved line-search algorithm with strong Wolfe condition, which was
636 |       contributed by Takashi Imamichi. This routine is now default for
637 |       ::LBFGS_LINESEARCH_BACKTRACKING. The previous line search algorithm
638 |       with regular Wolfe condition is still available as
639 |       ::LBFGS_LINESEARCH_BACKTRACKING_LOOSE.
640 |     - Configurable stop index for L1-norm computation. A member variable
641 |       ::lbfgs_parameter_t::orthantwise_end was added to specify the index
642 |       number at which the library stops computing the L1 norm of the
643 |       variables. This is useful to prevent some variables from being
644 |       regularized by the OW-LQN method.
645 |     - A sample program written in C++ (sample/sample.cpp).
646 | - Version 1.5 (2008-07-10):
647 |     - Configurable starting index for L1-norm computation. A member variable
648 |       ::lbfgs_parameter_t::orthantwise_start was added to specify the index
649 |       number from which the library computes the L1 norm of the variables.
650 |       This is useful to prevent some variables from being regularized by the
651 |       OWL-QN method.
652 |     - Fixed a zero-division error when the initial variables have already
653 |       been a minimizer (reported by Takashi Imamichi). In this case, the
654 |       library returns ::LBFGS_ALREADY_MINIMIZED status code.
655 |     - Defined ::LBFGS_SUCCESS status code as zero; removed unused constants,
656 |       LBFGSFALSE and LBFGSTRUE.
657 |     - Fixed a compile error in an implicit down-cast.
658 | - Version 1.4 (2008-04-25):
659 |     - Configurable line search algorithms. A member variable
660 |       ::lbfgs_parameter_t::linesearch was added to choose either MoreThuente
661 |       method (::LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
662 |       (::LBFGS_LINESEARCH_BACKTRACKING).
663 |     - Fixed a bug: the previous version did not compute psuedo-gradients
664 |       properly in the line search routines for OWL-QN. This bug might quit
665 |       an iteration process too early when the OWL-QN routine was activated
666 |       (0 < ::lbfgs_parameter_t::orthantwise_c).
667 |     - Configure script for POSIX environments.
668 |     - SSE/SSE2 optimizations with GCC.
669 |     - New functions ::lbfgs_malloc and ::lbfgs_free to use SSE/SSE2 routines
670 |       transparently. It is uncessary to use these functions for libLBFGS built
671 |       without SSE/SSE2 routines; you can still use any memory allocators if
672 |       SSE/SSE2 routines are disabled in libLBFGS.
673 | - Version 1.3 (2007-12-16):
674 |     - An API change. An argument was added to lbfgs() function to receive the
675 |       final value of the objective function. This argument can be set to
676 |       \c NULL if the final value is unnecessary.
677 |     - Fixed a null-pointer bug in the sample code (reported by Takashi Imamichi).
678 |     - Added build scripts for Microsoft Visual Studio 2005 and GCC.
679 |     - Added README file.
680 | - Version 1.2 (2007-12-13):
681 |     - Fixed a serious bug in orthant-wise L-BFGS.
682 |       An important variable was used without initialization.
683 | - Version 1.1 (2007-12-01):
684 |     - Implemented orthant-wise L-BFGS.
685 |     - Implemented lbfgs_parameter_init() function.
686 |     - Fixed several bugs.
687 |     - API documentation.
688 | - Version 1.0 (2007-09-20):
689 |     - Initial release.
690 | 
691 | @section api Documentation
692 | 
693 | - @ref liblbfgs_api "libLBFGS API"
694 | 
695 | @section sample Sample code
696 | 
697 | @include sample.c
698 | 
699 | @section ack Acknowledgements
700 | 
701 | The L-BFGS algorithm is described in:
702 |     - Jorge Nocedal.
703 |       Updating Quasi-Newton Matrices with Limited Storage.
704 |       <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
705 |     - Dong C. Liu and Jorge Nocedal.
706 |       On the limited memory BFGS method for large scale optimization.
707 |       <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
708 | 
709 | The line search algorithms used in this implementation are described in:
710 |     - John E. Dennis and Robert B. Schnabel.
711 |       <i>Numerical Methods for Unconstrained Optimization and Nonlinear
712 |       Equations</i>, Englewood Cliffs, 1983.
713 |     - Jorge J. More and David J. Thuente.
714 |       Line search algorithm with guaranteed sufficient decrease.
715 |       <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
716 |       pp. 286-307, 1994.
717 | 
718 | This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
719 | method presented in:
720 |     - Galen Andrew and Jianfeng Gao.
721 |       Scalable training of L1-regularized log-linear models.
722 |       In <i>Proceedings of the 24th International Conference on Machine
723 |       Learning (ICML 2007)</i>, pp. 33-40, 2007.
724 | 
725 | Special thanks go to:
726 |     - Yoshimasa Tsuruoka and Daisuke Okanohara for technical information about
727 |       OWL-QN
728 |     - Takashi Imamichi for the useful enhancements of the backtracking method
729 |     - Kevin S. Van Horn, Nic Schraudolph, and Tamas Nepusz for bug fixes
730 | 
731 | Finally I would like to thank the original author, Jorge Nocedal, who has been
732 | distributing the effieicnt and explanatory implementation in an open source
733 | licence.
734 | 
735 | @section reference Reference
736 | 
737 | - <a href="http://www.ece.northwestern.edu/~nocedal/lbfgs.html">L-BFGS</a> by Jorge Nocedal.
738 | - <a href="http://research.microsoft.com/en-us/downloads/b1eb1016-1738-4bd5-83a9-370c9d498a03/default.aspx">Orthant-Wise Limited-memory Quasi-Newton Optimizer for L1-regularized Objectives</a> by Galen Andrew.
739 | - <a href="http://chasen.org/~taku/software/misc/lbfgs/">C port (via f2c)</a> by Taku Kudo.
740 | - <a href="http://www.alglib.net/optimization/lbfgs.php">C#/C++/Delphi/VisualBasic6 port</a> in ALGLIB.
741 | - <a href="http://cctbx.sourceforge.net/">Computational Crystallography Toolbox</a> includes
742 |   <a href="http://cctbx.sourceforge.net/current_cvs/c_plus_plus/namespacescitbx_1_1lbfgs.html">scitbx::lbfgs</a>.
743 | */
744 | 
745 | #endif/*__LBFGS_H__*/
746 | 


--------------------------------------------------------------------------------
/ReleaseVersion/lbfgslib/lbfgs.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellozting/CompositeQuantization/1b363094b28839ada34299ee179cb99cf0414a20/ReleaseVersion/lbfgslib/lbfgs.lib


--------------------------------------------------------------------------------
/build_project.bat:
--------------------------------------------------------------------------------
1 | cd build
2 | 
3 | del CMakeCache.txt
4 | 
5 | cmake -DMAKE_ONLY=BUILD_ALL  -G "Visual Studio 12 Win64"  ..
6 | 
7 | pause
8 | 


--------------------------------------------------------------------------------
/lbfgslib/lbfgs.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellozting/CompositeQuantization/1b363094b28839ada34299ee179cb99cf0414a20/lbfgslib/lbfgs.lib


--------------------------------------------------------------------------------