├── .gitignore
├── LICENSE
├── README.md
├── bibtomarkdown.py
├── build.sh
└── class_notes
    ├── Class_01_introduction.pdf
    ├── Class_02_text_embeddings.pdf
    ├── Class_03_image_embeddings.pdf
    ├── Class_03_image_embeddings.tex
    ├── Class_04_low_dimensional_vector_search.pdf
    ├── Class_04_low_dimensional_vector_search.tex
    ├── Class_05_dimensionality_reduction.pdf
    ├── Class_05_dimensionality_reduction.tex
    ├── Class_06_aproximate_nearest_neighbor_search.pdf
    ├── Class_06_aproximate_nearest_neighbor_search.tex
    ├── Class_07_clustering.pdf
    ├── Class_07_clustering.tex
    ├── Class_08_quantization.pdf
    ├── Class_08_runbook_for_students.ipynb
    ├── Class_09_graph_indexes.pdf
    ├── README.md
    ├── images
        ├── chernoff-exp-bounds.png
        ├── dragon_diff_dup.jpg
        ├── kdtrees-construction.excalidraw
        ├── kdtrees-construction.png
        ├── kdtrees-proof.excalidraw
        ├── kdtrees-proof.png
        ├── kdtrees-search.excalidraw
        ├── kdtrees-search.png
        ├── kmeans-proj.excalidraw
        ├── kmeans-proj.png
        ├── nnsearch.png
        ├── pca.excalidraw
        ├── pca.png
        ├── vectorsearch.excalidraw
        └── vectorsearch.png
    ├── vs.bib
    └── vs.sty


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Core latex/pdflatex auxiliary files:
  2 | *.aux
  3 | *.lof
  4 | *.log
  5 | *.lot
  6 | *.fls
  7 | *.out
  8 | *.toc
  9 | *.fmt
 10 | *.fot
 11 | *.cb
 12 | *.cb2
 13 | .*.lb
 14 | 
 15 | ## Intermediate documents:
 16 | *.dvi
 17 | *.xdv
 18 | *-converted-to.*
 19 | # these rules might exclude image files for figures etc.
 20 | # *.ps
 21 | # *.eps
 22 | # *.pdf
 23 | 
 24 | ## Generated if empty string is given at "Please type another file name for output:"
 25 | .pdf
 26 | 
 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 28 | *.bbl
 29 | *.bcf
 30 | *.blg
 31 | *-blx.aux
 32 | *-blx.bib
 33 | *.run.xml
 34 | 
 35 | ## Build tool auxiliary files:
 36 | *.fdb_latexmk
 37 | *.synctex
 38 | *.synctex(busy)
 39 | *.synctex.gz
 40 | *.synctex.gz(busy)
 41 | *.pdfsync
 42 | 
 43 | ## Build tool directories for auxiliary files
 44 | # latexrun
 45 | latex.out/
 46 | 
 47 | ## Auxiliary and intermediate files from other packages:
 48 | # algorithms
 49 | *.alg
 50 | *.loa
 51 | 
 52 | # achemso
 53 | acs-*.bib
 54 | 
 55 | # amsthm
 56 | *.thm
 57 | 
 58 | # beamer
 59 | *.nav
 60 | *.pre
 61 | *.snm
 62 | *.vrb
 63 | 
 64 | # changes
 65 | *.soc
 66 | 
 67 | # comment
 68 | *.cut
 69 | 
 70 | # cprotect
 71 | *.cpt
 72 | 
 73 | # elsarticle (documentclass of Elsevier journals)
 74 | *.spl
 75 | 
 76 | # endnotes
 77 | *.ent
 78 | 
 79 | # fixme
 80 | *.lox
 81 | 
 82 | # feynmf/feynmp
 83 | *.mf
 84 | *.mp
 85 | *.t[1-9]
 86 | *.t[1-9][0-9]
 87 | *.tfm
 88 | 
 89 | #(r)(e)ledmac/(r)(e)ledpar
 90 | *.end
 91 | *.?end
 92 | *.[1-9]
 93 | *.[1-9][0-9]
 94 | *.[1-9][0-9][0-9]
 95 | *.[1-9]R
 96 | *.[1-9][0-9]R
 97 | *.[1-9][0-9][0-9]R
 98 | *.eledsec[1-9]
 99 | *.eledsec[1-9]R
100 | *.eledsec[1-9][0-9]
101 | *.eledsec[1-9][0-9]R
102 | *.eledsec[1-9][0-9][0-9]
103 | *.eledsec[1-9][0-9][0-9]R
104 | 
105 | # glossaries
106 | *.acn
107 | *.acr
108 | *.glg
109 | *.glo
110 | *.gls
111 | *.glsdefs
112 | *.lzo
113 | *.lzs
114 | 
115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
116 | # *.ist
117 | 
118 | # gnuplottex
119 | *-gnuplottex-*
120 | 
121 | # gregoriotex
122 | *.gaux
123 | *.gtex
124 | 
125 | # htlatex
126 | *.4ct
127 | *.4tc
128 | *.idv
129 | *.lg
130 | *.trc
131 | *.xref
132 | 
133 | # hyperref
134 | *.brf
135 | 
136 | # knitr
137 | *-concordance.tex
138 | # TODO Comment the next line if you want to keep your tikz graphics files
139 | *.tikz
140 | *-tikzDictionary
141 | 
142 | # listings
143 | *.lol
144 | 
145 | # luatexja-ruby
146 | *.ltjruby
147 | 
148 | # makeidx
149 | *.idx
150 | *.ilg
151 | *.ind
152 | 
153 | # minitoc
154 | *.maf
155 | *.mlf
156 | *.mlt
157 | *.mtc[0-9]*
158 | *.slf[0-9]*
159 | *.slt[0-9]*
160 | *.stc[0-9]*
161 | 
162 | # minted
163 | _minted*
164 | *.pyg
165 | 
166 | # morewrites
167 | *.mw
168 | 
169 | # nomencl
170 | *.nlg
171 | *.nlo
172 | *.nls
173 | 
174 | # pax
175 | *.pax
176 | 
177 | # pdfpcnotes
178 | *.pdfpc
179 | 
180 | # sagetex
181 | *.sagetex.sage
182 | *.sagetex.py
183 | *.sagetex.scmd
184 | 
185 | # scrwfile
186 | *.wrt
187 | 
188 | # sympy
189 | *.sout
190 | *.sympy
191 | sympy-plots-for-*.tex/
192 | 
193 | # pdfcomment
194 | *.upa
195 | *.upb
196 | 
197 | # pythontex
198 | *.pytxcode
199 | pythontex-files-*/
200 | 
201 | # tcolorbox
202 | *.listing
203 | 
204 | # thmtools
205 | *.loe
206 | 
207 | # TikZ & PGF
208 | *.dpth
209 | *.md5
210 | *.auxlock
211 | 
212 | # todonotes
213 | *.tdo
214 | 
215 | # vhistory
216 | *.hst
217 | *.ver
218 | 
219 | # easy-todo
220 | *.lod
221 | 
222 | # xcolor
223 | *.xcp
224 | 
225 | # xmpincl
226 | *.xmpi
227 | 
228 | # xindy
229 | *.xdy
230 | 
231 | # xypic precompiled matrices and outlines
232 | *.xyc
233 | *.xyd
234 | 
235 | # endfloat
236 | *.ttt
237 | *.fff
238 | 
239 | # Latexian
240 | TSWLatexianTemp*
241 | 
242 | ## Editors:
243 | # WinEdt
244 | *.bak
245 | *.sav
246 | 
247 | # Texpad
248 | .texpadtmp
249 | 
250 | # LyX
251 | *.lyx~
252 | 
253 | # Kile
254 | *.backup
255 | 
256 | # gummi
257 | .*.swp
258 | 
259 | # KBibTeX
260 | *~[0-9]*
261 | 
262 | # TeXnicCenter
263 | *.tps
264 | 
265 | # auto folder when using emacs and auctex
266 | ./auto/*
267 | *.el
268 | 
269 | # expex forward references with \gathertags
270 | *-tags.tex
271 | 
272 | # standalone packages
273 | *.sta
274 | 
275 | # Makeindex log files
276 | *.lpz
277 | 
278 | # mac files
279 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Edo Liberty
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Long Term Memory in AI - Vector Search and Databases
  2 | 
  3 | **NOTE:** COS 597A class times changed for Fall semester 2023. Classes will be held **9am-12noon**.
  4 | 
  5 | ## Instructors
  6 | 
  7 | * [Edo Liberty](https://edoliberty.github.io), Founder and CEO of [Pinecone](https://www.pinecone.io), the world's leading Vector Database. [Publications](https://scholar.google.com/citations?user=QHS_pZAAAAAJ&hl=en).
  8 | 
  9 | * [Matthijs Douze](https://ai.meta.com/people/matthijs-douze/), Research Scientist at Meta. Architect and main developer of [FAISS](https://github.com/facebookresearch/faiss) the most popular and advanced open source library for vector search. [Publications](https://scholar.google.com/citations?user=0eFZtREAAAAJ&hl=en).
 10 | 
 11 | * Teaching assistant [Nataly Brukhim](https://www.cs.princeton.edu/~nbrukhim/) PhD sdudent working with Prof. Elad Hazan and researcher at Google AI Princeton. email: <nbrukhim@princeton.edu>. [Publications](https://scholar.google.com/citations?user=jZwEDZoAAAAJ&hl=en).
 12 | 
 13 | * Guest lecture by [Harsha Vardhan Simhadri](https://harsha-simhadri.org/) Senior Principal Researcher, at Microsoft Research. Creator of [DiskANN](https://github.com/Microsoft/DiskANN). [Publications](https://scholar.google.com/citations?user=bW65tuAAAAAJ&hl=en)
 14 | 
 15 | 
 16 | ## Overview
 17 | Long Term Memory is a foundational capability in the modern AI Stack. At their core, these systems use vector search. Vector search is also a basic tool for systems that manipulate large collections of media like search engines, knowledge bases, content moderation tools, recommendation systems, etc. As such, the discipline lays at the intersection of Artificial Intelligence and Database Management Systems. This course will cover the theoretical foundations and practical implementation of vector search applications, algorithms, and systems. The course will be evaluated with project and in-class presentation. 
 18 | 
 19 | ## Contribute
 20 | 
 21 | All class materials are intended to be used freely by academics anywhere, students and professors alike. Please contribute in the form of pull requests or by opening issues.
 22 | 
 23 | ```
 24 | https://github.com/edoliberty/vector-search-class-notes
 25 | ```
 26 | 
 27 | On unix-like systems (e.g. macos) with bibtex and pdflatex available you should be able to run this:
 28 | 
 29 | ```
 30 | git clone git@github.com:edoliberty/vector-search-class-notes.git
 31 | cd vector-search-class-notes
 32 | ./build
 33 | ```
 34 | 
 35 | ## Syllabus
 36 | 
 37 | * 9/8 - [Class 1 - Introduction to Vector Search](class_notes/Class_01_introduction.pdf) [Matthijs + Edo + Nataly]
 38 | 	* Intro to the course: Topic, Schedule, Project, Grading, ...
 39 | 	* Embeddings as an information bottleneck. Instead of learning end-to-end, use embeddings as an intermediate representation
 40 | 	* Advantages: scalability, instant updates, and explainability
 41 | 	* Typical volumes of data and scalability. Embeddings are the only way to manage / access large databases
 42 | 	* The embedding contract: the embedding extractor and embedding indexer agree on the meaning of the distance. Separation of concerns.
 43 | 	* The vector space model in information retrieval
 44 | 	* Vector embeddings in machine learning
 45 | 	* Define vector, vector search, ranking, retrieval, recall
 46 | 
 47 | 	
 48 | * 9/15 - [Class 2 - Text embeddings](class_notes/Class_02_text_embeddings.pdf) [Matthijs]
 49 | 	* 2-layer word embeddings. Word2vec and fastText, obtained via a factorization of a co-occurrence matrix. Embedding arithmetic: king + woman - man = queen, (already based on similarity search)
 50 | 	* Sentence embeddings: How to train, masked LM. Properties of sentence embeddings.
 51 | 	* Large Language Models: reasoning as an emerging property of a LM. What happens when the training set = the whole web
 52 | 
 53 | 
 54 | * 9/22 - [Class 3 - Image embeddings](class_notes/Class_03_image_embeddings.pdf) [Matthijs] 
 55 | 	* Pixel structures of images. Early works on direct pixel indexing
 56 | 	* Traditional CV models. Global descriptors (GIST). Local descriptors (SIFT and friends)Direct indexing of local descriptors for image matching, local descriptor pooling (Fisher, VLAD)
 57 | 	* Convolutional Neural Nets. Off-the-shelf models. Trained specifically (contrastive learning, self-supervised learning)
 58 | 	* Modern Computer Vision models 
 59 | 
 60 | 
 61 | * 9/29 - [Class 4 - Low Dimensional Vector Search](class_notes/Class_04_low_dimensional_vector_search.pdf) [Edo]
 62 | 	* Vector search problem definition 
 63 | 	* k-d tree, space partitioning data structures
 64 | 	* Worst case proof for kd-trees
 65 | 	* Probabilistic inequalities. Recap of basic inequalities: Markov, Chernoof, Hoeffding
 66 | 	* Concentration Of Measure phenomena. Orthogonality of random vectors in high dimensions
 67 | 	* Curse of dimensionality and the failure of space partitioning
 68 | 
 69 | * 10/6 - [Class 5 - Dimensionality Reduction](class_notes/Class_05_dimensionality_reduction.pdf) [Edo] 
 70 | 	* Singular Value Decomposition (SVD)
 71 | 	* Applications of the SVD
 72 | 	* Rank-k approximation in the spectral norm
 73 | 	* Rank-k approximation in the Frobenius norm
 74 | 	* Linear regression in the least-squared loss
 75 | 	* PCA, Optimal squared loss dimension reduction
 76 | 	* Closest orthogonal matrix
 77 | 	* Computing the SVD: The power method
 78 | 	* Random-projection
 79 | 	* Matrices with normally distributed independent entries
 80 | 	* Fast Random Projections
 81 | 
 82 | * 10/13 - No Class - Midterm Examination Week
 83 | 
 84 | * 10/20 - No Class - Fall Recess
 85 | 
 86 | * 10/27 - [Class 6 - Approximate Nearest Neighbor Search](class_notes/Class_06_aproximate_nearest_neighbor_search.pdf) [Edo]
 87 | 	* Definition of Approximate Nearest Neighbor Search (ANNS)
 88 | 	* Criteria: Speed / accuracy / memory usage / updateability / index construction time 
 89 | 	* Definition of Locality Sensitive Hashing and examples
 90 | 	* The LSH Algorithm
 91 | 	* LSH Analysis, proof of correctness, and asymptotics
 92 | 
 93 | * 11/3 - [Class 7 - Clustering](class_notes/Class_07_clustering.pdf) [Edo]
 94 | 	* K-means clustering - mean squared error criterion.
 95 | 	* Lloyd’s Algorithm
 96 | 	* k-means and PCA
 97 | 	* ε-net argument for fixed dimensions
 98 | 	* Sampling based seeding for k-means
 99 | 	* k-means++
100 | 	* The Inverted File Model (IVF)
101 | 	
102 | * 11/10 - [Class 8 - Quantization for lossy vector compression](class_notes/Class_08_quantization.pdf) **This class will take place remotely via zoom, see the edstem message to get the link** [Matthijs]
103 | 	* Python notebook corresponding to the class: [Class_08_runbook_for_students.ipynb](class_notes/Class_08_runbook_for_students.ipynb)
104 | 	* Vector quantization is a topline (directly optimizes the objective)
105 | 	* Binary quantization and hamming comparison 
106 | 	* Product quantization. Chunked vector quantization. Optimized vector quantization
107 | 	* Additive quantization. Extension of product quantization. Difficulty in training approximations (Residual quantization, CQ, TQ, LSQ, etc.)
108 | 	* Cost of coarse quantization vs. inverted list scanning
109 | 	
110 | * 11/17 - [Class 9 - Graph based indexes](class_notes/Class_09_graph_indexes.pdf) by Guest lecturer [Harsha Vardhan Simhadri.](https://harsha-simhadri.org/)
111 | 	* Early works: hierarchical k-means 
112 | 	* Neighborhood graphs. How to construct them. Nearest Neighbor Descent
113 | 	* Greedy search in Neighborhood graphs. That does not work -- need long jumps
114 | 	* HNSW. A practical hierarchical graph-based index
115 | 	* NSG. Evolving a graph k-NN graph	
116 | 
117 | 
118 | * 11/24 - No Class - Thanksgiving Recess
119 | 
120 | * 12/1 - Class 10 - Student project and paper presentations [Edo + Nataly]
121 | 
122 | 
123 | ## Project 
124 | 
125 | Class work includes a final project. It will be graded based on 
126 | 
127 | 1. 50% - Project submission 
128 | 1. 50% - In-class presentation 
129 | 
130 | **Projects can be in three different flavors**
131 | 
132 | * _Theory/Research_: propose a new algorithm for a problem we explored in class (or modify an existing one), explain what it achieves, give experimental evidence or a proof for its behavior. If you choose this kind of project you are expected to submit a write up.
133 | * _Data Science/AI_: create an interesting use case for vector search using Pinecone, explain what data you used, what value your application brings, and what insights you gained. If you choose this kind of project you are expected to submit code (e.g. Jupyter Notebooks) and a writeup of your results and insights. 
134 | * _Engineering/HPC_: adapt or add to FAISS, explain your improvements, show experimental results. If you choose this kind of project you are expected to submit a branch of FAISS for review along with a short writeup of your suggested improvement and experiments. 
135 | 
136 | 
137 | **Project schedule**  
138 | 
139 | * 11/24 - One-page project proposal approved by the instructors
140 | * 12/1 - Final project submission
141 | * 12/1 - In-class presentation
142 | 
143 | 
144 | **Some more details**
145 | 
146 | * Project Instructor: Nataly <nbrukhim@princeton.edu>
147 | * Projects can be worked on individually, in teams of two or at most three students.
148 | * Expect to spend a few hours over the semester on the project proposal. Try to get it approved well ahead of the deadline. 
149 | * Expect to spent 3-5 _full days_ on the project itself (on par with preparing for a final exam) 
150 | * In class project project presentation are 5 minutes _per student_ (teams of two students present for 10 minutes. Teams of three, 15 minutes).
151 |  
152 | ## Selected Literature 
153 | 
154 | * [A fast random sampling algorithm for sparsifying matrices](http://dx.doi.org/10.1007/11830924_26) - Arora, Sanjeev and Hazan, Elad and Kale, Satyen - 2006
155 | * [A Randomized Algorithm for Principal Component Analysis](http://dx.doi.org/10.1137/080736417) - Vladimir Rokhlin and Arthur Szlam and Mark Tygert - 2009
156 | * A search structure based on kd trees for efficient ray tracing - Subramanian, KR and Fussel, DS - 1990
157 | * A Short Proof for Gap Independence of Simultaneous Iteration - Edo Liberty - 2016
158 | * Accelerating Large-Scale Inference with Anisotropic Vector Quantization - Ruiqi Guo and Philip Sun and Erik Lindgren and Quan Geng and David Simcha and Felix Chern and Sanjiv Kumar - 2020
159 | * [Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada](http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015) - 2015
160 | * [An Algorithm for Online K-Means Clustering](https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7) - Edo Liberty and Ram Sriharsha and Maxim Sviridenko
161 | * An Almost Optimal Unrestricted Fast Johnson-Lindenstrauss Transform - Nir Ailon and Edo Liberty - 2011
162 | * An elementary proof of the Johnson-Lindenstrauss lemma - S. DasGupta and A. Gupta - 1999
163 | * Approximate nearest neighbors and the fast Johnson-Lindenstrauss transform - Nir Ailon and Bernard Chazelle - 2006
164 | * Billion-scale similarity search with GPUs - Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou - 2017
165 | * Clustering Data Streams: Theory and Practice - Sudipto Guha and Adam Meyerson and Nina Mishra and Rajeev Motwani and Liadan O'Callaghan - 2003
166 | * [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf) - Jayaram Subramanya, Suhas and Devvrit, Fnu and Simhadri, Harsha Vardhan and Krishnawamy, Ravishankar and Kadekodi, Rohan - 2019
167 | * Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs - Yu. A. Malkov and D. A. Yashunin - 2018
168 | * [Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures](https://doi.org/10.1145/1963405.1963487) - Dong, Wei and Moses, Charikar and Li, Kai - 2011
169 | * Even Simpler Deterministic Matrix Sketching - Edo Liberty - 2022
170 | * Extensions of Lipschitz mappings into a Hilbert space - W. B. Johnson and J. Lindenstrauss - 1984
171 | * Fast Approximate Nearest Neighbor Search With The Navigating Spreading-out Graph - Cong Fu and Chao Xiang and Changxu Wang and Deng Cai - 2018
172 | * [Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions](http://dx.doi.org/10.1137/090771806) - Halko, N. and Martinsson, P. G. and Tropp, J. A. - 2011
173 | * Invertibility of random matrices: norm of the inverse - Mark Rudelson - 2008
174 | * K-means clustering via principal component analysis - Chris H. Q. Ding and Xiaofeng He - 2004
175 | * k-means++: the advantages of careful seeding - David Arthur and Sergei Vassilvitskii - 2007
176 | * Least squares quantization in pcm - Stuart P. Lloyd - 1982
177 | * [LSQ++: Lower Running Time and Higher Recall in Multi-Codebook Quantization](https://doi.org/10.1007/978-3-030-01270-0_30) - Martinez, Julieta and Zakhmi, Shobhit and Hoos, Holger H. and Little, James J. - 2018
178 | * [Multidimensional binary search trees used for associative searching](http://doi.acm.org/10.1145/361002.361007) - Bentley, Jon Louis - 1975
179 | * [Near-Optimal Entrywise Sampling for Data Matrices](https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf) - Achlioptas, Dimitris and Karnin, Zohar S and Liberty, Edo - 2013
180 | * Pass Efficient Algorithms for Approximating Large Matrices - Petros Drineas and Ravi Kannan - 2003
181 | * Product Quantization for Nearest Neighbor Search - Jegou, Herve and Douze, Matthijs and Schmid, Cordelia - 2011
182 | * QuickCSG: Arbitrary and faster boolean combinations of n solids - Douze, Matthijs and Franco, Jean-S{\'e}bastien and Raffin, Bruno - 2015
183 | * [Quicker {ADC} : Unlocking the Hidden Potential of Product Quantization With {SIMD](https://doi.org/10.1109%2Ftpami.2019.2952606) - Fabien Andre and Anne-Marie Kermarrec and Nicolas Le Scouarnec - 2021
184 | * [Random Projection Trees and Low Dimensional Manifolds](https://doi.org/10.1145/1374376.1374452) - Dasgupta, Sanjoy and Freund, Yoav - 2008
185 | * [Randomized Algorithms for Low-Rank Matrix Factorizations: Sharp Performance Bounds](http://dx.doi.org/10.1007/s00453-014-9891-7) - Witten, Rafi and Cand\`{e}s, Emmanuel - 2015
186 | * [Randomized Block Krylov Methods for Stronger and Faster Approximate Singular Value Decomposition](http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition) - Cameron Musco and Christopher Musco - 2015
187 | * [Revisiting Additive Quantization](https://api.semanticscholar.org/CorpusID:7340738) - Julieta Martinez and Joris Clement and Holger H. Hoos and J. Little - 2016
188 | * [Sampling from large matrices: An approach through geometric functional analysis](http://doi.acm.org/10.1145/1255443.1255449) - Rudelson, Mark and Vershynin, Roman - 2007
189 | * Similarity estimation techniques from rounding algorithms - Moses Charikar - 2002
190 | * Similarity Search in High Dimensions via Hashing - Aristides Gionis and Piotr Indyk and Rajeev Motwani - 1999
191 | * Simple and Deterministic Matrix Sketching - Edo Liberty - 2012
192 | * Smaller Coresets for k-Median and k-Means Clustering - S. {Har-Peled} and A. Kushal - 2005
193 | * Sparser Johnson-Lindenstrauss transforms - Daniel M. Kane and Jelani Nelson - 2012
194 | * Sparsity Lower Bounds for Dimensionality Reducing Maps - Jelani Nelson and Huy L. Nguyen - 2012
195 | * Spectral Relaxation for K-means Clustering - Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming Gu and Horst D. Simon - 2001
196 | * Streaming k-means approximation - Nir Ailon and Ragesh Jaiswal and Claire Monteleoni - 2009
197 | * Strong converse for identification via quantum channels - Rudolf Ahlswede and Andreas Winter - 2002
198 | * Transformer Memory as a Differentiable Search Index - Yi Tay and Vinh Q. Tran and Mostafa Dehghani and Jianmo Ni and Dara Bahri and Harsh Mehta and Zhen Qin and Kai Hui and Zhe Zhao and Jai Gupta and Tal Schuster and William W. Cohen and Donald Metzler - 2022
199 | * [Unsupervised Neural Quantization for Compressed-Domain Similarity Search](https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313) - S. Morozov and A. Babenko - 2019
200 | * [Worst-Case Analysis for Region and Partial Region Searches in Multidimensional Binary Search Trees and Balanced Quad Trees](https://doi.org/10.1007/BF00263763) - Lee, D. T. and Wong, C. K. - 1977
201 | * [A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search](http://www.vldb.org/pvldb/vol14/p1964-wang.pdf) - Mengzhao Wang and Xiaoliang Xu and Qiang Yue and Yuxiang Wang - 2021
202 | * [Approximate Nearest Neighbor Search on High Dimensional Data - Experiments, Analyses, and Improvement](https://doi.org/10.1109/TKDE.2019.2909204) - Wen Li and Ying Zhang and Yifang Sun and Wei Wang and Mingjie Li and Wenjie Zhang and Xuemin Lin - 2020
203 | * [Survey of vector database management systems](https://doi.org/10.1007/s00778-024-00864-x) - James Jie Pan and Jianguo Wang and Guoliang Li - 2024
204 | 


--------------------------------------------------------------------------------
/bibtomarkdown.py:
--------------------------------------------------------------------------------
 1 | ## This is a hacky util script to create markdown from the class bib file.
 2 | ## you can run it like this
 3 | ## >>  python bibtomarkdown.py class_notes/vs.bib > bib.md
 4 | ## For convenience, please add urls and notes to the bib file and 
 5 | ## regenerate the markdown instead of changing it directly in the README
 6 | 
 7 | 
 8 | def bibtokvdict(bib):
 9 |     lines = [line.strip() for line in bib.split('\n') if "=" in line]
10 |     pairs = [line.split('=',1) for line in lines]
11 |     return dict([ (pair[0].strip(), pair[1].strip('{}, ')) for pair in pairs])
12 | 
13 | def kvdicttomarkdown(kvdict):
14 |     if 'title' not in kvdict:
15 |         return ''
16 |     s = '*'
17 |     if 'url' in kvdict:
18 |         s += f" [{kvdict['title']}]({kvdict['url']})"
19 |     else:
20 |         s += f" {kvdict.get('title','')}"
21 |     if 'author' in kvdict:
22 |         s += f" - {kvdict['author']}"
23 |     if 'year' in kvdict:
24 |         s += f" - {kvdict['year']}"
25 |     return s
26 | 
27 | if __name__ == "__main__":
28 |     import argparse
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("bib_file_name", help="path to a *.bib file")
31 |     args = parser.parse_args()
32 | 
33 |     with open(args.bib_file_name) as f:
34 |         text = f.read()
35 | 
36 |     bibs = [s.strip() for s in text.split('\n\n') if s.startswith('@')]
37 |     kvdicts = [bibtokvdict(bib) for bib in bibs]
38 |     kvdicts.sort(key = lambda m:m['title'].lower())
39 |     mds = [kvdicttomarkdown(kvdict) for kvdict in kvdicts]
40 |     for md in mds:
41 |         print(md)


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | pushd class_notes
 2 | rm *.aux *.bbl *.blg *.dvi *.log
 3 | for FILE in *.tex; do 
 4 | 	pdflatex $FILE; 
 5 | 	bibtex "${FILE%.*}" ;
 6 | 	pdflatex $FILE; 
 7 | 	pdflatex $FILE; 
 8 | done
 9 | rm *.aux *.bbl *.blg *.dvi *.log
10 | popd


--------------------------------------------------------------------------------
/class_notes/Class_01_introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_01_introduction.pdf


--------------------------------------------------------------------------------
/class_notes/Class_02_text_embeddings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_02_text_embeddings.pdf


--------------------------------------------------------------------------------
/class_notes/Class_03_image_embeddings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_03_image_embeddings.pdf


--------------------------------------------------------------------------------
/class_notes/Class_03_image_embeddings.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{vs}
 3 | \begin{document}
 4 | 
 5 | \lecturetitle{Class 3 - Image Embeddings}
 6 | 
 7 | \bibliographystyle{plain}
 8 | \bibliography{vs}
 9 | 
10 | \end{document}
11 | %%%%%%%%
12 | 


--------------------------------------------------------------------------------
/class_notes/Class_04_low_dimensional_vector_search.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_04_low_dimensional_vector_search.pdf


--------------------------------------------------------------------------------
/class_notes/Class_04_low_dimensional_vector_search.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | \usepackage{vs}
  3 | 
  4 | \begin{document}
  5 | 
  6 | \lecturetitle{Class 4 - Low Dimensional Vector Search}
  7 | \section{Nearest Neighbor Search Definition}
  8 | 
  9 | As we have seen, nearest neighbor search is a fundamental computational building block in computer vision, semantic search, data mining, machine learning, and many other applications.
 10 | 
 11 | \begin{definition}{\bf Nearest Neighbor Search:} Given a set of points $X = \{x_1,\ldots,x_n\} \in \R^{d}$ 
 12 | preprocess them into a data structure of size $\poly(n,d)$ in time $\poly(n,d)$ such that nearest neighbor queries can
 13 | be performed. Given a search query point $q$ a radius $r$ and the data structure should return  
 14 | return $X_{q,r} = \{i \;\; | \;\; ||q - x_i || \le r \}$. 
 15 | \end{definition}
 16 | We will later extend this definition in the obvious way to max-inner-product search (MIPS) and cosine similarity search.
 17 | Improving the practical and theoretical asymptotic time complexity for computing $X_{q,r}$ is the topic of this class.
 18 | 
 19 | \begin{center}
 20 | \includegraphics[width=1.0\textwidth]{images/vectorsearch.png}
 21 | \end{center}
 22 | 
 23 | One option we have is to do nothing at the preprocessing stage. Then, at query time, scan the data points and find those which minimize $\|q - x_i\|$.
 24 | This would give a query time of $\Omega(nd)$. But, of course, it would be significantly better if we could reduce the dependence on $n$, or in other words, avoid scanning the data for every query point.
 25 | 
 26 | \section{kd-trees}
 27 | First, we shall review a well known and widely used algorithm for this problem which is called kd-trees \cite{Bentley75}.
 28 | The data structure holds the points in a geometrically partitioned tree. 
 29 | Each subtree contains all the points in an axis aligned bounding box,
 30 | In each depth in the tree the bounding boxes are split along an axis.
 31 | There are many different heuristics for splitting the boxes. For didactic reasons, let's consider a simple construction that lends itself to easy analysis.
 32 | 
 33 | In our setting, we assume the number of points is exactly a power of $2$ and that coordinate values are distinct. 
 34 | Both assumptions are relatively harmless and should not change the asymptotic behavior. 
 35 | 
 36 | We will organize the points in a perfectly balanced binary tree. Points will be associated only with leaves. 
 37 | The construction of the tree is simple. We start with all points in a single box (a node in the tree) 
 38 | We split the box in two along the first coordinate such that exactly half the points lie on each side. 
 39 | The splitting offset is given by the median of these coordinates. 
 40 | Then, do the same for each of the two resulting boxes wrt to the second coordinate.
 41 | Then, again, for the four resulting boxes wrt to the third coordinate and so on.
 42 | Splitting stops when there is exactly one point in each leaf.
 43 | 
 44 | \begin{center}
 45 | \includegraphics[width=0.9\textwidth]{images/kdtrees-construction.png}
 46 | \end{center}
 47 | 
 48 | \noindent When searching the tree, we consider only points whose bounding boxes touch the query region (ball in this case).
 49 | 
 50 | \begin{center}
 51 | \includegraphics[width=0.9\textwidth]{images/kdtrees-search.png}
 52 | \end{center}
 53 | 
 54 | \begin{fact}
 55 | Any axis aligned hyperplane touches at most $n^{1-1/d}$ boxes.
 56 | \end{fact}
 57 | \begin{proof}
 58 | Assume the hyperplane is aligned with the $i$'th coordinate. That is, the points $x\in\R^d$ such that $x_i = c$. 
 59 | When splitting according to coordinate $i$, the split value is either larger or smaller than $c$. 
 60 | When that happens, the hyperplane only touches one of the two nodes a level lower in the tree.
 61 | The result is that, a splitting according to $i$'s coordinate cuts the number of touched leaf boxes (points) by half.
 62 | Finally, since we cycle through the coordinates in a round robin fashion, each coordinate is used for splitting $log_2(n)/d$ times.
 63 | Note that $log_2(n)$ is the depth of the tree. We can now calculate the number of touched leaf boxes by $n (1/2)^{log_2(n)/d} = n^{1-1/d}$.
 64 | \end{proof}
 65 | 
 66 | \begin{center}
 67 | \includegraphics[width=0.9\textwidth]{images/kdtrees-proof.png}
 68 | \end{center}
 69 | 
 70 | To complete the analysis, note that the axis aligned bounding box of the query has $2d$ facets. Therefore, the number of touched cells is $O(dn^{1-1/d})$.
 71 | For each touched cell we need to decide whether the associated point is a valid search result which takes $O(d)$ operations. 
 72 | The resulting search complexity of the algorithm is $O(d^2n^{1-1/d})$ which is sub-linear in $n$.
 73 | While this analysis is rather simple it actually cannot be made much better  \cite{kdtree-worstcase}. There are many variants of this algorithm \cite{rptrees} and practical observations that claim kd-trees perform well in low dimensions. 
 74 | 
 75 | Kd-trees are used extensively in 3D libraries to organize objects, lights, etc. 
 76 | They are data adaptive, meaning that they can concentrate on areas where there is data, and keep large boxes when there is not much detail~\cite{subramanian1990search,douze2015quickcsg}. 
 77 | In addition to nearest neighbor search they support following rays, organizing data for collision detection, etc. 
 78 | 
 79 | \begin{center}
 80 | \includegraphics[width=0.9\textwidth]{images/dragon_diff_dup.jpg}
 81 | \end{center}
 82 | 
 83 | Nevertheless, this runtime is only better than the brute force solution of $O(nd)$ when $n > d^d$ which already hints that there might be a problem with high dimensions.
 84 | Before introducing the source of the problem, we should cover some basic facts about random variables. Facts that will serve us for the rest of the course.
 85 | 
 86 | \section{Probability Tools and Facts Recap}
 87 | A variable $X$ is a random variable if it assumes different values
 88 | according to a probability distribution. For example, $X$ can 
 89 | denote the outcome of a three sided die throw. 
 90 | The variable $X$ takes the values $x = 1,2,3$ with equal probabilities. 
 91 | 
 92 | The expectation of $X$ is the sum over the possible values times the probability of the events.
 93 | \begin{equation}
 94 | \E[X] = \sum_{x=1}^{3}x \Pr(X = x)=
 95 | 1\frac{1}{3}+2\frac{1}{3}+3\frac{1}{3} = 2
 96 | \end{equation}
 97 | 
 98 | Continuous scalar random variables are described by their distribution function $\varphi$.
 99 | $$
100 | \Pr[Y \in [a,b]] = \int_{a}^{b}\varphi (t) dt.
101 | $$
102 | For a function $\varphi$ to be a valid distribution we must have:
103 | \begin{eqnarray}
104 | \forall \;t, \;\; \varphi(t) &\ge& 0  \mbox{\;\;\; (where it is defined)}\\
105 | \int_{a}^{b}\varphi (t) dt && \mbox{is well defined for all $a$ and $b$}\\
106 | \int_{-\infty}^{-\infty}\varphi (t) dt &=& 1
107 | \end{eqnarray}
108 | 
109 | For example consider the continuous variable $Y$ taking values in
110 | $[0,1]$ uniformly. That means $\varphi(t) = 1$ if $t \in [0,1]$ and zero else.
111 | This means that the probability of $Y$ being in the interval $[t,t + dt]$ is exactly $dt$. And so the expectation of $Y$ is:
112 | \begin{equation}
113 | \E[Y] = \int_{t=0}^{1}t \varphi(t) dt = \int_{t=0}^{1}t \cdot dt = \frac{1}{2}t^2|_{0}^{1} = 1/2
114 | \end{equation}
115 | 
116 | \begin{remark}
117 | Strictly speaking, distributions are not necessarily continuous or bounded functions. 
118 | In fact, they can even not be a function at all. 
119 | For example, the distribution of $X$ above includes three Dirac-$\delta$ functions which are not, strictly speaking, functions.
120 | In this class, though, we will see only well behaved distributions.
121 | \end{remark}
122 | 
123 | \subsection{Dependence and Independence}
124 | A variable $X$ is said to be {\it dependent} on $Y$ if the distribution of $X$ given $Y$ is different than the distribution of $X$. 
125 | For example, assume the variable $X$ takes the value $1$ if $Y$ takes a
126 | value of less than $1/3$ and the values $2$ or $3$ with equal probability otherwise ($1/2$ each).
127 | %
128 | Clearly, the probability of $X$ assuming each of its values is still
129 | $1/3$. However, if we know that $Y$ is $0.7234$ the probability of
130 | $X$ assuming the value $1$ is zero. Let us calculate the expectation of $X$ given $Y$ as an exercise.
131 | \begin{eqnarray}
132 | \E(X | Y) = \sum_{x=1}^{3} x \Pr(X = x | Y \le 1/3) = 1\cdot 1\\
133 | \E(X | Y) = \sum_{x=1}^{3} x \Pr(X = x | Y > 1/3) = 1\cdot 0 + 2
134 | \frac{1}{2} + 3\frac{1}{2}  = 2.5
135 | \end{eqnarray}
136 | $E(X | Y) = 1$ for $y \in [0,1/3]$ and $E(X | Y) = 2.5$ for $y \in (1/3,1]$.\\
137 | Remember: $\E(X | Y)$ is a function of $y$!
138 | 
139 | \begin{definition}[Independence]
140 | Two variables are said to be {\it Independent} if:
141 | \[
142 | \forall y,\;\;\Pr[ X=x | Y = y] = \Pr[X=x].
143 | \]
144 | They are {\it dependent} otherwise.
145 | \end{definition}
146 | 
147 | \begin{fact}
148 | If two variables $X$ and $Y$ are {\it Independent} the so are $f(X)$ and $g(Y)$ for any functions $f$ and $g$.
149 | \end{fact}
150 | 
151 | \begin{fact}[Linearity of expectation 1]%
152 | For any random variable and any constant $\alpha$:
153 | \begin{equation}
154 | \E[\alpha X] = \alpha \E[X]
155 | \end{equation}
156 | \end{fact}
157 | 
158 | \begin{fact}[Linearity of expectation 2]%
159 | For any two random variables
160 | \begin{equation}
161 | \E_{X,Y}[X+Y] = \E[X] + \E[Y]
162 | \end{equation}
163 | even if they are dependent.
164 | \end{fact}
165 | 
166 | \begin{fact}[Multiplication of random variables]%
167 | For any two {\bf independent} random variables
168 | \begin{equation}
169 | \E_{X,Y}[XY] = \E[X]\E[Y]
170 | \end{equation}
171 | This does not necessarily hold if they are dependent.
172 | \end{fact}
173 | 
174 | \begin{definition}[Variance]%
175 | For a random variable $X$ we have 
176 | \begin{equation}
177 | \Var[X] = \E[(X - \E[X])^2] = \E[X^2] - (\E[X])^2
178 | \end{equation}
179 | The standard deviation $\sigma$ of $X$ is defined to be $\sigma(X) \equiv \sqrt{\Var[X]}$.
180 | \end{definition}
181 | 
182 | \begin{definition}[Additivity of variances]%
183 | For any two {\bf independent} variables $X$ and $Y$ we have 
184 | \begin{equation}
185 | \Var[X + Y] = \Var[X] + \Var[Y]
186 | \end{equation}
187 | \end{definition}
188 | 
189 | \begin{fact}[Markov's inequality]%
190 | For any {\it non-negative} random variable $X$:
191 | \begin{equation}
192 | \Pr(X > t) \le \frac{E[X]}{t}
193 | \end{equation}
194 | \end{fact}
195 | \begin{proof}
196 | \[
197 | \E_\psi[x] = \int_0^{\infty} z\psi(z)dz = \int_0^{t} z\psi(z)dz + \int_t^{\infty} z\psi(z)dz \ge 0 + \int_t^{\infty} t\psi(z)dz = t \Pr[x>t]
198 | \]
199 | \end{proof}
200 | 
201 | \begin{fact}[Chebyshev's inequality]%
202 | For any random variable $X$
203 | \begin{equation}
204 | \Pr[|X-E[X]| > t] \le \frac{\sigma^2(X)}{t^2}
205 | \end{equation}
206 | \end{fact}
207 | \begin{proof}
208 | \[
209 | \Pr[|X-E[X]| > t]  = \Pr[(X-E[X])^2 > t^2] \le \frac{\E[(X-E[X])^2]}{t^2} = \frac{\sigma^2(X)}{t^2}
210 | \]
211 | \end{proof}
212 | 
213 | \begin{lemma}[The union bound]
214 | For any set of $m$ events $A_1,\ldots,A_m$:
215 | \[
216 | \Pr[\cup_{i=1}^{m}A_i] \le \sum_{i=1}^{m}\Pr{A_i}.
217 | \]
218 | \end{lemma}
219 | In words, the probability that one or more events happen is at most the sum of the 
220 | individual event probabilities. 
221 | 
222 | \begin{theorem}[Chernoff's bound]
223 | Let $X_i$ be a set of {\bf independent} random variables such that $\E[X_i] = 0$ and $|X_i| \le 1$ almost surely.
224 | Also define $\sigma_i^2 = \E[X_i^2]$ and $\sigma^2 = \sum_i \sigma_i^2$. Then:
225 | \[
226 | \Pr[ \sum_i X_i \ge t ] \le max(e^{-t^2/4\sigma^2} , e^{-t/2})
227 | \]
228 | \end{theorem}
229 | \begin{proof}
230 | \begin{eqnarray}
231 | \Pr[ \sum_i X_i \ge t ] &=& \Pr[ \lambda \sum_i X_i \ge \lambda  t ]  \mbox{\;\; (for $\lambda \ge 0$)} \\
232 | &= &\Pr[ e^{\lambda \sum_i X_i} \ge e^{\lambda  t} ]   \mbox{\;\; (because $e^x$ is monotone)} \\
233 | &\le &\E[e^{\lambda \sum_i X_i}] /e^{\lambda  t} \mbox{\;\; (by Markov)} \\
234 | &=& \Pi_i \E[e^{\lambda X_i}] /e^{\lambda  t} \mbox{\;\; (by independence of $X_i$)} 
235 | \end{eqnarray}
236 | 
237 | \begin{center}
238 | \includegraphics[width=0.5\textwidth]{images/chernoff-exp-bounds.png}
239 | \end{center}
240 | 
241 | Now, for $x \in [-1,1]$ we have that $e^x \le 1 + x + x^2$ so $\E[e^{\lambda X_i}] \le 1 + \E[\lambda X_i] + \lambda^2 \E[X_i^2] \le 1 + \lambda^2 \sigma^2_i$.
242 | Now, since $1+x \le e^x$ we have that $1 + \lambda^2 \sigma^2_i \le e^{\lambda^2 \sigma_i^2}$. Combining the above we have that $\E[e^{\lambda X_i}] \le e^{\lambda^2 \sigma_i^2}$
243 | 
244 | \begin{eqnarray}
245 | \Pi_i \E[e^{\lambda X_i}] /e^{\lambda  t} &\le& \Pi_i \E[e^{\lambda^2 \sigma_i^2}] /e^{\lambda  t}\\
246 | &= & e^{\lambda^2 \sigma^2 - \lambda t}
247 | \end{eqnarray}
248 | Now, optimizing over $\lambda \in [0,1]$ we get that $\lambda = min(1,t/2\sigma^2)$ which completes the proof.
249 | \end{proof}
250 | 
251 | \begin{theorem}[Chernoff's bound; Another useful form]
252 | Let $X_1,\ldots,X_n$ be independent
253 | $\{0,1\}$ valued random variables. Each $X_i$ takes the value $1$
254 | with probability $p_i$ and $0$ else. Let $X = \sum_{i=1}^{n}X_i$ and
255 | let $\mu = E[X] = \sum_{i=1}^{n}p_i$. Then:
256 | \begin{eqnarray*}
257 | \Pr[X > (1+\eps)\mu] &\le& e^{-\mu \eps^2/4}\\
258 | \Pr[X < (1-\eps)\mu] &\le& e^{-\mu \eps^2/2}
259 | \end{eqnarray*}
260 | Or, using the union bound:
261 | \[
262 | \Pr[|X - \mu| > \eps\mu] \le 2e^{-\mu \eps^2/4}
263 | \]
264 | \end{theorem}
265 | 
266 | \section{Curse of Dimensionality}
267 | A prime example for the curse of dimensionality is that a random point in $[0,1]^d$ is likely to be far from any set of $n$ points in the unit cube.
268 | Consider the distance between the query point $q$ and an input data vector $x$.
269 | We want to show that $\|x_i-q\|^2 \in \Omega(d)$.
270 | 
271 | First, notice that $\Pr[|x(j)- q(j)| \ge 1/4] \ge 1/2$. The expected distance between $x$ and $q$ is at least $d/8$.
272 | Since $q(j)$ are independently drown, we can apply the Chernoff bound and get that for all $n$ points in the data set
273 | $\|x_i-q\|^2 \ge d/16$ if $d \ge \const\cdot\log(n)$.
274 | 
275 | Now, consider the kd-tree data structure and algorithm run on a random query.
276 | If the radius of the ball around $q$ is less than $d/16$ the query is ``uninteresting'' since it is likely to return no results at all.
277 | On the other hand, if the radius is greater than $d/16$ than the ball around $q$ will cross all the major partitions 
278 | along one of the axes. That means that the algorithm will visit at least $2^d$ partitions.
279 | 
280 | \section{Volumes of balls and cubes}
281 | Another interesting phenomenon that occurs in high dimensions is the fact that unit spheres 
282 | are exponentially smaller (in volume) than their containing boxes.
283 | Let us see this without using the explicit formulas for the volume of $d$ dimensional spheres.
284 | 
285 | To compute the volume of a unit sphere, we perform a thought experiment.
286 | First, bound the sphere in a box (with sides of length $2$).
287 | Then, pick a point in the box uniformly at random. What is the probability $p$ that
288 | the point is also in the sphere? This is exactly the ratio between the volume of the ball and the box ($2^d$).
289 | More accurately, $V = p2^d$ where $V$ is the volume of the sphere.
290 | 
291 | Now, we can bound $p$ from above. 
292 | A uniformly random chosen point from the cube is a vector $x \in \R^d$ such that each coordinate $x(i)$
293 | is chosen uniformly from $[-1,1]$. The event that $x$ is in the unit sphere is the event that $\|x\|^2 = \sum_{i=1}^{d}x(i)^2 \le 1$.
294 | Let $z_i = x(i)^2$, and note that 
295 | $\E[z(i)] = \int_{-1}^{1}\frac{1}{2}t^2 dt = 1/3$. Therefore, $\E[\|x\|^2] = d/3$. 
296 | Also, 
297 | \[
298 | \var(z_i)  = \int_{-1}^{1}\frac{1}{2}t^4 dt  - (1/3)^2  = 1/5 - 1/9 \le 1/10
299 | \]
300 | so by Chernoff's inequality.
301 | $p = \Pr[\sum_{i=1}^{d}x(i)^2 \le 1]  = \Pr[\sum_{i=1}^{d}(z_i -\E[z_i] ) \le 1-d/3] \le e^{-\frac{(d/3)^2}{4d/10}} \le e^{-d/4}$.
302 | This concludes the observation that the fraction of the volume which is inside the sphere is 
303 | exponentially small compared to the cube.
304 | A counter intuitive way of viewing this is that almost the entire volume of the cube is concentrated at the ``corners".
305 | 
306 | \section{Orthogonality of random vectors}
307 | 
308 | A uniformly chosen random vector from the unit sphere ($q$) is almost orthogonal to any fixed vector ($x$) with high probability.
309 | We can see this in two ways. First, we can see that the expected dot product of any vector $x$ with a the random vector $q$ is small.
310 | It is trivial that $\E[\langle x,q \rangle] = \langle x,\E[q] \rangle] = 0$.
311 | 
312 | Moreover, $\E[\langle x,q \rangle^2] = 1/d$.
313 | To see this, consider $q_1,q_2,\ldots,q_d$ where $q_1 = q$ and $q_2,\ldots,q_d$ complete $q$ to an orthogonal basis.
314 | Clearly, the distribution of all $q_i$ are identical (but not independent)
315 | $\E[\langle x, q \rangle^2] = \E[\langle x,q_1\rangle^2] = \E[\frac{1}{d}\sum_{i=1}^{d}\langle x,q_i\rangle^2] = \frac{1}{d}\|x\|^2$.
316 | 
317 | It is not hard to show that in fact for any vector $x$, if $q$ is chosen uniformly at random from the unit sphere 
318 | then $\Pr[ \langle x, q \rangle  \ge \frac{t}{\sqrt{d}}] \le e^{-t^2/2}$.
319 | First, replace that uniform distribution over the unit sphere with an i.i.d. distribution of Gaussians $q(i)\sim \N(0,\frac{1}{\sqrt{d}})$.
320 | Note that $E[\|q\|^2] = 1$, moreover, from the sharp concentration of the $\chi^2$ distribution we know that $E[\|q\|^2] \approx 1$.
321 | For convenience we will assume that $E[\|q\|^2] = 1$ and will ignore the small inaccuracy.
322 | Moreover, due to the rotational invariance of the Gaussian distribution we have that any direction is equally likely and thus this
323 | new distribution approximates the uniform distribution over the sphere.
324 | Next, notice that due to the rotational invariance $\langle x,q \rangle \sim \N(0,\frac{\|x\|}{\sqrt{d}})$ 
325 | and $\Pr[ \langle x, q \rangle  \ge \frac{t\|x\|}{\sqrt{d}}] \le e^{-t^2/2}$.
326 | 
327 | \section{The failure of space partitioning schemes}
328 | As we've seen, a ``typical'' distance from a random query to the nearest neighbor in a $d$-dimensional unit box is at the scale of $\sqrt{d}$.
329 | Note that the side of any box in any space partitioning scheme (including kd-trees, of course) is at most $1$. 
330 | As a result, we guaranteed to scan at least $\Omega(d)$ levels in the tree which is $\exp(d)$ points.
331 | Note that this is more than the brute force solution of $O(nd)$ when $d > \log(n)$. 
332 | 
333 | 
334 | \section{Bonus section: solution of the ``riddle" from the first class}
335 | You were asked to prove that the $\ell_\infty$ metric is the most general metric possible for any finite collections of objects. 
336 | 
337 | Consider $n$ objects $a_1,\ldots,a_n$ and a general metric $d(\cdot,\cdot)$.
338 | Consider the mapping $\psi$ of each item to an $n$ dimensional vector space  $x_i = [d(a_i,a_1),d(a_i,a_2),\ldots,d(a_i,a_n)]$.
339 | It is now easy to see that $ \| x_i - x_j \|_\infty = d(a_i,a_j)$.
340 | \[
341 | \|x_i -x_j\|_\infty = \max_k \; | x_i(k) - x_j(k)| = \max_k |d(a_i,a_k) - d(a_j,a_k) | = |d(a_i,a_j) - d(a_i,a_j) | = d(a_i,a_j)
342 | \]
343 | Note that if $k\ne i,j$ then $ |d(a_i,a_k) - d(a_j,a_k)| \le d(a_i,a_j)$ because of the triangle inequality $d(a_i,a_k) \le  d(a_i,a_j) + d(a_j,a_k) + $ and $d(a_j,a_k) \le d(a_i,a_j) +  d(a_i,a_k)$.
344 | 
345 | 
346 | 
347 | 
348 | 
349 | \bibliographystyle{plain}
350 | \bibliography{vs}
351 | 
352 | \end{document}
353 | %%%%%%%%
354 | 


--------------------------------------------------------------------------------
/class_notes/Class_05_dimensionality_reduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_05_dimensionality_reduction.pdf


--------------------------------------------------------------------------------
/class_notes/Class_05_dimensionality_reduction.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | \usepackage{vs}
  3 | \begin{document}
  4 | 
  5 | \lecturetitle{Class 5 - Dimensionality Reduction}
  6 | 
  7 | 
  8 | 
  9 | \section{Singular Value Decomposition (SVD)}
 10 | 
 11 | 
 12 | \noindent We will see that any matrix $A \in \R^{m \times n}$ (w.l.o.g. $m \le n$) can be written as 
 13 | \begin{eqnarray}
 14 | A &=& \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}\\
 15 | &\forall \;\; \ell& \sigma_\ell \in \R,  \;\; \sigma_\ell \ge 0\\
 16 | &\forall \;\; \ell, \ell'&  \langle u_{\ell}, u_{\ell'} \rangle=  \langle v_{\ell}, v_{\ell'} \rangle = \delta(\ell,\ell')
 17 | \end{eqnarray}
 18 | %
 19 | To prove this consider the matrix $AA^{T} \in \R^{m\times m}$.
 20 | Set $u_\ell$ to be the $\ell$'th eigenvector of $AA^{T}$.
 21 | By definition we have that $AA^{T}u_\ell = \lambda_\ell u_\ell$.
 22 | Since $AA^{T}$ is positive semidefinite we have $\lambda_\ell \ge 0$.
 23 | Since $AA^{T}$ is symmetric we have that $\forall \;\; \ell, \ell' \;  \langle u_{\ell}, u_{\ell'} \rangle = \delta(\ell,\ell')$.
 24 | Set $\sigma_\ell = \sqrt{\lambda_\ell}$ and $v_\ell = \frac{1}{\sigma_\ell}A^{T}u_{\ell}$.
 25 | Now we can compute the following:
 26 | \[
 27 | \langle v_{\ell}, v_{\ell'} \rangle =  \frac{1}{\sigma^{2}_\ell}u_{\ell}^{T}AA^{T}u_{\ell'} =   \frac{1}{\sigma_{\ell}^{2}}\lambda_\ell  \langle u_{\ell}, u_{\ell'} \rangle = \delta(\ell,\ell')
 28 | \]
 29 | %
 30 | We are only left to show that $A = \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}$.
 31 | To do that consider the test vector $w = \sum_{i=1}^{m} \alpha_i u_i$.
 32 | \begin{eqnarray*}
 33 | w^TA =  \sum_{i=1}^{m} \alpha_i u_i^TA = \sum_{i=1}^{m} \alpha_i \sigma_i v_i^T = \sum_{i=1}^{m}\sum_{j=1}^{m}\alpha_i \sigma_j (u_i^Tu_j) v_j^T = (\sum_{i=1}^{m}\alpha_i u_i^T)(\sum_{j=1}^{m}\sigma_j u_j v_j^T)  = w^T(\sum_{j=1}^{m}\sigma_j u_j v_j^T)\\
 34 | \end{eqnarray*}
 35 | %
 36 | The vectors $u_\ell$ and $v_{\ell}$ are called the left and right singular vectors of $A$ and $\sigma_\ell$ are the singular vectors of $A$.
 37 | It is customary to order the singular values in descending order $\sigma_1 \ge \sigma_2, \ldots , \sigma_m \ge 0$.
 38 | Also, we will denote by $r$ the rank of $A$. 
 39 | Here is another very convenient way to write the fact that $A = \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}$
 40 | \begin{itemize}
 41 | \item Let $\Sigma \in \R^{r \times r}$ be a diagonal matrix whose entries are $\Sigma(i,i) = \sigma_i$ and $\sigma_1 \ge \sigma_2 \ge \ldots \ge \sigma_r$.
 42 | \item Let $U \in \R^{m \times r}$ be the matrix whose $i$'th column is the left singular vectors of $A$ corresponding to singular value $\sigma_i$.
 43 | \item Let $V \in \R^{n \times r}$ be the matrix whose $i$'th column is the right singular vectors of $A$ corresponding to singular value $\sigma_i$.
 44 | \end{itemize}
 45 | We have that $A = USV^T$ and that $U^{T}U = V^{T}V = I_r$. Note that the sum goes only up to $r$ which is the rank of $A$. Clearly, not summing up zero valued singular values does not change the sum.
 46 | 
 47 | \subsection*{Applications of the SVD}
 48 | \begin{enumerate}
 49 | \item Determining range, null space and rank (also numerical rank).
 50 | \item Matrix approximation.
 51 | \item Inverse and Pseudo-inverse: If $A=U \Sigma V^{T}$ and $\Sigma$
 52 | is full rank, then $A^{-1}=V \Sigma^{-1} U^{T}$. If $\Sigma$ is
 53 | singular, then its pseudo-inverse is given by $A^{\dagger}=V
 54 | \Sigma^{\dagger} U^{T}$, where $\Sigma^{\dagger}$ is formed by
 55 | replacing every nonzero entry by its reciprocal.
 56 | \item Least squares: If we need to solve $Ax=b$ in the least-squares
 57 | sense, then $x_{LS}=V \Sigma^{\dagger} U^{T} b$.
 58 | \item De-noising -- Small singular values typically correspond to
 59 | noise. Take the matrix whose columns are the signals, compute SVD,
 60 | zero small singular values, and reconstruct.
 61 | \item Compression -- We have signals as the columns of the matrix
 62 | $S$, that is, the $i$ signal is given by
 63 | \begin{equation*}
 64 | S_{i} = \sum_{i=1}^{r} \left ( \sigma_{j} v_{ij} \right ) u_{j}.
 65 | \end{equation*}
 66 | If some of the $\sigma_{i}$ are small, we can discard them with
 67 | small error, thus obtaining a compressed representation of each
 68 | signal. We have to keep the coefficients $\sigma_{j} v_{ij}$ for
 69 | each signal and the dictionary, that is, the vectors $u_{i}$ that
 70 | correspond to the retained coefficients.
 71 | \end{enumerate}
 72 | 
 73 | \noindent SVD and eigen-decomposition are related but there are quite a few differences between them.
 74 | \begin{enumerate}
 75 | \item Not every matrix has an eigen-decomposition (not even any
 76 | square matrix).  Any matrix (even rectangular) has an SVD.
 77 | \item In eigen-decomposition $A=X \Lambda X^{-1}$, that is, the
 78 | eigen-basis is not always orthogonal. The basis of singular vectors
 79 | is always orthogonal.
 80 | \item In SVD we have two singular-spaces (right and left).
 81 | \item Computing the SVD of a matrix is more numerically stable.
 82 | %\item Relation to condition number; the numerical problems with eigen-decomposition; multiplication by an orthogonal matrix is perfectly conditioned.
 83 | \end{enumerate}
 84 | 
 85 | 
 86 | 
 87 | 
 88 | \subsection*{Rank-k approximation in the spectral norm}
 89 | The following will claim that the best approximation to $A$ by a rank deficient 
 90 | matrix is obtained by the top singular values and vectors of $A$. More accurately:
 91 | \begin{fact}
 92 | Set
 93 | \begin{equation*}
 94 | A_{k} = \sum_{j=1}^{k} \sigma_{j} u_{j} v_{j}^{T}.
 95 | \end{equation*}
 96 | Then,
 97 | \begin{equation*}
 98 | \min_{\substack{B \in \mathbb{R}^{m \times n} \\
 99 | \operatorname{rank}(B) \leq k}} \norm{A-B}_{2} = \norm{A-A_{k}}_{2}
100 | = \sigma_{k+1}.
101 | \end{equation*}
102 | \end{fact}
103 | 
104 | 
105 | \begin{proof}
106 | \begin{equation*}
107 | \norm{A-A_{k}} = \norm{\sum_{j=1}^{r} \sigma_{j} u_{j} v_{j}^{T} - \sum_{j=1}^{k}
108 | \sigma_{j} u_{j} v_{j}^{T}} = \norm{\sum_{j=k+1}^{r} \sigma_{j} u_{j}
109 | v_{j}^{T}} = \sigma_{k+1} 
110 | \end{equation*}
111 | and thus $\sigma_{k+1}$ is the largest singular value of $A-A_{k}$.
112 | Alternatively, look at $U^{T} A_{k} V =
113 | \operatorname{diag}(\sigma_{1},\ldots,\sigma_{k},0,\ldots,0)$, which
114 | means that $\operatorname{rank}(A_{k}) = k$, and that
115 | \begin{equation*}
116 | \norm{A-A_{k}}_{2} = \norm{U^{T} (A-A_{k}) V}_{2} =
117 | \norm{\operatorname{diag}(0,\ldots,0,\sigma_{k+1},\ldots,\sigma_{r})}_{2}
118 | = \sigma_{k+1}.
119 | \end{equation*}
120 | 
121 | Let $B$ be an arbitrary matrix with $\operatorname{rank}(B_{k}) =
122 | k$. Then, it has a null space of dimension $n-k$, that is,
123 | \begin{equation*}
124 | \operatorname{null}(B) = \operatorname{span}(w_{1},\ldots,w_{n-k}).
125 | \end{equation*}
126 | A dimension argument shows that
127 | \begin{equation*}
128 | \operatorname{span}(w_{1},\ldots,w_{n-k}) \cap
129 | \operatorname{span}(v_{1},\ldots,v_{k+1}) \ne \{ 0 \}.
130 | \end{equation*}
131 | Let $w$ be a unit vector from the intersection. Since
132 | \begin{equation*}
133 | Aw = \sum_{j=1}^{k+1} \sigma_{j} (v_{j}^{T}w) u_{j},
134 | \end{equation*}
135 | we have
136 | \begin{equation*}
137 | \norm{A-B}_{2}^{2} \ge \norm{(A-B)w}_{2}^{2} = \norm{Aw}_{2}^{2} =
138 | \sum_{j=1}^{k+1} \sigma_{j}^{2} \abs{v_{j}^{T}w}^{2} \ge
139 | \sigma_{k+1}^{2} \sum_{j=1}^{k+1} \abs{v_{j}^{T}w}^{2} =
140 | \sigma_{k+1}^{2},
141 | \end{equation*}
142 | since $w \in \operatorname{span}\{v_{1},\ldots,v_{n+1}\}$, and the
143 | $v_{j}$ are orthogonal.
144 | \end{proof}
145 | 
146 | \subsection*{Rank-k approximation in the Frobenius norm}
147 | 
148 | The same theorem holds with the Frobenius norm.
149 | \begin{theorem} Set
150 | \begin{equation*}
151 | A_{k} = \sum_{j=1}^{k} \sigma_{j} u_{j} v_{j}^{T}.
152 | \end{equation*}
153 | Then,
154 | \begin{equation*}
155 | \min_{\substack{B \in \mathbb{R}^{m \times n} \\
156 | \operatorname{rank}(B) \leq k}} \norm{A-B}_{F} = \norm{A-A_{k}}_{F}
157 | = \sqrt{\sum_{i=k+1}^{m} \sigma_{i}^{2}}.
158 | \end{equation*}
159 | \end{theorem}
160 | \begin{proof}
161 | Suppose $A=U \Sigma V^{T}$. Then
162 | \begin{equation*}
163 | \min_{\operatorname{rank}(B) \leq k} \norm{A-B}^{2}_{F} =
164 | \min_{\operatorname{rank}(B) \leq k} \norm{U \Sigma V^{T} - UU^{T} B
165 | VV^{T}}^{2}_{F} = \min_{\operatorname{rank}(B) \leq k} \norm{\Sigma
166 | - U^{T} B V}^{2}_{F}.
167 | \end{equation*}
168 | Now,
169 | \begin{equation*}
170 | \norm{\Sigma - U^{T} B V}^{2}_{F} = \sum_{i=1}^{n} \left (
171 | \Sigma_{ii} - \left (U^{T}B V)_{ii} \right ) \right )^{2} +
172 | \text{off-diagonal terms}.
173 | \end{equation*}
174 | If $B$ is the best approximation matrix and $U^{T}B V$ is not
175 | diagonal, then write $U^{T}B V=D+O$, where $D$ is diagonal and $O$
176 | contains the off-diagonal elements. Then the matrix $B = U D V^{T}$
177 | is a better approximation, which is a contradiction.
178 | 
179 | Thus, $U^{T}B V$ must be diagonal. Hence,
180 | \begin{equation*}
181 | \norm{\Sigma - D}^{2}_{F} = \sum_{i=1}^{n} \left (\sigma_{i} - d_{i}
182 | \right )^{2} = \sum_{i=1}^{k} \left (\sigma_{i} - d_{i} \right )^{2}
183 | + \sum_{i=k+1}^{n} \sigma_{i}^{2},
184 | \end{equation*}
185 | and this is minimal when $d_{i}=\sigma_{i}$, $i=1,\ldots,k$. The
186 | best approximating matrix is $A_{k} = U D V^{T}$, and the
187 | approximation error is $\sqrt{\sum_{i=k+1}^{n} \sigma_{i}^{2}}$.
188 | \end{proof}
189 | 
190 | 
191 | \section{Linear regression in the least-squared loss}
192 | In Linear regression we aim to find the best linear approximation 
193 | to a set of observed data. For the $m$ data  points $\{x_1,\ldots,x_m\}$,  $x_i \in \R^n$,
194 | each receiving the value $y_i$, we look for the weight vector $w$ that minimizes:
195 | \[
196 | \sum_{i=1}^{n} (x_{i}^{T}w - y_i)^2 = \norm{Aw - y}_{2}^{2}
197 | \]
198 | Where $A$ is a matrix that holds the data points as rows $A_i = x^{T}_{i}$.
199 | 
200 | \begin{proposition}
201 | The vector $w$ that minimizes $\norm{Aw - y}_{2}^{2}$ is $w = A^{\dagger}y = V\Sigma^{\dagger}U^{T}y$
202 | for $A = U\Sigma V^T$ and $\Sigma^{\dagger}_{ii} = 1/\Sigma_{ii}$ if $\Sigma_{ii} > 0$ and $0$ else. 
203 | \end{proposition}
204 | 
205 | Let us define $U_{\parallel}$ and $U_{\perp}$ as the parts of $U$ corresponding to positive and zero singular values of $A$ respectively. 
206 | Also let $y_{\parallel} = 0$ and $y_{\perp}$ be two vectors such that $y = y_{\parallel}+y_{\perp}$ and 
207 | $U_{\parallel}y_{\perp} = 0$ and $U_{\perp}y_{\parallel}=0$.
208 | 
209 | Since $y_{\parallel}$ and $y_{\perp}$ are orthogonal we have that  $\norm{Aw - y}_{2}^{2}
210 | = \norm{Aw - y_{\parallel}-y_{\perp}}_{2}^{2} = \norm{Aw - y_{\parallel}}_{2}^{2} + \norm{y_{\perp}}_{2}^{2}$.
211 | Now, since $y_{\parallel}$ is in the range of $A$ there is a solution $w$ for which $\norm{Aw - y_{\parallel}}_{2}^{2} = 0$.
212 | Namely, $w = A^{\dagger}y = V\Sigma^{\dagger}U^{T}y$ for $A = U\Sigma V^{T}$. This is because $U\Sigma V^{T}V\Sigma^{\dagger}U^{T}y = y_{\parallel}$.
213 | Moreover, we get that the minimal cost is exactly $ \norm{y_{\perp}}_{2}^{2}$ which is independent of $w$.
214 | 
215 | 
216 | \section{PCA, Optimal squared loss dimension reduction}
217 | 
218 | Given a set of $n$ vectors $x_1,\ldots,x_n$ in $\R^{m}$. We look for a rank $k$ 
219 | projection matrix $P \in \R^{m \times m}$ that minimizes:
220 | \[
221 | \sum_{i=1} ||Px_{i} - x_{i}||_{2}^{2}
222 | \]
223 | If we denote by $A$ the matrix whose $i$'th column is $x_i$ then this is equivalent to minimizing $||PA - A||_{F}^{2}$
224 | Since the best possible rank $k$ approximation to the matrix $A$ is $A_{k} = \sum_{i=1}^{k}\sigma_{i}u_{i}v_{i}^{T}$ the best
225 | possible solution would be a projection $P$ for which $PA = A_{k}$. This is achieved by $P = U_{k}U_{k}^{T}$ where $U_{k}$
226 | is the matrix corresponding to the first $k$ left singular vectors of $A$. 
227 | 
228 | If we define $y_i = U_{k}^{T}x_{i}$ we see that the values of $y_i \in \R^{k}$ are optimally fitted to the set of points $x_i$ in the 
229 | sense that they minimize:
230 | \[
231 | \min_{y_1,\ldots,y_n } \min_{\Psi \in \R^{k \times m}}\sum_{i=1} ||\Psi y_i - x_{i}||_{2}^{2}
232 | \] 
233 | The mapping of $x_i \rightarrow  U_{k}^{T}x_i = y_i $ thus reduces the dimension of any set of points  $x_1,\ldots,x_n$ in $\R^{m}$ to 
234 | a set of points $y_1,\ldots,y_n$ in $\R^{k}$ optimally in the squared loss sense. This is commonly referred to as Principal Component Analysis (PCA).
235 | 
236 | 
237 | 
238 | \begin{center}
239 | \includegraphics[width=0.6\textwidth]{images/pca.png}
240 | \end{center}
241 | 
242 | 
243 | \section{Closest orthogonal matrix}
244 | The SVD also allows to find the orthogonal matrix that is closest to
245 | a given matrix. Again, suppose that $A = U \Sigma V^{T}$ and $W$ is
246 | an orthogonal matrix that minimizes $\norm{A-W}^{2}_{F}$ among all
247 | orthogonal matrices. Now,
248 | \begin{equation*}
249 | \norm{U \Sigma V^{T} - W}_{F}^{2} = \norm{U \Sigma V^{T} - UU^{T} W
250 | VV^{T}} = \norm{\Sigma - \tilde{W}},
251 | \end{equation*}
252 | where $\tilde{W}=U^{T} W V$ is another orthogonal matrix. We need to
253 | find the orthogonal matrix $\tilde{W}$ that is closest to $\Sigma$.
254 | Alternatively, we need to minimize $\norm{\tilde{W}^{T} \Sigma -
255 | I}_{F}^{2}$.
256 | 
257 | If $U$ is orthogonal and $D$ is diagonal and positive, then
258 | \begin{equation}\label{eq1}
259 | \begin{aligned}
260 | \operatorname{trace} (UD) &= \sum_{i,k} u_{ik} d_{ki} \leq \sum _{i}
261 | \left ( \left ( \sum_{k} u_{ik}^{2} \right )^{1/2} \left ( \sum_{k}
262 | d_{ik}^{2} \right )^{1/2} \right ) \\
263 | &= \sum_{i} \left ( \sum_{k} d_{ki}^{2} \right )^{1/2} = \sum_{i}
264 | \left ( d_{ii}^{2} \right )^{1/2} = \sum_{i} d_{ii} =
265 | \operatorname{trace}(D).
266 | \end{aligned}
267 | \end{equation}
268 | Now
269 | \begin{align*}
270 | \norm{\tilde{W}^{T} \Sigma - I}_{F}^{2} &= \operatorname{trace}
271 | \left ( \left( \tilde{W}^{T} \Sigma - I \right ) \left(
272 | \tilde{W}^{T} \Sigma - I \right )^{T} \right ) \\
273 | &= \operatorname{trace} \left ( \left( \tilde{W}^{T} \Sigma   - I
274 | \right
275 | ) \left( \Sigma \tilde{W}  - I \right ) \right ) \\
276 | &= \operatorname{trace} \left ( \tilde{W}^{T} \Sigma^{2} \tilde{W}
277 | \right ) - \operatorname{trace} \left ( \tilde{W}^{T} \Sigma \right
278 | ) - \operatorname{trace} \left ( \Sigma \tilde{W} \right ) + n \\
279 | &= \operatorname{trace} \left ( \left ( \Sigma \tilde{W} \right
280 | )^{T} \left ( \Sigma \tilde{W}  \right ) \right ) - 2
281 | \operatorname{trace} \left (\Sigma \tilde{W} \right ) + n \\
282 | &= \norm{\Sigma \tilde{W}}_{F}^{2} - 2 \operatorname{trace} \left
283 | (\Sigma \tilde{W} \right ) + n \\
284 | &= \norm{\Sigma }_{F}^{2} - 2 \operatorname{trace} \left (\Sigma
285 | \tilde{W} \right ) + n.
286 | \end{align*}
287 | Thus, we need to maximize $\operatorname{trace} \left (\Sigma
288 | \tilde{W} \right )$. But this is maximized by $ \tilde{W} = I$ by
289 | \eqref{eq1}. Thus, the best approximating matrix is $W=UV^{T}$.
290 | 
291 | 
292 | 
293 | 
294 | 
295 | \section{Computing the SVD: The power method}
296 | 
297 | We give a simple algorithm for computing the Singular Value Decomposition of a matrix $A \in \R^{m \times n}$.
298 | We start by computing the first singular value $\sigma_1$ and left and right singular vectors $u_1$ and $v_1$ of $A$,
299 | for which $min_{i<j}\log(\sigma_i/\sigma_j) \ge \lambda$:
300 | \begin{enumerate}
301 | \item Generate $x_0$ such that $x_0(i) \sim \N(0,1)$.
302 | \item $s \leftarrow  \log(4\log(2n/\delta)/\eps\delta)/2\lambda$ 
303 | \item for $i$ in $[1,\ldots,s]$:
304 | \item \tab $x_i \leftarrow A^{T}Ax_{i-1}$
305 | \item $v_1 \leftarrow x_i/\norm{x_i}$  
306 | \item $\sigma_1 \leftarrow \norm{Av_1}$
307 | \item $u_1 \leftarrow Av_1/\sigma_1$ 
308 | \item return $(\sigma_1,u_1,v_1)$ 
309 | \end{enumerate}
310 | Let us prove the correctness of this algorithm.
311 | First, write each vector $x_i$ as a linear combination of the right singular values of $A$ i.e. $x_i = \sum_{j} \alpha^{i}_{j}v_j$. 
312 | From the fact that $v_j$ are the eigenvectors of $A^{T}A$ corresponding to eigenvalues $\sigma^{2}_j$ 
313 | we get that $\alpha^{i}_{j}= \alpha^{i-1}_{j}\sigma^{2}_{j}$.
314 | Thus, $\alpha^{s}_{j} = \alpha^{0}_{j}\sigma^{2s}_{j}$. Looking at the ratio between the coefficients of $v_1$ and $v_i$ for $x_s$
315 | we get that:
316 |  \[
317 |  \frac{|<x_s,v_1>|}{|<x_s,v_i>|} = \frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|}\left(\frac{\sigma_1}{\sigma_i}\right)^{2s}
318 | \]
319 | Demanding that the error in the estimation of $\sigma_1$ is less than $\eps$ gives the requirement on $s$.
320 | \begin{eqnarray}
321 | \frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|}\left(\frac{\sigma_1}{\sigma_i}\right)^{2s} &\ge& \frac{n}{\eps}\\
322 | s &\ge& \frac{\log(n|\alpha^{0}_i|/\eps|\alpha^{0}|_1)}{2\log(\sigma_1/\sigma_i)}
323 | \end{eqnarray}
324 | From the two-stability of the Gaussian distribution we have that $\alpha^{0}_i \sim \N(0,1)$. 
325 | Therefore, $\Pr[\alpha^{0}_i > t] \le e^{-t^2}$ which gives that with probability at least $1-\delta/2$ we have for
326 | all $i$, $|\alpha^{0}_i | \le \sqrt{\log(2n/\delta)}$. Also, $\Pr[|\alpha^{0}_1 | \le \delta/4 ] \le \delta/2$ (this is because 
327 | $\Pr[|z| < t] \le max_{r}\Psi_{z}(r)\cdot2t$ for any distribution and the normal distribution function at zero takes it maximal value which is less than $2$) 
328 | Thus, with probability at least $1-\delta$ we have that for all $i$, $\frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|} \le \frac{\sqrt{\log(2n/\delta)}}{\delta/4}$.
329 | Combining all of the above we get that it is sufficient to set $s = \log(4n\log(2n/\delta)/\eps\delta)/2\lambda = O(\log(n/\eps\delta)/\lambda)$
330 | in order to get $\eps$ precision with probability at least $1-\delta$.
331 | 
332 | We now describe how to extend this to a full SVD of $A$. Since we have computed $(\sigma_1,u_1,v_1)$, we can repeat this
333 | procedure for $A - \sigma_{1}u_{1}v_{1}^{T} = \sum_{i=2}^{n}{\sigma_{i}u_{i}v_{i}^{T}}$. The top singular value and vectors of which are $(\sigma_2,u_2,v_2)$.
334 | Thus, computing the rank-k approximation of $A$ requires $O(mnks)  = O(mnk\log(n/\eps\delta))/\lambda)$ operations. 
335 | This is because computing $A^{T}Ax$ requires $O(mn)$ operations and
336 | for each of the first $k$ singular values and vectors this is performed $s$ times. 
337 | 
338 | The main problem with this algorithm is that its running time is heavily influenced by the value of $\lambda$.
339 | This is, in fact, an artifact of the analysis rather than the algorithm. Next, we see a gap independent analysis.
340 | 
341 | 
342 | 
343 | \section{Gap independent analysis}
344 | We show a short proof from \cite{liberty2016short} of a spectral gap independent property of simultaneous iterations. This follows the similar analyses \cite{RokhlinST09,HalkoMT2011,MuscoM15,WittenE15}. 
345 | 
346 | \begin{lemma} Let $A \in \R^{n \times m}$ be an arbitrary matrix and let $G \in \R^{m \times k}$ be a matrix of i.i.d.\ random Gaussian entries. 
347 | Let $t = c\cdot \log(n/\eps)/\eps$ and $Z = \operatorname{span}((AA^T)^t A G)$ then
348 | \[
349 | ||A - ZZ^TA|| \le (1+\eps)\sigma_{k+1}
350 | \]
351 | with high probability depending only on the universal constant $c$.
352 | \end{lemma}
353 | \begin{proof}
354 | $||A - ZZ^TA|| = \max_{x :\|x\|=1}  \|x^T A\|$ such that $\|x^TZ\| = 0$.
355 | We change variables $A = USV^T$ and $x = Uy$ and $G' = V^TG$.
356 | Note that $G'$ is also a matrix of i.id.\ Gaussian entries because $V$ is orthogonal.
357 | This reduces to 
358 | $\max_{y:\|y\|=1}  \|y^TS\|$ such that $y^TS^{2t+1}G' = 0$.
359 | We now break $y$, $S$, and $G'$ to two blocks each such that
360 | \[
361 | y =
362 | \left(\begin{array}{c}
363 | y_1 \\ \hline
364 | y_2 \\
365 | \end{array}\right)
366 | %
367 | \mbox{,\;\;}
368 | %
369 | S = \left(\begin{array}{c|c}
370 | S_1 & 0 \\ \hline
371 | 0 & S_2 \\
372 | \end{array}\right)
373 | %
374 | \mbox{,\;\;}
375 | %
376 | G' = \left(\begin{array}{c}
377 | G'_1  \\ \hline
378 | G'_2 \\
379 | \end{array}\right)
380 | \] 
381 | and $y_1 \in \R^{k}$, $y_2 \in \R^{n-k}$, $S_1 \in \R^{k \times k}$, $S_2 \in \R^{(n-k) \times (n-k)}$, $G'_1 \in \R^{k \times k}$, and $G'_2 \in \R^{(n-k) \times k}$.
382 | \begin{eqnarray*}
383 | 0 &=& \|y^T S^{2t+1} G'\| = \|y_1^T S^{2t+1}_1 G'_1+  y_2^T S^{2t+1}_2 G'_2\| \\
384 | &\ge& \|y_1^T S^{2t+1}_1 G'_1\| -  \|y_2^T S^{2t+1}_2 G'_2\| \\
385 | &\ge& \|y_1^T S^{2t+1}_1\|/\|G'^{-1}_1\| - \|y_2^T\| \cdot \|S^{2t+1}_2 \| \cdot  \|G'_2\| \\
386 | &\ge& |y_1(i)| \sigma_{i}^{2t+1}/\|G'^{-1}_1\| - \sigma_{k+1}^{2t+1} \cdot  \|G'_2\| \ .
387 | \end{eqnarray*}
388 | \noindent This gives that $|y_1(i)| \le (\sigma_{k+1}/\sigma_i)^{2t+1}\|G'_2\| \|G'^{-1}_1\|$. Equipped with this inequality we bound the expression $\|y^TS\|$.
389 | Let $k' \le k$ be such that $\sigma_{k'} \ge (1+\eps)\sigma_{k+1}$ and $\sigma_{k'+1} < (1+\eps)\sigma_{k+1}$.
390 | \begin{eqnarray}
391 | ||A - ZZ^TA||^2 &=& \|y^TS\|^2 = \sum_{i=1}^{k'}y^2_i \sigma_i^2 + \sum_{i=k'+1}^{n}y^2_i \sigma_i^2 \\
392 | &\le& \left( \|G'_2\|^2 \|G'^{-1}_1\|^2 \sum_{i=1}^{k'}(\sigma_{k+1}/\sigma_i)^{4t}  \sigma_{k+1}^2 \right) + (1+\eps)\sigma_{k+1}^2 \\
393 | &\le& \left[ \|G'_2\|^2 \|G'^{-1}_1\|^2 k (1/(1+\eps))^{4t} + (1+\eps)\right]\sigma_{k+1}^2 \le (1+2\eps)\sigma_{k+1}^2
394 | \end{eqnarray}
395 | The last step is correct as long as $ \|G'_2\|^2 \|G'^{-1}_1\|^2 k (1/(1+\eps))^{4t} \le \eps \sigma^2_{k+1}$ which holds for $t \ge \log(\|G'_2\|^2 \|G'^{-1}_1\|^2 k/\eps) /4\log(1+\eps) = O(\log(n/\eps)/\eps)$. The last inequality uses the fact that $G'_1$ and $G'_2$ are random gaussian due to rotational invariance of the Gaussian distribution. This means that $\|G'_2\|^2 \|G'^{-1}_1\|^2 = O(\operatorname{poly}(n))$ with high probability \cite{Rudelson08}.
396 | Finally, $||A - ZZ^TA|| \le \sqrt{1+2\eps}\cdot\sigma_{k+1} \le (1+\eps)\sigma_{k+1}$.
397 | \end{proof}
398 | 
399 | 
400 | 
401 | 
402 | \section{Random-projection}
403 | 
404 | We will give a simple proof of the following, rather amazing, fact. Every set of $n$ points 
405 | in a Euclidian space (say in dimension $d$) can be embedded into the Euclidean space of 
406 | dimension $k = O(\log(n)/\eps^2)$ such that all pairwise distances are preserved up distortion $1\pm \eps$.
407 | We will prove the construction of \cite{DasGuptaGupta99} which is simpler than the one in \cite{JL84}.
408 | 
409 | 
410 | We will argue that a certain distribution over the choice of a matrix $\R \in \R^{k \times d}$ gives that:
411 | \begin{equation}
412 | \label{e1}
413 | \forall x \in \Sph^{d-1} \;\; \Pr\left[ \left| ||\frac{1}{\sqrt{k}}Rx|| - 1 \right| > \eps \right] \le \frac{1}{n^2} 
414 | \end{equation}
415 | Before we pick this distribution and show that Equation~\ref{e1} holds for it, let us first see
416 | that this gives the opening statement. 
417 | 
418 | Consider a set of $n$ points $x_1,\ldots, x_n$ in Euclidean space $\R^d$. Embedding these points
419 | into a lower dimension while preserving all distances between
420 | them up to distortion $1\pm \eps$ means approximately preserving the norms of all 
421 | ${n \choose 2}$ vectors $x_i - x_j$. Assuming Equation~\ref{e1} holds and using the union 
422 | bound, this property will fail to hold for at least one $x_i - x_j$ pair with probability at most ${n \choose 2}\frac{1}{n^2} \le 1/2$.
423 | Which means that all ${n \choose 2}$ point distances are preserved up to distortion $\eps$ with probability at least $1/2$.
424 | 
425 | 
426 | \section{Matrices with normally distributed independent entries}
427 | We consider the distribution of matrices $R$ such that each $R(i,j)$ is drawn independently from  a
428 | normal distribution with mean zero and variance $1$, $R(i,j) \sim \N(0,1)$. We show that for this
429 | distribution Equation~\ref{e1} holds for some $k \in O(\log(n)/\eps^2)$.
430 | 
431 | First consider the random variable $z = \sum_{j=1}^{d}r(j)x(j)$ where $r(j) \sim \N(0,1)$. 
432 | To understand how the variable $z$ distributes we recall the two-stability of the
433 | normal distribution. Namely, if $z_3 = z_2 + z_1$ and 
434 | $z_1 \sim \N(\mu_1,\sigma_{1})$ and $z_2 \sim \N(\mu_2,\sigma_{2})$ then, $$z_3 \sim \N(\mu_1 + \mu_2,\sqrt{\sigma^{2}_{1} + \sigma^{2}_{2}}).$$
435 | In our case,  $r(i)x(i) \sim \N(0,x_{i})$ and therefore, $z = \sum_{i=1}^{d}r(i)x(i) \sim \N(0,\sqrt{\sum_{i=1}^{d}x^{2}_{i}}) \sim \N(0,1)$.
436 | %
437 | Now, note that each element in the vector $Rx$ distributes exactly like $z$.
438 | Defining $k$ identical copies of $z$, $z_1,\ldots,z_k$,
439 | We get that $||\frac{1}{\sqrt{k}}Rx||$ distributes exactly like $\sqrt{\frac{1}{k}\sum_{i=1}^{k} z^{2}_{i}}$.
440 | Thus, proving Equation~\ref{e1} reduces to showing that:
441 | \begin{equation}
442 | \Pr\left[ \left| \sqrt{\frac{1}{k}\sum_{i=1}^{k} z^{2}_{i}} - 1 \right| > \eps \right] \le \frac{1}{n^2} 
443 | \end{equation}
444 | for a set of independent normal random variables $z_1,\ldots,z_k \sim \N(0,1)$.
445 | It is sufficient to demanding that $\Pr[\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)^2]$ and $\Pr[\sum_{i=1}^{k} z^{2}_{i} \le k(1-\eps)^2]$ are both smaller than $1/2n^2$.
446 | We start with bounding the probability that $\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)$ (this is okay because $k(1+\eps) < k(1+\eps)^2$).
447 | \[
448 | \Pr[\sum z^{2}_{i} \ge k(1+\eps)] = \Pr[e^{\lambda \sum z^{2}_{i}} \le e^{\lambda k (1+\eps)}] \le (\E[e^{\lambda z^2}])^k/e^{\lambda k (1+\eps)}
449 | \]
450 | Since $z \sim \N(0,1)$ we can compute $\E[e^{\lambda z^2}]$ exactly:
451 | \[
452 | \E [e^{\lambda z^{2}}] = \frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{\lambda t^{2}} e^{-\frac{t^{2}}{2}} dt =\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{-\frac{(t\sqrt{1-2\lambda})^{2}}{2}}dt = \frac{1}{\sqrt{1-2\lambda}}% e^{\frac{1}{2} \log(1-2\lambda)}
453 | \]
454 | The final step is by substituting $t' = t\sqrt{1-2\lambda}$ and recalling that $\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{-\frac{t'^{2}}{2}}dt' = 1$.
455 | Finally, using the fact that $\frac{1}{\sqrt{1-2\lambda}} \le 2\lambda + 4\lambda^2$ for $\lambda \in [0,1/4]$ we have:
456 | \[
457 | \E [e^{\lambda z^{2}}] \le e^{\lambda + 2\lambda^2}
458 | \]
459 | Substituting this into the equation above we have that:
460 | \[
461 | \Pr \le e^{k(\lambda  + 2\lambda^2) - k\lambda (1+\eps)} = e^{ 2k\lambda^2 - k\lambda\eps}  = e^{ - k\eps^2/8}  
462 | \]
463 | for $\lambda \leftarrow \eps/4$. Finally, our condition that 
464 | \[
465 | \Pr[\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)] \le e^{ - k\eps^2/8} \le 1/2n^2
466 | \]
467 | is achieved by $k = c\log(n)/\eps^2$.
468 | Calculating for $\Pr[\sum_{i=1}^{k} z^{2}_{i} \le k(1-\eps)]$ in the same manner shows that $k = c\log(n)/\eps^2$ is also sufficient for this case.
469 | This completes the proof.
470 | 
471 | 
472 | 
473 | 
474 | 
475 | \section{Fast Random Projections}
476 | We discussed in class the fact that random projection matrices cannot be made sparse in general.
477 | That is because projecting sparse vectors and preserving their norm requires the projecting matrix is almost fully dense see also \cite{JelaniH2012} and \cite{KaneN12}.
478 | 
479 | But, the question is, can we actively make sure that $x$ is not sparse? If so, can we achieve a sparse random projection for non sparse vectors?
480 | These two questions received a positive answer in the seminal work by Ailon and Chazelle \cite{AilonCh06}.
481 | The results of \cite{AilonCh06} were improved and simplified over the years. See \cite{AilonL11} for the latest result and an overview.
482 | 
483 | In this lesson we will produce a very simple algorithm based on the ideas in \cite{AilonCh06}.
484 | This algorithm will require a target dimension of $O(\log^2(n)/\eps^2)$ instead of $O(\log(n)/\eps^2)$ but will be much simpler to prove.
485 | 
486 | \subsection{Fast vector $\ell_4$ norm reduction}
487 | The goal of this subsection is to devise a linear mapping which preserves vector's $\ell_2$ norms but reduces their $\ell_4$ norms with high probability.
488 | This will work to our advantage because, intuitively, vectors whose $\ell_4$ norm is small cannot be too sparse.
489 | For this we will need to learn what Hadamard matrices are.
490 | 
491 | Hadamard matrices are commonly used in coding theory and are conceptually
492 | close for Fourier matrices. We assume for convenience that $d$ is a power of $2$ (otherwise we can pad out vectors with zeros).
493 | The Walsh Hadamard transform of a vector $x \in \R^{d}$ is the
494 | result of the matrix-vector multiplication $Hx$ where $H$ is a $d
495 | \times d$ matrix whose entries are $H(i,j) = \frac{1}{\sqrt{d}}(-1)^{\langle
496 | i,j\rangle}$. Here ${\langle i,j\rangle}$ means the dot product over
497 | $F_2$ of the bit representation of $i$ and $j$ as binary vectors of
498 | length $\log(d)$.
499 | Another way to view this is to define Hadamard Matrices recursively.
500 | \begin{equation*} %
501 | H_{1} = \frac{1}{\sqrt{2}}\left(
502 |           \begin{array}{rr}
503 |             1 & 1 \\
504 |             1 & -1\\
505 |           \end{array}
506 |         \right)
507 | ,\;\;
508 |         H_{d} = \frac{1}{\sqrt{2}}\left(
509 |           \begin{array}{r:r}
510 |             H_{d/2} & H_{d/2} \\ \hdashline
511 |             H_{d/2} & -H_{d/2}\\
512 |           \end{array}
513 |         \right)
514 | \end{equation*} %
515 | Here are a few interesting (and easy to show) facts about Hadamard matrices.
516 | \begin{enumerate}
517 | \item $H_d$ is a unitary matrix $\|Hx\| = \|x\|$ for any vector $x\in \R^d$.
518 | %\item $H_{d}(i,j) \in \{ \frac{1}{\sqrt{d}},- \frac{1}{\sqrt{d}}\}$
519 | \item Computing $x \mapsto Hx$ requires $O(d\log(d))$ operations.
520 | \end{enumerate}
521 | 
522 | 
523 | We also define a diagonal matrix $D$ to be such that $D(i,i) \in \{1,-1\}$ uniformly.
524 | Clearly, we have that $\|HDx\|_2 = \|x\|_2$ since both $H$ and $D$ are isotropies.
525 | Let us now bound $\|HDx\|_\infty$.
526 | $(HDx)(1) = \sum_{i=1}^{d}H(1,i)D(i,i) x_i = \sum_{i=1}^{d}\frac{x_i}{\sqrt{d}}s_i$ where $s_i \in \{-1,1\}$ uniformly.
527 | To bound this we recap Hoeffding's inequality.
528 | \begin{fact}[Hoeffding's inequality]
529 | Let $X_1,\ldots,X_n$ be independent random variables s.t. $X_i \in [a_i,b_i]$.
530 | Let $X = \sum_{i=1}^{n} X_i$.
531 | \begin{equation}
532 | \Pr[|X - \E[X]| \ge t] \le 2e^{-\frac{2 t^2}{\sum_{i=1}^{n} (b_i -a_i)^2}}
533 | \end{equation}
534 | \end{fact}
535 | Invoking Hoeffding's inequality and then the union bound we get that if $\|HDx\|_\infty \le \sqrt{\frac{c \log(n)}{d}}$ for all points $x$.
536 | Remark, for this we assumed $\log(d) = O(\log(n))$ otherwise we should have had $\log(nd)$ in the bound. 
537 | The situation, however, that the dimension is super polynomial in the number of points is unlikely. 
538 | Usually it is common to have $n > d$.
539 | 
540 | \begin{lemma}
541 | Let $x \in \R^d$ by such that $\|x\|=1$. Then:
542 | \[
543 | \|HDx\|^4_4 = O(log(n)/d)
544 | \]
545 | with probability at least $1-1/\poly(n)$
546 | \end{lemma}
547 | \begin{proof}
548 | Let us define $y = HDx$ and $z_i$ = $y_i^2$. 
549 | From the above we have that $z_i \le \frac{c \log(n)}{d} = \eta$ with probability at least $1-1/\poly(n)$.
550 | The quantity $\|HDx\|^4_4 = \|y\|_{4}^{4} = \sum_{i}z_i^2$ is a convex function of the $z$ variables which is defined over a polytop $z_i \in [0,1]$ and $\sum_{i} z_i = 1$ (this is because $\|y\|_2^2 = 1$).
551 | This means that its maximal value is obtained on an extreme point of this polytope. 
552 | In other words, the point $z_1,\ldots,z_{1/\eta} = \eta$ and $z_{1/\eta+1},\ldots,z_{d} = 0$ or $z = [\eta,\eta,\ldots,\eta,\eta,0,0,0,\ldots,0,0,0]$.
553 | Computing the value of the function in this point gives $\sum_{i}z_i^2 \le (1/\eta)\cdot (\eta^2) = \eta$. Recalling the $\eta = \frac{c \log(n)}{d}$ completes the proof.  
554 | \end{proof}
555 | 
556 | \subsection{Sampling from vectors with low $\ell_4$ norms}
557 | Here we prove a very simple fact. For vectors whose $\ell_4$ is low, dimensionality reduction can be obtained by sampling.
558 | 
559 | 
560 | Let $y$ be a vector such that $\|y\|_2 = 1$. Let $z$ be a sampled version of $y$ such that $z_i = y_i/\sqrt{p}$ with probability $p$ and $0$ else. 
561 | This is akin to sampling, in expectation, $d\cdot p$ coordinates from $y$ (and scaling them up by $1/\sqrt{p}$).
562 | Note the $\E[\|z\|^2] = \E[\|y\|^2] = 1$ moreover:
563 | \[
564 | \Pr[|\|z\|^2 - 1| > \eps] = \Pr[|\sum z_i^2 - 1| > \eps] = \Pr[|\sum b_i y_i^2/p - 1| > \eps]
565 | \]
566 | Where $b_i$ are independent random indicator variables taking the $b_i = 1$ with probability $p$ and $b_i = 0$ else.
567 | To apply Chernoff's bound we must assert that $y_i^2/p \le 1$. Let's assume this for now and return to it later.
568 | Applying Chernoff's bound we get
569 | \[
570 | \Pr[|\sum b_i y_i^2/p - 1| > \eps] \le e^{-\frac{c\eps^2}{\sigma^2}}
571 | \]
572 | where $\sigma^2 = \sum_{i} \E[(b_i y_i^2/p)^2] = \|y\|_{4}^{4}/p$.
573 | Concluding that
574 | \[
575 | \Pr[|\|z\|^2 - 1| > \eps] \le e^{-\frac{cp\eps^2}{\|y\|_4^4}}
576 | \]
577 | This shows that the concentration of the sampling procedure really depends directly on the $\ell_4$ norm of the sampled vector.
578 | If we plug in the bound on $\|y\|_4^4 = \|HDx\|_4^4$ from the previous section we get 
579 | \[
580 | \Pr[|\|z\|^2 - 1| > \eps] \le e^{-\frac{cp\eps d}{\log(n)}} \le \frac{1}{\poly(n)}
581 | \]
582 | For some $p \in O(\log^2(n)/d\eps^2)$. 
583 | 
584 | \subsection{Random Projection by Sampling}
585 | Putting it all together we obtain the following.
586 | \begin{lemma}
587 | Define the following matrices
588 | \begin{itemize}
589 | \item $D$: A diagonal matrix such that $D_{i,i} \in \{+1,-1\}$ uniformly.
590 | \item $H$: The $d\times d$ Walsh Hadamard Transform matrix.
591 | \item $P$: A `sampling matrix' which contains each row of matrix $I_d\cdot \sqrt{p}$ with probability $p= c\log^2(n)/d\eps^2$.
592 | \end{itemize}
593 | Then, with at least constant probability the following holds.
594 | \begin{enumerate}
595 | \item The target dimension of the mapping is $k = c\log^2(n)/\eps^2$ (a factor $log(n)$ worse than optimal).
596 | \item The mapping $x \mapsto PHDx$ is a $(1\pm\eps)$-distortion mapping for any set of $n$ points. 
597 | That is, for any set $x_1,\ldots,x_n \in \R^d$ we have
598 | \[
599 | \|x_i -x_j\|(1-\eps) \le \|PHDx_i  - PHDx_j\| \le \|x_i -x_j\|(1+\eps)
600 | \]
601 | \item Storing $PHD$ requires at most $O(d + k\log(d))$ space.
602 | \item Applying the mapping $x \mapsto PHDx$ requires at most $d\log(d)$ floating point operations.
603 | \end{enumerate}
604 | \end{lemma}
605 | 
606 | 
607 | \bibliographystyle{plain}
608 | \bibliography{vs}
609 | 
610 | \end{document}
611 | %%%%%%%%
612 | 


--------------------------------------------------------------------------------
/class_notes/Class_06_aproximate_nearest_neighbor_search.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_06_aproximate_nearest_neighbor_search.pdf


--------------------------------------------------------------------------------
/class_notes/Class_06_aproximate_nearest_neighbor_search.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | \usepackage{vs}
  3 | \begin{document}
  4 | 
  5 | \lecturetitle{Class 6 - Approximate Nearest Neighbor Search}
  6 | 
  7 | \section{Locality Sensitive Hashing}
  8 | 
  9 | In this section we will review ideas from \cite{Charikar02} and \cite{GionisIM99}.
 10 | We define a family $\mathcal{H}$
 11 | of functions as $(r_1,r_2,p_1,p_2)$-sensitive if:
 12 | \begin{eqnarray*} 
 13 | || x- y || < r_1 &\rightarrow& \Pr_{h \sim \mathcal{H}}(h(x)=h(y)) > p_1\\
 14 | || x- y || > r_2 &\rightarrow& \Pr_{h \sim \mathcal{H}}(h(x)=h(y)) < p_2
 15 | \end{eqnarray*}
 16 | This is only meaningful if $r_1 < r_2$ and $p_1 > p_2$.
 17 | Which means that if $x$ and $y$ are ``close" then the probability that
 18 | they hash to the same value is at least something, but if they are further away
 19 | then it is smaller. Or, the probability of points being hashed to the same value 
 20 | decreases with their distance.
 21 | 
 22 | 
 23 | Let us assume such functions exist and give some intuition on how to use them.
 24 | First we concatenate $k$ different hash functions from $\mathcal{H}$ 
 25 | to construct a new hash function $g(x) = [h_1(x),\ldots,h_k(x)]$.
 26 | We choose $k$ such that $\Pr(g(x)=g(y)) \le 1/n$ if $||x-y|| > r_2$.
 27 | Using the $(r_1,r_2,p_1,p_2)$-sensitivity of $\mathcal{H}$ we will get that
 28 | if $||x-y|| < r_1$ then $\Pr(g(x)=g(y)) \ge 1/n^{\rho}$ for some $\rho<1$.
 29 | 
 30 | Now, if we generate $\ell = n^{\rho}$ different copies of $g$, $g_1,\ldots,g_\ell$,
 31 | and consider every $x$ in the data for which $g_i(x)=g_i(q)$ we will 
 32 | find every close point $x$ with constant probability and consider only $O(n^\rho)$ far points.
 33 | 
 34 | Let us make this statement more precise.
 35 | The preprocessing step is so.
 36 | \begin{enumerate}
 37 | \item $\rho \leftarrow \log(1/p_1)/\log(1/p_2)$
 38 | \item $\ell \leftarrow n^{\rho}$
 39 | \item $k \leftarrow \log(n)/log(1/p_2)$
 40 | \item for $\ell' \in \{1,\ldots,\ell\}$
 41 | \item \tab $g_{\ell'} \leftarrow [h_1(x),\ldots,h_k(x)]$
 42 | \item for $x \in X$
 43 | \item \tab for $\ell' \in \{1,\ldots,\ell\} $
 44 | \item \tab \tab add $x$ to $T_{\ell'}(g_{\ell'}(x))$
 45 | \end{enumerate}
 46 | 
 47 | The search stage is as follows:
 48 | \begin{enumerate}
 49 | \item $S \leftarrow \emptyset$
 50 | \item for $\ell' \in \{1,\ldots,\ell\} $
 51 | \item  \tab add $T_{\ell'}(g_{\ell'}(x)))$ to $S$
 52 | \item if $|S| \le 2n^{\rho}$
 53 | \item \tab for $x' \in S$
 54 | \item \tab  \tab if $||x' - q|| \le r_2$
 55 | \item \tab \tab \tab return  $x'$
 56 | \end{enumerate}
 57 | 
 58 | \begin{fact}
 59 | the number of points $x$ such that $||x-q|| \ge r_2$ and $x \in S$ is smaller that
 60 | $2\cdot n^{\rho}$ with probability at least $1/2$. 
 61 | \end{fact}
 62 | \begin{proof}
 63 | $x \in S$ is for some $\ell'$ we have $g_{\ell'}(q)  = g_{\ell'}(x)$ for $x$ such that $||x-q||>r_2$
 64 | this happens with probability $p_{2}^{log(n)/log(1/p_2)} = 1/n$. Thus, the expected total number of 
 65 | such points $x$ is $1$. Since we have $\ell = n^{\rho}$ different $g$ functions the total expected number of such
 66 | points is $n^{\rho}$. Due to the above and Markov's inequality $\Pr[|S| > 2n^{\rho}] \le \Pr[|S| > 2E[|S|]] \le 1/2$.   
 67 | \end{proof}
 68 | 
 69 | \begin{fact}
 70 | If $||x-q|| \le r_1$ then with constant probability $x \in S$
 71 | \end{fact}
 72 | \begin{proof}
 73 | By the $(r_1,r_2,p_1,p_2)$-sensitivity of $H$
 74 | \[
 75 | \Pr[g(x) = g(q)] \ge p_{1}^{k} = p_{1}^{\log(n)/\log(1/p_2)} = n^{-\log(1/p1)/\log(1/p_2)} = n^{-\rho}
 76 | \]
 77 | Since we repeat this $\ell = n^{\rho}$ times independently, we have that  $g_{\ell'}(x) \not = g_{\ell'}(q)$ for all 
 78 | $\ell'$ with probability at most $(1-n^{-\rho})^{n^{\rho}} < e^{-1}$ 
 79 | \end{proof}
 80 | 
 81 | Thus, both events happen with probability at least $1 - 1/2 - e^{-1} = const$.
 82 | We can duplicate the entire data structure $O(\log(1/\delta))$ time to achieve success probability $1-\delta$
 83 | in the cost of an $O(\log(1/\delta))$ factor in data storage and search time.
 84 | This means that the searching running time is $O(dn^{\rho})$.
 85 | 
 86 | \section{LSH functions}
 87 | \subsection{$\{0,1\}^d$  with the Hamming distance}
 88 | The hamming distance between points which are $x,y\in \{0,1\}^d$ is defined as 
 89 | the number of coordinates for which $x$ and $y$ defer. We claim that choosing a random 
 90 | coordinate from each vector is a local sensitive function and examine its parameters.   
 91 | \begin{fact}
 92 | let $\mathcal{H}$ be a family of $d$ functions for which $h_i(x) = x_i$.
 93 | Then, $\mathcal{H}$ is $(r,(1+\eps)r,1-\frac{r}{d},1-\frac{(1+\eps)r}{d})$-sensitive.
 94 | \end{fact}
 95 | \begin{fact}
 96 | If $r \le d/\log(n)$ then $\rho = \log(1/p_1)/\log(1/p_2) \le 1/(1+\eps)$
 97 | \end{fact}
 98 | \begin{proof}
 99 | See Fact 3 in \cite{GionisIM99}. Moreover, assuming $r \le d/\log(n)$ is harmless since we can always 
100 | extend each vector by $d\log(n)$ zeros which does not change their distances and guaranties that  $r \le d/\log(n)$.
101 |  \end{proof}
102 | 
103 | 
104 | \begin{remark}
105 | This results is also applicable to the Euclidean distance setting because it is possible
106 | to map $\ell_{2}^{d}$ into $\ell_{1}^{O(d)}$ and also trivially possible to map $\ell_{1}^{d} = \{0,1\}^{O(d/\eps)}$ with distortion 
107 | $\eps$ for bounded valued vectors. 
108 | \end{remark}
109 | 
110 | 
111 | Thus, the running time of $O(n^{\rho})$ is in fact $O(n^{1/(1+\eps)})$. In other words, to find a
112 | the closest neighbor up to a factor of $2$ in this distance is possible while examining only $O(\sqrt{n})$ data points.
113 | This, however, does not achieve the bound of $O(\poly(d,\log(n)))$.  
114 | 
115 | \subsection{Searching with similarities}
116 | Note that in the above we never used the fact that the distance function is a metric. 
117 | Indeed, it is possible to search though items as long as we can produce a local sensitive hashing.
118 | In \cite{Charikar02} Charikar defined Local sensitive hashing as:
119 | \[
120 | \Pr_{h}[h(x)=h(y)] = sim(x,y)
121 | \]
122 | For example, let $x$ and $y$ be sets of items. Their set similarity can be defined as $\frac{| x \cap y|}{|x \cup y|}$.
123 | Here we can use a famous trick. We will map $h(x) \rightarrow \arg \min_{x_i \in x} g(x_i)$ when $g$ is a random permutation over
124 | the entire universe or a random function into $[0,1]$ for example. The reason this holds true is because 
125 | the minimal value of $g$ in $|x \cup y|$ might accidentally be also in $| x \cap y|$ but 
126 | since the distribution is uniform, the probability of this event is $\frac{| x \cap y|}{|x \cup y|}$.
127 |  
128 |  \subsection{LSH for points in $\Sph^{d-1}$}
129 | The set of unit length vectors in $\R^{d}$ is called the $d$ dimensional unit sphere and is denoted by  $\Sph^{d-1}$
130 | (the power is $d-1$ to denote that it is actually a $d-1$ dimensional manifold. Do not be confused, the points are still in $\R^d$)
131 | For these points, we can define the distance as the angle between the vectors $d(x,y) = cos^{-1}(x^{T}y)$.
132 | We can thus define a hash function $h(x) = sign(u^{T}x)$ for a vector $u$ chosen uniformly at random from $\Sph^{d-1}$.
133 | It is immediate to show that $h$ is local sensitive to the angular distance with parameters similar to the previous subsection.
134 | 
135 | 
136 | 
137 | \bibliographystyle{plain}
138 | \bibliography{vs}
139 | 
140 | \end{document}
141 | %%%%%%%%
142 | 


--------------------------------------------------------------------------------
/class_notes/Class_07_clustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_07_clustering.pdf


--------------------------------------------------------------------------------
/class_notes/Class_07_clustering.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | \usepackage{vs}
  3 | \begin{document}
  4 | 
  5 | \lecturetitle{Class 7 - Clustering}
  6 | 
  7 | \section{K-means clustering}
  8 | \begin{definition}[$k$-means]
  9 | Given $n$ vectors $x_1\ldots,x_n\in \R^d$, and an integer $k$, find $k$ points $c_1,\ldots,c_k \in \R^d$
 10 | which minimize the expression:
 11 | \[
 12 | f = \sum_{i \in [n]} \min_{j \in [k]} \|x_i - c_j \|^2
 13 | \]
 14 | \end{definition}
 15 | In words, we aim to find $k$ cluster centers. The cost is the squared distance between all the points to their closest cluster center.
 16 | k-means clustering and Lloyd's algorithm \cite{Lloyd82leastsquares} are probably the most widely used clustering objective and algorithm.
 17 | This is for three main reasons:
 18 | \begin{itemize} 
 19 | \item The objective function is simple and natural.
 20 | \item Lloyd's algorithm (which we see below) is simple, efficient in practice, and often results in optimal or close to optimal results.
 21 | \item The results are easily interpretable and are often quite descriptive for real data sets. 
 22 | \end{itemize}
 23 | In 1957 Stuart Lloyd suggested a simple alternating minimization algorithm which efficiently finds a local minimum for this problem.
 24 | This algorithm (a.k.a. Lloyd's algorithm) seems to work so well in practice that it is sometimes referred to as $k$-means or the $k$-means algorithm.
 25 | 
 26 | \begin{algorithm}
 27 | \caption{Lloyd's Algorithm}
 28 | \begin{algorithmic}
 29 | \STATE $c_1,\ldots,c_k \leftarrow$ randomly chosen centers
 30 | \WHILE {Objective function still improves}
 31 | \STATE $S_1,\ldots,S_k \leftarrow \phi$
 32 | \FOR {$i \in 1,\ldots,n$}
 33 | 	\STATE $j \leftarrow \arg\min_{j'}\|x_i- c_{j'}\|^2 \}$
 34 | 	\STATE add $i$ to $S_j$
 35 | \ENDFOR
 36 | \FOR {$j \in 1,\ldots,k$}
 37 | 	\STATE $cc_j = \frac{1}{|S_j|}\sum_{i \in S_j} x_i$
 38 | \ENDFOR
 39 | \ENDWHILE
 40 | \end{algorithmic}
 41 | \end{algorithm}
 42 | \noindent This algorithm can be thought of as a potential function reducing algorithm.
 43 | The potential function is our objective function from above.
 44 | \[
 45 | f = \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - c_j\|^2.
 46 | \]
 47 | Where the sets $S_j$ are the sets of points to which $c_j$ is the closest center.
 48 | In each step of the algorithm the potential function is reduced.
 49 | Let's examine that.
 50 | First, if the set of centers $c_j$ are fixed, the best assignment is clearly the one which assigns
 51 | each data point to its closest center. 
 52 | Then, is the set $S_j$ are fixed, the optimal center is $c_j = \frac{1}{|S_j|}\sum_{i \in S_j} x_i$ (can easily be seen by derivation of the cost function).
 53 | Therefore, moving $c_j$ to it's optimal position can only reduce the potential function. 
 54 | The algorithm therefore terminates in a local minimum.
 55 | There are only two questions. One, whether the number of iterations for convergence is bounded. Two, whether we can guaranty that the solution is close to optimal.
 56 | 
 57 | \section{k-means and PCA}
 58 | This section will present a simple connection between $k$-means and PCA (similar ideas given here \cite{DingH04a}).
 59 | First, consider the similarity between the $k$-means cost function. Let $C_k = \{c_1,\ldots,c_k\}$ 
 60 | \[
 61 | f_{k-means} = \min_{C_k} \sum_{i \in [n]} \min_{c \in C_k} \|x_i - c\|^2
 62 | \]
 63 | and that of PCA
 64 | \[
 65 | f_{PCA} = \min_{P_k} \sum_{i \in [n]} \min_{z \in P_k} \|x_i - z\|^2 
 66 | \]
 67 | where $P_k$ is a projection into dimension $k$ and $z \in P_k$ means that $P_k z = z$.
 68 | Now, think about the subspace $P^{*}_{k}$ which contains the $k$ optimal centers $C^{*}_{k}$. 
 69 | Since $C^{*}_{j} \subset P^{*}_{k}$ we have that:
 70 | \[
 71 | f_{k-means} = \sum_{i \in [n]}  \min_{c \in C^{*}_k} \|x_i - c\|^2 \ge \sum_{i \in [n]} \min_{z \in P^{*}_k} \|x_i - z\|^2  \ge f_{PCA}
 72 | \]
 73 | 
 74 | \noindent For PCA, we conveniently have a closed form expression $ \min_{z \in P_k} \|x_i - z\|^2  = \|x_i - P_{k} x_i \|^2$.
 75 | The equality stems from the fact that for any point $x$ and any projection operation $P$ we have that $P(x) = \arg\min_{z \in P} \|x - z\|$.
 76 | Now, consider solving $k$-means on the points $y_i = P_k x_i$ instead. This intuitively will be an easier task
 77 | because $y_i$ are embedded into a lower dimension, namely $k$ (by the projection $P_k$).
 78 | 
 79 | \begin{center}
 80 | \includegraphics[width=0.6\textwidth]{images/kmeans-proj.png}
 81 | \end{center}
 82 | 
 83 | \noindent Before we do that though, we should argue that a good clustering for $y_i$ results in a good clustering for $x_i$.
 84 | Note that all $y_i$ are projected on a subspace $P$. If we project the optimal centers $C^*$ onto $P$ as well, we will get a solution with a lower cost than $f_{k-mean}$.
 85 | The optimal solution for $y_i$ will clearly be even better (or at least, not worse). Therefore $\hat{f}_{k-mean}\le f_{k-mean}$ where $\hat{f}_{k-mean} = f_{k-mean}(y_1,\ldots,y_n)$.
 86 | 
 87 | The following gives us a simple algorithm. Compute the $PCA$ of the points $x_i$ into dimension $k$.
 88 | Solve $k$-means on the points $y_i$ in dimension $k$. Output the resulting clusters and centers.
 89 | \[
 90 | f_{alg} = f_{PCA} + \hat{f}_{k-means} \le 2f_{k-means}
 91 | \]
 92 | 
 93 | \section{$\eps$-net argument for fixed dimensions}
 94 | Since computing the SVD of a matrix (and hence PCA) is well known. 
 95 | We get that computing a $2$-approximation to 
 96 | the $k$-means problem in dimension $d$ is possible if it can be done in dimension $k$.
 97 | 
 98 | To solve this problem we adopt a brute force approach.
 99 | Let $Q_{\eps}$ be a set of points inside the unit ball $B^{k}_{1}$ such that:
100 | \[
101 | \forall z \in B^{k}_{1} \;\; \exists \; q \in Q_{\eps} \; s.t. \;\; \|z - q\| \le \eps
102 | \]
103 | Such sets of points exist such that  $|Q_{\eps}| \le c(\frac{1}{\eps})^k$. There are 
104 | probabilistic constructions for such sets as well but we will not go into that.
105 | Assuming w.l.o.g. that $\|x_i\| \le 1$ we can constrain the centers of 
106 | the clusters to one of the points in the $\eps$-net $Q_{\eps}$.
107 | Let $q_j$ be the closest point in $Q_{\eps}$ to $c_j$ (so $\|c_j - q_j \| \le \eps$).
108 | From a simple calculation we have that:
109 | \[
110 | \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - q_j \|^2 \le  \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - c_j \|^2 + 5\eps.
111 | \]
112 |  
113 | To find the best clustering we can exhaustively search through every set of $k$ points from $Q_{\eps}$.
114 | For each such set, compute the cost of this assignment on the original points and return the one minimizing the cost.
115 | That will require ${c(\frac{1}{\eps})^k \choose k}$ iterations over candidate solutions each of which requires $O(ndk)$ time. 
116 | The final running time we achieve is $2^{O(k^2\log(1/\eps))}nd$. 
117 | 
118 | 
119 | \section{Sampling based seeding for k-means}
120 | Another simple idea is to sample sufficiently many points from the input as candidate centers.
121 | Ideas similar to the ones described here can be found here \cite{ZhaHDGS01}.
122 | 
123 | First, assume we have only one set of points $S$ and $|S|=n$. 
124 | Also, denote by $c$ the centroid of $S$, $c = \frac{1}{n}\sum_{i \in S} x_i$ and assume w.l.o.g. $c=0$.
125 | We will claim that picking a random members of $S$ as a centroid is not much worse than
126 | picking $c=0$.
127 | Let $q$ be a member of $S$ chosen uniformly at random.
128 | Let us compute the expectation of the cost function.
129 | \begin{eqnarray}
130 | \E[\sum_{i \in S} \|x_i - q\|^2] &=& \sum_{i \in S} \sum_{j \in S} \frac{1}{n}\|x_i - x_j\|^2  \\
131 | &=& \sum_{i \in S} \|x_i\|^2  - \frac{2}{n}(\sum_{i \in S} x_i)^T(\sum_{j \in S} x_j) + \sum_{j \in S} \|x_j\|^2 \\
132 | &\le& 2 \sum_{i \in S}  \|x_i - c\|^2.
133 | \end{eqnarray}
134 | 
135 | \noindent Using Markov's inequality we get that
136 | \[
137 | \Pr[\sum_{i \in S} \|x_i - q\|^2 \le 4\sum_{i \in S}  \|x_i - c\|^2] \ge 1/2
138 | \]
139 | If this happens we say that $q$ is a good representative for $S$ (at least half of the points are good representatives!) 
140 | Now consider again the situation where we have $k$ clusters $S_1,\dots,S_k$.
141 | If we are given a set $Q$ which contains a good candidate for each of the sets.
142 | Then, restricting ourselves to pick centers from $Q$ will result in at most a multiplicative factor of $4$ to the cost.
143 | 
144 | The set $Q$ can be quite small if the set are roughly balanced.
145 | Let the smallest set contain $n_s$ points. 
146 | We therefore succeed in finding a good representative for any set with probability at least $\frac{1}{2}\frac{n_s}{n}$.
147 | The probability of failure for any set is thus bounded by $k (1 - \frac{n_s}{2n})^{|Q|}$.
148 | Therefore $|Q| = O(k \log(k))$ if $n_s \in \Omega(n/k)$.
149 | 
150 | Again, iterating over all subsets of $Q$ of size $k$ we can find an approximate 
151 | solution is time $O({ck \log(k) \choose k}knd) = 2^{O(k \log(k))}nd$.
152 | 
153 | \section{k-means++}
154 | 
155 | In the above, we gave approximation algorithms to the $k$-means problem.
156 | Alas, any solution can be improved by performing Lloyds algorithm on its output.
157 | Therefore, such algorithms can be considered as `seeding' algorithms 
158 | which give initial assignments to Lloyds algorithm.
159 | A well known seeding procedure \cite{ArthurV07} is called $k$-means++.
160 | \begin{algorithm}
161 | \caption{$k$-means++ algorithm  \cite{ArthurV07}}
162 | \begin{algorithmic}
163 | \STATE $C \leftarrow \{x_i\}$ where $x_i$ is a uniformly chosen from $[n]$.
164 | \FOR{$j \in [k]$}
165 | 	\STATE Pick node $x$ with probability proportional to $\min_{c \in C} \|x - c\|^2$
166 | 	\STATE Add $x$ to $C$
167 | \ENDFOR
168 | \STATE {\bf return:} $C$
169 | \end{algorithmic}
170 | \end{algorithm}
171 | In each iteration, the next center is chosen randomly from the input points.
172 | The distribution over the points is not uniform. 
173 | Each point is picked with probability proportional to the minimal square distance from it to a picked center.
174 | Surprisingly, This simple and practical approach already gives an $O(\log(k))$ approximation guarantee.
175 | More precisely, let $f_{k-means}(C)$ denote the cost of $k$-means with the set of centers $C$.
176 | Also, denote by $C^*$ the optimal set of centers. Then 
177 | \[
178 | \E[f_{k-means}(C)] \le 8 (\log(k)+2)f_{k-means}(C^*)
179 | \]
180 | 
181 | In \cite{AilonJM09} the authors give a streaming algorithm for this problem.
182 | They manipulate ideas from \cite{ArthurV07} and combine them with a hirarchical 
183 | divide and conquer methodology. See also \cite{GuhaMMMO03} for a thorough survey and
184 | new techniques for clustering in streams.
185 | 
186 | Another problem which is very related to $k$-means is the $k$-medians problem.
187 | Given a set to points $x_1,\ldots,x_n$ the aim is to find centers $c_1,\ldots,c_k$ which minimize:
188 | \[
189 | f_{k-medians} = \sum_{i \in [n]} \min_{j \in [k]} \|x_i - c_j \|
190 | \]
191 | Both $k$-means and the $k$-median problem admit $1+\eps$ multiplicative approximation algorithms but these
192 | are far from being simple. See \cite{hk-sckmk-05} for more details, related work, and a new core set based solution. 
193 | 
194 | \section{The Inverted File Model (IVF)}
195 | One of the most common approaches in vector search is to begin with clustering the set of points using k-mean and, at search time, consider only the points within the nearest clusters. This is called the inverted file model (IVF) and is used extensively in practice. We will expand on this discussion in class.  
196 | 
197 | 
198 | 
199 | \bibliographystyle{plain}
200 | \bibliography{vs}
201 | 
202 | \end{document}
203 | %%%%%%%%
204 | 


--------------------------------------------------------------------------------
/class_notes/Class_08_quantization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_08_quantization.pdf


--------------------------------------------------------------------------------
/class_notes/Class_08_runbook_for_students.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "id": "6adb5788",
   7 |    "metadata": {},
   8 |    "outputs": [],
   9 |    "source": [
  10 |     "import time\n",
  11 |     "import numpy as np\n",
  12 |     "\n",
  13 |     "# requires Faiss to be installed, see \n",
  14 |     "# https://github.com/facebookresearch/faiss/blob/main/INSTALL.md#installing-faiss-via-conda\n",
  15 |     "# oh how to install the CPU version\n",
  16 |     "\n",
  17 |     "import faiss\n",
  18 |     "\n",
  19 |     "from faiss.contrib.datasets import SyntheticDataset\n",
  20 |     "\n",
  21 |     "from matplotlib import pyplot"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": null,
  27 |    "id": "31018e88",
  28 |    "metadata": {},
  29 |    "outputs": [],
  30 |    "source": [
  31 |     "# setup that works for my machine. Adjust to yours \n",
  32 |     "faiss.omp_set_num_threads(32)\n",
  33 |     "\n",
  34 |     "%matplotlib inline\n",
  35 |     "%config InlineBackend.figure_format='retina'"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": null,
  41 |    "id": "e823d6ca",
  42 |    "metadata": {},
  43 |    "outputs": [],
  44 |    "source": [
  45 |     "# get some data\n",
  46 |     "ds = SyntheticDataset(64, 1000_000, 10000, 100)\n",
  47 |     "print(ds)"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": null,
  53 |    "id": "c660a6ee",
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "# get training set\n",
  58 |     "xt = ds.get_train()\n",
  59 |     "xt.shape"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": null,
  65 |    "id": "e1ff2bf0",
  66 |    "metadata": {},
  67 |    "outputs": [],
  68 |    "source": [
  69 |     "d = ds.d"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "id": "d9d12999",
  75 |    "metadata": {},
  76 |    "source": [
  77 |     "# Run k-means "
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "code",
  82 |    "execution_count": null,
  83 |    "id": "be5b07a0",
  84 |    "metadata": {},
  85 |    "outputs": [],
  86 |    "source": [
  87 |     "# 4096 centroids \n",
  88 |     "km = faiss.Kmeans(ds.d, 4096)"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "code",
  93 |    "execution_count": null,
  94 |    "id": "c6dfa79b",
  95 |    "metadata": {},
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "%%time\n",
  99 |     "km.train(xt)"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "id": "e185e9ed",
 106 |    "metadata": {},
 107 |    "outputs": [],
 108 |    "source": [
 109 |     "centroids = km.centroids \n",
 110 |     "centroids.shape"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": null,
 116 |    "id": "73149508",
 117 |    "metadata": {},
 118 |    "outputs": [],
 119 |    "source": [
 120 |     "MSE = km.obj[-1] / len(xt)\n",
 121 |     "MSE"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": null,
 127 |    "id": "a85938b3",
 128 |    "metadata": {},
 129 |    "outputs": [],
 130 |    "source": [
 131 |     "pyplot.plot(km.obj / ds.nt)\n",
 132 |     "pyplot.ylabel(\"Mean Squared Error\")\n",
 133 |     "pyplot.xlabel(\"Iteration\")\n",
 134 |     "pyplot.grid()"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "markdown",
 139 |    "id": "5d7dd70d",
 140 |    "metadata": {},
 141 |    "source": [
 142 |     "# Hierarchical k-means "
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "code",
 147 |    "execution_count": null,
 148 |    "id": "4252e712",
 149 |    "metadata": {},
 150 |    "outputs": [],
 151 |    "source": [
 152 |     "def recursive_run_kmeans(xt, k, level): \n",
 153 |     "    if level == 0: \n",
 154 |     "        # all vectors encoded to the same, compute MSE\n",
 155 |     "        centroid = xt.mean(axis=0)\n",
 156 |     "        s = ((xt - centroid) ** 2).sum()\n",
 157 |     "        return [centroid], s\n",
 158 |     "    else: \n",
 159 |     "        km = faiss.Kmeans(ds.d, k)\n",
 160 |     "        km.train(xt)\n",
 161 |     "        _, labels = km.assign(xt)\n",
 162 |     "        tot_sum = 0\n",
 163 |     "        centroids = []\n",
 164 |     "        for i in range(k): \n",
 165 |     "            subset = labels == i\n",
 166 |     "            cent_i, sum_i = recursive_run_kmeans(xt[subset], k, level - 1)\n",
 167 |     "            centroids += cent_i\n",
 168 |     "            tot_sum += sum_i \n",
 169 |     "        return centroids, tot_sum        "
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": null,
 175 |    "id": "deab4bdf",
 176 |    "metadata": {},
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "%%time \n",
 180 |     "# 4096 = 8 ** 4\n",
 181 |     "cents, s = recursive_run_kmeans(xt, 8, 4)\n",
 182 |     "MSE = s / len(xt)\n",
 183 |     "MSE"
 184 |    ]
 185 |   },
 186 |   {
 187 |    "cell_type": "code",
 188 |    "execution_count": null,
 189 |    "id": "fc7a74ce",
 190 |    "metadata": {},
 191 |    "outputs": [],
 192 |    "source": [
 193 |     "%%time \n",
 194 |     "# 4096 = 64 ** 2\n",
 195 |     "cents, s = recursive_run_kmeans(xt, 64, 2)\n",
 196 |     "MSE = s / len(xt)\n",
 197 |     "MSE"
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "code",
 202 |    "execution_count": null,
 203 |    "id": "70c019a3",
 204 |    "metadata": {},
 205 |    "outputs": [],
 206 |    "source": [
 207 |     "# search from centroids directly \n",
 208 |     "\n",
 209 |     "D, _ = faiss.knn(xt, cents, k=1)\n",
 210 |     "MSE = D.mean()\n",
 211 |     "MSE"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "markdown",
 216 |    "id": "aa4e76cd",
 217 |    "metadata": {},
 218 |    "source": [
 219 |     "## Searching in a vector database "
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "code",
 224 |    "execution_count": null,
 225 |    "id": "070a8371",
 226 |    "metadata": {},
 227 |    "outputs": [],
 228 |    "source": [
 229 |     "# the database and set of query vectors are arrays\n",
 230 |     "xq = ds.get_queries()\n",
 231 |     "xb = ds.get_database()"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": null,
 237 |    "id": "a881bc56",
 238 |    "metadata": {},
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "xq.shape"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "code",
 246 |    "execution_count": null,
 247 |    "id": "e55a1ce7",
 248 |    "metadata": {},
 249 |    "outputs": [],
 250 |    "source": [
 251 |     "xb.shape"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "markdown",
 256 |    "id": "3d8d787a",
 257 |    "metadata": {},
 258 |    "source": [
 259 |     "### Ground truth and the knn function"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": null,
 265 |    "id": "328cdcce",
 266 |    "metadata": {},
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "# find ground-truth nearest neighbors \n",
 270 |     "gt_dis, gt = faiss.knn(xq, xb, k=10)"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": null,
 276 |    "id": "368cac81",
 277 |    "metadata": {},
 278 |    "outputs": [],
 279 |    "source": [
 280 |     "gt.shape"
 281 |    ]
 282 |   },
 283 |   {
 284 |    "cell_type": "code",
 285 |    "execution_count": null,
 286 |    "id": "2419d184",
 287 |    "metadata": {},
 288 |    "outputs": [],
 289 |    "source": [
 290 |     "gt[:3]"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": null,
 296 |    "id": "ec4a0f9a",
 297 |    "metadata": {},
 298 |    "outputs": [],
 299 |    "source": [
 300 |     "gt_dis[:3]"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": null,
 306 |    "id": "f611e072",
 307 |    "metadata": {},
 308 |    "outputs": [],
 309 |    "source": [
 310 |     "((xq[1] - xb[6558])**2).sum()"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "markdown",
 315 |    "id": "b89accbb",
 316 |    "metadata": {},
 317 |    "source": [
 318 |     "# The inverted file "
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "code",
 323 |    "execution_count": null,
 324 |    "id": "dd8018b6",
 325 |    "metadata": {},
 326 |    "outputs": [],
 327 |    "source": [
 328 |     "nlist = 4096\n",
 329 |     "\n",
 330 |     "# compute IVF entries for database = find the nearest centroid for each database vector \n",
 331 |     "_, list_nos = faiss.knn(xb, centroids, k=1)\n",
 332 |     "list_nos = list_nos.flatten()"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": null,
 338 |    "id": "ec84f3c3",
 339 |    "metadata": {},
 340 |    "outputs": [],
 341 |    "source": [
 342 |     "ivf_vectors = []\n",
 343 |     "ivf_ids = []\n",
 344 |     "\n",
 345 |     "for list_no in range(nlist): \n",
 346 |     "    ids = np.where(list_nos == list_no)[0]\n",
 347 |     "    ivf_ids.append(ids)\n",
 348 |     "    ivf_vectors.append(xb[ids])"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": null,
 354 |    "id": "522a369f",
 355 |    "metadata": {},
 356 |    "outputs": [],
 357 |    "source": [
 358 |     "len(ivf_ids), len(ivf_vectors)"
 359 |    ]
 360 |   },
 361 |   {
 362 |    "cell_type": "code",
 363 |    "execution_count": null,
 364 |    "id": "81170e6b",
 365 |    "metadata": {},
 366 |    "outputs": [],
 367 |    "source": [
 368 |     "max(len(l) for l in ivf_ids)"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "code",
 373 |    "execution_count": null,
 374 |    "id": "c6ef3af3",
 375 |    "metadata": {},
 376 |    "outputs": [],
 377 |    "source": [
 378 |     "min(len(l) for l in ivf_ids)"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": null,
 384 |    "id": "31fbda29",
 385 |    "metadata": {},
 386 |    "outputs": [],
 387 |    "source": [
 388 |     "# searching in the nearest centroid \n",
 389 |     "_, q_list_nos = faiss.knn(xq, centroids, k=1)\n",
 390 |     "found_nns = []\n",
 391 |     "for q in range(100): \n",
 392 |     "    query = xq[q]\n",
 393 |     "    # fetch contents of cluster\n",
 394 |     "    cluster_vectors = ivf_vectors[q_list_nos[q, 0]]\n",
 395 |     "    cluster_ids = ivf_ids[q_list_nos[q, 0]]\n",
 396 |     "    if cluster_ids.size == 0: \n",
 397 |     "        found_nns.append(-1)\n",
 398 |     "        continue\n",
 399 |     "    # compute distances \n",
 400 |     "    distances = ((query - cluster_vectors)**2).sum(1)\n",
 401 |     "    # collect result id\n",
 402 |     "    result_id = cluster_ids[distances.argmin()]\n",
 403 |     "    found_nns.append(result_id)\n",
 404 |     "    "
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "code",
 409 |    "execution_count": null,
 410 |    "id": "84b46874",
 411 |    "metadata": {},
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "(found_nns == gt[:, 0]).sum()"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "markdown",
 419 |    "id": "b4ed63d2",
 420 |    "metadata": {},
 421 |    "source": [
 422 |     "That's not much. Maybe we need to explore more clusters?"
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": null,
 428 |    "id": "784a32aa",
 429 |    "metadata": {},
 430 |    "outputs": [],
 431 |    "source": [
 432 |     "nprobe = 13\n",
 433 |     "# searching in the nearest centroid \n",
 434 |     "_, q_list_nos = faiss.knn(xq, centroids, k=nprobe)\n",
 435 |     "found_nns = []\n",
 436 |     "ndis = 0\n",
 437 |     "for q in range(100): \n",
 438 |     "    query = xq[q]\n",
 439 |     "    # fetch contents of clusters \n",
 440 |     "    cluster_vectors = np.vstack([\n",
 441 |     "        ivf_vectors[i]\n",
 442 |     "        for i in q_list_nos[q]\n",
 443 |     "    ])\n",
 444 |     "    cluster_ids = np.hstack([\n",
 445 |     "        ivf_ids[i]\n",
 446 |     "        for i in q_list_nos[q]\n",
 447 |     "    ])\n",
 448 |     "    if cluster_ids.size == 0: \n",
 449 |     "        found_nns.append(-1)\n",
 450 |     "        continue\n",
 451 |     "    # compute distances \n",
 452 |     "    distances = ((query - cluster_vectors)**2).sum(1)\n",
 453 |     "    ndis += len(cluster_ids)\n",
 454 |     "    # collect result id\n",
 455 |     "    result_id = cluster_ids[distances.argmin()]\n",
 456 |     "    found_nns.append(result_id)"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": null,
 462 |    "id": "40055f37",
 463 |    "metadata": {},
 464 |    "outputs": [],
 465 |    "source": [
 466 |     "(found_nns == gt[:, 0]).sum()"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "code",
 471 |    "execution_count": null,
 472 |    "id": "3b06f0e5",
 473 |    "metadata": {},
 474 |    "outputs": [],
 475 |    "source": [
 476 |     "ndis / 100 "
 477 |    ]
 478 |   },
 479 |   {
 480 |    "cell_type": "markdown",
 481 |    "id": "f1d8b744",
 482 |    "metadata": {},
 483 |    "source": [
 484 |     "That's better, we computed just 106 distances on average per query (out of 10000)"
 485 |    ]
 486 |   },
 487 |   {
 488 |    "cell_type": "markdown",
 489 |    "id": "99729b82",
 490 |    "metadata": {},
 491 |    "source": [
 492 |     "## Inverted file in Faiss "
 493 |    ]
 494 |   },
 495 |   {
 496 |    "cell_type": "code",
 497 |    "execution_count": null,
 498 |    "id": "41a72316",
 499 |    "metadata": {},
 500 |    "outputs": [],
 501 |    "source": [
 502 |     "index = faiss.index_factory(d, \"IVF1024,Flat\") # flat means: don't encode the vectors!"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": null,
 508 |    "id": "45727dbd",
 509 |    "metadata": {},
 510 |    "outputs": [],
 511 |    "source": [
 512 |     "index.train(xt)"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "code",
 517 |    "execution_count": null,
 518 |    "id": "6415958b",
 519 |    "metadata": {},
 520 |    "outputs": [],
 521 |    "source": [
 522 |     "index.add(xb)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": null,
 528 |    "id": "cf2ae5c9",
 529 |    "metadata": {},
 530 |    "outputs": [],
 531 |    "source": [
 532 |     "D, I = index.search(xq, 10)"
 533 |    ]
 534 |   },
 535 |   {
 536 |    "cell_type": "code",
 537 |    "execution_count": null,
 538 |    "id": "14990831",
 539 |    "metadata": {},
 540 |    "outputs": [],
 541 |    "source": [
 542 |     "(I[:, 0] == gt[:, 0]).sum()"
 543 |    ]
 544 |   },
 545 |   {
 546 |    "cell_type": "code",
 547 |    "execution_count": null,
 548 |    "id": "10ee442a",
 549 |    "metadata": {},
 550 |    "outputs": [],
 551 |    "source": [
 552 |     "index.nprobe = 10\n",
 553 |     "D, I = index.search(xq, 10)\n",
 554 |     "(I[:, 0] == gt[:, 0]).sum()"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "markdown",
 559 |    "id": "53cc6c5c",
 560 |    "metadata": {},
 561 |    "source": [
 562 |     "## Tradeoff speed / accuracy "
 563 |    ]
 564 |   },
 565 |   {
 566 |    "cell_type": "code",
 567 |    "execution_count": null,
 568 |    "id": "fc32eca0",
 569 |    "metadata": {},
 570 |    "outputs": [],
 571 |    "source": [
 572 |     "results = {}\n",
 573 |     "for nlist in 64, 256, 1024: \n",
 574 |     "    index = faiss.index_factory(d, f\"IVF{nlist},Flat\")\n",
 575 |     "    index.train(xt)\n",
 576 |     "    index.add(xb)\n",
 577 |     "    for nprobe in 1, 2, 4, 8, 16, 32, 64, 128:\n",
 578 |     "        if nprobe > nlist: \n",
 579 |     "            continue\n",
 580 |     "        index.nprobe = nprobe\n",
 581 |     "        t0 = time.time()\n",
 582 |     "        for run in range(100):   # several runs to get stable timings\n",
 583 |     "            D, I = index.search(xq, 10)\n",
 584 |     "        t1 = time.time() \n",
 585 |     "        recall = (I[:, 0] == gt[:, 0]).sum()\n",
 586 |     "        print(f\"{nlist=:} {nprobe=:} {recall=:} time={(t1 - t0) * 1000 :.3f} ms\")\n",
 587 |     "        results[(nlist, nprobe)] = (recall, (t1 - t0) * 1000)\n",
 588 |     "        "
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "code",
 593 |    "execution_count": null,
 594 |    "id": "9c492714",
 595 |    "metadata": {},
 596 |    "outputs": [],
 597 |    "source": [
 598 |     "for nlist in 64, 256, 1024: \n",
 599 |     "    index = faiss.index_factory(d, f\"IVF{nlist},Flat\")\n",
 600 |     "    index.train(xt)\n",
 601 |     "    index.add(xb)\n",
 602 |     "    res = [results[(nlist, nprobe)] for nprobe in [1, 2, 4, 8, 16, 32, 64, 128] if nprobe < nlist]\n",
 603 |     "    recalls = [r[0] for r in res]\n",
 604 |     "    times = [r[1] for r in res]\n",
 605 |     "    pyplot.plot(recalls, times, label=f\"{nlist=:}\")\n",
 606 |     "\n",
 607 |     "pyplot.ylabel(\"time (ms)\")\n",
 608 |     "pyplot.xlabel(\"R@1\")\n",
 609 |     "pyplot.legend()\n",
 610 |     "pyplot.grid()\n",
 611 |     "    "
 612 |    ]
 613 |   },
 614 |   {
 615 |    "cell_type": "markdown",
 616 |    "id": "7b3f90f5",
 617 |    "metadata": {},
 618 |    "source": [
 619 |     "## Search cost as a function of the database size "
 620 |    ]
 621 |   },
 622 |   {
 623 |    "cell_type": "code",
 624 |    "execution_count": null,
 625 |    "id": "9b0f60b6",
 626 |    "metadata": {},
 627 |    "outputs": [],
 628 |    "source": [
 629 |     "ns = 2 ** np.arange(10, 25)\n",
 630 |     "nprobe = 15 # fix nprobe \n",
 631 |     "for k in 4 ** np.arange(3, 7): \n",
 632 |     "    coarse_quantization_cost = k\n",
 633 |     "    ivf_scanning_cost = nprobe / k * ns\n",
 634 |     "    pyplot.loglog(ns, coarse_quantization_cost + ivf_scanning_cost, label=f\"{k=:}\")\n",
 635 |     "pyplot.xlabel(\"database size\")\n",
 636 |     "pyplot.ylabel(\"nb distance computations\")\n",
 637 |     "pyplot.title(f\"search cost at {nprobe=:}\")\n",
 638 |     "pyplot.legend()\n",
 639 |     "pyplot.grid()"
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "markdown",
 644 |    "id": "49dec309",
 645 |    "metadata": {},
 646 |    "source": [
 647 |     "# Searching in compressed vectors "
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": null,
 653 |    "id": "db326b75",
 654 |    "metadata": {},
 655 |    "outputs": [],
 656 |    "source": [
 657 |     "# work on a smaller subset because otherwise we don't see anything with such small codes \n",
 658 |     "xb_small = xb[:1000]\n",
 659 |     "_, gt_small = faiss.knn(xq, xb_small, k=10)"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "code",
 664 |    "execution_count": null,
 665 |    "id": "d6455720",
 666 |    "metadata": {},
 667 |    "outputs": [],
 668 |    "source": [
 669 |     "# compute codes for database = find the nearest centroid for each database vector \n",
 670 |     "encoding_errors, codes = faiss.knn(xb_small, centroids, k=1)"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "code",
 675 |    "execution_count": null,
 676 |    "id": "eed66ff0",
 677 |    "metadata": {},
 678 |    "outputs": [],
 679 |    "source": [
 680 |     "codes.shape"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "code",
 685 |    "execution_count": null,
 686 |    "id": "2e230786",
 687 |    "metadata": {},
 688 |    "outputs": [],
 689 |    "source": [
 690 |     "codes = codes.flatten()"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": null,
 696 |    "id": "592cf1d6",
 697 |    "metadata": {},
 698 |    "outputs": [],
 699 |    "source": [
 700 |     "# reconstruct \n",
 701 |     "reconstructed_xb = centroids[codes]"
 702 |    ]
 703 |   },
 704 |   {
 705 |    "cell_type": "code",
 706 |    "execution_count": null,
 707 |    "id": "86b7e431",
 708 |    "metadata": {},
 709 |    "outputs": [],
 710 |    "source": [
 711 |     "MSE = ((reconstructed_xb - xb_small) ** 2).sum(1).mean()\n",
 712 |     "MSE"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "markdown",
 717 |    "id": "6fa261cb",
 718 |    "metadata": {},
 719 |    "source": [
 720 |     "Similar but a bit worse than the training MSE "
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "code",
 725 |    "execution_count": null,
 726 |    "id": "ae1ab0ee",
 727 |    "metadata": {},
 728 |    "outputs": [],
 729 |    "source": [
 730 |     "# anothe way of computing it\n",
 731 |     "encoding_errors.mean()"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "markdown",
 736 |    "id": "3c5bd736",
 737 |    "metadata": {},
 738 |    "source": [
 739 |     "## Asymmetric search"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "code",
 744 |    "execution_count": null,
 745 |    "id": "1cee40e0",
 746 |    "metadata": {},
 747 |    "outputs": [],
 748 |    "source": [
 749 |     "found_dis, found_indices = faiss.knn(xq, reconstructed_xb, k=10)"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": null,
 755 |    "id": "63737db3",
 756 |    "metadata": {},
 757 |    "outputs": [],
 758 |    "source": [
 759 |     "(gt_small[:, 0] == found_indices[:, 0]).sum() "
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "markdown",
 764 |    "id": "0be8a494",
 765 |    "metadata": {},
 766 |    "source": [
 767 |     "We loose 73% of nearest neighbors because the vectors are compressed a lot (12 bits). But note chance is at 1/1000 = 0.1%"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "markdown",
 772 |    "id": "d1e78019",
 773 |    "metadata": {},
 774 |    "source": [
 775 |     "## Symmetric search "
 776 |    ]
 777 |   },
 778 |   {
 779 |    "cell_type": "code",
 780 |    "execution_count": null,
 781 |    "id": "e9ae2b9c",
 782 |    "metadata": {},
 783 |    "outputs": [],
 784 |    "source": [
 785 |     "# let's encode and decode the queries as well \n",
 786 |     "_, xq_codes = faiss.knn(xq, centroids, k=1)\n",
 787 |     "xq_codes = xq_codes.flatten()\n",
 788 |     "reconstructed_xq = centroids[xq_codes]"
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": null,
 794 |    "id": "f61bc16a",
 795 |    "metadata": {},
 796 |    "outputs": [],
 797 |    "source": [
 798 |     "found_dis, found_indices = faiss.knn(reconstructed_xq, reconstructed_xb, k=10)"
 799 |    ]
 800 |   },
 801 |   {
 802 |    "cell_type": "code",
 803 |    "execution_count": null,
 804 |    "id": "b4c98736",
 805 |    "metadata": {},
 806 |    "outputs": [],
 807 |    "source": [
 808 |     "(gt_small[:, 0] == found_indices[:, 0]).sum() "
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "markdown",
 813 |    "id": "c38a209e",
 814 |    "metadata": {},
 815 |    "source": [
 816 |     "Wow that's even worse"
 817 |    ]
 818 |   },
 819 |   {
 820 |    "cell_type": "markdown",
 821 |    "id": "415e294d",
 822 |    "metadata": {},
 823 |    "source": [
 824 |     "## Asymmetric search with look-up tables "
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": null,
 830 |    "id": "56f9ede7",
 831 |    "metadata": {},
 832 |    "outputs": [],
 833 |    "source": [
 834 |     "# recall reference results\n",
 835 |     "found_dis, found_indices = faiss.knn(xq, reconstructed_xb, k=10)"
 836 |    ]
 837 |   },
 838 |   {
 839 |    "cell_type": "code",
 840 |    "execution_count": null,
 841 |    "id": "94838b00",
 842 |    "metadata": {},
 843 |    "outputs": [],
 844 |    "source": [
 845 |     "# make look-up tables for all queries\n",
 846 |     "def pairwise_distances(A, B): \n",
 847 |     "    return (A ** 2).sum(1)[:, None] + (B ** 2).sum(1) - 2 * A @ B.T "
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": null,
 853 |    "id": "fbccc070",
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "LUT = pairwise_distances(xq, centroids)"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "code",
 862 |    "execution_count": null,
 863 |    "id": "02a3fc04",
 864 |    "metadata": {},
 865 |    "outputs": [],
 866 |    "source": [
 867 |     "LUT.shape"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "code",
 872 |    "execution_count": null,
 873 |    "id": "ea02f2d7",
 874 |    "metadata": {},
 875 |    "outputs": [],
 876 |    "source": [
 877 |     "codes.shape"
 878 |    ]
 879 |   },
 880 |   {
 881 |    "cell_type": "code",
 882 |    "execution_count": null,
 883 |    "id": "5f9ef31d",
 884 |    "metadata": {},
 885 |    "outputs": [],
 886 |    "source": [
 887 |     "distances = LUT[:, codes]"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": null,
 893 |    "id": "678b9a5a",
 894 |    "metadata": {},
 895 |    "outputs": [],
 896 |    "source": [
 897 |     "distances.shape"
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "code",
 902 |    "execution_count": null,
 903 |    "id": "ecba60fb",
 904 |    "metadata": {},
 905 |    "outputs": [],
 906 |    "source": [
 907 |     "found_indices_2 = distances.argmin(axis=1)"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "code",
 912 |    "execution_count": null,
 913 |    "id": "fc8f7d7d",
 914 |    "metadata": {},
 915 |    "outputs": [],
 916 |    "source": [
 917 |     "np.all(found_indices[:, 0] == found_indices_2)"
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "code",
 922 |    "execution_count": null,
 923 |    "id": "d6ac2963",
 924 |    "metadata": {},
 925 |    "outputs": [],
 926 |    "source": [
 927 |     "found_indices_2"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": null,
 933 |    "id": "9f7c2514",
 934 |    "metadata": {},
 935 |    "outputs": [],
 936 |    "source": [
 937 |     "found_indices[:, 0]"
 938 |    ]
 939 |   },
 940 |   {
 941 |    "cell_type": "code",
 942 |    "execution_count": null,
 943 |    "id": "57aa95ea",
 944 |    "metadata": {},
 945 |    "outputs": [],
 946 |    "source": [
 947 |     "np.where(found_indices[:, 0] != found_indices_2)"
 948 |    ]
 949 |   },
 950 |   {
 951 |    "cell_type": "markdown",
 952 |    "id": "4e3c366e",
 953 |    "metadata": {},
 954 |    "source": [
 955 |     "# Product Quantization"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": null,
 961 |    "id": "19ba01fc",
 962 |    "metadata": {},
 963 |    "outputs": [],
 964 |    "source": [
 965 |     "# 4 sub-vectors, encode each in 2^8 elements\n",
 966 |     "pq = faiss.ProductQuantizer(d, 4, 8)"
 967 |    ]
 968 |   },
 969 |   {
 970 |    "cell_type": "code",
 971 |    "execution_count": null,
 972 |    "id": "7865e4da",
 973 |    "metadata": {},
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "pq.code_size   # in bytes, bits/8 rounded up to next integer"
 977 |    ]
 978 |   },
 979 |   {
 980 |    "cell_type": "code",
 981 |    "execution_count": null,
 982 |    "id": "21fb7138",
 983 |    "metadata": {},
 984 |    "outputs": [],
 985 |    "source": [
 986 |     "pq.train(xt)"
 987 |    ]
 988 |   },
 989 |   {
 990 |    "cell_type": "code",
 991 |    "execution_count": null,
 992 |    "id": "48543659",
 993 |    "metadata": {},
 994 |    "outputs": [],
 995 |    "source": [
 996 |     "xb_codes = pq.compute_codes(xb)"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "code",
1001 |    "execution_count": null,
1002 |    "id": "0f6c3eac",
1003 |    "metadata": {},
1004 |    "outputs": [],
1005 |    "source": [
1006 |     "pq_reconstruction = pq.decode(xb_codes)"
1007 |    ]
1008 |   },
1009 |   {
1010 |    "cell_type": "code",
1011 |    "execution_count": null,
1012 |    "id": "84d2814e",
1013 |    "metadata": {},
1014 |    "outputs": [],
1015 |    "source": [
1016 |     "# compute the MSE\n",
1017 |     "((pq_reconstruction - xb) ** 2).sum(1).mean()"
1018 |    ]
1019 |   },
1020 |   {
1021 |    "cell_type": "markdown",
1022 |    "id": "a6f0a55e",
1023 |    "metadata": {},
1024 |    "source": [
1025 |     "Better MSE than the 12-bit k-means one"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "markdown",
1030 |    "id": "0d71662c",
1031 |    "metadata": {},
1032 |    "source": [
1033 |     "## Manual reconstruction"
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "code",
1038 |    "execution_count": null,
1039 |    "id": "63ad6728",
1040 |    "metadata": {},
1041 |    "outputs": [],
1042 |    "source": [
1043 |     "from faiss.contrib.inspect_tools import get_pq_centroids, get_additive_quantizer_codebooks"
1044 |    ]
1045 |   },
1046 |   {
1047 |    "cell_type": "code",
1048 |    "execution_count": null,
1049 |    "id": "57b275a0",
1050 |    "metadata": {},
1051 |    "outputs": [],
1052 |    "source": [
1053 |     "pq_centroids = get_pq_centroids(pq)"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "code",
1058 |    "execution_count": null,
1059 |    "id": "6d96cd6b",
1060 |    "metadata": {},
1061 |    "outputs": [],
1062 |    "source": [
1063 |     "pq_centroids.shape"
1064 |    ]
1065 |   },
1066 |   {
1067 |    "cell_type": "markdown",
1068 |    "id": "416f93e9",
1069 |    "metadata": {},
1070 |    "source": [
1071 |     "Layout: number of subvectors, K, subvector dimension"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": null,
1077 |    "id": "9124a6e5",
1078 |    "metadata": {},
1079 |    "outputs": [],
1080 |    "source": [
1081 |     "xb_codes[:2]"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "code",
1086 |    "execution_count": null,
1087 |    "id": "0d081828",
1088 |    "metadata": {},
1089 |    "outputs": [],
1090 |    "source": [
1091 |     "# reconstruct vector no 123 -- TODO implement the re-construction! \n",
1092 |     "xb123_recons = "
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "code",
1097 |    "execution_count": null,
1098 |    "id": "410d41d7",
1099 |    "metadata": {},
1100 |    "outputs": [],
1101 |    "source": [
1102 |     "np.all(pq_reconstruction[123] == xb123_recons)"
1103 |    ]
1104 |   },
1105 |   {
1106 |    "cell_type": "markdown",
1107 |    "id": "9fedac66",
1108 |    "metadata": {},
1109 |    "source": [
1110 |     "## Compare options for fixed code_size\n",
1111 |     "fix number of quantizers "
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "code",
1116 |    "execution_count": null,
1117 |    "id": "cde5f63d",
1118 |    "metadata": {},
1119 |    "outputs": [],
1120 |    "source": [
1121 |     "budget = 6  # budget 6 bytes per vector\n",
1122 |     "for M in 4, 8, 16: \n",
1123 |     "    nbits = budget * 8 // M\n",
1124 |     "    print(f\"PQ {M}x{nbits}\")\n",
1125 |     "    pq = faiss.ProductQuantizer(d, M, nbits)\n",
1126 |     "    print(f\"Sub-vector size {pq.dsub} K={pq.ksub} code size {pq.code_size}\")\n",
1127 |     "    pq.train(xt)\n",
1128 |     "    t0 = time.time()\n",
1129 |     "    pq_reconstruction = pq.decode(pq.compute_codes(xb))\n",
1130 |     "    t1 = time.time()\n",
1131 |     "    MSE = ((pq_reconstruction - xb) ** 2).sum(1).mean()\n",
1132 |     "    print(f\"{MSE=:.2f} encode-decode time: {(t1 - t0)*1000:.3f} ms\")"
1133 |    ]
1134 |   },
1135 |   {
1136 |    "cell_type": "markdown",
1137 |    "id": "aea675ce",
1138 |    "metadata": {},
1139 |    "source": [
1140 |     "## Optimized product quantization"
1141 |    ]
1142 |   },
1143 |   {
1144 |    "cell_type": "code",
1145 |    "execution_count": null,
1146 |    "id": "c6b8f7f1",
1147 |    "metadata": {},
1148 |    "outputs": [],
1149 |    "source": [
1150 |     "from faiss.contrib.inspect_tools import get_LinearTransform_matrix"
1151 |    ]
1152 |   },
1153 |   {
1154 |    "cell_type": "code",
1155 |    "execution_count": null,
1156 |    "id": "5eb63d88",
1157 |    "metadata": {},
1158 |    "outputs": [],
1159 |    "source": [
1160 |     "opq = faiss.OPQMatrix(d, 4)\n",
1161 |     "pq = faiss.ProductQuantizer(d, 4, 8)"
1162 |    ]
1163 |   },
1164 |   {
1165 |    "cell_type": "code",
1166 |    "execution_count": null,
1167 |    "id": "7f0e6e5c",
1168 |    "metadata": {},
1169 |    "outputs": [],
1170 |    "source": [
1171 |     "opq.train(xt)"
1172 |    ]
1173 |   },
1174 |   {
1175 |    "cell_type": "code",
1176 |    "execution_count": null,
1177 |    "id": "ae9913a3",
1178 |    "metadata": {},
1179 |    "outputs": [],
1180 |    "source": [
1181 |     "pq.train(opq.apply(xt))"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "code",
1186 |    "execution_count": null,
1187 |    "id": "3afb9004",
1188 |    "metadata": {},
1189 |    "outputs": [],
1190 |    "source": [
1191 |     "xb_t = opq.apply(xb)"
1192 |    ]
1193 |   },
1194 |   {
1195 |    "cell_type": "code",
1196 |    "execution_count": null,
1197 |    "id": "038bcb8a",
1198 |    "metadata": {},
1199 |    "outputs": [],
1200 |    "source": [
1201 |     "xb_t_recons = pq.decode(pq.compute_codes(xb_t))"
1202 |    ]
1203 |   },
1204 |   {
1205 |    "cell_type": "code",
1206 |    "execution_count": null,
1207 |    "id": "6a855571",
1208 |    "metadata": {},
1209 |    "outputs": [],
1210 |    "source": [
1211 |     "((xb_t - xb_t_recons) ** 2).sum(1).mean()"
1212 |    ]
1213 |   },
1214 |   {
1215 |    "cell_type": "markdown",
1216 |    "id": "742355f8",
1217 |    "metadata": {},
1218 |    "source": [
1219 |     "The MSE for regular PQ was 13 --> improves"
1220 |    ]
1221 |   },
1222 |   {
1223 |    "cell_type": "code",
1224 |    "execution_count": null,
1225 |    "id": "fb17c49c",
1226 |    "metadata": {},
1227 |    "outputs": [],
1228 |    "source": [
1229 |     "A, bias = get_LinearTransform_matrix(opq)  # how to get the OPQ matrix"
1230 |    ]
1231 |   },
1232 |   {
1233 |    "cell_type": "code",
1234 |    "execution_count": null,
1235 |    "id": "155ac4a1",
1236 |    "metadata": {},
1237 |    "outputs": [],
1238 |    "source": [
1239 |     "A.shape"
1240 |    ]
1241 |   },
1242 |   {
1243 |    "cell_type": "markdown",
1244 |    "id": "b1d0ef83",
1245 |    "metadata": {},
1246 |    "source": [
1247 |     "## PQ in an index"
1248 |    ]
1249 |   },
1250 |   {
1251 |    "cell_type": "markdown",
1252 |    "id": "5f0c9f7f",
1253 |    "metadata": {},
1254 |    "source": [
1255 |     "A product quantizer with a search function (uses look-up tables)"
1256 |    ]
1257 |   },
1258 |   {
1259 |    "cell_type": "code",
1260 |    "execution_count": null,
1261 |    "id": "e7c2b2d2",
1262 |    "metadata": {},
1263 |    "outputs": [],
1264 |    "source": [
1265 |     "index = faiss.index_factory(d, \"PQ8x6np\")\n",
1266 |     "index.train(xt)\n",
1267 |     "index.add(xb)\n",
1268 |     "D, I = index.search(xq, 10)\n",
1269 |     "(I[:, 0] == gt[:, 0]).sum()"
1270 |    ]
1271 |   },
1272 |   {
1273 |    "cell_type": "code",
1274 |    "execution_count": null,
1275 |    "id": "93889500",
1276 |    "metadata": {},
1277 |    "outputs": [],
1278 |    "source": [
1279 |     "index = faiss.index_factory(d, \"OPQ4,PQ8x6np\")\n",
1280 |     "index.train(xt)\n",
1281 |     "index.add(xb)\n",
1282 |     "D, I = index.search(xq, 10)\n",
1283 |     "(I[:, 0] == gt[:, 0]).sum()"
1284 |    ]
1285 |   },
1286 |   {
1287 |    "cell_type": "markdown",
1288 |    "id": "ff3843b9",
1289 |    "metadata": {},
1290 |    "source": [
1291 |     "OPQ a bit better, but free at search time."
1292 |    ]
1293 |   },
1294 |   {
1295 |    "cell_type": "markdown",
1296 |    "id": "4930ab30",
1297 |    "metadata": {},
1298 |    "source": [
1299 |     "# Residual quantization"
1300 |    ]
1301 |   },
1302 |   {
1303 |    "cell_type": "code",
1304 |    "execution_count": null,
1305 |    "id": "cc11a1bf",
1306 |    "metadata": {},
1307 |    "outputs": [],
1308 |    "source": [
1309 |     "rq = faiss.ResidualQuantizer(d, 4, 8)"
1310 |    ]
1311 |   },
1312 |   {
1313 |    "cell_type": "code",
1314 |    "execution_count": null,
1315 |    "id": "19d003a0",
1316 |    "metadata": {},
1317 |    "outputs": [],
1318 |    "source": [
1319 |     "rq.max_beam_size "
1320 |    ]
1321 |   },
1322 |   {
1323 |    "cell_type": "code",
1324 |    "execution_count": null,
1325 |    "id": "170d394e",
1326 |    "metadata": {},
1327 |    "outputs": [],
1328 |    "source": [
1329 |     "%%time \n",
1330 |     "rq.train(xt[:50_000])"
1331 |    ]
1332 |   },
1333 |   {
1334 |    "cell_type": "code",
1335 |    "execution_count": null,
1336 |    "id": "5cdf572f",
1337 |    "metadata": {},
1338 |    "outputs": [],
1339 |    "source": [
1340 |     "xb_recons = rq.decode(rq.compute_codes(xb))\n",
1341 |     "((xb - xb_recons) ** 2).sum(1).mean()"
1342 |    ]
1343 |   },
1344 |   {
1345 |    "cell_type": "markdown",
1346 |    "id": "81566a1b",
1347 |    "metadata": {},
1348 |    "source": [
1349 |     "A bit better than OPQ"
1350 |    ]
1351 |   },
1352 |   {
1353 |    "cell_type": "code",
1354 |    "execution_count": null,
1355 |    "id": "37821ac8",
1356 |    "metadata": {},
1357 |    "outputs": [],
1358 |    "source": [
1359 |     "rq.max_beam_size = 50"
1360 |    ]
1361 |   },
1362 |   {
1363 |    "cell_type": "code",
1364 |    "execution_count": null,
1365 |    "id": "913e5caf",
1366 |    "metadata": {},
1367 |    "outputs": [],
1368 |    "source": [
1369 |     "%%time\n",
1370 |     "xb_recons = rq.decode(rq.compute_codes(xb))\n",
1371 |     "((xb - xb_recons) ** 2).sum(1).mean()"
1372 |    ]
1373 |   },
1374 |   {
1375 |    "cell_type": "markdown",
1376 |    "id": "b03a5f4a",
1377 |    "metadata": {},
1378 |    "source": [
1379 |     "Improves (slowly)"
1380 |    ]
1381 |   },
1382 |   {
1383 |    "cell_type": "markdown",
1384 |    "id": "857e79bf",
1385 |    "metadata": {},
1386 |    "source": [
1387 |     "# Search with additive quantizers"
1388 |    ]
1389 |   },
1390 |   {
1391 |    "cell_type": "code",
1392 |    "execution_count": null,
1393 |    "id": "6888cb5b",
1394 |    "metadata": {},
1395 |    "outputs": [],
1396 |    "source": [
1397 |     "index = faiss.index_factory(d, \"RQ8x6\")\n",
1398 |     "index.code_size"
1399 |    ]
1400 |   },
1401 |   {
1402 |    "cell_type": "code",
1403 |    "execution_count": null,
1404 |    "id": "09331ce1",
1405 |    "metadata": {},
1406 |    "outputs": [],
1407 |    "source": [
1408 |     "index.train(xt[:50_000])"
1409 |    ]
1410 |   },
1411 |   {
1412 |    "cell_type": "code",
1413 |    "execution_count": null,
1414 |    "id": "86e183da",
1415 |    "metadata": {},
1416 |    "outputs": [],
1417 |    "source": [
1418 |     "index.add(xb)\n",
1419 |     "D, I = index.search(xq, 10)\n",
1420 |     "(I[:, 0] == gt[:, 0]).sum()"
1421 |    ]
1422 |   },
1423 |   {
1424 |    "cell_type": "markdown",
1425 |    "id": "53b7af9e",
1426 |    "metadata": {},
1427 |    "source": [
1428 |     "Better than PQ & OPQ"
1429 |    ]
1430 |   },
1431 |   {
1432 |    "cell_type": "code",
1433 |    "execution_count": null,
1434 |    "id": "ddd9a100",
1435 |    "metadata": {},
1436 |    "outputs": [],
1437 |    "source": [
1438 |     "%timeit index.search(xq, 10)"
1439 |    ]
1440 |   },
1441 |   {
1442 |    "cell_type": "markdown",
1443 |    "id": "31faa0af",
1444 |    "metadata": {},
1445 |    "source": [
1446 |     "This is a search timing with decoding "
1447 |    ]
1448 |   },
1449 |   {
1450 |    "cell_type": "code",
1451 |    "execution_count": null,
1452 |    "id": "deab7f5e",
1453 |    "metadata": {},
1454 |    "outputs": [],
1455 |    "source": [
1456 |     "index = faiss.index_factory(d, \"RQ8x6_Nqint8\")\n",
1457 |     "index.code_size"
1458 |    ]
1459 |   },
1460 |   {
1461 |    "cell_type": "code",
1462 |    "execution_count": null,
1463 |    "id": "40cff484",
1464 |    "metadata": {},
1465 |    "outputs": [],
1466 |    "source": [
1467 |     "index.train(xt[:50_000])\n",
1468 |     "index.add(xb)\n",
1469 |     "D, I = index.search(xq, 10)\n",
1470 |     "(I[:, 0] == gt[:, 0]).sum()"
1471 |    ]
1472 |   },
1473 |   {
1474 |    "cell_type": "code",
1475 |    "execution_count": null,
1476 |    "id": "9daf0ba4",
1477 |    "metadata": {},
1478 |    "outputs": [],
1479 |    "source": [
1480 |     "%timeit index.search(xq, 10)"
1481 |    ]
1482 |   },
1483 |   {
1484 |    "cell_type": "markdown",
1485 |    "id": "ae050618",
1486 |    "metadata": {},
1487 |    "source": [
1488 |     "Same result but much faster (uses encoded norm) "
1489 |    ]
1490 |   },
1491 |   {
1492 |    "cell_type": "markdown",
1493 |    "id": "674563a7",
1494 |    "metadata": {},
1495 |    "source": [
1496 |     "# Scalar quantizers"
1497 |    ]
1498 |   },
1499 |   {
1500 |    "cell_type": "code",
1501 |    "execution_count": null,
1502 |    "id": "cafaafed",
1503 |    "metadata": {},
1504 |    "outputs": [],
1505 |    "source": [
1506 |     "for key in \"Flat\", \"SQfp16\", \"SQ8\", \"SQ6\", \"SQ4\", \"LSHrt\": \n",
1507 |     "    index = faiss.index_factory(d, key)\n",
1508 |     "    index.train(xt[:50_000])\n",
1509 |     "    index.add(xb)\n",
1510 |     "    D, I = index.search(xq, 10)\n",
1511 |     "    nfound = (I[:, 0] == gt[:, 0]).sum()\n",
1512 |     "    \n",
1513 |     "    print(f\"{key} {index.code_size=:} {nfound=:}\")"
1514 |    ]
1515 |   },
1516 |   {
1517 |    "cell_type": "markdown",
1518 |    "id": "886f7b1a",
1519 |    "metadata": {},
1520 |    "source": [
1521 |     "# Polysemous codes "
1522 |    ]
1523 |   },
1524 |   {
1525 |    "cell_type": "code",
1526 |    "execution_count": null,
1527 |    "id": "b7b97cf4",
1528 |    "metadata": {},
1529 |    "outputs": [],
1530 |    "source": [
1531 |     "index = faiss.index_factory(d, \"PQ8x8\") # omit the np"
1532 |    ]
1533 |   },
1534 |   {
1535 |    "cell_type": "code",
1536 |    "execution_count": null,
1537 |    "id": "72ef7542",
1538 |    "metadata": {},
1539 |    "outputs": [],
1540 |    "source": [
1541 |     "index.code_size"
1542 |    ]
1543 |   },
1544 |   {
1545 |    "cell_type": "code",
1546 |    "execution_count": null,
1547 |    "id": "10ef0e5d",
1548 |    "metadata": {},
1549 |    "outputs": [],
1550 |    "source": [
1551 |     "index.train(xt)\n",
1552 |     "index.add(xb)"
1553 |    ]
1554 |   },
1555 |   {
1556 |    "cell_type": "code",
1557 |    "execution_count": null,
1558 |    "id": "9debd581",
1559 |    "metadata": {},
1560 |    "outputs": [],
1561 |    "source": [
1562 |     "index.polysemous_ht  # threshold of binary code comparison -- default does not filter "
1563 |    ]
1564 |   },
1565 |   {
1566 |    "cell_type": "code",
1567 |    "execution_count": null,
1568 |    "id": "412b57eb",
1569 |    "metadata": {},
1570 |    "outputs": [],
1571 |    "source": [
1572 |     "D, I = index.search(xq, 10)\n",
1573 |     "(I[:, 0] == gt[:, 0]).sum()"
1574 |    ]
1575 |   },
1576 |   {
1577 |    "cell_type": "code",
1578 |    "execution_count": null,
1579 |    "id": "60336e65",
1580 |    "metadata": {},
1581 |    "outputs": [],
1582 |    "source": [
1583 |     "%timeit index.search(xq, 10)"
1584 |    ]
1585 |   },
1586 |   {
1587 |    "cell_type": "code",
1588 |    "execution_count": null,
1589 |    "id": "d0f73061",
1590 |    "metadata": {},
1591 |    "outputs": [],
1592 |    "source": [
1593 |     "index.search_type = faiss.IndexPQ.ST_polysemous\n",
1594 |     "index.polysemous_ht = 24\n",
1595 |     "D, I = index.search(xq, 10)\n",
1596 |     "(I[:, 0] == gt[:, 0]).sum()"
1597 |    ]
1598 |   },
1599 |   {
1600 |    "cell_type": "code",
1601 |    "execution_count": null,
1602 |    "id": "2b6ff69d",
1603 |    "metadata": {},
1604 |    "outputs": [],
1605 |    "source": [
1606 |     "%timeit index.search(xq, 10)"
1607 |    ]
1608 |   },
1609 |   {
1610 |    "cell_type": "markdown",
1611 |    "id": "9a03984a",
1612 |    "metadata": {},
1613 |    "source": [
1614 |     "About twice faster, same accuracy"
1615 |    ]
1616 |   },
1617 |   {
1618 |    "cell_type": "markdown",
1619 |    "id": "c7ccba6c",
1620 |    "metadata": {},
1621 |    "source": [
1622 |     "# IVFPQ index"
1623 |    ]
1624 |   },
1625 |   {
1626 |    "cell_type": "code",
1627 |    "execution_count": null,
1628 |    "id": "cea808f2",
1629 |    "metadata": {},
1630 |    "outputs": [],
1631 |    "source": [
1632 |     "index = faiss.index_factory(d, \"IVF200,PQ16x8np\") "
1633 |    ]
1634 |   },
1635 |   {
1636 |    "cell_type": "code",
1637 |    "execution_count": null,
1638 |    "id": "cc6a9584",
1639 |    "metadata": {},
1640 |    "outputs": [],
1641 |    "source": [
1642 |     "index.train(xt)"
1643 |    ]
1644 |   },
1645 |   {
1646 |    "cell_type": "code",
1647 |    "execution_count": null,
1648 |    "id": "e5ab2fdb",
1649 |    "metadata": {},
1650 |    "outputs": [],
1651 |    "source": [
1652 |     "index.add(xb)"
1653 |    ]
1654 |   },
1655 |   {
1656 |    "cell_type": "code",
1657 |    "execution_count": null,
1658 |    "id": "8102f0a9",
1659 |    "metadata": {},
1660 |    "outputs": [],
1661 |    "source": [
1662 |     "D, I = index.search(xq, 10)"
1663 |    ]
1664 |   },
1665 |   {
1666 |    "cell_type": "code",
1667 |    "execution_count": null,
1668 |    "id": "def2e3e2",
1669 |    "metadata": {},
1670 |    "outputs": [],
1671 |    "source": [
1672 |     "(I[:, 0] == gt[:, 0]).sum()"
1673 |    ]
1674 |   },
1675 |   {
1676 |    "cell_type": "code",
1677 |    "execution_count": null,
1678 |    "id": "43cb68b7",
1679 |    "metadata": {},
1680 |    "outputs": [],
1681 |    "source": [
1682 |     "index.nprobe "
1683 |    ]
1684 |   },
1685 |   {
1686 |    "cell_type": "code",
1687 |    "execution_count": null,
1688 |    "id": "dbc7d35c",
1689 |    "metadata": {},
1690 |    "outputs": [],
1691 |    "source": [
1692 |     "for nprobe in 2, 5, 10, 20, 50: \n",
1693 |     "    index.nprobe = nprobe \n",
1694 |     "    t0 = time.time()\n",
1695 |     "    for _ in range(50): \n",
1696 |     "        D, I = index.search(xq, 10)\n",
1697 |     "    t1 = time.time()\n",
1698 |     "    nok = (I[:, 0] == gt[:, 0]).sum()\n",
1699 |     "    print(f\"{nprobe=:} {nok=:} {(t1 - t0)*1000:.3f} ms\")"
1700 |    ]
1701 |   },
1702 |   {
1703 |    "cell_type": "markdown",
1704 |    "id": "c43314dc",
1705 |    "metadata": {},
1706 |    "source": [
1707 |     "## Fast-scan SIMD implementation"
1708 |    ]
1709 |   },
1710 |   {
1711 |    "cell_type": "code",
1712 |    "execution_count": null,
1713 |    "id": "d8842cd9",
1714 |    "metadata": {},
1715 |    "outputs": [],
1716 |    "source": [
1717 |     "index = faiss.index_factory(d, \"IVF200,PQ32x4fsr\") \n",
1718 |     "index.train(xt)\n",
1719 |     "index.add(xb)"
1720 |    ]
1721 |   },
1722 |   {
1723 |    "cell_type": "code",
1724 |    "execution_count": null,
1725 |    "id": "986b6a5c",
1726 |    "metadata": {},
1727 |    "outputs": [],
1728 |    "source": [
1729 |     "for nprobe in 2, 5, 10, 20, 50: \n",
1730 |     "    index.nprobe = nprobe \n",
1731 |     "    t0 = time.time()\n",
1732 |     "    for _ in range(50): \n",
1733 |     "        D, I = index.search(xq, 10)\n",
1734 |     "    t1 = time.time()\n",
1735 |     "    nok = (I[:, 0] == gt[:, 0]).sum()\n",
1736 |     "    print(f\"{nprobe=:} {nok=:} {(t1 - t0)*1000:.3f} ms\")"
1737 |    ]
1738 |   },
1739 |   {
1740 |    "cell_type": "code",
1741 |    "execution_count": null,
1742 |    "id": "3d9f505e",
1743 |    "metadata": {},
1744 |    "outputs": [],
1745 |    "source": []
1746 |   }
1747 |  ],
1748 |  "metadata": {
1749 |   "kernelspec": {
1750 |    "display_name": "Python 3 (ipykernel)",
1751 |    "language": "python",
1752 |    "name": "python3"
1753 |   },
1754 |   "language_info": {
1755 |    "codemirror_mode": {
1756 |     "name": "ipython",
1757 |     "version": 3
1758 |    },
1759 |    "file_extension": ".py",
1760 |    "mimetype": "text/x-python",
1761 |    "name": "python",
1762 |    "nbconvert_exporter": "python",
1763 |    "pygments_lexer": "ipython3",
1764 |    "version": "3.10.13"
1765 |   }
1766 |  },
1767 |  "nbformat": 4,
1768 |  "nbformat_minor": 5
1769 | }
1770 | 


--------------------------------------------------------------------------------
/class_notes/Class_09_graph_indexes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_09_graph_indexes.pdf


--------------------------------------------------------------------------------
/class_notes/README.md:
--------------------------------------------------------------------------------
1 | # Long Term Memory in AI - Vector Search and Databases
2 | These are materials for [COS 597A](https://github.com/edoliberty/vector-search-class-notes/tree/main) is given at Princeton during the Fall semester 2023.
3 | 
4 | ## Disclaimer
5 | The following content is rapidly evolving over the next several months. Please regard the materials above more as a sandbox than as shareable materials.
6 | 
7 | ## Contribute
8 | These class notes are intended to be used freely by academics anywhere, students and professors alike. Please feel free to contribute in the form of pull requests or opening issues.
9 | 


--------------------------------------------------------------------------------
/class_notes/images/chernoff-exp-bounds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/chernoff-exp-bounds.png


--------------------------------------------------------------------------------
/class_notes/images/dragon_diff_dup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/dragon_diff_dup.jpg


--------------------------------------------------------------------------------
/class_notes/images/kdtrees-construction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-construction.png


--------------------------------------------------------------------------------
/class_notes/images/kdtrees-proof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-proof.png


--------------------------------------------------------------------------------
/class_notes/images/kdtrees-search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-search.png


--------------------------------------------------------------------------------
/class_notes/images/kmeans-proj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kmeans-proj.png


--------------------------------------------------------------------------------
/class_notes/images/nnsearch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/nnsearch.png


--------------------------------------------------------------------------------
/class_notes/images/pca.excalidraw:
--------------------------------------------------------------------------------
   1 | {
   2 |   "type": "excalidraw",
   3 |   "version": 2,
   4 |   "source": "https://app.excalidraw.com",
   5 |   "elements": [
   6 |     {
   7 |       "type": "arrow",
   8 |       "version": 89,
   9 |       "versionNonce": 1711806385,
  10 |       "isDeleted": false,
  11 |       "id": "psR6eSvp2P4QEWZr2DQKN",
  12 |       "fillStyle": "hachure",
  13 |       "strokeWidth": 1,
  14 |       "strokeStyle": "solid",
  15 |       "roughness": 1,
  16 |       "opacity": 100,
  17 |       "angle": 0,
  18 |       "x": 364.1484375,
  19 |       "y": 697.4765625,
  20 |       "strokeColor": "#1e1e1e",
  21 |       "backgroundColor": "transparent",
  22 |       "width": 3.27734375,
  23 |       "height": 385.296875,
  24 |       "seed": 287312799,
  25 |       "groupIds": [],
  26 |       "frameId": null,
  27 |       "roundness": {
  28 |         "type": 2
  29 |       },
  30 |       "boundElements": [],
  31 |       "updated": 1693239320995,
  32 |       "link": null,
  33 |       "locked": false,
  34 |       "startBinding": null,
  35 |       "endBinding": null,
  36 |       "lastCommittedPoint": null,
  37 |       "startArrowhead": null,
  38 |       "endArrowhead": "arrow",
  39 |       "points": [
  40 |         [
  41 |           0,
  42 |           0
  43 |         ],
  44 |         [
  45 |           -3.27734375,
  46 |           -385.296875
  47 |         ]
  48 |       ]
  49 |     },
  50 |     {
  51 |       "type": "arrow",
  52 |       "version": 137,
  53 |       "versionNonce": 1877686239,
  54 |       "isDeleted": false,
  55 |       "id": "MFR8U6_UeWjWwRz45RJi3",
  56 |       "fillStyle": "hachure",
  57 |       "strokeWidth": 1,
  58 |       "strokeStyle": "solid",
  59 |       "roughness": 1,
  60 |       "opacity": 100,
  61 |       "angle": 0,
  62 |       "x": 279.73046875,
  63 |       "y": 619.62109375,
  64 |       "strokeColor": "#1e1e1e",
  65 |       "backgroundColor": "transparent",
  66 |       "width": 755.75,
  67 |       "height": 4.08203125,
  68 |       "seed": 794312191,
  69 |       "groupIds": [],
  70 |       "frameId": null,
  71 |       "roundness": {
  72 |         "type": 2
  73 |       },
  74 |       "boundElements": [],
  75 |       "updated": 1693239320995,
  76 |       "link": null,
  77 |       "locked": false,
  78 |       "startBinding": null,
  79 |       "endBinding": null,
  80 |       "lastCommittedPoint": null,
  81 |       "startArrowhead": null,
  82 |       "endArrowhead": "arrow",
  83 |       "points": [
  84 |         [
  85 |           0,
  86 |           0
  87 |         ],
  88 |         [
  89 |           755.75,
  90 |           -4.08203125
  91 |         ]
  92 |       ]
  93 |     },
  94 |     {
  95 |       "type": "diamond",
  96 |       "version": 865,
  97 |       "versionNonce": 1028163807,
  98 |       "isDeleted": false,
  99 |       "id": "8HSTYnEQtwMgEsmaZvHzy",
 100 |       "fillStyle": "hachure",
 101 |       "strokeWidth": 1,
 102 |       "strokeStyle": "solid",
 103 |       "roughness": 1,
 104 |       "opacity": 100,
 105 |       "angle": 5.956825058112036,
 106 |       "x": 349.19270593476466,
 107 |       "y": 367.4721183769821,
 108 |       "strokeColor": "#1e1e1e",
 109 |       "backgroundColor": "transparent",
 110 |       "width": 609.5141984414641,
 111 |       "height": 304.80769228380507,
 112 |       "seed": 1971597727,
 113 |       "groupIds": [],
 114 |       "frameId": null,
 115 |       "roundness": null,
 116 |       "boundElements": [
 117 |         {
 118 |           "id": "JxAYXYDpuFLjxcxo4fGGt",
 119 |           "type": "arrow"
 120 |         },
 121 |         {
 122 |           "id": "xZClQRoQkSaRFfs9laIPW",
 123 |           "type": "arrow"
 124 |         }
 125 |       ],
 126 |       "updated": 1693239518301,
 127 |       "link": null,
 128 |       "locked": false
 129 |     },
 130 |     {
 131 |       "type": "ellipse",
 132 |       "version": 1072,
 133 |       "versionNonce": 1452729343,
 134 |       "isDeleted": false,
 135 |       "id": "jSIw3zMX4MMfJx63tvCod",
 136 |       "fillStyle": "solid",
 137 |       "strokeWidth": 2,
 138 |       "strokeStyle": "solid",
 139 |       "roughness": 1,
 140 |       "opacity": 100,
 141 |       "angle": 0,
 142 |       "x": 643.9316406249991,
 143 |       "y": 444.44726562499943,
 144 |       "strokeColor": "#1e1e1e",
 145 |       "backgroundColor": "#1e1e1e",
 146 |       "width": 11.36328125,
 147 |       "height": 11.296875,
 148 |       "seed": 1533171921,
 149 |       "groupIds": [],
 150 |       "frameId": null,
 151 |       "roundness": null,
 152 |       "boundElements": [],
 153 |       "updated": 1693239320995,
 154 |       "link": null,
 155 |       "locked": false
 156 |     },
 157 |     {
 158 |       "type": "ellipse",
 159 |       "version": 1107,
 160 |       "versionNonce": 767074161,
 161 |       "isDeleted": false,
 162 |       "id": "nGxVo89H30T_e4-LYRUKn",
 163 |       "fillStyle": "solid",
 164 |       "strokeWidth": 2,
 165 |       "strokeStyle": "solid",
 166 |       "roughness": 1,
 167 |       "opacity": 100,
 168 |       "angle": 0,
 169 |       "x": 565.0117187499991,
 170 |       "y": 363.34960937499943,
 171 |       "strokeColor": "#1e1e1e",
 172 |       "backgroundColor": "#1e1e1e",
 173 |       "width": 11.36328125,
 174 |       "height": 11.296875,
 175 |       "seed": 1344345777,
 176 |       "groupIds": [],
 177 |       "frameId": null,
 178 |       "roundness": null,
 179 |       "boundElements": [],
 180 |       "updated": 1693239320995,
 181 |       "link": null,
 182 |       "locked": false
 183 |     },
 184 |     {
 185 |       "type": "ellipse",
 186 |       "version": 1204,
 187 |       "versionNonce": 1829136191,
 188 |       "isDeleted": false,
 189 |       "id": "cTsABDIHJxAw41elopIyV",
 190 |       "fillStyle": "solid",
 191 |       "strokeWidth": 2,
 192 |       "strokeStyle": "solid",
 193 |       "roughness": 1,
 194 |       "opacity": 100,
 195 |       "angle": 0,
 196 |       "x": 473.2695312499991,
 197 |       "y": 570.0683593750008,
 198 |       "strokeColor": "#1e1e1e",
 199 |       "backgroundColor": "#1e1e1e",
 200 |       "width": 11.36328125,
 201 |       "height": 11.296875,
 202 |       "seed": 920510065,
 203 |       "groupIds": [],
 204 |       "frameId": null,
 205 |       "roundness": null,
 206 |       "boundElements": [],
 207 |       "updated": 1693239419995,
 208 |       "link": null,
 209 |       "locked": false
 210 |     },
 211 |     {
 212 |       "type": "ellipse",
 213 |       "version": 1152,
 214 |       "versionNonce": 321203775,
 215 |       "isDeleted": false,
 216 |       "id": "e3JE3PUcczgJzM4WOZn76",
 217 |       "fillStyle": "solid",
 218 |       "strokeWidth": 2,
 219 |       "strokeStyle": "solid",
 220 |       "roughness": 1,
 221 |       "opacity": 100,
 222 |       "angle": 0,
 223 |       "x": 684.4999999999991,
 224 |       "y": 691.5097656250013,
 225 |       "strokeColor": "#1e1e1e",
 226 |       "backgroundColor": "#1e1e1e",
 227 |       "width": 11.36328125,
 228 |       "height": 11.296875,
 229 |       "seed": 1767565393,
 230 |       "groupIds": [],
 231 |       "frameId": null,
 232 |       "roundness": null,
 233 |       "boundElements": [],
 234 |       "updated": 1693239363160,
 235 |       "link": null,
 236 |       "locked": false
 237 |     },
 238 |     {
 239 |       "type": "ellipse",
 240 |       "version": 1327,
 241 |       "versionNonce": 1515191217,
 242 |       "isDeleted": false,
 243 |       "id": "YR3c0Co_UxaclMGYqADWB",
 244 |       "fillStyle": "solid",
 245 |       "strokeWidth": 2,
 246 |       "strokeStyle": "solid",
 247 |       "roughness": 1,
 248 |       "opacity": 100,
 249 |       "angle": 0,
 250 |       "x": 509.8320312499991,
 251 |       "y": 536.2949218750008,
 252 |       "strokeColor": "#1e1e1e",
 253 |       "backgroundColor": "#1e1e1e",
 254 |       "width": 11.36328125,
 255 |       "height": 11.296875,
 256 |       "seed": 1062975025,
 257 |       "groupIds": [],
 258 |       "frameId": null,
 259 |       "roundness": null,
 260 |       "boundElements": [],
 261 |       "updated": 1693239360627,
 262 |       "link": null,
 263 |       "locked": false
 264 |     },
 265 |     {
 266 |       "type": "ellipse",
 267 |       "version": 1123,
 268 |       "versionNonce": 2139676401,
 269 |       "isDeleted": false,
 270 |       "id": "Ncc_A-KxecqBQGiXYwUkt",
 271 |       "fillStyle": "solid",
 272 |       "strokeWidth": 2,
 273 |       "strokeStyle": "solid",
 274 |       "roughness": 1,
 275 |       "opacity": 100,
 276 |       "angle": 0,
 277 |       "x": 810.8046874999991,
 278 |       "y": 365.01367187499943,
 279 |       "strokeColor": "#1e1e1e",
 280 |       "backgroundColor": "#1e1e1e",
 281 |       "width": 11.36328125,
 282 |       "height": 11.296875,
 283 |       "seed": 1047132255,
 284 |       "groupIds": [],
 285 |       "frameId": null,
 286 |       "roundness": null,
 287 |       "boundElements": [],
 288 |       "updated": 1693239320995,
 289 |       "link": null,
 290 |       "locked": false
 291 |     },
 292 |     {
 293 |       "type": "ellipse",
 294 |       "version": 1171,
 295 |       "versionNonce": 1439815889,
 296 |       "isDeleted": false,
 297 |       "id": "kebTOWkuV1Gi2T2pdIkUX",
 298 |       "fillStyle": "solid",
 299 |       "strokeWidth": 2,
 300 |       "strokeStyle": "solid",
 301 |       "roughness": 1,
 302 |       "opacity": 100,
 303 |       "angle": 0,
 304 |       "x": 742.3906249999991,
 305 |       "y": 567.4082031250008,
 306 |       "strokeColor": "#1e1e1e",
 307 |       "backgroundColor": "#1e1e1e",
 308 |       "width": 11.36328125,
 309 |       "height": 11.296875,
 310 |       "seed": 610315423,
 311 |       "groupIds": [],
 312 |       "frameId": null,
 313 |       "roundness": null,
 314 |       "boundElements": [],
 315 |       "updated": 1693239320995,
 316 |       "link": null,
 317 |       "locked": false
 318 |     },
 319 |     {
 320 |       "type": "ellipse",
 321 |       "version": 1163,
 322 |       "versionNonce": 191994047,
 323 |       "isDeleted": false,
 324 |       "id": "NLleLR66x0sdOMy0wselI",
 325 |       "fillStyle": "solid",
 326 |       "strokeWidth": 2,
 327 |       "strokeStyle": "solid",
 328 |       "roughness": 1,
 329 |       "opacity": 100,
 330 |       "angle": 0,
 331 |       "x": 877.7812499999991,
 332 |       "y": 540.0175781250013,
 333 |       "strokeColor": "#1e1e1e",
 334 |       "backgroundColor": "#1e1e1e",
 335 |       "width": 11.36328125,
 336 |       "height": 11.296875,
 337 |       "seed": 1845477567,
 338 |       "groupIds": [],
 339 |       "frameId": null,
 340 |       "roundness": null,
 341 |       "boundElements": [],
 342 |       "updated": 1693239320995,
 343 |       "link": null,
 344 |       "locked": false
 345 |     },
 346 |     {
 347 |       "type": "ellipse",
 348 |       "version": 1424,
 349 |       "versionNonce": 1363037983,
 350 |       "isDeleted": false,
 351 |       "id": "fgLhBWmoRKYV2V19yUIVc",
 352 |       "fillStyle": "solid",
 353 |       "strokeWidth": 2,
 354 |       "strokeStyle": "solid",
 355 |       "roughness": 1,
 356 |       "opacity": 100,
 357 |       "angle": 0,
 358 |       "x": 687.5546874999991,
 359 |       "y": 313.22851562499943,
 360 |       "strokeColor": "#1e1e1e",
 361 |       "backgroundColor": "#1e1e1e",
 362 |       "width": 11.36328125,
 363 |       "height": 11.296875,
 364 |       "seed": 2024267007,
 365 |       "groupIds": [],
 366 |       "frameId": null,
 367 |       "roundness": null,
 368 |       "boundElements": [],
 369 |       "updated": 1693239392930,
 370 |       "link": null,
 371 |       "locked": false
 372 |     },
 373 |     {
 374 |       "id": "f26_GHdCGVl2N_wGG2VkZ",
 375 |       "type": "line",
 376 |       "x": 881.390625,
 377 |       "y": 542.84765625,
 378 |       "width": 43.875,
 379 |       "height": 77.04296875,
 380 |       "angle": 0,
 381 |       "strokeColor": "#1e1e1e",
 382 |       "backgroundColor": "transparent",
 383 |       "fillStyle": "hachure",
 384 |       "strokeWidth": 1,
 385 |       "strokeStyle": "dashed",
 386 |       "roughness": 1,
 387 |       "opacity": 100,
 388 |       "groupIds": [],
 389 |       "frameId": null,
 390 |       "roundness": null,
 391 |       "seed": 1393721105,
 392 |       "version": 39,
 393 |       "versionNonce": 360953087,
 394 |       "isDeleted": false,
 395 |       "boundElements": null,
 396 |       "updated": 1693239320995,
 397 |       "link": null,
 398 |       "locked": false,
 399 |       "points": [
 400 |         [
 401 |           0,
 402 |           0
 403 |         ],
 404 |         [
 405 |           -43.875,
 406 |           -77.04296875
 407 |         ]
 408 |       ],
 409 |       "lastCommittedPoint": null,
 410 |       "startBinding": null,
 411 |       "endBinding": null,
 412 |       "startArrowhead": null,
 413 |       "endArrowhead": null
 414 |     },
 415 |     {
 416 |       "type": "ellipse",
 417 |       "version": 1214,
 418 |       "versionNonce": 1031626353,
 419 |       "isDeleted": false,
 420 |       "id": "3s37WmvPbgH_4E5OjXE1m",
 421 |       "fillStyle": "solid",
 422 |       "strokeWidth": 2,
 423 |       "strokeStyle": "solid",
 424 |       "roughness": 1,
 425 |       "opacity": 100,
 426 |       "angle": 0,
 427 |       "x": 830.505859375,
 428 |       "y": 459.7695312500001,
 429 |       "strokeColor": "#1e1e1e",
 430 |       "backgroundColor": "transparent",
 431 |       "width": 11.36328125,
 432 |       "height": 11.296875,
 433 |       "seed": 975525087,
 434 |       "groupIds": [],
 435 |       "frameId": null,
 436 |       "roundness": null,
 437 |       "boundElements": [],
 438 |       "updated": 1693239320995,
 439 |       "link": null,
 440 |       "locked": false
 441 |     },
 442 |     {
 443 |       "type": "ellipse",
 444 |       "version": 1335,
 445 |       "versionNonce": 1041238961,
 446 |       "isDeleted": false,
 447 |       "id": "f73_5TYYv5Mt5mhqPso08",
 448 |       "fillStyle": "solid",
 449 |       "strokeWidth": 2,
 450 |       "strokeStyle": "solid",
 451 |       "roughness": 1,
 452 |       "opacity": 100,
 453 |       "angle": 0,
 454 |       "x": 715.7826731642708,
 455 |       "y": 525.6458047041671,
 456 |       "strokeColor": "#1e1e1e",
 457 |       "backgroundColor": "transparent",
 458 |       "width": 11.36328125,
 459 |       "height": 11.296875,
 460 |       "seed": 1258547409,
 461 |       "groupIds": [],
 462 |       "frameId": null,
 463 |       "roundness": null,
 464 |       "boundElements": [],
 465 |       "updated": 1693239333950,
 466 |       "link": null,
 467 |       "locked": false
 468 |     },
 469 |     {
 470 |       "id": "hh_z4WA9tpZLxJ6LYUnLS",
 471 |       "type": "line",
 472 |       "x": 745.4140625,
 473 |       "y": 570.6171875,
 474 |       "width": 20.11328125,
 475 |       "height": 34.45703125,
 476 |       "angle": 0,
 477 |       "strokeColor": "#1e1e1e",
 478 |       "backgroundColor": "transparent",
 479 |       "fillStyle": "hachure",
 480 |       "strokeWidth": 1,
 481 |       "strokeStyle": "dashed",
 482 |       "roughness": 1,
 483 |       "opacity": 100,
 484 |       "groupIds": [],
 485 |       "frameId": null,
 486 |       "roundness": null,
 487 |       "seed": 1593905873,
 488 |       "version": 38,
 489 |       "versionNonce": 843463935,
 490 |       "isDeleted": false,
 491 |       "boundElements": null,
 492 |       "updated": 1693239331724,
 493 |       "link": null,
 494 |       "locked": false,
 495 |       "points": [
 496 |         [
 497 |           0,
 498 |           0
 499 |         ],
 500 |         [
 501 |           -20.11328125,
 502 |           -34.45703125
 503 |         ]
 504 |       ],
 505 |       "lastCommittedPoint": null,
 506 |       "startBinding": null,
 507 |       "endBinding": null,
 508 |       "startArrowhead": null,
 509 |       "endArrowhead": null
 510 |     },
 511 |     {
 512 |       "id": "dQv-OSQHmxyifigNoEpmt",
 513 |       "type": "line",
 514 |       "x": 655.41796875,
 515 |       "y": 459.48828125,
 516 |       "width": 15.4296875,
 517 |       "height": 32.8359375,
 518 |       "angle": 0,
 519 |       "strokeColor": "#1e1e1e",
 520 |       "backgroundColor": "transparent",
 521 |       "fillStyle": "hachure",
 522 |       "strokeWidth": 1,
 523 |       "strokeStyle": "dashed",
 524 |       "roughness": 1,
 525 |       "opacity": 100,
 526 |       "groupIds": [],
 527 |       "frameId": null,
 528 |       "roundness": null,
 529 |       "seed": 1012231569,
 530 |       "version": 43,
 531 |       "versionNonce": 1541485215,
 532 |       "isDeleted": false,
 533 |       "boundElements": null,
 534 |       "updated": 1693239340345,
 535 |       "link": null,
 536 |       "locked": false,
 537 |       "points": [
 538 |         [
 539 |           0,
 540 |           0
 541 |         ],
 542 |         [
 543 |           15.4296875,
 544 |           32.8359375
 545 |         ]
 546 |       ],
 547 |       "lastCommittedPoint": null,
 548 |       "startBinding": null,
 549 |       "endBinding": null,
 550 |       "startArrowhead": null,
 551 |       "endArrowhead": null
 552 |     },
 553 |     {
 554 |       "type": "ellipse",
 555 |       "version": 1386,
 556 |       "versionNonce": 486839807,
 557 |       "isDeleted": false,
 558 |       "id": "R1-4ATAv_nN6yBknTiz9E",
 559 |       "fillStyle": "solid",
 560 |       "strokeWidth": 2,
 561 |       "strokeStyle": "solid",
 562 |       "roughness": 1,
 563 |       "opacity": 100,
 564 |       "angle": 0,
 565 |       "x": 668.892578125,
 566 |       "y": 497.1523437499999,
 567 |       "strokeColor": "#1e1e1e",
 568 |       "backgroundColor": "transparent",
 569 |       "width": 11.36328125,
 570 |       "height": 11.296875,
 571 |       "seed": 791709873,
 572 |       "groupIds": [],
 573 |       "frameId": null,
 574 |       "roundness": null,
 575 |       "boundElements": [],
 576 |       "updated": 1693239344248,
 577 |       "link": null,
 578 |       "locked": false
 579 |     },
 580 |     {
 581 |       "id": "NXBahAn28pAT6DtCWimAG",
 582 |       "type": "line",
 583 |       "x": 520.6484375,
 584 |       "y": 553.5625,
 585 |       "width": 9.765625,
 586 |       "height": 18.26171875,
 587 |       "angle": 0,
 588 |       "strokeColor": "#1e1e1e",
 589 |       "backgroundColor": "transparent",
 590 |       "fillStyle": "hachure",
 591 |       "strokeWidth": 1,
 592 |       "strokeStyle": "dashed",
 593 |       "roughness": 1,
 594 |       "opacity": 100,
 595 |       "groupIds": [],
 596 |       "frameId": null,
 597 |       "roundness": null,
 598 |       "seed": 1897889311,
 599 |       "version": 48,
 600 |       "versionNonce": 2103633695,
 601 |       "isDeleted": false,
 602 |       "boundElements": null,
 603 |       "updated": 1693239427160,
 604 |       "link": null,
 605 |       "locked": false,
 606 |       "points": [
 607 |         [
 608 |           0,
 609 |           0
 610 |         ],
 611 |         [
 612 |           9.765625,
 613 |           18.26171875
 614 |         ]
 615 |       ],
 616 |       "lastCommittedPoint": null,
 617 |       "startBinding": null,
 618 |       "endBinding": null,
 619 |       "startArrowhead": null,
 620 |       "endArrowhead": null
 621 |     },
 622 |     {
 623 |       "type": "ellipse",
 624 |       "version": 1443,
 625 |       "versionNonce": 722951569,
 626 |       "isDeleted": false,
 627 |       "id": "FK_Gy7rcuVejdls-McB1n",
 628 |       "fillStyle": "solid",
 629 |       "strokeWidth": 2,
 630 |       "strokeStyle": "solid",
 631 |       "roughness": 1,
 632 |       "opacity": 100,
 633 |       "angle": 0,
 634 |       "x": 530.998046875,
 635 |       "y": 575.703125,
 636 |       "strokeColor": "#1e1e1e",
 637 |       "backgroundColor": "transparent",
 638 |       "width": 11.36328125,
 639 |       "height": 11.296875,
 640 |       "seed": 1282704561,
 641 |       "groupIds": [],
 642 |       "frameId": null,
 643 |       "roundness": null,
 644 |       "boundElements": [],
 645 |       "updated": 1693239360627,
 646 |       "link": null,
 647 |       "locked": false
 648 |     },
 649 |     {
 650 |       "id": "GjS4uJ6VLFK9oZcXJr0Ox",
 651 |       "type": "line",
 652 |       "x": 689.03515625,
 653 |       "y": 697.7890625,
 654 |       "width": 76.89453125,
 655 |       "height": 113.76171875,
 656 |       "angle": 0,
 657 |       "strokeColor": "#1e1e1e",
 658 |       "backgroundColor": "transparent",
 659 |       "fillStyle": "hachure",
 660 |       "strokeWidth": 1,
 661 |       "strokeStyle": "dashed",
 662 |       "roughness": 1,
 663 |       "opacity": 100,
 664 |       "groupIds": [],
 665 |       "frameId": null,
 666 |       "roundness": null,
 667 |       "seed": 2142968113,
 668 |       "version": 103,
 669 |       "versionNonce": 457782481,
 670 |       "isDeleted": false,
 671 |       "boundElements": null,
 672 |       "updated": 1693239370270,
 673 |       "link": null,
 674 |       "locked": false,
 675 |       "points": [
 676 |         [
 677 |           0,
 678 |           0
 679 |         ],
 680 |         [
 681 |           -76.89453125,
 682 |           -113.76171875
 683 |         ]
 684 |       ],
 685 |       "lastCommittedPoint": null,
 686 |       "startBinding": null,
 687 |       "endBinding": null,
 688 |       "startArrowhead": null,
 689 |       "endArrowhead": null
 690 |     },
 691 |     {
 692 |       "type": "ellipse",
 693 |       "version": 1463,
 694 |       "versionNonce": 784994257,
 695 |       "isDeleted": false,
 696 |       "id": "OyKsnV-vE9KVPTC1LgaYS",
 697 |       "fillStyle": "solid",
 698 |       "strokeWidth": 2,
 699 |       "strokeStyle": "solid",
 700 |       "roughness": 1,
 701 |       "opacity": 100,
 702 |       "angle": 0,
 703 |       "x": 603.291015625,
 704 |       "y": 569.59765625,
 705 |       "strokeColor": "#1e1e1e",
 706 |       "backgroundColor": "transparent",
 707 |       "width": 11.36328125,
 708 |       "height": 11.296875,
 709 |       "seed": 1349389535,
 710 |       "groupIds": [],
 711 |       "frameId": null,
 712 |       "roundness": null,
 713 |       "boundElements": [],
 714 |       "updated": 1693239373992,
 715 |       "link": null,
 716 |       "locked": false
 717 |     },
 718 |     {
 719 |       "id": "2QJVUvB1aPHrepxQvZdwm",
 720 |       "type": "line",
 721 |       "x": 820.875,
 722 |       "y": 379.046875,
 723 |       "width": 32.0234375,
 724 |       "height": 51.73828125,
 725 |       "angle": 0,
 726 |       "strokeColor": "#1e1e1e",
 727 |       "backgroundColor": "transparent",
 728 |       "fillStyle": "hachure",
 729 |       "strokeWidth": 1,
 730 |       "strokeStyle": "dashed",
 731 |       "roughness": 1,
 732 |       "opacity": 100,
 733 |       "groupIds": [],
 734 |       "frameId": null,
 735 |       "roundness": null,
 736 |       "seed": 982924863,
 737 |       "version": 19,
 738 |       "versionNonce": 353800031,
 739 |       "isDeleted": false,
 740 |       "boundElements": null,
 741 |       "updated": 1693239386996,
 742 |       "link": null,
 743 |       "locked": false,
 744 |       "points": [
 745 |         [
 746 |           0,
 747 |           0
 748 |         ],
 749 |         [
 750 |           32.0234375,
 751 |           51.73828125
 752 |         ]
 753 |       ],
 754 |       "lastCommittedPoint": null,
 755 |       "startBinding": null,
 756 |       "endBinding": null,
 757 |       "startArrowhead": null,
 758 |       "endArrowhead": null
 759 |     },
 760 |     {
 761 |       "type": "ellipse",
 762 |       "version": 1258,
 763 |       "versionNonce": 1590588959,
 764 |       "isDeleted": false,
 765 |       "id": "AU0JHXmpi1KVNnpTNWfme",
 766 |       "fillStyle": "solid",
 767 |       "strokeWidth": 2,
 768 |       "strokeStyle": "solid",
 769 |       "roughness": 1,
 770 |       "opacity": 100,
 771 |       "angle": 0,
 772 |       "x": 856.162109375,
 773 |       "y": 434.15234375,
 774 |       "strokeColor": "#1e1e1e",
 775 |       "backgroundColor": "transparent",
 776 |       "width": 11.36328125,
 777 |       "height": 11.296875,
 778 |       "seed": 1012953073,
 779 |       "groupIds": [],
 780 |       "frameId": null,
 781 |       "roundness": null,
 782 |       "boundElements": [
 783 |         {
 784 |           "id": "APhoaq5Vf-fOMoHzNEmxq",
 785 |           "type": "arrow"
 786 |         }
 787 |       ],
 788 |       "updated": 1693239665678,
 789 |       "link": null,
 790 |       "locked": false
 791 |     },
 792 |     {
 793 |       "id": "hWKcuVKHTedL0zaT3viK0",
 794 |       "type": "line",
 795 |       "x": 699.31640625,
 796 |       "y": 328.25390625,
 797 |       "width": 44.28125,
 798 |       "height": 81.23046875,
 799 |       "angle": 0,
 800 |       "strokeColor": "#1e1e1e",
 801 |       "backgroundColor": "transparent",
 802 |       "fillStyle": "hachure",
 803 |       "strokeWidth": 1,
 804 |       "strokeStyle": "dashed",
 805 |       "roughness": 1,
 806 |       "opacity": 100,
 807 |       "groupIds": [],
 808 |       "frameId": null,
 809 |       "roundness": null,
 810 |       "seed": 602469969,
 811 |       "version": 78,
 812 |       "versionNonce": 2131126129,
 813 |       "isDeleted": false,
 814 |       "boundElements": null,
 815 |       "updated": 1693239399787,
 816 |       "link": null,
 817 |       "locked": false,
 818 |       "points": [
 819 |         [
 820 |           0,
 821 |           0
 822 |         ],
 823 |         [
 824 |           44.28125,
 825 |           81.23046875
 826 |         ]
 827 |       ],
 828 |       "lastCommittedPoint": null,
 829 |       "startBinding": null,
 830 |       "endBinding": null,
 831 |       "startArrowhead": null,
 832 |       "endArrowhead": null
 833 |     },
 834 |     {
 835 |       "type": "ellipse",
 836 |       "version": 1318,
 837 |       "versionNonce": 157246321,
 838 |       "isDeleted": false,
 839 |       "id": "4iDQENfrnMiAu1qA26bJ3",
 840 |       "fillStyle": "solid",
 841 |       "strokeWidth": 2,
 842 |       "strokeStyle": "solid",
 843 |       "roughness": 1,
 844 |       "opacity": 100,
 845 |       "angle": 0,
 846 |       "x": 745.466796875,
 847 |       "y": 418.5390625,
 848 |       "strokeColor": "#1e1e1e",
 849 |       "backgroundColor": "transparent",
 850 |       "width": 11.36328125,
 851 |       "height": 11.296875,
 852 |       "seed": 603646015,
 853 |       "groupIds": [],
 854 |       "frameId": null,
 855 |       "roundness": null,
 856 |       "boundElements": [],
 857 |       "updated": 1693239403082,
 858 |       "link": null,
 859 |       "locked": false
 860 |     },
 861 |     {
 862 |       "id": "eKBOrAoy26LMq-IqdE9Mn",
 863 |       "type": "line",
 864 |       "x": 570.17578125,
 865 |       "y": 377.1015625,
 866 |       "width": 22.75,
 867 |       "height": 41.0234375,
 868 |       "angle": 0,
 869 |       "strokeColor": "#1e1e1e",
 870 |       "backgroundColor": "transparent",
 871 |       "fillStyle": "hachure",
 872 |       "strokeWidth": 1,
 873 |       "strokeStyle": "dashed",
 874 |       "roughness": 1,
 875 |       "opacity": 100,
 876 |       "groupIds": [],
 877 |       "frameId": null,
 878 |       "roundness": null,
 879 |       "seed": 1229948959,
 880 |       "version": 35,
 881 |       "versionNonce": 17255999,
 882 |       "isDeleted": false,
 883 |       "boundElements": null,
 884 |       "updated": 1693239407390,
 885 |       "link": null,
 886 |       "locked": false,
 887 |       "points": [
 888 |         [
 889 |           0,
 890 |           0
 891 |         ],
 892 |         [
 893 |           22.75,
 894 |           41.0234375
 895 |         ]
 896 |       ],
 897 |       "lastCommittedPoint": null,
 898 |       "startBinding": null,
 899 |       "endBinding": null,
 900 |       "startArrowhead": null,
 901 |       "endArrowhead": null
 902 |     },
 903 |     {
 904 |       "type": "ellipse",
 905 |       "version": 1377,
 906 |       "versionNonce": 1566092831,
 907 |       "isDeleted": false,
 908 |       "id": "cp5uAThKVgKBkKksASfq3",
 909 |       "fillStyle": "solid",
 910 |       "strokeWidth": 2,
 911 |       "strokeStyle": "solid",
 912 |       "roughness": 1,
 913 |       "opacity": 100,
 914 |       "angle": 0,
 915 |       "x": 590.556640625,
 916 |       "y": 420.15234375,
 917 |       "strokeColor": "#1e1e1e",
 918 |       "backgroundColor": "transparent",
 919 |       "width": 11.36328125,
 920 |       "height": 11.296875,
 921 |       "seed": 202812177,
 922 |       "groupIds": [],
 923 |       "frameId": null,
 924 |       "roundness": null,
 925 |       "boundElements": [],
 926 |       "updated": 1693239412631,
 927 |       "link": null,
 928 |       "locked": false
 929 |     },
 930 |     {
 931 |       "type": "ellipse",
 932 |       "version": 1500,
 933 |       "versionNonce": 1069620991,
 934 |       "isDeleted": false,
 935 |       "id": "ggRTxyZhqy4sauhGA09nR",
 936 |       "fillStyle": "solid",
 937 |       "strokeWidth": 2,
 938 |       "strokeStyle": "solid",
 939 |       "roughness": 1,
 940 |       "opacity": 100,
 941 |       "angle": 0,
 942 |       "x": 453.912109375,
 943 |       "y": 540.4609375,
 944 |       "strokeColor": "#1e1e1e",
 945 |       "backgroundColor": "transparent",
 946 |       "width": 11.36328125,
 947 |       "height": 11.296875,
 948 |       "seed": 1299890705,
 949 |       "groupIds": [],
 950 |       "frameId": null,
 951 |       "roundness": null,
 952 |       "boundElements": [],
 953 |       "updated": 1693239425731,
 954 |       "link": null,
 955 |       "locked": false
 956 |     },
 957 |     {
 958 |       "type": "line",
 959 |       "version": 80,
 960 |       "versionNonce": 1794690367,
 961 |       "isDeleted": false,
 962 |       "id": "7iZC_vinFqhDINfcEXiM3",
 963 |       "fillStyle": "hachure",
 964 |       "strokeWidth": 1,
 965 |       "strokeStyle": "dashed",
 966 |       "roughness": 1,
 967 |       "opacity": 100,
 968 |       "angle": 0,
 969 |       "x": 467.0243193823844,
 970 |       "y": 556.9658289533108,
 971 |       "strokeColor": "#1e1e1e",
 972 |       "backgroundColor": "transparent",
 973 |       "width": 9.765625,
 974 |       "height": 18.26171875,
 975 |       "seed": 265726783,
 976 |       "groupIds": [],
 977 |       "frameId": null,
 978 |       "roundness": null,
 979 |       "boundElements": [],
 980 |       "updated": 1693239430271,
 981 |       "link": null,
 982 |       "locked": false,
 983 |       "startBinding": null,
 984 |       "endBinding": null,
 985 |       "lastCommittedPoint": null,
 986 |       "startArrowhead": null,
 987 |       "endArrowhead": null,
 988 |       "points": [
 989 |         [
 990 |           0,
 991 |           0
 992 |         ],
 993 |         [
 994 |           9.765625,
 995 |           18.26171875
 996 |         ]
 997 |       ]
 998 |     },
 999 |     {
1000 |       "id": "JxAYXYDpuFLjxcxo4fGGt",
1001 |       "type": "arrow",
1002 |       "x": 366.08984375,
1003 |       "y": 615.4140625,
1004 |       "width": 177.61328125,
1005 |       "height": 29.21484375,
1006 |       "angle": 0,
1007 |       "strokeColor": "#1e1e1e",
1008 |       "backgroundColor": "transparent",
1009 |       "fillStyle": "hachure",
1010 |       "strokeWidth": 1,
1011 |       "strokeStyle": "solid",
1012 |       "roughness": 1,
1013 |       "opacity": 100,
1014 |       "groupIds": [],
1015 |       "frameId": null,
1016 |       "roundness": null,
1017 |       "seed": 1261621745,
1018 |       "version": 45,
1019 |       "versionNonce": 399426033,
1020 |       "isDeleted": false,
1021 |       "boundElements": null,
1022 |       "updated": 1693239515433,
1023 |       "link": null,
1024 |       "locked": false,
1025 |       "points": [
1026 |         [
1027 |           0,
1028 |           0
1029 |         ],
1030 |         [
1031 |           177.61328125,
1032 |           29.21484375
1033 |         ]
1034 |       ],
1035 |       "lastCommittedPoint": null,
1036 |       "startBinding": {
1037 |         "elementId": "8HSTYnEQtwMgEsmaZvHzy",
1038 |         "focus": 1.0481540308011237,
1039 |         "gap": 1
1040 |       },
1041 |       "endBinding": null,
1042 |       "startArrowhead": null,
1043 |       "endArrowhead": "arrow"
1044 |     },
1045 |     {
1046 |       "id": "xgCU6Gc5ZQpPFJr5o1brZ",
1047 |       "type": "text",
1048 |       "x": 433.5530014038086,
1049 |       "y": 640.15625,
1050 |       "width": 14.643997192382812,
1051 |       "height": 35,
1052 |       "angle": 0,
1053 |       "strokeColor": "#1e1e1e",
1054 |       "backgroundColor": "transparent",
1055 |       "fillStyle": "hachure",
1056 |       "strokeWidth": 1,
1057 |       "strokeStyle": "solid",
1058 |       "roughness": 1,
1059 |       "opacity": 100,
1060 |       "groupIds": [],
1061 |       "frameId": null,
1062 |       "roundness": null,
1063 |       "seed": 794945329,
1064 |       "version": 32,
1065 |       "versionNonce": 1063307761,
1066 |       "isDeleted": false,
1067 |       "boundElements": null,
1068 |       "updated": 1693239493942,
1069 |       "link": null,
1070 |       "locked": false,
1071 |       "text": "v",
1072 |       "fontSize": 28,
1073 |       "fontFamily": 1,
1074 |       "textAlign": "center",
1075 |       "verticalAlign": "top",
1076 |       "baseline": 25,
1077 |       "containerId": null,
1078 |       "originalText": "v",
1079 |       "lineHeight": 1.25
1080 |     },
1081 |     {
1082 |       "type": "text",
1083 |       "version": 62,
1084 |       "versionNonce": 1719395423,
1085 |       "isDeleted": false,
1086 |       "id": "cyCapA8uGbYaE0xsKFUtV",
1087 |       "fillStyle": "hachure",
1088 |       "strokeWidth": 1,
1089 |       "strokeStyle": "solid",
1090 |       "roughness": 1,
1091 |       "opacity": 100,
1092 |       "angle": 0,
1093 |       "x": 388.1623764038086,
1094 |       "y": 532.13671875,
1095 |       "strokeColor": "#1e1e1e",
1096 |       "backgroundColor": "transparent",
1097 |       "width": 14.643997192382812,
1098 |       "height": 35,
1099 |       "seed": 1411027295,
1100 |       "groupIds": [],
1101 |       "frameId": null,
1102 |       "roundness": null,
1103 |       "boundElements": [],
1104 |       "updated": 1693239492634,
1105 |       "link": null,
1106 |       "locked": false,
1107 |       "fontSize": 28,
1108 |       "fontFamily": 1,
1109 |       "text": "v",
1110 |       "textAlign": "center",
1111 |       "verticalAlign": "top",
1112 |       "containerId": null,
1113 |       "originalText": "v",
1114 |       "lineHeight": 1.25,
1115 |       "baseline": 25
1116 |     },
1117 |     {
1118 |       "type": "text",
1119 |       "version": 89,
1120 |       "versionNonce": 794662271,
1121 |       "isDeleted": false,
1122 |       "id": "0hIddUaa_QgZ7vFLqvf7U",
1123 |       "fillStyle": "hachure",
1124 |       "strokeWidth": 1,
1125 |       "strokeStyle": "solid",
1126 |       "roughness": 1,
1127 |       "opacity": 100,
1128 |       "angle": 0,
1129 |       "x": 450.7109069824219,
1130 |       "y": 659.8984375,
1131 |       "strokeColor": "#1e1e1e",
1132 |       "backgroundColor": "transparent",
1133 |       "width": 4.33599853515625,
1134 |       "height": 20,
1135 |       "seed": 1232364497,
1136 |       "groupIds": [],
1137 |       "frameId": null,
1138 |       "roundness": null,
1139 |       "boundElements": [],
1140 |       "updated": 1693239507281,
1141 |       "link": null,
1142 |       "locked": false,
1143 |       "fontSize": 16,
1144 |       "fontFamily": 1,
1145 |       "text": "1",
1146 |       "textAlign": "center",
1147 |       "verticalAlign": "top",
1148 |       "containerId": null,
1149 |       "originalText": "1",
1150 |       "lineHeight": 1.25,
1151 |       "baseline": 14
1152 |     },
1153 |     {
1154 |       "type": "text",
1155 |       "version": 144,
1156 |       "versionNonce": 1166858801,
1157 |       "isDeleted": false,
1158 |       "id": "DSegTrSal4dMNN6IJ7TLa",
1159 |       "fillStyle": "hachure",
1160 |       "strokeWidth": 1,
1161 |       "strokeStyle": "solid",
1162 |       "roughness": 1,
1163 |       "opacity": 100,
1164 |       "angle": 0,
1165 |       "x": 401.0930633544922,
1166 |       "y": 549.80078125,
1167 |       "strokeColor": "#1e1e1e",
1168 |       "backgroundColor": "transparent",
1169 |       "width": 11.391998291015625,
1170 |       "height": 20,
1171 |       "seed": 111641023,
1172 |       "groupIds": [],
1173 |       "frameId": null,
1174 |       "roundness": null,
1175 |       "boundElements": [],
1176 |       "updated": 1693239513312,
1177 |       "link": null,
1178 |       "locked": false,
1179 |       "fontSize": 16,
1180 |       "fontFamily": 1,
1181 |       "text": "2",
1182 |       "textAlign": "center",
1183 |       "verticalAlign": "top",
1184 |       "containerId": null,
1185 |       "originalText": "2",
1186 |       "lineHeight": 1.25,
1187 |       "baseline": 14
1188 |     },
1189 |     {
1190 |       "type": "arrow",
1191 |       "version": 181,
1192 |       "versionNonce": 892260657,
1193 |       "isDeleted": false,
1194 |       "id": "xZClQRoQkSaRFfs9laIPW",
1195 |       "fillStyle": "hachure",
1196 |       "strokeWidth": 1,
1197 |       "strokeStyle": "solid",
1198 |       "roughness": 1,
1199 |       "opacity": 100,
1200 |       "angle": 0,
1201 |       "x": 365.4546954613179,
1202 |       "y": 615.3624257761985,
1203 |       "strokeColor": "#1e1e1e",
1204 |       "backgroundColor": "transparent",
1205 |       "width": 136.6953125,
1206 |       "height": 134.9609375,
1207 |       "seed": 1365466065,
1208 |       "groupIds": [],
1209 |       "frameId": null,
1210 |       "roundness": null,
1211 |       "boundElements": [],
1212 |       "updated": 1693239522713,
1213 |       "link": null,
1214 |       "locked": false,
1215 |       "startBinding": {
1216 |         "elementId": "8HSTYnEQtwMgEsmaZvHzy",
1217 |         "focus": -0.9831183739783577,
1218 |         "gap": 1.4362444909314434
1219 |       },
1220 |       "endBinding": null,
1221 |       "lastCommittedPoint": null,
1222 |       "startArrowhead": null,
1223 |       "endArrowhead": "arrow",
1224 |       "points": [
1225 |         [
1226 |           0,
1227 |           0
1228 |         ],
1229 |         [
1230 |           136.6953125,
1231 |           -134.9609375
1232 |         ]
1233 |       ]
1234 |     },
1235 |     {
1236 |       "id": "bZWMnxvDKIUGwt_KoiAum",
1237 |       "type": "text",
1238 |       "x": 908.4673156738281,
1239 |       "y": 349.69921875,
1240 |       "width": 93.45599365234375,
1241 |       "height": 45,
1242 |       "angle": 0,
1243 |       "strokeColor": "#1e1e1e",
1244 |       "backgroundColor": "transparent",
1245 |       "fillStyle": "hachure",
1246 |       "strokeWidth": 1,
1247 |       "strokeStyle": "solid",
1248 |       "roughness": 1,
1249 |       "opacity": 100,
1250 |       "groupIds": [],
1251 |       "frameId": null,
1252 |       "roundness": null,
1253 |       "seed": 1048544017,
1254 |       "version": 78,
1255 |       "versionNonce": 567293343,
1256 |       "isDeleted": false,
1257 |       "boundElements": null,
1258 |       "updated": 1693239650468,
1259 |       "link": null,
1260 |       "locked": false,
1261 |       "text": "V V x",
1262 |       "fontSize": 36,
1263 |       "fontFamily": 1,
1264 |       "textAlign": "center",
1265 |       "verticalAlign": "top",
1266 |       "baseline": 32,
1267 |       "containerId": null,
1268 |       "originalText": "V V x",
1269 |       "lineHeight": 1.25
1270 |     },
1271 |     {
1272 |       "id": "Og00EPx6Lhp3iSq-SJHvF",
1273 |       "type": "text",
1274 |       "x": 930.0948486328125,
1275 |       "y": 371.5,
1276 |       "width": 9.739990234375,
1277 |       "height": 25,
1278 |       "angle": 0,
1279 |       "strokeColor": "#1e1e1e",
1280 |       "backgroundColor": "transparent",
1281 |       "fillStyle": "hachure",
1282 |       "strokeWidth": 1,
1283 |       "strokeStyle": "solid",
1284 |       "roughness": 1,
1285 |       "opacity": 100,
1286 |       "groupIds": [],
1287 |       "frameId": null,
1288 |       "roundness": null,
1289 |       "seed": 1512582833,
1290 |       "version": 109,
1291 |       "versionNonce": 1252964305,
1292 |       "isDeleted": false,
1293 |       "boundElements": null,
1294 |       "updated": 1693239650468,
1295 |       "link": null,
1296 |       "locked": false,
1297 |       "text": "k",
1298 |       "fontSize": 20,
1299 |       "fontFamily": 1,
1300 |       "textAlign": "center",
1301 |       "verticalAlign": "top",
1302 |       "baseline": 18,
1303 |       "containerId": null,
1304 |       "originalText": "k",
1305 |       "lineHeight": 1.25
1306 |     },
1307 |     {
1308 |       "type": "text",
1309 |       "version": 140,
1310 |       "versionNonce": 322080255,
1311 |       "isDeleted": false,
1312 |       "id": "wQcrlcXpUAOq9d8wmlEQw",
1313 |       "fillStyle": "hachure",
1314 |       "strokeWidth": 1,
1315 |       "strokeStyle": "solid",
1316 |       "roughness": 1,
1317 |       "opacity": 100,
1318 |       "angle": 0,
1319 |       "x": 963.8292236328125,
1320 |       "y": 372.15234375,
1321 |       "strokeColor": "#1e1e1e",
1322 |       "backgroundColor": "transparent",
1323 |       "width": 9.739990234375,
1324 |       "height": 25,
1325 |       "seed": 1834865887,
1326 |       "groupIds": [],
1327 |       "frameId": null,
1328 |       "roundness": null,
1329 |       "boundElements": [
1330 |         {
1331 |           "id": "APhoaq5Vf-fOMoHzNEmxq",
1332 |           "type": "arrow"
1333 |         }
1334 |       ],
1335 |       "updated": 1693239665678,
1336 |       "link": null,
1337 |       "locked": false,
1338 |       "fontSize": 20,
1339 |       "fontFamily": 1,
1340 |       "text": "k",
1341 |       "textAlign": "center",
1342 |       "verticalAlign": "top",
1343 |       "containerId": null,
1344 |       "originalText": "k",
1345 |       "lineHeight": 1.25,
1346 |       "baseline": 18
1347 |     },
1348 |     {
1349 |       "type": "text",
1350 |       "version": 160,
1351 |       "versionNonce": 1955173809,
1352 |       "isDeleted": false,
1353 |       "id": "CaEHaW4foPnuLnVxFw3cD",
1354 |       "fillStyle": "hachure",
1355 |       "strokeWidth": 1,
1356 |       "strokeStyle": "solid",
1357 |       "roughness": 1,
1358 |       "opacity": 100,
1359 |       "angle": 0,
1360 |       "x": 965.2725067138672,
1361 |       "y": 338.3046875,
1362 |       "strokeColor": "#1e1e1e",
1363 |       "backgroundColor": "transparent",
1364 |       "width": 16.079986572265625,
1365 |       "height": 25,
1366 |       "seed": 818709247,
1367 |       "groupIds": [],
1368 |       "frameId": null,
1369 |       "roundness": null,
1370 |       "boundElements": [],
1371 |       "updated": 1693239650468,
1372 |       "link": null,
1373 |       "locked": false,
1374 |       "fontSize": 20,
1375 |       "fontFamily": 1,
1376 |       "text": "T",
1377 |       "textAlign": "center",
1378 |       "verticalAlign": "top",
1379 |       "containerId": null,
1380 |       "originalText": "T",
1381 |       "lineHeight": 1.25,
1382 |       "baseline": 18
1383 |     },
1384 |     {
1385 |       "type": "text",
1386 |       "version": 161,
1387 |       "versionNonce": 1602954111,
1388 |       "isDeleted": false,
1389 |       "id": "s79jHU6wtiQlE8-i6WKSj",
1390 |       "fillStyle": "hachure",
1391 |       "strokeWidth": 1,
1392 |       "strokeStyle": "solid",
1393 |       "roughness": 1,
1394 |       "opacity": 100,
1395 |       "angle": 0,
1396 |       "x": 790.0090026855469,
1397 |       "y": 312.109375,
1398 |       "strokeColor": "#1e1e1e",
1399 |       "backgroundColor": "transparent",
1400 |       "width": 20.23199462890625,
1401 |       "height": 45,
1402 |       "seed": 1282579825,
1403 |       "groupIds": [],
1404 |       "frameId": null,
1405 |       "roundness": null,
1406 |       "boundElements": [],
1407 |       "updated": 1693239659788,
1408 |       "link": null,
1409 |       "locked": false,
1410 |       "fontSize": 36,
1411 |       "fontFamily": 1,
1412 |       "text": "x",
1413 |       "textAlign": "center",
1414 |       "verticalAlign": "top",
1415 |       "containerId": null,
1416 |       "originalText": "x",
1417 |       "lineHeight": 1.25,
1418 |       "baseline": 32
1419 |     },
1420 |     {
1421 |       "id": "APhoaq5Vf-fOMoHzNEmxq",
1422 |       "type": "arrow",
1423 |       "x": 956.984375,
1424 |       "y": 406.6328125,
1425 |       "width": 75.328125,
1426 |       "height": 26.7578125,
1427 |       "angle": 0,
1428 |       "strokeColor": "#1e1e1e",
1429 |       "backgroundColor": "transparent",
1430 |       "fillStyle": "hachure",
1431 |       "strokeWidth": 1,
1432 |       "strokeStyle": "solid",
1433 |       "roughness": 1,
1434 |       "opacity": 100,
1435 |       "groupIds": [],
1436 |       "frameId": null,
1437 |       "roundness": null,
1438 |       "seed": 214570993,
1439 |       "version": 57,
1440 |       "versionNonce": 1568222047,
1441 |       "isDeleted": false,
1442 |       "boundElements": null,
1443 |       "updated": 1693239669626,
1444 |       "link": null,
1445 |       "locked": false,
1446 |       "points": [
1447 |         [
1448 |           0,
1449 |           0
1450 |         ],
1451 |         [
1452 |           -75.328125,
1453 |           26.7578125
1454 |         ]
1455 |       ],
1456 |       "lastCommittedPoint": null,
1457 |       "startBinding": {
1458 |         "elementId": "wQcrlcXpUAOq9d8wmlEQw",
1459 |         "focus": -1.2522333285519662,
1460 |         "gap": 9.48046875
1461 |       },
1462 |       "endBinding": {
1463 |         "elementId": "AU0JHXmpi1KVNnpTNWfme",
1464 |         "focus": 0.10462789327715792,
1465 |         "gap": 15.145177073482863
1466 |       },
1467 |       "startArrowhead": null,
1468 |       "endArrowhead": "arrow"
1469 |     }
1470 |   ],
1471 |   "appState": {
1472 |     "gridSize": null,
1473 |     "viewBackgroundColor": "#ffffff"
1474 |   },
1475 |   "files": {}
1476 | }


--------------------------------------------------------------------------------
/class_notes/images/pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/pca.png


--------------------------------------------------------------------------------
/class_notes/images/vectorsearch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/vectorsearch.png


--------------------------------------------------------------------------------
/class_notes/vs.bib:
--------------------------------------------------------------------------------
  1 | %% This BibTeX bibliography file was created using BibDesk.
  2 | %% https://bibdesk.sourceforge.io/
  3 | 
  4 | %% Created for Edo Liberty at 2023-08-28 13:55:33 -0400 
  5 | 
  6 | 
  7 | %% Saved with string encoding Unicode (UTF-8) 
  8 | 
  9 | 
 10 | 
 11 | @article{RokhlinST09,
 12 | 	author = {Vladimir Rokhlin and Arthur Szlam and Mark Tygert},
 13 | 	bibsource = {dblp computer science bibliography, http://dblp.org},
 14 | 	biburl = {http://dblp.uni-trier.de/rec/bib/journals/siammax/RokhlinST09},
 15 | 	date-added = {2023-08-28 13:55:06 -0400},
 16 | 	date-modified = {2023-08-28 13:55:06 -0400},
 17 | 	doi = {10.1137/080736417},
 18 | 	journal = {{SIAM} J. Matrix Analysis Applications},
 19 | 	number = {3},
 20 | 	pages = {1100--1124},
 21 | 	timestamp = {Tue, 22 Mar 2011 09:17:45 +0100},
 22 | 	title = {A Randomized Algorithm for Principal Component Analysis},
 23 | 	url = {http://dx.doi.org/10.1137/080736417},
 24 | 	volume = {31},
 25 | 	year = {2009},
 26 | 	bdsk-url-1 = {http://dx.doi.org/10.1137/080736417}}
 27 | 
 28 | @proceedings{DBLP:conf/nips/2015,
 29 | 	bibsource = {dblp computer science bibliography, http://dblp.org},
 30 | 	biburl = {http://dblp.uni-trier.de/rec/bib/conf/nips/2015},
 31 | 	date-added = {2023-08-28 13:55:06 -0400},
 32 | 	date-modified = {2023-08-28 13:55:06 -0400},
 33 | 	editor = {Corinna Cortes and Neil D. Lawrence and Daniel D. Lee and Masashi Sugiyama and Roman Garnett},
 34 | 	timestamp = {Fri, 08 Apr 2016 19:32:52 +0200},
 35 | 	title = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada},
 36 | 	url = {http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015},
 37 | 	year = {2015},
 38 | 	bdsk-url-1 = {http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015}}
 39 | 
 40 | @article{HalkoMT2011,
 41 | 	acmid = {2078881},
 42 | 	address = {Philadelphia, PA, USA},
 43 | 	author = {Halko, N. and Martinsson, P. G. and Tropp, J. A.},
 44 | 	date-added = {2023-08-28 13:55:06 -0400},
 45 | 	date-modified = {2023-08-28 13:55:06 -0400},
 46 | 	doi = {10.1137/090771806},
 47 | 	issn = {0036-1445},
 48 | 	issue_date = {May 2011},
 49 | 	journal = {SIAM Rev.},
 50 | 	keywords = {Johnson-Lindenstrauss lemma, dimension reduction, eigenvalue decomposition, interpolative decomposition, matrix approximation, parallel algorithm, pass-efficient algorithm, principal component analysis, random matrix, randomized algorithm, rank-revealing QR factorization, singular value decomposition, streaming algorithm},
 51 | 	month = may,
 52 | 	number = {2},
 53 | 	numpages = {72},
 54 | 	pages = {217--288},
 55 | 	publisher = {Society for Industrial and Applied Mathematics},
 56 | 	title = {Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions},
 57 | 	url = {http://dx.doi.org/10.1137/090771806},
 58 | 	volume = {53},
 59 | 	year = {2011},
 60 | 	bdsk-url-1 = {http://dx.doi.org/10.1137/090771806}}
 61 | 
 62 | @article{Rudelson08,
 63 | 	author = {Mark Rudelson},
 64 | 	date-added = {2023-08-28 13:55:06 -0400},
 65 | 	date-modified = {2023-08-28 13:55:06 -0400},
 66 | 	journal = {Annals of Mathematics},
 67 | 	pages = {575-600},
 68 | 	title = {Invertibility of random matrices: norm of the inverse},
 69 | 	volume = {168 Issue 2},
 70 | 	year = {2008}}
 71 | 
 72 | @article{WittenE15,
 73 | 	acmid = {2756048},
 74 | 	address = {Secaucus, NJ, USA},
 75 | 	author = {Witten, Rafi and Cand\`{e}s, Emmanuel},
 76 | 	date-added = {2023-08-28 13:55:06 -0400},
 77 | 	date-modified = {2023-08-28 13:55:06 -0400},
 78 | 	doi = {10.1007/s00453-014-9891-7},
 79 | 	issn = {0178-4617},
 80 | 	issue_date = {May 2015},
 81 | 	journal = {Algorithmica},
 82 | 	keywords = {Dimension reduction, Matrix approximation, Pass efficient algorithm, Random matrix, Randomized linear algebra},
 83 | 	month = may,
 84 | 	number = {1},
 85 | 	numpages = {18},
 86 | 	pages = {264--281},
 87 | 	publisher = {Springer-Verlag New York, Inc.},
 88 | 	title = {Randomized Algorithms for Low-Rank Matrix Factorizations: Sharp Performance Bounds},
 89 | 	url = {http://dx.doi.org/10.1007/s00453-014-9891-7},
 90 | 	volume = {72},
 91 | 	year = {2015},
 92 | 	bdsk-url-1 = {http://dx.doi.org/10.1007/s00453-014-9891-7}}
 93 | 
 94 | @inproceedings{MuscoM15,
 95 | 	author = {Cameron Musco and Christopher Musco},
 96 | 	bibsource = {dblp computer science bibliography, http://dblp.org},
 97 | 	biburl = {http://dblp.uni-trier.de/rec/bib/conf/nips/MuscoM15},
 98 | 	booktitle = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada},
 99 | 	crossref = {DBLP:conf/nips/2015},
100 | 	date-added = {2023-08-28 13:55:06 -0400},
101 | 	date-modified = {2023-08-28 13:55:06 -0400},
102 | 	pages = {1396--1404},
103 | 	timestamp = {Fri, 08 Apr 2016 19:32:53 +0200},
104 | 	title = {Randomized Block Krylov Methods for Stronger and Faster Approximate Singular Value Decomposition},
105 | 	url = {http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition},
106 | 	year = {2015},
107 | 	bdsk-url-1 = {http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition}}
108 | 
109 | @misc{liberty2016short,
110 | 	archiveprefix = {arXiv},
111 | 	author = {Edo Liberty},
112 | 	date-added = {2023-08-28 12:52:04 -0400},
113 | 	date-modified = {2023-08-28 12:52:04 -0400},
114 | 	eprint = {1605.05610},
115 | 	primaryclass = {cs.NA},
116 | 	title = {A Short Proof for Gap Independence of Simultaneous Iteration},
117 | 	year = {2016}}
118 | 
119 | @inproceedings{NIPS2013_6e0721b2,
120 | 	author = {Achlioptas, Dimitris and Karnin, Zohar S and Liberty, Edo},
121 | 	booktitle = {Advances in Neural Information Processing Systems},
122 | 	date-added = {2023-08-28 07:26:11 -0400},
123 | 	date-modified = {2023-08-28 07:26:11 -0400},
124 | 	editor = {C.J. Burges and L. Bottou and M. Welling and Z. Ghahramani and K.Q. Weinberger},
125 | 	publisher = {Curran Associates, Inc.},
126 | 	title = {Near-Optimal Entrywise Sampling for Data Matrices},
127 | 	url = {https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf},
128 | 	volume = {26},
129 | 	year = {2013},
130 | 	bdsk-url-1 = {https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf}}
131 | 
132 | @inbook{doi:10.1137/1.9781611974317.7,
133 | 	abstract = { Abstract This paper shows that one can be competitive with the k-means objective while operating online. In this model, the algorithm receives vectors v1, {\ldots}, vn one by one in an arbitrary order. For each vector vt the algorithm outputs a cluster identifier before receiving vt+1. Our online algorithm generates O(k log n log γn) clusters whose expected k-means cost is O(W* log n). Here, W* is the optimal k-means cost using k clusters and γ is the aspect ratio of the data. The dependence on γ is shown to be unavoidable and tight. We also show that, experimentally, it is not much worse than k-means++ while operating in a strictly more constrained computational model. },
134 | 	author = {Edo Liberty and Ram Sriharsha and Maxim Sviridenko},
135 | 	booktitle = {2016 Proceedings of the Meeting on Algorithm Engineering and Experiments (ALENEX)},
136 | 	date-added = {2023-08-27 23:57:26 -0400},
137 | 	date-modified = {2023-08-27 23:57:26 -0400},
138 | 	doi = {10.1137/1.9781611974317.7},
139 | 	eprint = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611974317.7},
140 | 	pages = {81-89},
141 | 	title = {An Algorithm for Online K-Means Clustering},
142 | 	url = {https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7},
143 | 	bdsk-url-1 = {https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7},
144 | 	bdsk-url-2 = {https://doi.org/10.1137/1.9781611974317.7}}
145 | 
146 | @misc{liberty2022simpler,
147 | 	archiveprefix = {arXiv},
148 | 	author = {Edo Liberty},
149 | 	date-added = {2023-08-27 23:55:56 -0400},
150 | 	date-modified = {2023-08-27 23:55:56 -0400},
151 | 	eprint = {2202.01780},
152 | 	primaryclass = {cs.DS},
153 | 	title = {Even Simpler Deterministic Matrix Sketching},
154 | 	year = {2022}}
155 | 
156 | @inproceedings{nngraph,
157 | 	abstract = {K-Nearest Neighbor Graph (K-NNG) construction is an important operation with many web related applications, including collaborative filtering, similarity search, and many others in data mining and machine learning. Existing methods for K-NNG construction either do not scale, or are specific to certain similarity measures. We present NN-Descent, a simple yet efficient algorithm for approximate K-NNG construction with arbitrary similarity measures. Our method is based on local search, has minimal space overhead and does not rely on any shared global index. Hence, it is especially suitable for large-scale applications where data structures need to be distributed over the network. We have shown with a variety of datasets and similarity measures that the proposed method typically converges to above 90\% recall with each point comparing only to several percent of the whole dataset on average.},
158 | 	address = {New York, NY, USA},
159 | 	author = {Dong, Wei and Moses, Charikar and Li, Kai},
160 | 	booktitle = {Proceedings of the 20th International Conference on World Wide Web},
161 | 	date-added = {2023-08-27 22:55:23 -0400},
162 | 	date-modified = {2023-08-27 22:55:31 -0400},
163 | 	doi = {10.1145/1963405.1963487},
164 | 	isbn = {9781450306324},
165 | 	keywords = {iterative method, k-nearest neighbor graph, arbitrary similarity measure},
166 | 	location = {Hyderabad, India},
167 | 	numpages = {10},
168 | 	pages = {577--586},
169 | 	publisher = {Association for Computing Machinery},
170 | 	series = {WWW '11},
171 | 	title = {Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures},
172 | 	url = {https://doi.org/10.1145/1963405.1963487},
173 | 	year = {2011},
174 | 	bdsk-url-1 = {https://doi.org/10.1145/1963405.1963487}}
175 | 
176 | @inproceedings{NEURIPS2019_09853c7f,
177 | 	author = {Jayaram Subramanya, Suhas and Devvrit, Fnu and Simhadri, Harsha Vardhan and Krishnawamy, Ravishankar and Kadekodi, Rohan},
178 | 	booktitle = {Advances in Neural Information Processing Systems},
179 | 	date-added = {2023-08-27 22:54:28 -0400},
180 | 	date-modified = {2023-08-27 22:54:28 -0400},
181 | 	editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
182 | 	publisher = {Curran Associates, Inc.},
183 | 	title = {DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node},
184 | 	url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf},
185 | 	volume = {32},
186 | 	year = {2019},
187 | 	bdsk-url-1 = {https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf}}
188 | 
189 | @misc{tay2022transformer,
190 | 	archiveprefix = {arXiv},
191 | 	author = {Yi Tay and Vinh Q. Tran and Mostafa Dehghani and Jianmo Ni and Dara Bahri and Harsh Mehta and Zhen Qin and Kai Hui and Zhe Zhao and Jai Gupta and Tal Schuster and William W. Cohen and Donald Metzler},
192 | 	date-added = {2023-08-27 22:53:43 -0400},
193 | 	date-modified = {2023-08-27 22:53:43 -0400},
194 | 	eprint = {2202.06991},
195 | 	primaryclass = {cs.CL},
196 | 	title = {Transformer Memory as a Differentiable Search Index},
197 | 	year = {2022}}
198 | 
199 | @inproceedings{neuralQuantization,
200 | 	abstract = {We tackle the problem of unsupervised visual descriptors compression, which is a key ingredient of large-scale image retrieval systems. While the deep learning machinery has benefited literally all computer vision pipelines, the existing state-of-the-art compression methods employ shallow architectures, and we aim to close this gap by our paper. In more detail, we introduce a DNN architecture for the unsupervised compressed-domain retrieval, based on multi-codebook quantization. The proposed architecture is designed to incorporate both fast data encoding and efficient distances computation via lookup tables. We demonstrate the exceptional advantage of our scheme over existing quantization approaches on several datasets of visual descriptors via outperforming the previous state-of-the-art by a large margin.},
201 | 	address = {Los Alamitos, CA, USA},
202 | 	author = {S. Morozov and A. Babenko},
203 | 	booktitle = {2019 IEEE/CVF International Conference on Computer Vision (ICCV)},
204 | 	date-added = {2023-08-27 22:52:56 -0400},
205 | 	date-modified = {2023-08-27 22:53:14 -0400},
206 | 	doi = {10.1109/ICCV.2019.00313},
207 | 	keywords = {quantization (signal);image coding;computer architecture;visualization;computer vision;encoding;databases},
208 | 	month = {nov},
209 | 	pages = {3036-3045},
210 | 	publisher = {IEEE Computer Society},
211 | 	title = {Unsupervised Neural Quantization for Compressed-Domain Similarity Search},
212 | 	url = {https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313},
213 | 	year = {2019},
214 | 	bdsk-url-1 = {https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313},
215 | 	bdsk-url-2 = {https://doi.org/10.1109/ICCV.2019.00313}}
216 | 
217 | @inproceedings{LSQ,
218 | 	abstract = {Multi-codebook quantization (MCQ) is the task of expressing a set of vectors as accurately as possible in terms of discrete entries in multiple bases. Work in MCQ is heavily focused on lowering quantization error, thereby improving distance estimation and recall on benchmarks of visual descriptors at a fixed memory budget. However, recent studies and methods in this area are hard to compare against each other, because they use different datasets, different protocols, and, perhaps most importantly, different computational budgets. In this work, we first benchmark a series of MCQ baselines on an equal footing and provide an analysis of their recall-vs-running-time performance. We observe that local search quantization (LSQ) is in practice much faster than its competitors, but is not the most accurate method in all cases. We then introduce two novel improvements that render LSQ (i) more accurate and (ii) faster. These improvements are easy to implement, and define a new state of the art in MCQ.},
219 | 	address = {Berlin, Heidelberg},
220 | 	author = {Martinez, Julieta and Zakhmi, Shobhit and Hoos, Holger H. and Little, James J.},
221 | 	booktitle = {Computer Vision -- ECCV 2018: 15th European Conference, Munich, Germany, September 8-14, 2018, Proceedings, Part XVI},
222 | 	date-added = {2023-08-27 22:51:54 -0400},
223 | 	date-modified = {2023-08-27 22:52:02 -0400},
224 | 	doi = {10.1007/978-3-030-01270-0_30},
225 | 	isbn = {978-3-030-01269-4},
226 | 	location = {Munich, Germany},
227 | 	numpages = {16},
228 | 	pages = {508--523},
229 | 	publisher = {Springer-Verlag},
230 | 	title = {LSQ++: Lower Running Time and Higher Recall in Multi-Codebook Quantization},
231 | 	url = {https://doi.org/10.1007/978-3-030-01270-0_30},
232 | 	year = {2018},
233 | 	bdsk-url-1 = {https://doi.org/10.1007/978-3-030-01270-0_30}}
234 | 
235 | @inproceedings{Martinez2016RevisitingAQ,
236 | 	author = {Julieta Martinez and Joris Clement and Holger H. Hoos and J. Little},
237 | 	booktitle = {European Conference on Computer Vision},
238 | 	date-added = {2023-08-27 22:51:08 -0400},
239 | 	date-modified = {2023-08-27 22:51:08 -0400},
240 | 	title = {Revisiting Additive Quantization},
241 | 	url = {https://api.semanticscholar.org/CorpusID:7340738},
242 | 	year = {2016},
243 | 	bdsk-url-1 = {https://api.semanticscholar.org/CorpusID:7340738}}
244 | 
245 | @misc{fu2018fast,
246 | 	archiveprefix = {arXiv},
247 | 	author = {Cong Fu and Chao Xiang and Changxu Wang and Deng Cai},
248 | 	date-added = {2023-08-27 22:49:58 -0400},
249 | 	date-modified = {2023-08-27 22:49:58 -0400},
250 | 	eprint = {1707.00143},
251 | 	primaryclass = {cs.LG},
252 | 	title = {Fast Approximate Nearest Neighbor Search With The Navigating Spreading-out Graph},
253 | 	year = {2018}}
254 | 
255 | @misc{guo2020accelerating,
256 | 	archiveprefix = {arXiv},
257 | 	author = {Ruiqi Guo and Philip Sun and Erik Lindgren and Quan Geng and David Simcha and Felix Chern and Sanjiv Kumar},
258 | 	date-added = {2023-08-27 22:49:28 -0400},
259 | 	date-modified = {2023-08-27 22:49:28 -0400},
260 | 	eprint = {1908.10396},
261 | 	primaryclass = {cs.LG},
262 | 	title = {Accelerating Large-Scale Inference with Anisotropic Vector Quantization},
263 | 	year = {2020}}
264 | 
265 | @article{Andre_2021,
266 | 	author = {Fabien Andre and Anne-Marie Kermarrec and Nicolas Le Scouarnec},
267 | 	date-added = {2023-08-27 22:48:12 -0400},
268 | 	date-modified = {2023-08-27 22:48:12 -0400},
269 | 	doi = {10.1109/tpami.2019.2952606},
270 | 	journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
271 | 	month = {may},
272 | 	number = {5},
273 | 	pages = {1666--1677},
274 | 	publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
275 | 	title = {Quicker {ADC} : Unlocking the Hidden Potential of Product Quantization With {SIMD}},
276 | 	url = {https://doi.org/10.1109%2Ftpami.2019.2952606},
277 | 	volume = {43},
278 | 	year = 2021,
279 | 	bdsk-url-1 = {https://doi.org/10.1109%2Ftpami.2019.2952606},
280 | 	bdsk-url-2 = {https://doi.org/10.1109/tpami.2019.2952606}}
281 | 
282 | @misc{hnsw,
283 | 	archiveprefix = {arXiv},
284 | 	author = {Yu. A. Malkov and D. A. Yashunin},
285 | 	date-added = {2023-08-27 22:47:22 -0400},
286 | 	date-modified = {2023-08-27 22:47:31 -0400},
287 | 	eprint = {1603.09320},
288 | 	primaryclass = {cs.DS},
289 | 	title = {Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs},
290 | 	year = {2018}}
291 | 
292 | @misc{johnson2017billionscale,
293 | 	archiveprefix = {arXiv},
294 | 	author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
295 | 	date-added = {2023-08-27 22:46:42 -0400},
296 | 	date-modified = {2023-08-27 22:46:42 -0400},
297 | 	eprint = {1702.08734},
298 | 	primaryclass = {cs.CV},
299 | 	title = {Billion-scale similarity search with GPUs},
300 | 	year = {2017}}
301 | 
302 | @article{pq,
303 | 	author = {Jegou, Herve and Douze, Matthijs and Schmid, Cordelia},
304 | 	date-added = {2023-08-27 22:45:03 -0400},
305 | 	date-modified = {2023-08-28 10:10:02 -0400},
306 | 	doi = {10.1109/TPAMI.2010.57},
307 | 	journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
308 | 	number = {1},
309 | 	pages = {117-128},
310 | 	title = {Product Quantization for Nearest Neighbor Search},
311 | 	volume = {33},
312 | 	year = {2011},
313 | 	bdsk-url-1 = {https://doi.org/10.1109/TPAMI.2010.57}}
314 | 
315 | @article{kdtree-worstcase,
316 | 	abstract = {Given a file of N records each of which has k keys, the worst-case analysis for the region and partial region queries in multidimensional binary search trees and balanced quad trees are presented. It is shown that the search algorithms proposed in [1, 3] run in time O(k N 1 1/k) for region queries in both tree structures. For partial region queries with s keys specified, the search algorithms run at most in time O(s N 1 1/ k ) in both structures.},
317 | 	address = {Berlin, Heidelberg},
318 | 	author = {Lee, D. T. and Wong, C. K.},
319 | 	date-added = {2023-08-26 14:50:53 -0400},
320 | 	date-modified = {2023-08-26 14:51:02 -0400},
321 | 	doi = {10.1007/BF00263763},
322 | 	issn = {0001-5903},
323 | 	issue_date = {March 1977},
324 | 	journal = {Acta Inf.},
325 | 	month = {mar},
326 | 	number = {1},
327 | 	numpages = {7},
328 | 	pages = {23--29},
329 | 	publisher = {Springer-Verlag},
330 | 	title = {Worst-Case Analysis for Region and Partial Region Searches in Multidimensional Binary Search Trees and Balanced Quad Trees},
331 | 	url = {https://doi.org/10.1007/BF00263763},
332 | 	volume = {9},
333 | 	year = {1977},
334 | 	bdsk-url-1 = {https://doi.org/10.1007/BF00263763}}
335 | 
336 | @inproceedings{rptrees,
337 | 	abstract = {We present a simple variant of the k-d tree which automatically adapts to intrinsic low dimensional structure in data without having to explicitly learn this structure.},
338 | 	address = {New York, NY, USA},
339 | 	author = {Dasgupta, Sanjoy and Freund, Yoav},
340 | 	booktitle = {Proceedings of the Fortieth Annual ACM Symposium on Theory of Computing},
341 | 	date-added = {2023-08-26 14:41:09 -0400},
342 | 	date-modified = {2023-08-26 14:41:18 -0400},
343 | 	doi = {10.1145/1374376.1374452},
344 | 	isbn = {9781605580470},
345 | 	keywords = {manifold, random projection, k-d tree, curse of dimension},
346 | 	location = {Victoria, British Columbia, Canada},
347 | 	numpages = {10},
348 | 	pages = {537--546},
349 | 	publisher = {Association for Computing Machinery},
350 | 	series = {STOC '08},
351 | 	title = {Random Projection Trees and Low Dimensional Manifolds},
352 | 	url = {https://doi.org/10.1145/1374376.1374452},
353 | 	year = {2008},
354 | 	bdsk-url-1 = {https://doi.org/10.1145/1374376.1374452}}
355 | 
356 | @article{LibertyMatrixSketching2012,
357 | 	author = {Edo Liberty},
358 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
359 | 	date-added = {2012-12-23 13:06:13 +0200},
360 | 	date-modified = {2012-12-23 13:06:27 +0200},
361 | 	ee = {http://arxiv.org/abs/1206.0594},
362 | 	journal = {CoRR},
363 | 	title = {Simple and Deterministic Matrix Sketching},
364 | 	volume = {abs/1206.0594},
365 | 	year = {2012}}
366 | 
367 | @article{rvSamplingFromLargeMatrices2007,
368 | 	acmid = {1255449},
369 | 	address = {New York, NY, USA},
370 | 	articleno = {21},
371 | 	author = {Rudelson, Mark and Vershynin, Roman},
372 | 	date-added = {2012-12-23 10:23:13 +0200},
373 | 	date-modified = {2012-12-23 10:23:55 +0200},
374 | 	doi = {10.1145/1255443.1255449},
375 | 	issn = {0004-5411},
376 | 	issue_date = {July 2007},
377 | 	journal = {J. ACM},
378 | 	keywords = {Monte-Carlo methods, Randomized algorithms, low-rank approximations, massive data sets, singular-value decompositions},
379 | 	month = jul,
380 | 	number = {4},
381 | 	publisher = {ACM},
382 | 	title = {Sampling from large matrices: An approach through geometric functional analysis},
383 | 	url = {http://doi.acm.org/10.1145/1255443.1255449},
384 | 	volume = {54},
385 | 	year = {2007},
386 | 	bdsk-url-1 = {http://doi.acm.org/10.1145/1255443.1255449},
387 | 	bdsk-url-2 = {http://dx.doi.org/10.1145/1255443.1255449}}
388 | 
389 | @inproceedings{AroraHaKaFRS06,
390 | 	acmid = {2165259},
391 | 	address = {Berlin, Heidelberg},
392 | 	author = {Arora, Sanjeev and Hazan, Elad and Kale, Satyen},
393 | 	booktitle = {Proceedings of the 9th international conference on Approximation Algorithms for Combinatorial Optimization Problems, and 10th international conference on Randomization and Computation},
394 | 	date-added = {2012-12-16 10:02:26 +0200},
395 | 	date-modified = {2012-12-16 10:02:26 +0200},
396 | 	doi = {10.1007/11830924_26},
397 | 	isbn = {3-540-38044-2, 978-3-540-38044-3},
398 | 	location = {Barcelona, Spain},
399 | 	numpages = {8},
400 | 	pages = {272--279},
401 | 	publisher = {Springer-Verlag},
402 | 	series = {APPROX'06/RANDOM'06},
403 | 	title = {A fast random sampling algorithm for sparsifying matrices},
404 | 	url = {http://dx.doi.org/10.1007/11830924_26},
405 | 	year = {2006},
406 | 	bdsk-url-1 = {http://dx.doi.org/10.1007/11830924_26}}
407 | 
408 | @inproceedings{JelaniH2012,
409 | 	author = {Jelani Nelson and Huy L. Nguyen},
410 | 	booktitle = {arXiv:1211.0995v1},
411 | 	date-added = {2012-12-02 10:16:07 +0200},
412 | 	date-modified = {2012-12-02 10:16:45 +0200},
413 | 	title = {Sparsity Lower Bounds for Dimensionality Reducing Maps},
414 | 	year = {2012}}
415 | 
416 | @inproceedings{KaneN12,
417 | 	author = {Daniel M. Kane and Jelani Nelson},
418 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
419 | 	booktitle = {SODA},
420 | 	crossref = {DBLP:conf/soda/2012},
421 | 	date-added = {2012-12-02 10:12:45 +0200},
422 | 	date-modified = {2012-12-02 10:12:51 +0200},
423 | 	ee = {http://portal.acm.org/citation.cfm?id=2095210{\&}CFID=63838676{\&}CFTOKEN=79617016},
424 | 	pages = {1195-1206},
425 | 	title = {Sparser Johnson-Lindenstrauss transforms},
426 | 	year = {2012}}
427 | 
428 | @article{GuhaMMMO03,
429 | 	author = {Sudipto Guha and Adam Meyerson and Nina Mishra and Rajeev Motwani and Liadan O'Callaghan},
430 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
431 | 	date-added = {2012-01-15 15:34:54 +0200},
432 | 	date-modified = {2012-01-15 15:35:01 +0200},
433 | 	ee = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2003.1198387},
434 | 	journal = {IEEE Trans. Knowl. Data Eng.},
435 | 	number = {3},
436 | 	pages = {515-528},
437 | 	title = {Clustering Data Streams: Theory and Practice},
438 | 	volume = {15},
439 | 	year = {2003}}
440 | 
441 | @inproceedings{AilonJM09,
442 | 	author = {Nir Ailon and Ragesh Jaiswal and Claire Monteleoni},
443 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
444 | 	booktitle = {NIPS},
445 | 	crossref = {DBLP:conf/nips/2009},
446 | 	date-added = {2012-01-15 15:17:28 +0200},
447 | 	date-modified = {2012-01-15 15:17:36 +0200},
448 | 	ee = {http://books.nips.cc/papers/files/nips22/NIPS2009_1085.pdf},
449 | 	pages = {10-18},
450 | 	title = {Streaming k-means approximation},
451 | 	year = {2009}}
452 | 
453 | @inproceedings{ArthurV07,
454 | 	author = {David Arthur and Sergei Vassilvitskii},
455 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
456 | 	booktitle = {SODA},
457 | 	crossref = {DBLP:conf/soda/2007},
458 | 	date-added = {2012-01-15 15:14:52 +0200},
459 | 	date-modified = {2012-01-15 15:15:02 +0200},
460 | 	ee = {http://doi.acm.org/10.1145/1283383.1283494},
461 | 	pages = {1027-1035},
462 | 	title = {k-means++: the advantages of careful seeding},
463 | 	year = {2007}}
464 | 
465 | @inproceedings{hk-sckmk-05,
466 | 	author = {S. {Har-Peled} and A. Kushal},
467 | 	booktitle = SOCG_2005,
468 | 	date-added = {2012-01-15 14:17:41 +0200},
469 | 	date-modified = {2023-08-28 07:17:01 -0400},
470 | 	pages = {126--134},
471 | 	title = {Smaller Coresets for k-Median and k-Means Clustering},
472 | 	year = {2005}}
473 | 
474 | @inproceedings{DingH04a,
475 | 	author = {Chris H. Q. Ding and Xiaofeng He},
476 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
477 | 	booktitle = {ICML},
478 | 	crossref = {DBLP:conf/icml/2004},
479 | 	date-added = {2012-01-14 18:01:07 +0200},
480 | 	date-modified = {2023-08-28 07:16:45 -0400},
481 | 	ee = {http://doi.acm.org/10.1145/1015330.1015408},
482 | 	title = {K-means clustering via principal component analysis},
483 | 	year = {2004}}
484 | 
485 | @article{Lloyd82leastsquares,
486 | 	author = {Stuart P. Lloyd},
487 | 	date-added = {2012-01-14 17:55:10 +0200},
488 | 	date-modified = {2012-01-14 17:55:10 +0200},
489 | 	journal = {IEEE Transactions on Information Theory},
490 | 	pages = {129--137},
491 | 	title = {Least squares quantization in pcm},
492 | 	volume = {28},
493 | 	year = {1982}}
494 | 
495 | @inproceedings{ZhaHDGS01,
496 | 	author = {Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming Gu and Horst D. Simon},
497 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
498 | 	booktitle = {NIPS},
499 | 	crossref = {DBLP:conf/nips/2001},
500 | 	date-added = {2012-01-14 17:53:42 +0200},
501 | 	date-modified = {2012-01-14 17:54:11 +0200},
502 | 	ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA41.ps.gz},
503 | 	pages = {1057-1064},
504 | 	title = {Spectral Relaxation for K-means Clustering},
505 | 	year = {2001}}
506 | 
507 | @article{Bentley75,
508 | 	acmid = {361007},
509 | 	address = {New York, NY, USA},
510 | 	author = {Bentley, Jon Louis},
511 | 	date-added = {2011-12-30 14:57:39 +0200},
512 | 	date-modified = {2011-12-30 14:57:58 +0200},
513 | 	doi = {http://doi.acm.org/10.1145/361002.361007},
514 | 	issn = {0001-0782},
515 | 	issue = {9},
516 | 	journal = {Commun. ACM},
517 | 	keywords = {associative retrieval, attribute, binary search trees, binary tree insertion, information retrieval system, intersection queries, key, nearest neighbor queries, partial match queries},
518 | 	month = {September},
519 | 	numpages = {9},
520 | 	pages = {509--517},
521 | 	publisher = {ACM},
522 | 	title = {Multidimensional binary search trees used for associative searching},
523 | 	url = {http://doi.acm.org/10.1145/361002.361007},
524 | 	volume = {18},
525 | 	year = {1975},
526 | 	bdsk-url-1 = {http://doi.acm.org/10.1145/361002.361007}}
527 | 
528 | @inproceedings{GionisIM99,
529 | 	author = {Aristides Gionis and Piotr Indyk and Rajeev Motwani},
530 | 	booktitle = {VLDB},
531 | 	date-added = {2011-12-30 14:48:57 +0200},
532 | 	date-modified = {2011-12-30 14:54:13 +0200},
533 | 	pages = {518-529},
534 | 	title = {Similarity Search in High Dimensions via Hashing},
535 | 	year = {1999}}
536 | 
537 | @inproceedings{Charikar02,
538 | 	author = {Moses Charikar},
539 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
540 | 	booktitle = {STOC},
541 | 	date-added = {2011-12-30 14:44:59 +0200},
542 | 	date-modified = {2011-12-30 14:45:18 +0200},
543 | 	ee = {http://doi.acm.org/10.1145/509907.509965},
544 | 	pages = {380-388},
545 | 	title = {Similarity estimation techniques from rounding algorithms},
546 | 	year = {2002}}
547 | 
548 | @misc{Drineas03passefficient,
549 | 	author = {Petros Drineas and Ravi Kannan},
550 | 	date-added = {2011-12-18 16:54:46 +0200},
551 | 	date-modified = {2011-12-18 16:54:46 +0200},
552 | 	title = {Pass Efficient Algorithms for Approximating Large Matrices},
553 | 	year = {2003}}
554 | 
555 | @article{AhlswedeW02,
556 | 	author = {Rudolf Ahlswede and Andreas Winter},
557 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
558 | 	date-added = {2011-12-18 11:47:32 +0200},
559 | 	date-modified = {2011-12-18 11:47:42 +0200},
560 | 	ee = {http://dx.doi.org/10.1109/18.985947},
561 | 	journal = {IEEE Transactions on Information Theory},
562 | 	number = {3},
563 | 	pages = {569-579},
564 | 	title = {Strong converse for identification via quantum channels},
565 | 	volume = {48},
566 | 	year = {2002}}
567 | 
568 | @inproceedings{AilonL11,
569 | 	author = {Nir Ailon and Edo Liberty},
570 | 	bibsource = {DBLP, http://dblp.uni-trier.de},
571 | 	booktitle = {SODA},
572 | 	crossref = {DBLP:conf/soda/2011},
573 | 	date-added = {2011-11-27 14:08:34 +0200},
574 | 	date-modified = {2011-11-27 14:08:39 +0200},
575 | 	ee = {http://www.siam.org/proceedings/soda/2011/SODA11_017_ailonn.pdf},
576 | 	pages = {185-191},
577 | 	title = {An Almost Optimal Unrestricted Fast Johnson-Lindenstrauss Transform},
578 | 	year = {2011}}
579 | 
580 | @article{DasGuptaGupta99,
581 | 	author = {S. DasGupta and A. Gupta},
582 | 	date-added = {2011-11-27 14:06:16 +0200},
583 | 	date-modified = {2023-08-28 07:29:49 -0400},
584 | 	journal = {Technical Report, UC Berkeley},
585 | 	title = {An elementary proof of the Johnson-Lindenstrauss lemma},
586 | 	volume = {99-006},
587 | 	year = 1999}
588 | 
589 | @inproceedings{AilonCh06,
590 | 	address = {Seattle, WA},
591 | 	author = {Nir Ailon and Bernard Chazelle},
592 | 	booktitle = {Proceedings of the 38st Annual Symposium on the Theory of Compututing (STOC)},
593 | 	date-added = {2011-11-27 14:05:58 +0200},
594 | 	date-modified = {2023-08-28 07:30:03 -0400},
595 | 	pages = {557--563},
596 | 	title = {Approximate nearest neighbors and the fast Johnson-Lindenstrauss transform},
597 | 	year = 2006}
598 | 
599 | @article{JL84,
600 | 	author = {W. B. Johnson and J. Lindenstrauss},
601 | 	date-added = {2011-11-27 14:05:01 +0200},
602 | 	date-modified = {2023-08-28 07:18:14 -0400},
603 | 	journal = {Contemporary Mathematics},
604 | 	pages = {189--206},
605 | 	title = {Extensions of Lipschitz mappings into a Hilbert space},
606 | 	volume = 26,
607 | 	year = 1984}
608 | 
609 | @article{douze2015quickcsg,
610 | 	author = {Douze, Matthijs and Franco, Jean-S{\'e}bastien and Raffin, Bruno},
611 | 	school = {Inria-Research Centre Grenoble--Rh{\^o}ne-Alpes; INRIA},
612 | 	title = {QuickCSG: Arbitrary and faster boolean combinations of n solids},
613 | 	year = {2015}}
614 | 
615 | @phdthesis{subramanian1990search,
616 | 	author = {Subramanian, KR and Fussel, DS},
617 | 	school = {PhD thesis, The University of Texas at Austin},
618 | 	title = {A search structure based on kd trees for efficient ray tracing},
619 | 	year = {1990}}
620 | 


--------------------------------------------------------------------------------
/class_notes/vs.sty:
--------------------------------------------------------------------------------
 1 | 
 2 | %%%% PACKAGES %%%%
 3 | \usepackage{fullpage}
 4 | \usepackage{algorithm,algorithmic}
 5 | \usepackage{amsfonts, amsmath, amsthm}
 6 | \usepackage{graphicx}
 7 | %for dotted lines inside matrices
 8 | \usepackage{arydshln}
 9 | \setlength{\dashlinedash}{.8pt} %
10 | \setlength{\dashlinegap}{1.2pt} %
11 | 
12 | %%%% ENVIRONMENTS %%%%
13 | \newtheorem{definition}{Definition}[section]
14 | \newtheorem{fact}{Fact}[section]
15 | \newtheorem{claim}{Claim}[section]
16 | \newtheorem{lemma}{Lemma}[section]
17 | \newtheorem{remark}{Remark}[section]
18 | \newtheorem{theorem}{Theorem}[section]
19 | \newtheorem{proposition}{Proposition}[section]
20 | 
21 | %%%% COMMANDS %%%%
22 | \newcommand{\E}{{\mathbb E}}
23 | \newcommand{\Var}{{\operatorname{Var}}}
24 | \newcommand{\var}{{\operatorname{Var}}}
25 | \newcommand{\poly}{{\operatorname{poly}}}
26 | \newcommand{\const}{{\operatorname{const}}}
27 | \newcommand{\OPT}{{\operatorname{OPT}}}
28 | \newcommand{\ALG}{{\operatorname{ALG}}}
29 | 
30 | 
31 | \newcommand{\allones}{\mathbf{1}}
32 | \newcommand{\abs}[1]{\left| #1 \right|}
33 | \newcommand{\norm}[1]{\| #1 \|}
34 | \newcommand{\eps}{\varepsilon}
35 | \newcommand{\tab}{\hspace{.5cm}}
36 | \newcommand{\R}{{\mathbb{R}}}
37 | \newcommand{\Sph}{{\mathbb{S}}}
38 | \newcommand{\N}{{\mathcal{N}}}
39 | 
40 | 
41 | \newcommand{\lecturetitle}[1]{
42 |    \noindent
43 |    \begin{center}
44 |    \framebox{
45 |       \vbox{\vspace{2mm}
46 |     \hbox to 6.28in { {\bf Long Term Memory in AI - Vector Search and Databases
47 | 		\hfill COS 597A Fall 2023} }
48 |        \vspace{4mm}
49 |        \hbox to 6.28in { {\Large \hfill #1 \hfill} }
50 |        \vspace{2mm}
51 |        \hbox to 6.28in { {\it Lectures: Lectures: Edo Liberty and Matthijs Douze \hfill}}
52 |       \vspace{2mm}}
53 |    }
54 |    \end{center}
55 |    \markboth{Lectures: Edo Liberty and Matthijs Douze}{Lectures: Edo Liberty and Matthijs Douze}
56 | {\small
57 |    {\bf Warning}: {\it
58 |    Please do not cite this note as a peer reviewed source. 
59 |    Please submit requests and corrections as issues or pull requests at github.com/edoliberty/vector-search-class-notes}
60 |    \vspace*{4mm}}
61 |    \hrule
62 |    \vspace{1cm}
63 | }
64 | 
65 | %%%% COMMON DEFS %%%%
66 | \date{\nonumber}
67 | 
68 | 


--------------------------------------------------------------------------------