├── .gitignore ├── LICENSE ├── README.md ├── bibtomarkdown.py ├── build.sh └── class_notes ├── Class_01_introduction.pdf ├── Class_02_text_embeddings.pdf ├── Class_03_image_embeddings.pdf ├── Class_03_image_embeddings.tex ├── Class_04_low_dimensional_vector_search.pdf ├── Class_04_low_dimensional_vector_search.tex ├── Class_05_dimensionality_reduction.pdf ├── Class_05_dimensionality_reduction.tex ├── Class_06_aproximate_nearest_neighbor_search.pdf ├── Class_06_aproximate_nearest_neighbor_search.tex ├── Class_07_clustering.pdf ├── Class_07_clustering.tex ├── Class_08_quantization.pdf ├── Class_08_runbook_for_students.ipynb ├── Class_09_graph_indexes.pdf ├── README.md ├── images ├── chernoff-exp-bounds.png ├── dragon_diff_dup.jpg ├── kdtrees-construction.excalidraw ├── kdtrees-construction.png ├── kdtrees-proof.excalidraw ├── kdtrees-proof.png ├── kdtrees-search.excalidraw ├── kdtrees-search.png ├── kmeans-proj.excalidraw ├── kmeans-proj.png ├── nnsearch.png ├── pca.excalidraw ├── pca.png ├── vectorsearch.excalidraw └── vectorsearch.png ├── vs.bib └── vs.sty /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | # *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Build tool directories for auxiliary files 44 | # latexrun 45 | latex.out/ 46 | 47 | ## Auxiliary and intermediate files from other packages: 48 | # algorithms 49 | *.alg 50 | *.loa 51 | 52 | # achemso 53 | acs-*.bib 54 | 55 | # amsthm 56 | *.thm 57 | 58 | # beamer 59 | *.nav 60 | *.pre 61 | *.snm 62 | *.vrb 63 | 64 | # changes 65 | *.soc 66 | 67 | # comment 68 | *.cut 69 | 70 | # cprotect 71 | *.cpt 72 | 73 | # elsarticle (documentclass of Elsevier journals) 74 | *.spl 75 | 76 | # endnotes 77 | *.ent 78 | 79 | # fixme 80 | *.lox 81 | 82 | # feynmf/feynmp 83 | *.mf 84 | *.mp 85 | *.t[1-9] 86 | *.t[1-9][0-9] 87 | *.tfm 88 | 89 | #(r)(e)ledmac/(r)(e)ledpar 90 | *.end 91 | *.?end 92 | *.[1-9] 93 | *.[1-9][0-9] 94 | *.[1-9][0-9][0-9] 95 | *.[1-9]R 96 | *.[1-9][0-9]R 97 | *.[1-9][0-9][0-9]R 98 | *.eledsec[1-9] 99 | *.eledsec[1-9]R 100 | *.eledsec[1-9][0-9] 101 | *.eledsec[1-9][0-9]R 102 | *.eledsec[1-9][0-9][0-9] 103 | *.eledsec[1-9][0-9][0-9]R 104 | 105 | # glossaries 106 | *.acn 107 | *.acr 108 | *.glg 109 | *.glo 110 | *.gls 111 | *.glsdefs 112 | *.lzo 113 | *.lzs 114 | 115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 116 | # *.ist 117 | 118 | # gnuplottex 119 | *-gnuplottex-* 120 | 121 | # gregoriotex 122 | *.gaux 123 | *.gtex 124 | 125 | # htlatex 126 | *.4ct 127 | *.4tc 128 | *.idv 129 | *.lg 130 | *.trc 131 | *.xref 132 | 133 | # hyperref 134 | *.brf 135 | 136 | # knitr 137 | *-concordance.tex 138 | # TODO Comment the next line if you want to keep your tikz graphics files 139 | *.tikz 140 | *-tikzDictionary 141 | 142 | # listings 143 | *.lol 144 | 145 | # luatexja-ruby 146 | *.ltjruby 147 | 148 | # makeidx 149 | *.idx 150 | *.ilg 151 | *.ind 152 | 153 | # minitoc 154 | *.maf 155 | *.mlf 156 | *.mlt 157 | *.mtc[0-9]* 158 | *.slf[0-9]* 159 | *.slt[0-9]* 160 | *.stc[0-9]* 161 | 162 | # minted 163 | _minted* 164 | *.pyg 165 | 166 | # morewrites 167 | *.mw 168 | 169 | # nomencl 170 | *.nlg 171 | *.nlo 172 | *.nls 173 | 174 | # pax 175 | *.pax 176 | 177 | # pdfpcnotes 178 | *.pdfpc 179 | 180 | # sagetex 181 | *.sagetex.sage 182 | *.sagetex.py 183 | *.sagetex.scmd 184 | 185 | # scrwfile 186 | *.wrt 187 | 188 | # sympy 189 | *.sout 190 | *.sympy 191 | sympy-plots-for-*.tex/ 192 | 193 | # pdfcomment 194 | *.upa 195 | *.upb 196 | 197 | # pythontex 198 | *.pytxcode 199 | pythontex-files-*/ 200 | 201 | # tcolorbox 202 | *.listing 203 | 204 | # thmtools 205 | *.loe 206 | 207 | # TikZ & PGF 208 | *.dpth 209 | *.md5 210 | *.auxlock 211 | 212 | # todonotes 213 | *.tdo 214 | 215 | # vhistory 216 | *.hst 217 | *.ver 218 | 219 | # easy-todo 220 | *.lod 221 | 222 | # xcolor 223 | *.xcp 224 | 225 | # xmpincl 226 | *.xmpi 227 | 228 | # xindy 229 | *.xdy 230 | 231 | # xypic precompiled matrices and outlines 232 | *.xyc 233 | *.xyd 234 | 235 | # endfloat 236 | *.ttt 237 | *.fff 238 | 239 | # Latexian 240 | TSWLatexianTemp* 241 | 242 | ## Editors: 243 | # WinEdt 244 | *.bak 245 | *.sav 246 | 247 | # Texpad 248 | .texpadtmp 249 | 250 | # LyX 251 | *.lyx~ 252 | 253 | # Kile 254 | *.backup 255 | 256 | # gummi 257 | .*.swp 258 | 259 | # KBibTeX 260 | *~[0-9]* 261 | 262 | # TeXnicCenter 263 | *.tps 264 | 265 | # auto folder when using emacs and auctex 266 | ./auto/* 267 | *.el 268 | 269 | # expex forward references with \gathertags 270 | *-tags.tex 271 | 272 | # standalone packages 273 | *.sta 274 | 275 | # Makeindex log files 276 | *.lpz 277 | 278 | # mac files 279 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Edo Liberty 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Long Term Memory in AI - Vector Search and Databases 2 | 3 | **NOTE:** COS 597A class times changed for Fall semester 2023. Classes will be held **9am-12noon**. 4 | 5 | ## Instructors 6 | 7 | * [Edo Liberty](https://edoliberty.github.io), Founder and CEO of [Pinecone](https://www.pinecone.io), the world's leading Vector Database. [Publications](https://scholar.google.com/citations?user=QHS_pZAAAAAJ&hl=en). 8 | 9 | * [Matthijs Douze](https://ai.meta.com/people/matthijs-douze/), Research Scientist at Meta. Architect and main developer of [FAISS](https://github.com/facebookresearch/faiss) the most popular and advanced open source library for vector search. [Publications](https://scholar.google.com/citations?user=0eFZtREAAAAJ&hl=en). 10 | 11 | * Teaching assistant [Nataly Brukhim](https://www.cs.princeton.edu/~nbrukhim/) PhD sdudent working with Prof. Elad Hazan and researcher at Google AI Princeton. email: . [Publications](https://scholar.google.com/citations?user=jZwEDZoAAAAJ&hl=en). 12 | 13 | * Guest lecture by [Harsha Vardhan Simhadri](https://harsha-simhadri.org/) Senior Principal Researcher, at Microsoft Research. Creator of [DiskANN](https://github.com/Microsoft/DiskANN). [Publications](https://scholar.google.com/citations?user=bW65tuAAAAAJ&hl=en) 14 | 15 | 16 | ## Overview 17 | Long Term Memory is a foundational capability in the modern AI Stack. At their core, these systems use vector search. Vector search is also a basic tool for systems that manipulate large collections of media like search engines, knowledge bases, content moderation tools, recommendation systems, etc. As such, the discipline lays at the intersection of Artificial Intelligence and Database Management Systems. This course will cover the theoretical foundations and practical implementation of vector search applications, algorithms, and systems. The course will be evaluated with project and in-class presentation. 18 | 19 | ## Contribute 20 | 21 | All class materials are intended to be used freely by academics anywhere, students and professors alike. Please contribute in the form of pull requests or by opening issues. 22 | 23 | ``` 24 | https://github.com/edoliberty/vector-search-class-notes 25 | ``` 26 | 27 | On unix-like systems (e.g. macos) with bibtex and pdflatex available you should be able to run this: 28 | 29 | ``` 30 | git clone git@github.com:edoliberty/vector-search-class-notes.git 31 | cd vector-search-class-notes 32 | ./build 33 | ``` 34 | 35 | ## Syllabus 36 | 37 | * 9/8 - [Class 1 - Introduction to Vector Search](class_notes/Class_01_introduction.pdf) [Matthijs + Edo + Nataly] 38 | * Intro to the course: Topic, Schedule, Project, Grading, ... 39 | * Embeddings as an information bottleneck. Instead of learning end-to-end, use embeddings as an intermediate representation 40 | * Advantages: scalability, instant updates, and explainability 41 | * Typical volumes of data and scalability. Embeddings are the only way to manage / access large databases 42 | * The embedding contract: the embedding extractor and embedding indexer agree on the meaning of the distance. Separation of concerns. 43 | * The vector space model in information retrieval 44 | * Vector embeddings in machine learning 45 | * Define vector, vector search, ranking, retrieval, recall 46 | 47 | 48 | * 9/15 - [Class 2 - Text embeddings](class_notes/Class_02_text_embeddings.pdf) [Matthijs] 49 | * 2-layer word embeddings. Word2vec and fastText, obtained via a factorization of a co-occurrence matrix. Embedding arithmetic: king + woman - man = queen, (already based on similarity search) 50 | * Sentence embeddings: How to train, masked LM. Properties of sentence embeddings. 51 | * Large Language Models: reasoning as an emerging property of a LM. What happens when the training set = the whole web 52 | 53 | 54 | * 9/22 - [Class 3 - Image embeddings](class_notes/Class_03_image_embeddings.pdf) [Matthijs] 55 | * Pixel structures of images. Early works on direct pixel indexing 56 | * Traditional CV models. Global descriptors (GIST). Local descriptors (SIFT and friends)Direct indexing of local descriptors for image matching, local descriptor pooling (Fisher, VLAD) 57 | * Convolutional Neural Nets. Off-the-shelf models. Trained specifically (contrastive learning, self-supervised learning) 58 | * Modern Computer Vision models 59 | 60 | 61 | * 9/29 - [Class 4 - Low Dimensional Vector Search](class_notes/Class_04_low_dimensional_vector_search.pdf) [Edo] 62 | * Vector search problem definition 63 | * k-d tree, space partitioning data structures 64 | * Worst case proof for kd-trees 65 | * Probabilistic inequalities. Recap of basic inequalities: Markov, Chernoof, Hoeffding 66 | * Concentration Of Measure phenomena. Orthogonality of random vectors in high dimensions 67 | * Curse of dimensionality and the failure of space partitioning 68 | 69 | * 10/6 - [Class 5 - Dimensionality Reduction](class_notes/Class_05_dimensionality_reduction.pdf) [Edo] 70 | * Singular Value Decomposition (SVD) 71 | * Applications of the SVD 72 | * Rank-k approximation in the spectral norm 73 | * Rank-k approximation in the Frobenius norm 74 | * Linear regression in the least-squared loss 75 | * PCA, Optimal squared loss dimension reduction 76 | * Closest orthogonal matrix 77 | * Computing the SVD: The power method 78 | * Random-projection 79 | * Matrices with normally distributed independent entries 80 | * Fast Random Projections 81 | 82 | * 10/13 - No Class - Midterm Examination Week 83 | 84 | * 10/20 - No Class - Fall Recess 85 | 86 | * 10/27 - [Class 6 - Approximate Nearest Neighbor Search](class_notes/Class_06_aproximate_nearest_neighbor_search.pdf) [Edo] 87 | * Definition of Approximate Nearest Neighbor Search (ANNS) 88 | * Criteria: Speed / accuracy / memory usage / updateability / index construction time 89 | * Definition of Locality Sensitive Hashing and examples 90 | * The LSH Algorithm 91 | * LSH Analysis, proof of correctness, and asymptotics 92 | 93 | * 11/3 - [Class 7 - Clustering](class_notes/Class_07_clustering.pdf) [Edo] 94 | * K-means clustering - mean squared error criterion. 95 | * Lloyd’s Algorithm 96 | * k-means and PCA 97 | * ε-net argument for fixed dimensions 98 | * Sampling based seeding for k-means 99 | * k-means++ 100 | * The Inverted File Model (IVF) 101 | 102 | * 11/10 - [Class 8 - Quantization for lossy vector compression](class_notes/Class_08_quantization.pdf) **This class will take place remotely via zoom, see the edstem message to get the link** [Matthijs] 103 | * Python notebook corresponding to the class: [Class_08_runbook_for_students.ipynb](class_notes/Class_08_runbook_for_students.ipynb) 104 | * Vector quantization is a topline (directly optimizes the objective) 105 | * Binary quantization and hamming comparison 106 | * Product quantization. Chunked vector quantization. Optimized vector quantization 107 | * Additive quantization. Extension of product quantization. Difficulty in training approximations (Residual quantization, CQ, TQ, LSQ, etc.) 108 | * Cost of coarse quantization vs. inverted list scanning 109 | 110 | * 11/17 - [Class 9 - Graph based indexes](class_notes/Class_09_graph_indexes.pdf) by Guest lecturer [Harsha Vardhan Simhadri.](https://harsha-simhadri.org/) 111 | * Early works: hierarchical k-means 112 | * Neighborhood graphs. How to construct them. Nearest Neighbor Descent 113 | * Greedy search in Neighborhood graphs. That does not work -- need long jumps 114 | * HNSW. A practical hierarchical graph-based index 115 | * NSG. Evolving a graph k-NN graph 116 | 117 | 118 | * 11/24 - No Class - Thanksgiving Recess 119 | 120 | * 12/1 - Class 10 - Student project and paper presentations [Edo + Nataly] 121 | 122 | 123 | ## Project 124 | 125 | Class work includes a final project. It will be graded based on 126 | 127 | 1. 50% - Project submission 128 | 1. 50% - In-class presentation 129 | 130 | **Projects can be in three different flavors** 131 | 132 | * _Theory/Research_: propose a new algorithm for a problem we explored in class (or modify an existing one), explain what it achieves, give experimental evidence or a proof for its behavior. If you choose this kind of project you are expected to submit a write up. 133 | * _Data Science/AI_: create an interesting use case for vector search using Pinecone, explain what data you used, what value your application brings, and what insights you gained. If you choose this kind of project you are expected to submit code (e.g. Jupyter Notebooks) and a writeup of your results and insights. 134 | * _Engineering/HPC_: adapt or add to FAISS, explain your improvements, show experimental results. If you choose this kind of project you are expected to submit a branch of FAISS for review along with a short writeup of your suggested improvement and experiments. 135 | 136 | 137 | **Project schedule**   138 | 139 | * 11/24 - One-page project proposal approved by the instructors 140 | * 12/1 - Final project submission 141 | * 12/1 - In-class presentation 142 | 143 | 144 | **Some more details** 145 | 146 | * Project Instructor: Nataly 147 | * Projects can be worked on individually, in teams of two or at most three students. 148 | * Expect to spend a few hours over the semester on the project proposal. Try to get it approved well ahead of the deadline. 149 | * Expect to spent 3-5 _full days_ on the project itself (on par with preparing for a final exam) 150 | * In class project project presentation are 5 minutes _per student_ (teams of two students present for 10 minutes. Teams of three, 15 minutes). 151 |   152 | ## Selected Literature 153 | 154 | * [A fast random sampling algorithm for sparsifying matrices](http://dx.doi.org/10.1007/11830924_26) - Arora, Sanjeev and Hazan, Elad and Kale, Satyen - 2006 155 | * [A Randomized Algorithm for Principal Component Analysis](http://dx.doi.org/10.1137/080736417) - Vladimir Rokhlin and Arthur Szlam and Mark Tygert - 2009 156 | * A search structure based on kd trees for efficient ray tracing - Subramanian, KR and Fussel, DS - 1990 157 | * A Short Proof for Gap Independence of Simultaneous Iteration - Edo Liberty - 2016 158 | * Accelerating Large-Scale Inference with Anisotropic Vector Quantization - Ruiqi Guo and Philip Sun and Erik Lindgren and Quan Geng and David Simcha and Felix Chern and Sanjiv Kumar - 2020 159 | * [Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada](http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015) - 2015 160 | * [An Algorithm for Online K-Means Clustering](https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7) - Edo Liberty and Ram Sriharsha and Maxim Sviridenko 161 | * An Almost Optimal Unrestricted Fast Johnson-Lindenstrauss Transform - Nir Ailon and Edo Liberty - 2011 162 | * An elementary proof of the Johnson-Lindenstrauss lemma - S. DasGupta and A. Gupta - 1999 163 | * Approximate nearest neighbors and the fast Johnson-Lindenstrauss transform - Nir Ailon and Bernard Chazelle - 2006 164 | * Billion-scale similarity search with GPUs - Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou - 2017 165 | * Clustering Data Streams: Theory and Practice - Sudipto Guha and Adam Meyerson and Nina Mishra and Rajeev Motwani and Liadan O'Callaghan - 2003 166 | * [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf) - Jayaram Subramanya, Suhas and Devvrit, Fnu and Simhadri, Harsha Vardhan and Krishnawamy, Ravishankar and Kadekodi, Rohan - 2019 167 | * Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs - Yu. A. Malkov and D. A. Yashunin - 2018 168 | * [Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures](https://doi.org/10.1145/1963405.1963487) - Dong, Wei and Moses, Charikar and Li, Kai - 2011 169 | * Even Simpler Deterministic Matrix Sketching - Edo Liberty - 2022 170 | * Extensions of Lipschitz mappings into a Hilbert space - W. B. Johnson and J. Lindenstrauss - 1984 171 | * Fast Approximate Nearest Neighbor Search With The Navigating Spreading-out Graph - Cong Fu and Chao Xiang and Changxu Wang and Deng Cai - 2018 172 | * [Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions](http://dx.doi.org/10.1137/090771806) - Halko, N. and Martinsson, P. G. and Tropp, J. A. - 2011 173 | * Invertibility of random matrices: norm of the inverse - Mark Rudelson - 2008 174 | * K-means clustering via principal component analysis - Chris H. Q. Ding and Xiaofeng He - 2004 175 | * k-means++: the advantages of careful seeding - David Arthur and Sergei Vassilvitskii - 2007 176 | * Least squares quantization in pcm - Stuart P. Lloyd - 1982 177 | * [LSQ++: Lower Running Time and Higher Recall in Multi-Codebook Quantization](https://doi.org/10.1007/978-3-030-01270-0_30) - Martinez, Julieta and Zakhmi, Shobhit and Hoos, Holger H. and Little, James J. - 2018 178 | * [Multidimensional binary search trees used for associative searching](http://doi.acm.org/10.1145/361002.361007) - Bentley, Jon Louis - 1975 179 | * [Near-Optimal Entrywise Sampling for Data Matrices](https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf) - Achlioptas, Dimitris and Karnin, Zohar S and Liberty, Edo - 2013 180 | * Pass Efficient Algorithms for Approximating Large Matrices - Petros Drineas and Ravi Kannan - 2003 181 | * Product Quantization for Nearest Neighbor Search - Jegou, Herve and Douze, Matthijs and Schmid, Cordelia - 2011 182 | * QuickCSG: Arbitrary and faster boolean combinations of n solids - Douze, Matthijs and Franco, Jean-S{\'e}bastien and Raffin, Bruno - 2015 183 | * [Quicker {ADC} : Unlocking the Hidden Potential of Product Quantization With {SIMD](https://doi.org/10.1109%2Ftpami.2019.2952606) - Fabien Andre and Anne-Marie Kermarrec and Nicolas Le Scouarnec - 2021 184 | * [Random Projection Trees and Low Dimensional Manifolds](https://doi.org/10.1145/1374376.1374452) - Dasgupta, Sanjoy and Freund, Yoav - 2008 185 | * [Randomized Algorithms for Low-Rank Matrix Factorizations: Sharp Performance Bounds](http://dx.doi.org/10.1007/s00453-014-9891-7) - Witten, Rafi and Cand\`{e}s, Emmanuel - 2015 186 | * [Randomized Block Krylov Methods for Stronger and Faster Approximate Singular Value Decomposition](http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition) - Cameron Musco and Christopher Musco - 2015 187 | * [Revisiting Additive Quantization](https://api.semanticscholar.org/CorpusID:7340738) - Julieta Martinez and Joris Clement and Holger H. Hoos and J. Little - 2016 188 | * [Sampling from large matrices: An approach through geometric functional analysis](http://doi.acm.org/10.1145/1255443.1255449) - Rudelson, Mark and Vershynin, Roman - 2007 189 | * Similarity estimation techniques from rounding algorithms - Moses Charikar - 2002 190 | * Similarity Search in High Dimensions via Hashing - Aristides Gionis and Piotr Indyk and Rajeev Motwani - 1999 191 | * Simple and Deterministic Matrix Sketching - Edo Liberty - 2012 192 | * Smaller Coresets for k-Median and k-Means Clustering - S. {Har-Peled} and A. Kushal - 2005 193 | * Sparser Johnson-Lindenstrauss transforms - Daniel M. Kane and Jelani Nelson - 2012 194 | * Sparsity Lower Bounds for Dimensionality Reducing Maps - Jelani Nelson and Huy L. Nguyen - 2012 195 | * Spectral Relaxation for K-means Clustering - Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming Gu and Horst D. Simon - 2001 196 | * Streaming k-means approximation - Nir Ailon and Ragesh Jaiswal and Claire Monteleoni - 2009 197 | * Strong converse for identification via quantum channels - Rudolf Ahlswede and Andreas Winter - 2002 198 | * Transformer Memory as a Differentiable Search Index - Yi Tay and Vinh Q. Tran and Mostafa Dehghani and Jianmo Ni and Dara Bahri and Harsh Mehta and Zhen Qin and Kai Hui and Zhe Zhao and Jai Gupta and Tal Schuster and William W. Cohen and Donald Metzler - 2022 199 | * [Unsupervised Neural Quantization for Compressed-Domain Similarity Search](https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313) - S. Morozov and A. Babenko - 2019 200 | * [Worst-Case Analysis for Region and Partial Region Searches in Multidimensional Binary Search Trees and Balanced Quad Trees](https://doi.org/10.1007/BF00263763) - Lee, D. T. and Wong, C. K. - 1977 201 | * [A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search](http://www.vldb.org/pvldb/vol14/p1964-wang.pdf) - Mengzhao Wang and Xiaoliang Xu and Qiang Yue and Yuxiang Wang - 2021 202 | * [Approximate Nearest Neighbor Search on High Dimensional Data - Experiments, Analyses, and Improvement](https://doi.org/10.1109/TKDE.2019.2909204) - Wen Li and Ying Zhang and Yifang Sun and Wei Wang and Mingjie Li and Wenjie Zhang and Xuemin Lin - 2020 203 | * [Survey of vector database management systems](https://doi.org/10.1007/s00778-024-00864-x) - James Jie Pan and Jianguo Wang and Guoliang Li - 2024 204 | -------------------------------------------------------------------------------- /bibtomarkdown.py: -------------------------------------------------------------------------------- 1 | ## This is a hacky util script to create markdown from the class bib file. 2 | ## you can run it like this 3 | ## >> python bibtomarkdown.py class_notes/vs.bib > bib.md 4 | ## For convenience, please add urls and notes to the bib file and 5 | ## regenerate the markdown instead of changing it directly in the README 6 | 7 | 8 | def bibtokvdict(bib): 9 | lines = [line.strip() for line in bib.split('\n') if "=" in line] 10 | pairs = [line.split('=',1) for line in lines] 11 | return dict([ (pair[0].strip(), pair[1].strip('{}, ')) for pair in pairs]) 12 | 13 | def kvdicttomarkdown(kvdict): 14 | if 'title' not in kvdict: 15 | return '' 16 | s = '*' 17 | if 'url' in kvdict: 18 | s += f" [{kvdict['title']}]({kvdict['url']})" 19 | else: 20 | s += f" {kvdict.get('title','')}" 21 | if 'author' in kvdict: 22 | s += f" - {kvdict['author']}" 23 | if 'year' in kvdict: 24 | s += f" - {kvdict['year']}" 25 | return s 26 | 27 | if __name__ == "__main__": 28 | import argparse 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("bib_file_name", help="path to a *.bib file") 31 | args = parser.parse_args() 32 | 33 | with open(args.bib_file_name) as f: 34 | text = f.read() 35 | 36 | bibs = [s.strip() for s in text.split('\n\n') if s.startswith('@')] 37 | kvdicts = [bibtokvdict(bib) for bib in bibs] 38 | kvdicts.sort(key = lambda m:m['title'].lower()) 39 | mds = [kvdicttomarkdown(kvdict) for kvdict in kvdicts] 40 | for md in mds: 41 | print(md) -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | pushd class_notes 2 | rm *.aux *.bbl *.blg *.dvi *.log 3 | for FILE in *.tex; do 4 | pdflatex $FILE; 5 | bibtex "${FILE%.*}" ; 6 | pdflatex $FILE; 7 | pdflatex $FILE; 8 | done 9 | rm *.aux *.bbl *.blg *.dvi *.log 10 | popd -------------------------------------------------------------------------------- /class_notes/Class_01_introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_01_introduction.pdf -------------------------------------------------------------------------------- /class_notes/Class_02_text_embeddings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_02_text_embeddings.pdf -------------------------------------------------------------------------------- /class_notes/Class_03_image_embeddings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_03_image_embeddings.pdf -------------------------------------------------------------------------------- /class_notes/Class_03_image_embeddings.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{vs} 3 | \begin{document} 4 | 5 | \lecturetitle{Class 3 - Image Embeddings} 6 | 7 | \bibliographystyle{plain} 8 | \bibliography{vs} 9 | 10 | \end{document} 11 | %%%%%%%% 12 | -------------------------------------------------------------------------------- /class_notes/Class_04_low_dimensional_vector_search.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_04_low_dimensional_vector_search.pdf -------------------------------------------------------------------------------- /class_notes/Class_04_low_dimensional_vector_search.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{vs} 3 | 4 | \begin{document} 5 | 6 | \lecturetitle{Class 4 - Low Dimensional Vector Search} 7 | \section{Nearest Neighbor Search Definition} 8 | 9 | As we have seen, nearest neighbor search is a fundamental computational building block in computer vision, semantic search, data mining, machine learning, and many other applications. 10 | 11 | \begin{definition}{\bf Nearest Neighbor Search:} Given a set of points $X = \{x_1,\ldots,x_n\} \in \R^{d}$ 12 | preprocess them into a data structure of size $\poly(n,d)$ in time $\poly(n,d)$ such that nearest neighbor queries can 13 | be performed. Given a search query point $q$ a radius $r$ and the data structure should return   14 | return $X_{q,r} = \{i \;\; | \;\; ||q - x_i || \le r \}$. 15 | \end{definition} 16 | We will later extend this definition in the obvious way to max-inner-product search (MIPS) and cosine similarity search. 17 | Improving the practical and theoretical asymptotic time complexity for computing $X_{q,r}$ is the topic of this class. 18 | 19 | \begin{center} 20 | \includegraphics[width=1.0\textwidth]{images/vectorsearch.png} 21 | \end{center} 22 | 23 | One option we have is to do nothing at the preprocessing stage. Then, at query time, scan the data points and find those which minimize $\|q - x_i\|$. 24 | This would give a query time of $\Omega(nd)$. But, of course, it would be significantly better if we could reduce the dependence on $n$, or in other words, avoid scanning the data for every query point. 25 | 26 | \section{kd-trees} 27 | First, we shall review a well known and widely used algorithm for this problem which is called kd-trees \cite{Bentley75}. 28 | The data structure holds the points in a geometrically partitioned tree. 29 | Each subtree contains all the points in an axis aligned bounding box, 30 | In each depth in the tree the bounding boxes are split along an axis. 31 | There are many different heuristics for splitting the boxes. For didactic reasons, let's consider a simple construction that lends itself to easy analysis. 32 | 33 | In our setting, we assume the number of points is exactly a power of $2$ and that coordinate values are distinct. 34 | Both assumptions are relatively harmless and should not change the asymptotic behavior. 35 | 36 | We will organize the points in a perfectly balanced binary tree. Points will be associated only with leaves. 37 | The construction of the tree is simple. We start with all points in a single box (a node in the tree) 38 | We split the box in two along the first coordinate such that exactly half the points lie on each side. 39 | The splitting offset is given by the median of these coordinates. 40 | Then, do the same for each of the two resulting boxes wrt to the second coordinate. 41 | Then, again, for the four resulting boxes wrt to the third coordinate and so on. 42 | Splitting stops when there is exactly one point in each leaf. 43 | 44 | \begin{center} 45 | \includegraphics[width=0.9\textwidth]{images/kdtrees-construction.png} 46 | \end{center} 47 | 48 | \noindent When searching the tree, we consider only points whose bounding boxes touch the query region (ball in this case). 49 | 50 | \begin{center} 51 | \includegraphics[width=0.9\textwidth]{images/kdtrees-search.png} 52 | \end{center} 53 | 54 | \begin{fact} 55 | Any axis aligned hyperplane touches at most $n^{1-1/d}$ boxes. 56 | \end{fact} 57 | \begin{proof} 58 | Assume the hyperplane is aligned with the $i$'th coordinate. That is, the points $x\in\R^d$ such that $x_i = c$. 59 | When splitting according to coordinate $i$, the split value is either larger or smaller than $c$. 60 | When that happens, the hyperplane only touches one of the two nodes a level lower in the tree. 61 | The result is that, a splitting according to $i$'s coordinate cuts the number of touched leaf boxes (points) by half. 62 | Finally, since we cycle through the coordinates in a round robin fashion, each coordinate is used for splitting $log_2(n)/d$ times. 63 | Note that $log_2(n)$ is the depth of the tree. We can now calculate the number of touched leaf boxes by $n (1/2)^{log_2(n)/d} = n^{1-1/d}$. 64 | \end{proof} 65 | 66 | \begin{center} 67 | \includegraphics[width=0.9\textwidth]{images/kdtrees-proof.png} 68 | \end{center} 69 | 70 | To complete the analysis, note that the axis aligned bounding box of the query has $2d$ facets. Therefore, the number of touched cells is $O(dn^{1-1/d})$. 71 | For each touched cell we need to decide whether the associated point is a valid search result which takes $O(d)$ operations. 72 | The resulting search complexity of the algorithm is $O(d^2n^{1-1/d})$ which is sub-linear in $n$. 73 | While this analysis is rather simple it actually cannot be made much better  \cite{kdtree-worstcase}. There are many variants of this algorithm \cite{rptrees} and practical observations that claim kd-trees perform well in low dimensions. 74 | 75 | Kd-trees are used extensively in 3D libraries to organize objects, lights, etc. 76 | They are data adaptive, meaning that they can concentrate on areas where there is data, and keep large boxes when there is not much detail~\cite{subramanian1990search,douze2015quickcsg}. 77 | In addition to nearest neighbor search they support following rays, organizing data for collision detection, etc. 78 | 79 | \begin{center} 80 | \includegraphics[width=0.9\textwidth]{images/dragon_diff_dup.jpg} 81 | \end{center} 82 | 83 | Nevertheless, this runtime is only better than the brute force solution of $O(nd)$ when $n > d^d$ which already hints that there might be a problem with high dimensions. 84 | Before introducing the source of the problem, we should cover some basic facts about random variables. Facts that will serve us for the rest of the course. 85 | 86 | \section{Probability Tools and Facts Recap} 87 | A variable $X$ is a random variable if it assumes different values 88 | according to a probability distribution. For example, $X$ can 89 | denote the outcome of a three sided die throw. 90 | The variable $X$ takes the values $x = 1,2,3$ with equal probabilities. 91 | 92 | The expectation of $X$ is the sum over the possible values times the probability of the events. 93 | \begin{equation} 94 | \E[X] = \sum_{x=1}^{3}x \Pr(X = x)= 95 | 1\frac{1}{3}+2\frac{1}{3}+3\frac{1}{3} = 2 96 | \end{equation} 97 | 98 | Continuous scalar random variables are described by their distribution function $\varphi$. 99 | $$ 100 | \Pr[Y \in [a,b]] = \int_{a}^{b}\varphi (t) dt. 101 | $$ 102 | For a function $\varphi$ to be a valid distribution we must have: 103 | \begin{eqnarray} 104 | \forall \;t, \;\; \varphi(t) &\ge& 0  \mbox{\;\;\; (where it is defined)}\\ 105 | \int_{a}^{b}\varphi (t) dt && \mbox{is well defined for all $a$ and $b$}\\ 106 | \int_{-\infty}^{-\infty}\varphi (t) dt &=& 1 107 | \end{eqnarray} 108 | 109 | For example consider the continuous variable $Y$ taking values in 110 | $[0,1]$ uniformly. That means $\varphi(t) = 1$ if $t \in [0,1]$ and zero else. 111 | This means that the probability of $Y$ being in the interval $[t,t + dt]$ is exactly $dt$. And so the expectation of $Y$ is: 112 | \begin{equation} 113 | \E[Y] = \int_{t=0}^{1}t \varphi(t) dt = \int_{t=0}^{1}t \cdot dt = \frac{1}{2}t^2|_{0}^{1} = 1/2 114 | \end{equation} 115 | 116 | \begin{remark} 117 | Strictly speaking, distributions are not necessarily continuous or bounded functions. 118 | In fact, they can even not be a function at all. 119 | For example, the distribution of $X$ above includes three Dirac-$\delta$ functions which are not, strictly speaking, functions. 120 | In this class, though, we will see only well behaved distributions. 121 | \end{remark} 122 | 123 | \subsection{Dependence and Independence} 124 | A variable $X$ is said to be {\it dependent} on $Y$ if the distribution of $X$ given $Y$ is different than the distribution of $X$. 125 | For example, assume the variable $X$ takes the value $1$ if $Y$ takes a 126 | value of less than $1/3$ and the values $2$ or $3$ with equal probability otherwise ($1/2$ each). 127 | % 128 | Clearly, the probability of $X$ assuming each of its values is still 129 | $1/3$. However, if we know that $Y$ is $0.7234$ the probability of 130 | $X$ assuming the value $1$ is zero. Let us calculate the expectation of $X$ given $Y$ as an exercise. 131 | \begin{eqnarray} 132 | \E(X | Y) = \sum_{x=1}^{3} x \Pr(X = x | Y \le 1/3) = 1\cdot 1\\ 133 | \E(X | Y) = \sum_{x=1}^{3} x \Pr(X = x | Y > 1/3) = 1\cdot 0 + 2 134 | \frac{1}{2} + 3\frac{1}{2}  = 2.5 135 | \end{eqnarray} 136 | $E(X | Y) = 1$ for $y \in [0,1/3]$ and $E(X | Y) = 2.5$ for $y \in (1/3,1]$.\\ 137 | Remember: $\E(X | Y)$ is a function of $y$! 138 | 139 | \begin{definition}[Independence] 140 | Two variables are said to be {\it Independent} if: 141 | \[ 142 | \forall y,\;\;\Pr[ X=x | Y = y] = \Pr[X=x]. 143 | \] 144 | They are {\it dependent} otherwise. 145 | \end{definition} 146 | 147 | \begin{fact} 148 | If two variables $X$ and $Y$ are {\it Independent} the so are $f(X)$ and $g(Y)$ for any functions $f$ and $g$. 149 | \end{fact} 150 | 151 | \begin{fact}[Linearity of expectation 1]% 152 | For any random variable and any constant $\alpha$: 153 | \begin{equation} 154 | \E[\alpha X] = \alpha \E[X] 155 | \end{equation} 156 | \end{fact} 157 | 158 | \begin{fact}[Linearity of expectation 2]% 159 | For any two random variables 160 | \begin{equation} 161 | \E_{X,Y}[X+Y] = \E[X] + \E[Y] 162 | \end{equation} 163 | even if they are dependent. 164 | \end{fact} 165 | 166 | \begin{fact}[Multiplication of random variables]% 167 | For any two {\bf independent} random variables 168 | \begin{equation} 169 | \E_{X,Y}[XY] = \E[X]\E[Y] 170 | \end{equation} 171 | This does not necessarily hold if they are dependent. 172 | \end{fact} 173 | 174 | \begin{definition}[Variance]% 175 | For a random variable $X$ we have 176 | \begin{equation} 177 | \Var[X] = \E[(X - \E[X])^2] = \E[X^2] - (\E[X])^2 178 | \end{equation} 179 | The standard deviation $\sigma$ of $X$ is defined to be $\sigma(X) \equiv \sqrt{\Var[X]}$. 180 | \end{definition} 181 | 182 | \begin{definition}[Additivity of variances]% 183 | For any two {\bf independent} variables $X$ and $Y$ we have 184 | \begin{equation} 185 | \Var[X + Y] = \Var[X] + \Var[Y] 186 | \end{equation} 187 | \end{definition} 188 | 189 | \begin{fact}[Markov's inequality]% 190 | For any {\it non-negative} random variable $X$: 191 | \begin{equation} 192 | \Pr(X > t) \le \frac{E[X]}{t} 193 | \end{equation} 194 | \end{fact} 195 | \begin{proof} 196 | \[ 197 | \E_\psi[x] = \int_0^{\infty} z\psi(z)dz = \int_0^{t} z\psi(z)dz + \int_t^{\infty} z\psi(z)dz \ge 0 + \int_t^{\infty} t\psi(z)dz = t \Pr[x>t] 198 | \] 199 | \end{proof} 200 | 201 | \begin{fact}[Chebyshev's inequality]% 202 | For any random variable $X$ 203 | \begin{equation} 204 | \Pr[|X-E[X]| > t] \le \frac{\sigma^2(X)}{t^2} 205 | \end{equation} 206 | \end{fact} 207 | \begin{proof} 208 | \[ 209 | \Pr[|X-E[X]| > t]  = \Pr[(X-E[X])^2 > t^2] \le \frac{\E[(X-E[X])^2]}{t^2} = \frac{\sigma^2(X)}{t^2} 210 | \] 211 | \end{proof} 212 | 213 | \begin{lemma}[The union bound] 214 | For any set of $m$ events $A_1,\ldots,A_m$: 215 | \[ 216 | \Pr[\cup_{i=1}^{m}A_i] \le \sum_{i=1}^{m}\Pr{A_i}. 217 | \] 218 | \end{lemma} 219 | In words, the probability that one or more events happen is at most the sum of the 220 | individual event probabilities. 221 | 222 | \begin{theorem}[Chernoff's bound] 223 | Let $X_i$ be a set of {\bf independent} random variables such that $\E[X_i] = 0$ and $|X_i| \le 1$ almost surely. 224 | Also define $\sigma_i^2 = \E[X_i^2]$ and $\sigma^2 = \sum_i \sigma_i^2$. Then: 225 | \[ 226 | \Pr[ \sum_i X_i \ge t ] \le max(e^{-t^2/4\sigma^2} , e^{-t/2}) 227 | \] 228 | \end{theorem} 229 | \begin{proof} 230 | \begin{eqnarray} 231 | \Pr[ \sum_i X_i \ge t ] &=& \Pr[ \lambda \sum_i X_i \ge \lambda  t ]  \mbox{\;\; (for $\lambda \ge 0$)} \\ 232 | &= &\Pr[ e^{\lambda \sum_i X_i} \ge e^{\lambda  t} ]   \mbox{\;\; (because $e^x$ is monotone)} \\ 233 | &\le &\E[e^{\lambda \sum_i X_i}] /e^{\lambda  t} \mbox{\;\; (by Markov)} \\ 234 | &=& \Pi_i \E[e^{\lambda X_i}] /e^{\lambda  t} \mbox{\;\; (by independence of $X_i$)} 235 | \end{eqnarray} 236 | 237 | \begin{center} 238 | \includegraphics[width=0.5\textwidth]{images/chernoff-exp-bounds.png} 239 | \end{center} 240 | 241 | Now, for $x \in [-1,1]$ we have that $e^x \le 1 + x + x^2$ so $\E[e^{\lambda X_i}] \le 1 + \E[\lambda X_i] + \lambda^2 \E[X_i^2] \le 1 + \lambda^2 \sigma^2_i$. 242 | Now, since $1+x \le e^x$ we have that $1 + \lambda^2 \sigma^2_i \le e^{\lambda^2 \sigma_i^2}$. Combining the above we have that $\E[e^{\lambda X_i}] \le e^{\lambda^2 \sigma_i^2}$ 243 | 244 | \begin{eqnarray} 245 | \Pi_i \E[e^{\lambda X_i}] /e^{\lambda  t} &\le& \Pi_i \E[e^{\lambda^2 \sigma_i^2}] /e^{\lambda  t}\\ 246 | &= & e^{\lambda^2 \sigma^2 - \lambda t} 247 | \end{eqnarray} 248 | Now, optimizing over $\lambda \in [0,1]$ we get that $\lambda = min(1,t/2\sigma^2)$ which completes the proof. 249 | \end{proof} 250 | 251 | \begin{theorem}[Chernoff's bound; Another useful form] 252 | Let $X_1,\ldots,X_n$ be independent 253 | $\{0,1\}$ valued random variables. Each $X_i$ takes the value $1$ 254 | with probability $p_i$ and $0$ else. Let $X = \sum_{i=1}^{n}X_i$ and 255 | let $\mu = E[X] = \sum_{i=1}^{n}p_i$. Then: 256 | \begin{eqnarray*} 257 | \Pr[X > (1+\eps)\mu] &\le& e^{-\mu \eps^2/4}\\ 258 | \Pr[X < (1-\eps)\mu] &\le& e^{-\mu \eps^2/2} 259 | \end{eqnarray*} 260 | Or, using the union bound: 261 | \[ 262 | \Pr[|X - \mu| > \eps\mu] \le 2e^{-\mu \eps^2/4} 263 | \] 264 | \end{theorem} 265 | 266 | \section{Curse of Dimensionality} 267 | A prime example for the curse of dimensionality is that a random point in $[0,1]^d$ is likely to be far from any set of $n$ points in the unit cube. 268 | Consider the distance between the query point $q$ and an input data vector $x$. 269 | We want to show that $\|x_i-q\|^2 \in \Omega(d)$. 270 | 271 | First, notice that $\Pr[|x(j)- q(j)| \ge 1/4] \ge 1/2$. The expected distance between $x$ and $q$ is at least $d/8$. 272 | Since $q(j)$ are independently drown, we can apply the Chernoff bound and get that for all $n$ points in the data set 273 | $\|x_i-q\|^2 \ge d/16$ if $d \ge \const\cdot\log(n)$. 274 | 275 | Now, consider the kd-tree data structure and algorithm run on a random query. 276 | If the radius of the ball around $q$ is less than $d/16$ the query is ``uninteresting'' since it is likely to return no results at all. 277 | On the other hand, if the radius is greater than $d/16$ than the ball around $q$ will cross all the major partitions 278 | along one of the axes. That means that the algorithm will visit at least $2^d$ partitions. 279 | 280 | \section{Volumes of balls and cubes} 281 | Another interesting phenomenon that occurs in high dimensions is the fact that unit spheres 282 | are exponentially smaller (in volume) than their containing boxes. 283 | Let us see this without using the explicit formulas for the volume of $d$ dimensional spheres. 284 | 285 | To compute the volume of a unit sphere, we perform a thought experiment. 286 | First, bound the sphere in a box (with sides of length $2$). 287 | Then, pick a point in the box uniformly at random. What is the probability $p$ that 288 | the point is also in the sphere? This is exactly the ratio between the volume of the ball and the box ($2^d$). 289 | More accurately, $V = p2^d$ where $V$ is the volume of the sphere. 290 | 291 | Now, we can bound $p$ from above. 292 | A uniformly random chosen point from the cube is a vector $x \in \R^d$ such that each coordinate $x(i)$ 293 | is chosen uniformly from $[-1,1]$. The event that $x$ is in the unit sphere is the event that $\|x\|^2 = \sum_{i=1}^{d}x(i)^2 \le 1$. 294 | Let $z_i = x(i)^2$, and note that 295 | $\E[z(i)] = \int_{-1}^{1}\frac{1}{2}t^2 dt = 1/3$. Therefore, $\E[\|x\|^2] = d/3$. 296 | Also, 297 | \[ 298 | \var(z_i)  = \int_{-1}^{1}\frac{1}{2}t^4 dt  - (1/3)^2  = 1/5 - 1/9 \le 1/10 299 | \] 300 | so by Chernoff's inequality. 301 | $p = \Pr[\sum_{i=1}^{d}x(i)^2 \le 1]  = \Pr[\sum_{i=1}^{d}(z_i -\E[z_i] ) \le 1-d/3] \le e^{-\frac{(d/3)^2}{4d/10}} \le e^{-d/4}$. 302 | This concludes the observation that the fraction of the volume which is inside the sphere is 303 | exponentially small compared to the cube. 304 | A counter intuitive way of viewing this is that almost the entire volume of the cube is concentrated at the ``corners". 305 | 306 | \section{Orthogonality of random vectors} 307 | 308 | A uniformly chosen random vector from the unit sphere ($q$) is almost orthogonal to any fixed vector ($x$) with high probability. 309 | We can see this in two ways. First, we can see that the expected dot product of any vector $x$ with a the random vector $q$ is small. 310 | It is trivial that $\E[\langle x,q \rangle] = \langle x,\E[q] \rangle] = 0$. 311 | 312 | Moreover, $\E[\langle x,q \rangle^2] = 1/d$. 313 | To see this, consider $q_1,q_2,\ldots,q_d$ where $q_1 = q$ and $q_2,\ldots,q_d$ complete $q$ to an orthogonal basis. 314 | Clearly, the distribution of all $q_i$ are identical (but not independent) 315 | $\E[\langle x, q \rangle^2] = \E[\langle x,q_1\rangle^2] = \E[\frac{1}{d}\sum_{i=1}^{d}\langle x,q_i\rangle^2] = \frac{1}{d}\|x\|^2$. 316 | 317 | It is not hard to show that in fact for any vector $x$, if $q$ is chosen uniformly at random from the unit sphere 318 | then $\Pr[ \langle x, q \rangle  \ge \frac{t}{\sqrt{d}}] \le e^{-t^2/2}$. 319 | First, replace that uniform distribution over the unit sphere with an i.i.d. distribution of Gaussians $q(i)\sim \N(0,\frac{1}{\sqrt{d}})$. 320 | Note that $E[\|q\|^2] = 1$, moreover, from the sharp concentration of the $\chi^2$ distribution we know that $E[\|q\|^2] \approx 1$. 321 | For convenience we will assume that $E[\|q\|^2] = 1$ and will ignore the small inaccuracy. 322 | Moreover, due to the rotational invariance of the Gaussian distribution we have that any direction is equally likely and thus this 323 | new distribution approximates the uniform distribution over the sphere. 324 | Next, notice that due to the rotational invariance $\langle x,q \rangle \sim \N(0,\frac{\|x\|}{\sqrt{d}})$ 325 | and $\Pr[ \langle x, q \rangle  \ge \frac{t\|x\|}{\sqrt{d}}] \le e^{-t^2/2}$. 326 | 327 | \section{The failure of space partitioning schemes} 328 | As we've seen, a ``typical'' distance from a random query to the nearest neighbor in a $d$-dimensional unit box is at the scale of $\sqrt{d}$. 329 | Note that the side of any box in any space partitioning scheme (including kd-trees, of course) is at most $1$. 330 | As a result, we guaranteed to scan at least $\Omega(d)$ levels in the tree which is $\exp(d)$ points. 331 | Note that this is more than the brute force solution of $O(nd)$ when $d > \log(n)$. 332 | 333 | 334 | \section{Bonus section: solution of the ``riddle" from the first class} 335 | You were asked to prove that the $\ell_\infty$ metric is the most general metric possible for any finite collections of objects. 336 | 337 | Consider $n$ objects $a_1,\ldots,a_n$ and a general metric $d(\cdot,\cdot)$. 338 | Consider the mapping $\psi$ of each item to an $n$ dimensional vector space $x_i = [d(a_i,a_1),d(a_i,a_2),\ldots,d(a_i,a_n)]$. 339 | It is now easy to see that $ \| x_i - x_j \|_\infty = d(a_i,a_j)$. 340 | \[ 341 | \|x_i -x_j\|_\infty = \max_k \; | x_i(k) - x_j(k)| = \max_k |d(a_i,a_k) - d(a_j,a_k) | = |d(a_i,a_j) - d(a_i,a_j) | = d(a_i,a_j) 342 | \] 343 | Note that if $k\ne i,j$ then $ |d(a_i,a_k) - d(a_j,a_k)| \le d(a_i,a_j)$ because of the triangle inequality $d(a_i,a_k) \le d(a_i,a_j) + d(a_j,a_k) + $ and $d(a_j,a_k) \le d(a_i,a_j) + d(a_i,a_k)$. 344 | 345 | 346 | 347 | 348 | 349 | \bibliographystyle{plain} 350 | \bibliography{vs} 351 | 352 | \end{document} 353 | %%%%%%%% 354 | -------------------------------------------------------------------------------- /class_notes/Class_05_dimensionality_reduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_05_dimensionality_reduction.pdf -------------------------------------------------------------------------------- /class_notes/Class_05_dimensionality_reduction.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{vs} 3 | \begin{document} 4 | 5 | \lecturetitle{Class 5 - Dimensionality Reduction} 6 | 7 | 8 | 9 | \section{Singular Value Decomposition (SVD)} 10 | 11 | 12 | \noindent We will see that any matrix $A \in \R^{m \times n}$ (w.l.o.g. $m \le n$) can be written as 13 | \begin{eqnarray} 14 | A &=& \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}\\ 15 | &\forall \;\; \ell& \sigma_\ell \in \R, \;\; \sigma_\ell \ge 0\\ 16 | &\forall \;\; \ell, \ell'& \langle u_{\ell}, u_{\ell'} \rangle= \langle v_{\ell}, v_{\ell'} \rangle = \delta(\ell,\ell') 17 | \end{eqnarray} 18 | % 19 | To prove this consider the matrix $AA^{T} \in \R^{m\times m}$. 20 | Set $u_\ell$ to be the $\ell$'th eigenvector of $AA^{T}$. 21 | By definition we have that $AA^{T}u_\ell = \lambda_\ell u_\ell$. 22 | Since $AA^{T}$ is positive semidefinite we have $\lambda_\ell \ge 0$. 23 | Since $AA^{T}$ is symmetric we have that $\forall \;\; \ell, \ell' \; \langle u_{\ell}, u_{\ell'} \rangle = \delta(\ell,\ell')$. 24 | Set $\sigma_\ell = \sqrt{\lambda_\ell}$ and $v_\ell = \frac{1}{\sigma_\ell}A^{T}u_{\ell}$. 25 | Now we can compute the following: 26 | \[ 27 | \langle v_{\ell}, v_{\ell'} \rangle = \frac{1}{\sigma^{2}_\ell}u_{\ell}^{T}AA^{T}u_{\ell'} = \frac{1}{\sigma_{\ell}^{2}}\lambda_\ell \langle u_{\ell}, u_{\ell'} \rangle = \delta(\ell,\ell') 28 | \] 29 | % 30 | We are only left to show that $A = \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}$. 31 | To do that consider the test vector $w = \sum_{i=1}^{m} \alpha_i u_i$. 32 | \begin{eqnarray*} 33 | w^TA = \sum_{i=1}^{m} \alpha_i u_i^TA = \sum_{i=1}^{m} \alpha_i \sigma_i v_i^T = \sum_{i=1}^{m}\sum_{j=1}^{m}\alpha_i \sigma_j (u_i^Tu_j) v_j^T = (\sum_{i=1}^{m}\alpha_i u_i^T)(\sum_{j=1}^{m}\sigma_j u_j v_j^T) = w^T(\sum_{j=1}^{m}\sigma_j u_j v_j^T)\\ 34 | \end{eqnarray*} 35 | % 36 | The vectors $u_\ell$ and $v_{\ell}$ are called the left and right singular vectors of $A$ and $\sigma_\ell$ are the singular vectors of $A$. 37 | It is customary to order the singular values in descending order $\sigma_1 \ge \sigma_2, \ldots , \sigma_m \ge 0$. 38 | Also, we will denote by $r$ the rank of $A$. 39 | Here is another very convenient way to write the fact that $A = \sum_{\ell=1}^{m} \sigma_{\ell} u_{\ell} v_{\ell}^{T}$ 40 | \begin{itemize} 41 | \item Let $\Sigma \in \R^{r \times r}$ be a diagonal matrix whose entries are $\Sigma(i,i) = \sigma_i$ and $\sigma_1 \ge \sigma_2 \ge \ldots \ge \sigma_r$. 42 | \item Let $U \in \R^{m \times r}$ be the matrix whose $i$'th column is the left singular vectors of $A$ corresponding to singular value $\sigma_i$. 43 | \item Let $V \in \R^{n \times r}$ be the matrix whose $i$'th column is the right singular vectors of $A$ corresponding to singular value $\sigma_i$. 44 | \end{itemize} 45 | We have that $A = USV^T$ and that $U^{T}U = V^{T}V = I_r$. Note that the sum goes only up to $r$ which is the rank of $A$. Clearly, not summing up zero valued singular values does not change the sum. 46 | 47 | \subsection*{Applications of the SVD} 48 | \begin{enumerate} 49 | \item Determining range, null space and rank (also numerical rank). 50 | \item Matrix approximation. 51 | \item Inverse and Pseudo-inverse: If $A=U \Sigma V^{T}$ and $\Sigma$ 52 | is full rank, then $A^{-1}=V \Sigma^{-1} U^{T}$. If $\Sigma$ is 53 | singular, then its pseudo-inverse is given by $A^{\dagger}=V 54 | \Sigma^{\dagger} U^{T}$, where $\Sigma^{\dagger}$ is formed by 55 | replacing every nonzero entry by its reciprocal. 56 | \item Least squares: If we need to solve $Ax=b$ in the least-squares 57 | sense, then $x_{LS}=V \Sigma^{\dagger} U^{T} b$. 58 | \item De-noising -- Small singular values typically correspond to 59 | noise. Take the matrix whose columns are the signals, compute SVD, 60 | zero small singular values, and reconstruct. 61 | \item Compression -- We have signals as the columns of the matrix 62 | $S$, that is, the $i$ signal is given by 63 | \begin{equation*} 64 | S_{i} = \sum_{i=1}^{r} \left ( \sigma_{j} v_{ij} \right ) u_{j}. 65 | \end{equation*} 66 | If some of the $\sigma_{i}$ are small, we can discard them with 67 | small error, thus obtaining a compressed representation of each 68 | signal. We have to keep the coefficients $\sigma_{j} v_{ij}$ for 69 | each signal and the dictionary, that is, the vectors $u_{i}$ that 70 | correspond to the retained coefficients. 71 | \end{enumerate} 72 | 73 | \noindent SVD and eigen-decomposition are related but there are quite a few differences between them. 74 | \begin{enumerate} 75 | \item Not every matrix has an eigen-decomposition (not even any 76 | square matrix). Any matrix (even rectangular) has an SVD. 77 | \item In eigen-decomposition $A=X \Lambda X^{-1}$, that is, the 78 | eigen-basis is not always orthogonal. The basis of singular vectors 79 | is always orthogonal. 80 | \item In SVD we have two singular-spaces (right and left). 81 | \item Computing the SVD of a matrix is more numerically stable. 82 | %\item Relation to condition number; the numerical problems with eigen-decomposition; multiplication by an orthogonal matrix is perfectly conditioned. 83 | \end{enumerate} 84 | 85 | 86 | 87 | 88 | \subsection*{Rank-k approximation in the spectral norm} 89 | The following will claim that the best approximation to $A$ by a rank deficient 90 | matrix is obtained by the top singular values and vectors of $A$. More accurately: 91 | \begin{fact} 92 | Set 93 | \begin{equation*} 94 | A_{k} = \sum_{j=1}^{k} \sigma_{j} u_{j} v_{j}^{T}. 95 | \end{equation*} 96 | Then, 97 | \begin{equation*} 98 | \min_{\substack{B \in \mathbb{R}^{m \times n} \\ 99 | \operatorname{rank}(B) \leq k}} \norm{A-B}_{2} = \norm{A-A_{k}}_{2} 100 | = \sigma_{k+1}. 101 | \end{equation*} 102 | \end{fact} 103 | 104 | 105 | \begin{proof} 106 | \begin{equation*} 107 | \norm{A-A_{k}} = \norm{\sum_{j=1}^{r} \sigma_{j} u_{j} v_{j}^{T} - \sum_{j=1}^{k} 108 | \sigma_{j} u_{j} v_{j}^{T}} = \norm{\sum_{j=k+1}^{r} \sigma_{j} u_{j} 109 | v_{j}^{T}} = \sigma_{k+1} 110 | \end{equation*} 111 | and thus $\sigma_{k+1}$ is the largest singular value of $A-A_{k}$. 112 | Alternatively, look at $U^{T} A_{k} V = 113 | \operatorname{diag}(\sigma_{1},\ldots,\sigma_{k},0,\ldots,0)$, which 114 | means that $\operatorname{rank}(A_{k}) = k$, and that 115 | \begin{equation*} 116 | \norm{A-A_{k}}_{2} = \norm{U^{T} (A-A_{k}) V}_{2} = 117 | \norm{\operatorname{diag}(0,\ldots,0,\sigma_{k+1},\ldots,\sigma_{r})}_{2} 118 | = \sigma_{k+1}. 119 | \end{equation*} 120 | 121 | Let $B$ be an arbitrary matrix with $\operatorname{rank}(B_{k}) = 122 | k$. Then, it has a null space of dimension $n-k$, that is, 123 | \begin{equation*} 124 | \operatorname{null}(B) = \operatorname{span}(w_{1},\ldots,w_{n-k}). 125 | \end{equation*} 126 | A dimension argument shows that 127 | \begin{equation*} 128 | \operatorname{span}(w_{1},\ldots,w_{n-k}) \cap 129 | \operatorname{span}(v_{1},\ldots,v_{k+1}) \ne \{ 0 \}. 130 | \end{equation*} 131 | Let $w$ be a unit vector from the intersection. Since 132 | \begin{equation*} 133 | Aw = \sum_{j=1}^{k+1} \sigma_{j} (v_{j}^{T}w) u_{j}, 134 | \end{equation*} 135 | we have 136 | \begin{equation*} 137 | \norm{A-B}_{2}^{2} \ge \norm{(A-B)w}_{2}^{2} = \norm{Aw}_{2}^{2} = 138 | \sum_{j=1}^{k+1} \sigma_{j}^{2} \abs{v_{j}^{T}w}^{2} \ge 139 | \sigma_{k+1}^{2} \sum_{j=1}^{k+1} \abs{v_{j}^{T}w}^{2} = 140 | \sigma_{k+1}^{2}, 141 | \end{equation*} 142 | since $w \in \operatorname{span}\{v_{1},\ldots,v_{n+1}\}$, and the 143 | $v_{j}$ are orthogonal. 144 | \end{proof} 145 | 146 | \subsection*{Rank-k approximation in the Frobenius norm} 147 | 148 | The same theorem holds with the Frobenius norm. 149 | \begin{theorem} Set 150 | \begin{equation*} 151 | A_{k} = \sum_{j=1}^{k} \sigma_{j} u_{j} v_{j}^{T}. 152 | \end{equation*} 153 | Then, 154 | \begin{equation*} 155 | \min_{\substack{B \in \mathbb{R}^{m \times n} \\ 156 | \operatorname{rank}(B) \leq k}} \norm{A-B}_{F} = \norm{A-A_{k}}_{F} 157 | = \sqrt{\sum_{i=k+1}^{m} \sigma_{i}^{2}}. 158 | \end{equation*} 159 | \end{theorem} 160 | \begin{proof} 161 | Suppose $A=U \Sigma V^{T}$. Then 162 | \begin{equation*} 163 | \min_{\operatorname{rank}(B) \leq k} \norm{A-B}^{2}_{F} = 164 | \min_{\operatorname{rank}(B) \leq k} \norm{U \Sigma V^{T} - UU^{T} B 165 | VV^{T}}^{2}_{F} = \min_{\operatorname{rank}(B) \leq k} \norm{\Sigma 166 | - U^{T} B V}^{2}_{F}. 167 | \end{equation*} 168 | Now, 169 | \begin{equation*} 170 | \norm{\Sigma - U^{T} B V}^{2}_{F} = \sum_{i=1}^{n} \left ( 171 | \Sigma_{ii} - \left (U^{T}B V)_{ii} \right ) \right )^{2} + 172 | \text{off-diagonal terms}. 173 | \end{equation*} 174 | If $B$ is the best approximation matrix and $U^{T}B V$ is not 175 | diagonal, then write $U^{T}B V=D+O$, where $D$ is diagonal and $O$ 176 | contains the off-diagonal elements. Then the matrix $B = U D V^{T}$ 177 | is a better approximation, which is a contradiction. 178 | 179 | Thus, $U^{T}B V$ must be diagonal. Hence, 180 | \begin{equation*} 181 | \norm{\Sigma - D}^{2}_{F} = \sum_{i=1}^{n} \left (\sigma_{i} - d_{i} 182 | \right )^{2} = \sum_{i=1}^{k} \left (\sigma_{i} - d_{i} \right )^{2} 183 | + \sum_{i=k+1}^{n} \sigma_{i}^{2}, 184 | \end{equation*} 185 | and this is minimal when $d_{i}=\sigma_{i}$, $i=1,\ldots,k$. The 186 | best approximating matrix is $A_{k} = U D V^{T}$, and the 187 | approximation error is $\sqrt{\sum_{i=k+1}^{n} \sigma_{i}^{2}}$. 188 | \end{proof} 189 | 190 | 191 | \section{Linear regression in the least-squared loss} 192 | In Linear regression we aim to find the best linear approximation 193 | to a set of observed data. For the $m$ data points $\{x_1,\ldots,x_m\}$, $x_i \in \R^n$, 194 | each receiving the value $y_i$, we look for the weight vector $w$ that minimizes: 195 | \[ 196 | \sum_{i=1}^{n} (x_{i}^{T}w - y_i)^2 = \norm{Aw - y}_{2}^{2} 197 | \] 198 | Where $A$ is a matrix that holds the data points as rows $A_i = x^{T}_{i}$. 199 | 200 | \begin{proposition} 201 | The vector $w$ that minimizes $\norm{Aw - y}_{2}^{2}$ is $w = A^{\dagger}y = V\Sigma^{\dagger}U^{T}y$ 202 | for $A = U\Sigma V^T$ and $\Sigma^{\dagger}_{ii} = 1/\Sigma_{ii}$ if $\Sigma_{ii} > 0$ and $0$ else. 203 | \end{proposition} 204 | 205 | Let us define $U_{\parallel}$ and $U_{\perp}$ as the parts of $U$ corresponding to positive and zero singular values of $A$ respectively. 206 | Also let $y_{\parallel} = 0$ and $y_{\perp}$ be two vectors such that $y = y_{\parallel}+y_{\perp}$ and 207 | $U_{\parallel}y_{\perp} = 0$ and $U_{\perp}y_{\parallel}=0$. 208 | 209 | Since $y_{\parallel}$ and $y_{\perp}$ are orthogonal we have that $\norm{Aw - y}_{2}^{2} 210 | = \norm{Aw - y_{\parallel}-y_{\perp}}_{2}^{2} = \norm{Aw - y_{\parallel}}_{2}^{2} + \norm{y_{\perp}}_{2}^{2}$. 211 | Now, since $y_{\parallel}$ is in the range of $A$ there is a solution $w$ for which $\norm{Aw - y_{\parallel}}_{2}^{2} = 0$. 212 | Namely, $w = A^{\dagger}y = V\Sigma^{\dagger}U^{T}y$ for $A = U\Sigma V^{T}$. This is because $U\Sigma V^{T}V\Sigma^{\dagger}U^{T}y = y_{\parallel}$. 213 | Moreover, we get that the minimal cost is exactly $ \norm{y_{\perp}}_{2}^{2}$ which is independent of $w$. 214 | 215 | 216 | \section{PCA, Optimal squared loss dimension reduction} 217 | 218 | Given a set of $n$ vectors $x_1,\ldots,x_n$ in $\R^{m}$. We look for a rank $k$ 219 | projection matrix $P \in \R^{m \times m}$ that minimizes: 220 | \[ 221 | \sum_{i=1} ||Px_{i} - x_{i}||_{2}^{2} 222 | \] 223 | If we denote by $A$ the matrix whose $i$'th column is $x_i$ then this is equivalent to minimizing $||PA - A||_{F}^{2}$ 224 | Since the best possible rank $k$ approximation to the matrix $A$ is $A_{k} = \sum_{i=1}^{k}\sigma_{i}u_{i}v_{i}^{T}$ the best 225 | possible solution would be a projection $P$ for which $PA = A_{k}$. This is achieved by $P = U_{k}U_{k}^{T}$ where $U_{k}$ 226 | is the matrix corresponding to the first $k$ left singular vectors of $A$. 227 | 228 | If we define $y_i = U_{k}^{T}x_{i}$ we see that the values of $y_i \in \R^{k}$ are optimally fitted to the set of points $x_i$ in the 229 | sense that they minimize: 230 | \[ 231 | \min_{y_1,\ldots,y_n } \min_{\Psi \in \R^{k \times m}}\sum_{i=1} ||\Psi y_i - x_{i}||_{2}^{2} 232 | \] 233 | The mapping of $x_i \rightarrow U_{k}^{T}x_i = y_i $ thus reduces the dimension of any set of points $x_1,\ldots,x_n$ in $\R^{m}$ to 234 | a set of points $y_1,\ldots,y_n$ in $\R^{k}$ optimally in the squared loss sense. This is commonly referred to as Principal Component Analysis (PCA). 235 | 236 | 237 | 238 | \begin{center} 239 | \includegraphics[width=0.6\textwidth]{images/pca.png} 240 | \end{center} 241 | 242 | 243 | \section{Closest orthogonal matrix} 244 | The SVD also allows to find the orthogonal matrix that is closest to 245 | a given matrix. Again, suppose that $A = U \Sigma V^{T}$ and $W$ is 246 | an orthogonal matrix that minimizes $\norm{A-W}^{2}_{F}$ among all 247 | orthogonal matrices. Now, 248 | \begin{equation*} 249 | \norm{U \Sigma V^{T} - W}_{F}^{2} = \norm{U \Sigma V^{T} - UU^{T} W 250 | VV^{T}} = \norm{\Sigma - \tilde{W}}, 251 | \end{equation*} 252 | where $\tilde{W}=U^{T} W V$ is another orthogonal matrix. We need to 253 | find the orthogonal matrix $\tilde{W}$ that is closest to $\Sigma$. 254 | Alternatively, we need to minimize $\norm{\tilde{W}^{T} \Sigma - 255 | I}_{F}^{2}$. 256 | 257 | If $U$ is orthogonal and $D$ is diagonal and positive, then 258 | \begin{equation}\label{eq1} 259 | \begin{aligned} 260 | \operatorname{trace} (UD) &= \sum_{i,k} u_{ik} d_{ki} \leq \sum _{i} 261 | \left ( \left ( \sum_{k} u_{ik}^{2} \right )^{1/2} \left ( \sum_{k} 262 | d_{ik}^{2} \right )^{1/2} \right ) \\ 263 | &= \sum_{i} \left ( \sum_{k} d_{ki}^{2} \right )^{1/2} = \sum_{i} 264 | \left ( d_{ii}^{2} \right )^{1/2} = \sum_{i} d_{ii} = 265 | \operatorname{trace}(D). 266 | \end{aligned} 267 | \end{equation} 268 | Now 269 | \begin{align*} 270 | \norm{\tilde{W}^{T} \Sigma - I}_{F}^{2} &= \operatorname{trace} 271 | \left ( \left( \tilde{W}^{T} \Sigma - I \right ) \left( 272 | \tilde{W}^{T} \Sigma - I \right )^{T} \right ) \\ 273 | &= \operatorname{trace} \left ( \left( \tilde{W}^{T} \Sigma - I 274 | \right 275 | ) \left( \Sigma \tilde{W} - I \right ) \right ) \\ 276 | &= \operatorname{trace} \left ( \tilde{W}^{T} \Sigma^{2} \tilde{W} 277 | \right ) - \operatorname{trace} \left ( \tilde{W}^{T} \Sigma \right 278 | ) - \operatorname{trace} \left ( \Sigma \tilde{W} \right ) + n \\ 279 | &= \operatorname{trace} \left ( \left ( \Sigma \tilde{W} \right 280 | )^{T} \left ( \Sigma \tilde{W} \right ) \right ) - 2 281 | \operatorname{trace} \left (\Sigma \tilde{W} \right ) + n \\ 282 | &= \norm{\Sigma \tilde{W}}_{F}^{2} - 2 \operatorname{trace} \left 283 | (\Sigma \tilde{W} \right ) + n \\ 284 | &= \norm{\Sigma }_{F}^{2} - 2 \operatorname{trace} \left (\Sigma 285 | \tilde{W} \right ) + n. 286 | \end{align*} 287 | Thus, we need to maximize $\operatorname{trace} \left (\Sigma 288 | \tilde{W} \right )$. But this is maximized by $ \tilde{W} = I$ by 289 | \eqref{eq1}. Thus, the best approximating matrix is $W=UV^{T}$. 290 | 291 | 292 | 293 | 294 | 295 | \section{Computing the SVD: The power method} 296 | 297 | We give a simple algorithm for computing the Singular Value Decomposition of a matrix $A \in \R^{m \times n}$. 298 | We start by computing the first singular value $\sigma_1$ and left and right singular vectors $u_1$ and $v_1$ of $A$, 299 | for which $min_{i|}{||} = \frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|}\left(\frac{\sigma_1}{\sigma_i}\right)^{2s} 318 | \] 319 | Demanding that the error in the estimation of $\sigma_1$ is less than $\eps$ gives the requirement on $s$. 320 | \begin{eqnarray} 321 | \frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|}\left(\frac{\sigma_1}{\sigma_i}\right)^{2s} &\ge& \frac{n}{\eps}\\ 322 | s &\ge& \frac{\log(n|\alpha^{0}_i|/\eps|\alpha^{0}|_1)}{2\log(\sigma_1/\sigma_i)} 323 | \end{eqnarray} 324 | From the two-stability of the Gaussian distribution we have that $\alpha^{0}_i \sim \N(0,1)$. 325 | Therefore, $\Pr[\alpha^{0}_i > t] \le e^{-t^2}$ which gives that with probability at least $1-\delta/2$ we have for 326 | all $i$, $|\alpha^{0}_i | \le \sqrt{\log(2n/\delta)}$. Also, $\Pr[|\alpha^{0}_1 | \le \delta/4 ] \le \delta/2$ (this is because 327 | $\Pr[|z| < t] \le max_{r}\Psi_{z}(r)\cdot2t$ for any distribution and the normal distribution function at zero takes it maximal value which is less than $2$) 328 | Thus, with probability at least $1-\delta$ we have that for all $i$, $\frac{|\alpha^{0}_{1}|}{|\alpha^{0}_{i}|} \le \frac{\sqrt{\log(2n/\delta)}}{\delta/4}$. 329 | Combining all of the above we get that it is sufficient to set $s = \log(4n\log(2n/\delta)/\eps\delta)/2\lambda = O(\log(n/\eps\delta)/\lambda)$ 330 | in order to get $\eps$ precision with probability at least $1-\delta$. 331 | 332 | We now describe how to extend this to a full SVD of $A$. Since we have computed $(\sigma_1,u_1,v_1)$, we can repeat this 333 | procedure for $A - \sigma_{1}u_{1}v_{1}^{T} = \sum_{i=2}^{n}{\sigma_{i}u_{i}v_{i}^{T}}$. The top singular value and vectors of which are $(\sigma_2,u_2,v_2)$. 334 | Thus, computing the rank-k approximation of $A$ requires $O(mnks) = O(mnk\log(n/\eps\delta))/\lambda)$ operations. 335 | This is because computing $A^{T}Ax$ requires $O(mn)$ operations and 336 | for each of the first $k$ singular values and vectors this is performed $s$ times. 337 | 338 | The main problem with this algorithm is that its running time is heavily influenced by the value of $\lambda$. 339 | This is, in fact, an artifact of the analysis rather than the algorithm. Next, we see a gap independent analysis. 340 | 341 | 342 | 343 | \section{Gap independent analysis} 344 | We show a short proof from \cite{liberty2016short} of a spectral gap independent property of simultaneous iterations. This follows the similar analyses \cite{RokhlinST09,HalkoMT2011,MuscoM15,WittenE15}. 345 | 346 | \begin{lemma} Let $A \in \R^{n \times m}$ be an arbitrary matrix and let $G \in \R^{m \times k}$ be a matrix of i.i.d.\ random Gaussian entries. 347 | Let $t = c\cdot \log(n/\eps)/\eps$ and $Z = \operatorname{span}((AA^T)^t A G)$ then 348 | \[ 349 | ||A - ZZ^TA|| \le (1+\eps)\sigma_{k+1} 350 | \] 351 | with high probability depending only on the universal constant $c$. 352 | \end{lemma} 353 | \begin{proof} 354 | $||A - ZZ^TA|| = \max_{x :\|x\|=1} \|x^T A\|$ such that $\|x^TZ\| = 0$. 355 | We change variables $A = USV^T$ and $x = Uy$ and $G' = V^TG$. 356 | Note that $G'$ is also a matrix of i.id.\ Gaussian entries because $V$ is orthogonal. 357 | This reduces to 358 | $\max_{y:\|y\|=1} \|y^TS\|$ such that $y^TS^{2t+1}G' = 0$. 359 | We now break $y$, $S$, and $G'$ to two blocks each such that 360 | \[ 361 | y = 362 | \left(\begin{array}{c} 363 | y_1 \\ \hline 364 | y_2 \\ 365 | \end{array}\right) 366 | % 367 | \mbox{,\;\;} 368 | % 369 | S = \left(\begin{array}{c|c} 370 | S_1 & 0 \\ \hline 371 | 0 & S_2 \\ 372 | \end{array}\right) 373 | % 374 | \mbox{,\;\;} 375 | % 376 | G' = \left(\begin{array}{c} 377 | G'_1 \\ \hline 378 | G'_2 \\ 379 | \end{array}\right) 380 | \] 381 | and $y_1 \in \R^{k}$, $y_2 \in \R^{n-k}$, $S_1 \in \R^{k \times k}$, $S_2 \in \R^{(n-k) \times (n-k)}$, $G'_1 \in \R^{k \times k}$, and $G'_2 \in \R^{(n-k) \times k}$. 382 | \begin{eqnarray*} 383 | 0 &=& \|y^T S^{2t+1} G'\| = \|y_1^T S^{2t+1}_1 G'_1+ y_2^T S^{2t+1}_2 G'_2\| \\ 384 | &\ge& \|y_1^T S^{2t+1}_1 G'_1\| - \|y_2^T S^{2t+1}_2 G'_2\| \\ 385 | &\ge& \|y_1^T S^{2t+1}_1\|/\|G'^{-1}_1\| - \|y_2^T\| \cdot \|S^{2t+1}_2 \| \cdot \|G'_2\| \\ 386 | &\ge& |y_1(i)| \sigma_{i}^{2t+1}/\|G'^{-1}_1\| - \sigma_{k+1}^{2t+1} \cdot \|G'_2\| \ . 387 | \end{eqnarray*} 388 | \noindent This gives that $|y_1(i)| \le (\sigma_{k+1}/\sigma_i)^{2t+1}\|G'_2\| \|G'^{-1}_1\|$. Equipped with this inequality we bound the expression $\|y^TS\|$. 389 | Let $k' \le k$ be such that $\sigma_{k'} \ge (1+\eps)\sigma_{k+1}$ and $\sigma_{k'+1} < (1+\eps)\sigma_{k+1}$. 390 | \begin{eqnarray} 391 | ||A - ZZ^TA||^2 &=& \|y^TS\|^2 = \sum_{i=1}^{k'}y^2_i \sigma_i^2 + \sum_{i=k'+1}^{n}y^2_i \sigma_i^2 \\ 392 | &\le& \left( \|G'_2\|^2 \|G'^{-1}_1\|^2 \sum_{i=1}^{k'}(\sigma_{k+1}/\sigma_i)^{4t} \sigma_{k+1}^2 \right) + (1+\eps)\sigma_{k+1}^2 \\ 393 | &\le& \left[ \|G'_2\|^2 \|G'^{-1}_1\|^2 k (1/(1+\eps))^{4t} + (1+\eps)\right]\sigma_{k+1}^2 \le (1+2\eps)\sigma_{k+1}^2 394 | \end{eqnarray} 395 | The last step is correct as long as $ \|G'_2\|^2 \|G'^{-1}_1\|^2 k (1/(1+\eps))^{4t} \le \eps \sigma^2_{k+1}$ which holds for $t \ge \log(\|G'_2\|^2 \|G'^{-1}_1\|^2 k/\eps) /4\log(1+\eps) = O(\log(n/\eps)/\eps)$. The last inequality uses the fact that $G'_1$ and $G'_2$ are random gaussian due to rotational invariance of the Gaussian distribution. This means that $\|G'_2\|^2 \|G'^{-1}_1\|^2 = O(\operatorname{poly}(n))$ with high probability \cite{Rudelson08}. 396 | Finally, $||A - ZZ^TA|| \le \sqrt{1+2\eps}\cdot\sigma_{k+1} \le (1+\eps)\sigma_{k+1}$. 397 | \end{proof} 398 | 399 | 400 | 401 | 402 | \section{Random-projection} 403 | 404 | We will give a simple proof of the following, rather amazing, fact. Every set of $n$ points 405 | in a Euclidian space (say in dimension $d$) can be embedded into the Euclidean space of 406 | dimension $k = O(\log(n)/\eps^2)$ such that all pairwise distances are preserved up distortion $1\pm \eps$. 407 | We will prove the construction of \cite{DasGuptaGupta99} which is simpler than the one in \cite{JL84}. 408 | 409 | 410 | We will argue that a certain distribution over the choice of a matrix $\R \in \R^{k \times d}$ gives that: 411 | \begin{equation} 412 | \label{e1} 413 | \forall x \in \Sph^{d-1} \;\; \Pr\left[ \left| ||\frac{1}{\sqrt{k}}Rx|| - 1 \right| > \eps \right] \le \frac{1}{n^2} 414 | \end{equation} 415 | Before we pick this distribution and show that Equation~\ref{e1} holds for it, let us first see 416 | that this gives the opening statement. 417 | 418 | Consider a set of $n$ points $x_1,\ldots, x_n$ in Euclidean space $\R^d$. Embedding these points 419 | into a lower dimension while preserving all distances between 420 | them up to distortion $1\pm \eps$ means approximately preserving the norms of all 421 | ${n \choose 2}$ vectors $x_i - x_j$. Assuming Equation~\ref{e1} holds and using the union 422 | bound, this property will fail to hold for at least one $x_i - x_j$ pair with probability at most ${n \choose 2}\frac{1}{n^2} \le 1/2$. 423 | Which means that all ${n \choose 2}$ point distances are preserved up to distortion $\eps$ with probability at least $1/2$. 424 | 425 | 426 | \section{Matrices with normally distributed independent entries} 427 | We consider the distribution of matrices $R$ such that each $R(i,j)$ is drawn independently from a 428 | normal distribution with mean zero and variance $1$, $R(i,j) \sim \N(0,1)$. We show that for this 429 | distribution Equation~\ref{e1} holds for some $k \in O(\log(n)/\eps^2)$. 430 | 431 | First consider the random variable $z = \sum_{j=1}^{d}r(j)x(j)$ where $r(j) \sim \N(0,1)$. 432 | To understand how the variable $z$ distributes we recall the two-stability of the 433 | normal distribution. Namely, if $z_3 = z_2 + z_1$ and 434 | $z_1 \sim \N(\mu_1,\sigma_{1})$ and $z_2 \sim \N(\mu_2,\sigma_{2})$ then, $$z_3 \sim \N(\mu_1 + \mu_2,\sqrt{\sigma^{2}_{1} + \sigma^{2}_{2}}).$$ 435 | In our case, $r(i)x(i) \sim \N(0,x_{i})$ and therefore, $z = \sum_{i=1}^{d}r(i)x(i) \sim \N(0,\sqrt{\sum_{i=1}^{d}x^{2}_{i}}) \sim \N(0,1)$. 436 | % 437 | Now, note that each element in the vector $Rx$ distributes exactly like $z$. 438 | Defining $k$ identical copies of $z$, $z_1,\ldots,z_k$, 439 | We get that $||\frac{1}{\sqrt{k}}Rx||$ distributes exactly like $\sqrt{\frac{1}{k}\sum_{i=1}^{k} z^{2}_{i}}$. 440 | Thus, proving Equation~\ref{e1} reduces to showing that: 441 | \begin{equation} 442 | \Pr\left[ \left| \sqrt{\frac{1}{k}\sum_{i=1}^{k} z^{2}_{i}} - 1 \right| > \eps \right] \le \frac{1}{n^2} 443 | \end{equation} 444 | for a set of independent normal random variables $z_1,\ldots,z_k \sim \N(0,1)$. 445 | It is sufficient to demanding that $\Pr[\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)^2]$ and $\Pr[\sum_{i=1}^{k} z^{2}_{i} \le k(1-\eps)^2]$ are both smaller than $1/2n^2$. 446 | We start with bounding the probability that $\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)$ (this is okay because $k(1+\eps) < k(1+\eps)^2$). 447 | \[ 448 | \Pr[\sum z^{2}_{i} \ge k(1+\eps)] = \Pr[e^{\lambda \sum z^{2}_{i}} \le e^{\lambda k (1+\eps)}] \le (\E[e^{\lambda z^2}])^k/e^{\lambda k (1+\eps)} 449 | \] 450 | Since $z \sim \N(0,1)$ we can compute $\E[e^{\lambda z^2}]$ exactly: 451 | \[ 452 | \E [e^{\lambda z^{2}}] = \frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{\lambda t^{2}} e^{-\frac{t^{2}}{2}} dt =\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{-\frac{(t\sqrt{1-2\lambda})^{2}}{2}}dt = \frac{1}{\sqrt{1-2\lambda}}% e^{\frac{1}{2} \log(1-2\lambda)} 453 | \] 454 | The final step is by substituting $t' = t\sqrt{1-2\lambda}$ and recalling that $\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\infty} e^{-\frac{t'^{2}}{2}}dt' = 1$. 455 | Finally, using the fact that $\frac{1}{\sqrt{1-2\lambda}} \le 2\lambda + 4\lambda^2$ for $\lambda \in [0,1/4]$ we have: 456 | \[ 457 | \E [e^{\lambda z^{2}}] \le e^{\lambda + 2\lambda^2} 458 | \] 459 | Substituting this into the equation above we have that: 460 | \[ 461 | \Pr \le e^{k(\lambda + 2\lambda^2) - k\lambda (1+\eps)} = e^{ 2k\lambda^2 - k\lambda\eps} = e^{ - k\eps^2/8} 462 | \] 463 | for $\lambda \leftarrow \eps/4$. Finally, our condition that 464 | \[ 465 | \Pr[\sum_{i=1}^{k} z^{2}_{i} \ge k(1+\eps)] \le e^{ - k\eps^2/8} \le 1/2n^2 466 | \] 467 | is achieved by $k = c\log(n)/\eps^2$. 468 | Calculating for $\Pr[\sum_{i=1}^{k} z^{2}_{i} \le k(1-\eps)]$ in the same manner shows that $k = c\log(n)/\eps^2$ is also sufficient for this case. 469 | This completes the proof. 470 | 471 | 472 | 473 | 474 | 475 | \section{Fast Random Projections} 476 | We discussed in class the fact that random projection matrices cannot be made sparse in general. 477 | That is because projecting sparse vectors and preserving their norm requires the projecting matrix is almost fully dense see also \cite{JelaniH2012} and \cite{KaneN12}. 478 | 479 | But, the question is, can we actively make sure that $x$ is not sparse? If so, can we achieve a sparse random projection for non sparse vectors? 480 | These two questions received a positive answer in the seminal work by Ailon and Chazelle \cite{AilonCh06}. 481 | The results of \cite{AilonCh06} were improved and simplified over the years. See \cite{AilonL11} for the latest result and an overview. 482 | 483 | In this lesson we will produce a very simple algorithm based on the ideas in \cite{AilonCh06}. 484 | This algorithm will require a target dimension of $O(\log^2(n)/\eps^2)$ instead of $O(\log(n)/\eps^2)$ but will be much simpler to prove. 485 | 486 | \subsection{Fast vector $\ell_4$ norm reduction} 487 | The goal of this subsection is to devise a linear mapping which preserves vector's $\ell_2$ norms but reduces their $\ell_4$ norms with high probability. 488 | This will work to our advantage because, intuitively, vectors whose $\ell_4$ norm is small cannot be too sparse. 489 | For this we will need to learn what Hadamard matrices are. 490 | 491 | Hadamard matrices are commonly used in coding theory and are conceptually 492 | close for Fourier matrices. We assume for convenience that $d$ is a power of $2$ (otherwise we can pad out vectors with zeros). 493 | The Walsh Hadamard transform of a vector $x \in \R^{d}$ is the 494 | result of the matrix-vector multiplication $Hx$ where $H$ is a $d 495 | \times d$ matrix whose entries are $H(i,j) = \frac{1}{\sqrt{d}}(-1)^{\langle 496 | i,j\rangle}$. Here ${\langle i,j\rangle}$ means the dot product over 497 | $F_2$ of the bit representation of $i$ and $j$ as binary vectors of 498 | length $\log(d)$. 499 | Another way to view this is to define Hadamard Matrices recursively. 500 | \begin{equation*} % 501 | H_{1} = \frac{1}{\sqrt{2}}\left( 502 | \begin{array}{rr} 503 | 1 & 1 \\ 504 | 1 & -1\\ 505 | \end{array} 506 | \right) 507 | ,\;\; 508 | H_{d} = \frac{1}{\sqrt{2}}\left( 509 | \begin{array}{r:r} 510 | H_{d/2} & H_{d/2} \\ \hdashline 511 | H_{d/2} & -H_{d/2}\\ 512 | \end{array} 513 | \right) 514 | \end{equation*} % 515 | Here are a few interesting (and easy to show) facts about Hadamard matrices. 516 | \begin{enumerate} 517 | \item $H_d$ is a unitary matrix $\|Hx\| = \|x\|$ for any vector $x\in \R^d$. 518 | %\item $H_{d}(i,j) \in \{ \frac{1}{\sqrt{d}},- \frac{1}{\sqrt{d}}\}$ 519 | \item Computing $x \mapsto Hx$ requires $O(d\log(d))$ operations. 520 | \end{enumerate} 521 | 522 | 523 | We also define a diagonal matrix $D$ to be such that $D(i,i) \in \{1,-1\}$ uniformly. 524 | Clearly, we have that $\|HDx\|_2 = \|x\|_2$ since both $H$ and $D$ are isotropies. 525 | Let us now bound $\|HDx\|_\infty$. 526 | $(HDx)(1) = \sum_{i=1}^{d}H(1,i)D(i,i) x_i = \sum_{i=1}^{d}\frac{x_i}{\sqrt{d}}s_i$ where $s_i \in \{-1,1\}$ uniformly. 527 | To bound this we recap Hoeffding's inequality. 528 | \begin{fact}[Hoeffding's inequality] 529 | Let $X_1,\ldots,X_n$ be independent random variables s.t. $X_i \in [a_i,b_i]$. 530 | Let $X = \sum_{i=1}^{n} X_i$. 531 | \begin{equation} 532 | \Pr[|X - \E[X]| \ge t] \le 2e^{-\frac{2 t^2}{\sum_{i=1}^{n} (b_i -a_i)^2}} 533 | \end{equation} 534 | \end{fact} 535 | Invoking Hoeffding's inequality and then the union bound we get that if $\|HDx\|_\infty \le \sqrt{\frac{c \log(n)}{d}}$ for all points $x$. 536 | Remark, for this we assumed $\log(d) = O(\log(n))$ otherwise we should have had $\log(nd)$ in the bound. 537 | The situation, however, that the dimension is super polynomial in the number of points is unlikely. 538 | Usually it is common to have $n > d$. 539 | 540 | \begin{lemma} 541 | Let $x \in \R^d$ by such that $\|x\|=1$. Then: 542 | \[ 543 | \|HDx\|^4_4 = O(log(n)/d) 544 | \] 545 | with probability at least $1-1/\poly(n)$ 546 | \end{lemma} 547 | \begin{proof} 548 | Let us define $y = HDx$ and $z_i$ = $y_i^2$. 549 | From the above we have that $z_i \le \frac{c \log(n)}{d} = \eta$ with probability at least $1-1/\poly(n)$. 550 | The quantity $\|HDx\|^4_4 = \|y\|_{4}^{4} = \sum_{i}z_i^2$ is a convex function of the $z$ variables which is defined over a polytop $z_i \in [0,1]$ and $\sum_{i} z_i = 1$ (this is because $\|y\|_2^2 = 1$). 551 | This means that its maximal value is obtained on an extreme point of this polytope. 552 | In other words, the point $z_1,\ldots,z_{1/\eta} = \eta$ and $z_{1/\eta+1},\ldots,z_{d} = 0$ or $z = [\eta,\eta,\ldots,\eta,\eta,0,0,0,\ldots,0,0,0]$. 553 | Computing the value of the function in this point gives $\sum_{i}z_i^2 \le (1/\eta)\cdot (\eta^2) = \eta$. Recalling the $\eta = \frac{c \log(n)}{d}$ completes the proof. 554 | \end{proof} 555 | 556 | \subsection{Sampling from vectors with low $\ell_4$ norms} 557 | Here we prove a very simple fact. For vectors whose $\ell_4$ is low, dimensionality reduction can be obtained by sampling. 558 | 559 | 560 | Let $y$ be a vector such that $\|y\|_2 = 1$. Let $z$ be a sampled version of $y$ such that $z_i = y_i/\sqrt{p}$ with probability $p$ and $0$ else. 561 | This is akin to sampling, in expectation, $d\cdot p$ coordinates from $y$ (and scaling them up by $1/\sqrt{p}$). 562 | Note the $\E[\|z\|^2] = \E[\|y\|^2] = 1$ moreover: 563 | \[ 564 | \Pr[|\|z\|^2 - 1| > \eps] = \Pr[|\sum z_i^2 - 1| > \eps] = \Pr[|\sum b_i y_i^2/p - 1| > \eps] 565 | \] 566 | Where $b_i$ are independent random indicator variables taking the $b_i = 1$ with probability $p$ and $b_i = 0$ else. 567 | To apply Chernoff's bound we must assert that $y_i^2/p \le 1$. Let's assume this for now and return to it later. 568 | Applying Chernoff's bound we get 569 | \[ 570 | \Pr[|\sum b_i y_i^2/p - 1| > \eps] \le e^{-\frac{c\eps^2}{\sigma^2}} 571 | \] 572 | where $\sigma^2 = \sum_{i} \E[(b_i y_i^2/p)^2] = \|y\|_{4}^{4}/p$. 573 | Concluding that 574 | \[ 575 | \Pr[|\|z\|^2 - 1| > \eps] \le e^{-\frac{cp\eps^2}{\|y\|_4^4}} 576 | \] 577 | This shows that the concentration of the sampling procedure really depends directly on the $\ell_4$ norm of the sampled vector. 578 | If we plug in the bound on $\|y\|_4^4 = \|HDx\|_4^4$ from the previous section we get 579 | \[ 580 | \Pr[|\|z\|^2 - 1| > \eps] \le e^{-\frac{cp\eps d}{\log(n)}} \le \frac{1}{\poly(n)} 581 | \] 582 | For some $p \in O(\log^2(n)/d\eps^2)$. 583 | 584 | \subsection{Random Projection by Sampling} 585 | Putting it all together we obtain the following. 586 | \begin{lemma} 587 | Define the following matrices 588 | \begin{itemize} 589 | \item $D$: A diagonal matrix such that $D_{i,i} \in \{+1,-1\}$ uniformly. 590 | \item $H$: The $d\times d$ Walsh Hadamard Transform matrix. 591 | \item $P$: A `sampling matrix' which contains each row of matrix $I_d\cdot \sqrt{p}$ with probability $p= c\log^2(n)/d\eps^2$. 592 | \end{itemize} 593 | Then, with at least constant probability the following holds. 594 | \begin{enumerate} 595 | \item The target dimension of the mapping is $k = c\log^2(n)/\eps^2$ (a factor $log(n)$ worse than optimal). 596 | \item The mapping $x \mapsto PHDx$ is a $(1\pm\eps)$-distortion mapping for any set of $n$ points. 597 | That is, for any set $x_1,\ldots,x_n \in \R^d$ we have 598 | \[ 599 | \|x_i -x_j\|(1-\eps) \le \|PHDx_i - PHDx_j\| \le \|x_i -x_j\|(1+\eps) 600 | \] 601 | \item Storing $PHD$ requires at most $O(d + k\log(d))$ space. 602 | \item Applying the mapping $x \mapsto PHDx$ requires at most $d\log(d)$ floating point operations. 603 | \end{enumerate} 604 | \end{lemma} 605 | 606 | 607 | \bibliographystyle{plain} 608 | \bibliography{vs} 609 | 610 | \end{document} 611 | %%%%%%%% 612 | -------------------------------------------------------------------------------- /class_notes/Class_06_aproximate_nearest_neighbor_search.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_06_aproximate_nearest_neighbor_search.pdf -------------------------------------------------------------------------------- /class_notes/Class_06_aproximate_nearest_neighbor_search.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{vs} 3 | \begin{document} 4 | 5 | \lecturetitle{Class 6 - Approximate Nearest Neighbor Search} 6 | 7 | \section{Locality Sensitive Hashing} 8 | 9 | In this section we will review ideas from \cite{Charikar02} and \cite{GionisIM99}. 10 | We define a family $\mathcal{H}$ 11 | of functions as $(r_1,r_2,p_1,p_2)$-sensitive if: 12 | \begin{eqnarray*} 13 | || x- y || < r_1 &\rightarrow& \Pr_{h \sim \mathcal{H}}(h(x)=h(y)) > p_1\\ 14 | || x- y || > r_2 &\rightarrow& \Pr_{h \sim \mathcal{H}}(h(x)=h(y)) < p_2 15 | \end{eqnarray*} 16 | This is only meaningful if $r_1 < r_2$ and $p_1 > p_2$. 17 | Which means that if $x$ and $y$ are ``close" then the probability that 18 | they hash to the same value is at least something, but if they are further away 19 | then it is smaller. Or, the probability of points being hashed to the same value 20 | decreases with their distance. 21 | 22 | 23 | Let us assume such functions exist and give some intuition on how to use them. 24 | First we concatenate $k$ different hash functions from $\mathcal{H}$ 25 | to construct a new hash function $g(x) = [h_1(x),\ldots,h_k(x)]$. 26 | We choose $k$ such that $\Pr(g(x)=g(y)) \le 1/n$ if $||x-y|| > r_2$. 27 | Using the $(r_1,r_2,p_1,p_2)$-sensitivity of $\mathcal{H}$ we will get that 28 | if $||x-y|| < r_1$ then $\Pr(g(x)=g(y)) \ge 1/n^{\rho}$ for some $\rho<1$. 29 | 30 | Now, if we generate $\ell = n^{\rho}$ different copies of $g$, $g_1,\ldots,g_\ell$, 31 | and consider every $x$ in the data for which $g_i(x)=g_i(q)$ we will 32 | find every close point $x$ with constant probability and consider only $O(n^\rho)$ far points. 33 | 34 | Let us make this statement more precise. 35 | The preprocessing step is so. 36 | \begin{enumerate} 37 | \item $\rho \leftarrow \log(1/p_1)/\log(1/p_2)$ 38 | \item $\ell \leftarrow n^{\rho}$ 39 | \item $k \leftarrow \log(n)/log(1/p_2)$ 40 | \item for $\ell' \in \{1,\ldots,\ell\}$ 41 | \item \tab $g_{\ell'} \leftarrow [h_1(x),\ldots,h_k(x)]$ 42 | \item for $x \in X$ 43 | \item \tab for $\ell' \in \{1,\ldots,\ell\} $ 44 | \item \tab \tab add $x$ to $T_{\ell'}(g_{\ell'}(x))$ 45 | \end{enumerate} 46 | 47 | The search stage is as follows: 48 | \begin{enumerate} 49 | \item $S \leftarrow \emptyset$ 50 | \item for $\ell' \in \{1,\ldots,\ell\} $ 51 | \item \tab add $T_{\ell'}(g_{\ell'}(x)))$ to $S$ 52 | \item if $|S| \le 2n^{\rho}$ 53 | \item \tab for $x' \in S$ 54 | \item \tab \tab if $||x' - q|| \le r_2$ 55 | \item \tab \tab \tab return $x'$ 56 | \end{enumerate} 57 | 58 | \begin{fact} 59 | the number of points $x$ such that $||x-q|| \ge r_2$ and $x \in S$ is smaller that 60 | $2\cdot n^{\rho}$ with probability at least $1/2$. 61 | \end{fact} 62 | \begin{proof} 63 | $x \in S$ is for some $\ell'$ we have $g_{\ell'}(q) = g_{\ell'}(x)$ for $x$ such that $||x-q||>r_2$ 64 | this happens with probability $p_{2}^{log(n)/log(1/p_2)} = 1/n$. Thus, the expected total number of 65 | such points $x$ is $1$. Since we have $\ell = n^{\rho}$ different $g$ functions the total expected number of such 66 | points is $n^{\rho}$. Due to the above and Markov's inequality $\Pr[|S| > 2n^{\rho}] \le \Pr[|S| > 2E[|S|]] \le 1/2$. 67 | \end{proof} 68 | 69 | \begin{fact} 70 | If $||x-q|| \le r_1$ then with constant probability $x \in S$ 71 | \end{fact} 72 | \begin{proof} 73 | By the $(r_1,r_2,p_1,p_2)$-sensitivity of $H$ 74 | \[ 75 | \Pr[g(x) = g(q)] \ge p_{1}^{k} = p_{1}^{\log(n)/\log(1/p_2)} = n^{-\log(1/p1)/\log(1/p_2)} = n^{-\rho} 76 | \] 77 | Since we repeat this $\ell = n^{\rho}$ times independently, we have that $g_{\ell'}(x) \not = g_{\ell'}(q)$ for all 78 | $\ell'$ with probability at most $(1-n^{-\rho})^{n^{\rho}} < e^{-1}$ 79 | \end{proof} 80 | 81 | Thus, both events happen with probability at least $1 - 1/2 - e^{-1} = const$. 82 | We can duplicate the entire data structure $O(\log(1/\delta))$ time to achieve success probability $1-\delta$ 83 | in the cost of an $O(\log(1/\delta))$ factor in data storage and search time. 84 | This means that the searching running time is $O(dn^{\rho})$. 85 | 86 | \section{LSH functions} 87 | \subsection{$\{0,1\}^d$ with the Hamming distance} 88 | The hamming distance between points which are $x,y\in \{0,1\}^d$ is defined as 89 | the number of coordinates for which $x$ and $y$ defer. We claim that choosing a random 90 | coordinate from each vector is a local sensitive function and examine its parameters. 91 | \begin{fact} 92 | let $\mathcal{H}$ be a family of $d$ functions for which $h_i(x) = x_i$. 93 | Then, $\mathcal{H}$ is $(r,(1+\eps)r,1-\frac{r}{d},1-\frac{(1+\eps)r}{d})$-sensitive. 94 | \end{fact} 95 | \begin{fact} 96 | If $r \le d/\log(n)$ then $\rho = \log(1/p_1)/\log(1/p_2) \le 1/(1+\eps)$ 97 | \end{fact} 98 | \begin{proof} 99 | See Fact 3 in \cite{GionisIM99}. Moreover, assuming $r \le d/\log(n)$ is harmless since we can always 100 | extend each vector by $d\log(n)$ zeros which does not change their distances and guaranties that $r \le d/\log(n)$. 101 | \end{proof} 102 | 103 | 104 | \begin{remark} 105 | This results is also applicable to the Euclidean distance setting because it is possible 106 | to map $\ell_{2}^{d}$ into $\ell_{1}^{O(d)}$ and also trivially possible to map $\ell_{1}^{d} = \{0,1\}^{O(d/\eps)}$ with distortion 107 | $\eps$ for bounded valued vectors. 108 | \end{remark} 109 | 110 | 111 | Thus, the running time of $O(n^{\rho})$ is in fact $O(n^{1/(1+\eps)})$. In other words, to find a 112 | the closest neighbor up to a factor of $2$ in this distance is possible while examining only $O(\sqrt{n})$ data points. 113 | This, however, does not achieve the bound of $O(\poly(d,\log(n)))$. 114 | 115 | \subsection{Searching with similarities} 116 | Note that in the above we never used the fact that the distance function is a metric. 117 | Indeed, it is possible to search though items as long as we can produce a local sensitive hashing. 118 | In \cite{Charikar02} Charikar defined Local sensitive hashing as: 119 | \[ 120 | \Pr_{h}[h(x)=h(y)] = sim(x,y) 121 | \] 122 | For example, let $x$ and $y$ be sets of items. Their set similarity can be defined as $\frac{| x \cap y|}{|x \cup y|}$. 123 | Here we can use a famous trick. We will map $h(x) \rightarrow \arg \min_{x_i \in x} g(x_i)$ when $g$ is a random permutation over 124 | the entire universe or a random function into $[0,1]$ for example. The reason this holds true is because 125 | the minimal value of $g$ in $|x \cup y|$ might accidentally be also in $| x \cap y|$ but 126 | since the distribution is uniform, the probability of this event is $\frac{| x \cap y|}{|x \cup y|}$. 127 | 128 | \subsection{LSH for points in $\Sph^{d-1}$} 129 | The set of unit length vectors in $\R^{d}$ is called the $d$ dimensional unit sphere and is denoted by $\Sph^{d-1}$ 130 | (the power is $d-1$ to denote that it is actually a $d-1$ dimensional manifold. Do not be confused, the points are still in $\R^d$) 131 | For these points, we can define the distance as the angle between the vectors $d(x,y) = cos^{-1}(x^{T}y)$. 132 | We can thus define a hash function $h(x) = sign(u^{T}x)$ for a vector $u$ chosen uniformly at random from $\Sph^{d-1}$. 133 | It is immediate to show that $h$ is local sensitive to the angular distance with parameters similar to the previous subsection. 134 | 135 | 136 | 137 | \bibliographystyle{plain} 138 | \bibliography{vs} 139 | 140 | \end{document} 141 | %%%%%%%% 142 | -------------------------------------------------------------------------------- /class_notes/Class_07_clustering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_07_clustering.pdf -------------------------------------------------------------------------------- /class_notes/Class_07_clustering.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{vs} 3 | \begin{document} 4 | 5 | \lecturetitle{Class 7 - Clustering} 6 | 7 | \section{K-means clustering} 8 | \begin{definition}[$k$-means] 9 | Given $n$ vectors $x_1\ldots,x_n\in \R^d$, and an integer $k$, find $k$ points $c_1,\ldots,c_k \in \R^d$ 10 | which minimize the expression: 11 | \[ 12 | f = \sum_{i \in [n]} \min_{j \in [k]} \|x_i - c_j \|^2 13 | \] 14 | \end{definition} 15 | In words, we aim to find $k$ cluster centers. The cost is the squared distance between all the points to their closest cluster center. 16 | k-means clustering and Lloyd's algorithm \cite{Lloyd82leastsquares} are probably the most widely used clustering objective and algorithm. 17 | This is for three main reasons: 18 | \begin{itemize} 19 | \item The objective function is simple and natural. 20 | \item Lloyd's algorithm (which we see below) is simple, efficient in practice, and often results in optimal or close to optimal results. 21 | \item The results are easily interpretable and are often quite descriptive for real data sets. 22 | \end{itemize} 23 | In 1957 Stuart Lloyd suggested a simple alternating minimization algorithm which efficiently finds a local minimum for this problem. 24 | This algorithm (a.k.a. Lloyd's algorithm) seems to work so well in practice that it is sometimes referred to as $k$-means or the $k$-means algorithm. 25 | 26 | \begin{algorithm} 27 | \caption{Lloyd's Algorithm} 28 | \begin{algorithmic} 29 | \STATE $c_1,\ldots,c_k \leftarrow$ randomly chosen centers 30 | \WHILE {Objective function still improves} 31 | \STATE $S_1,\ldots,S_k \leftarrow \phi$ 32 | \FOR {$i \in 1,\ldots,n$} 33 | \STATE $j \leftarrow \arg\min_{j'}\|x_i- c_{j'}\|^2 \}$ 34 | \STATE add $i$ to $S_j$ 35 | \ENDFOR 36 | \FOR {$j \in 1,\ldots,k$} 37 | \STATE $cc_j = \frac{1}{|S_j|}\sum_{i \in S_j} x_i$ 38 | \ENDFOR 39 | \ENDWHILE 40 | \end{algorithmic} 41 | \end{algorithm} 42 | \noindent This algorithm can be thought of as a potential function reducing algorithm. 43 | The potential function is our objective function from above. 44 | \[ 45 | f = \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - c_j\|^2. 46 | \] 47 | Where the sets $S_j$ are the sets of points to which $c_j$ is the closest center. 48 | In each step of the algorithm the potential function is reduced. 49 | Let's examine that. 50 | First, if the set of centers $c_j$ are fixed, the best assignment is clearly the one which assigns 51 | each data point to its closest center. 52 | Then, is the set $S_j$ are fixed, the optimal center is $c_j = \frac{1}{|S_j|}\sum_{i \in S_j} x_i$ (can easily be seen by derivation of the cost function). 53 | Therefore, moving $c_j$ to it's optimal position can only reduce the potential function. 54 | The algorithm therefore terminates in a local minimum. 55 | There are only two questions. One, whether the number of iterations for convergence is bounded. Two, whether we can guaranty that the solution is close to optimal. 56 | 57 | \section{k-means and PCA} 58 | This section will present a simple connection between $k$-means and PCA (similar ideas given here \cite{DingH04a}). 59 | First, consider the similarity between the $k$-means cost function. Let $C_k = \{c_1,\ldots,c_k\}$ 60 | \[ 61 | f_{k-means} = \min_{C_k} \sum_{i \in [n]} \min_{c \in C_k} \|x_i - c\|^2 62 | \] 63 | and that of PCA 64 | \[ 65 | f_{PCA} = \min_{P_k} \sum_{i \in [n]} \min_{z \in P_k} \|x_i - z\|^2 66 | \] 67 | where $P_k$ is a projection into dimension $k$ and $z \in P_k$ means that $P_k z = z$. 68 | Now, think about the subspace $P^{*}_{k}$ which contains the $k$ optimal centers $C^{*}_{k}$. 69 | Since $C^{*}_{j} \subset P^{*}_{k}$ we have that: 70 | \[ 71 | f_{k-means} = \sum_{i \in [n]} \min_{c \in C^{*}_k} \|x_i - c\|^2 \ge \sum_{i \in [n]} \min_{z \in P^{*}_k} \|x_i - z\|^2 \ge f_{PCA} 72 | \] 73 | 74 | \noindent For PCA, we conveniently have a closed form expression $ \min_{z \in P_k} \|x_i - z\|^2 = \|x_i - P_{k} x_i \|^2$. 75 | The equality stems from the fact that for any point $x$ and any projection operation $P$ we have that $P(x) = \arg\min_{z \in P} \|x - z\|$. 76 | Now, consider solving $k$-means on the points $y_i = P_k x_i$ instead. This intuitively will be an easier task 77 | because $y_i$ are embedded into a lower dimension, namely $k$ (by the projection $P_k$). 78 | 79 | \begin{center} 80 | \includegraphics[width=0.6\textwidth]{images/kmeans-proj.png} 81 | \end{center} 82 | 83 | \noindent Before we do that though, we should argue that a good clustering for $y_i$ results in a good clustering for $x_i$. 84 | Note that all $y_i$ are projected on a subspace $P$. If we project the optimal centers $C^*$ onto $P$ as well, we will get a solution with a lower cost than $f_{k-mean}$. 85 | The optimal solution for $y_i$ will clearly be even better (or at least, not worse). Therefore $\hat{f}_{k-mean}\le f_{k-mean}$ where $\hat{f}_{k-mean} = f_{k-mean}(y_1,\ldots,y_n)$. 86 | 87 | The following gives us a simple algorithm. Compute the $PCA$ of the points $x_i$ into dimension $k$. 88 | Solve $k$-means on the points $y_i$ in dimension $k$. Output the resulting clusters and centers. 89 | \[ 90 | f_{alg} = f_{PCA} + \hat{f}_{k-means} \le 2f_{k-means} 91 | \] 92 | 93 | \section{$\eps$-net argument for fixed dimensions} 94 | Since computing the SVD of a matrix (and hence PCA) is well known. 95 | We get that computing a $2$-approximation to 96 | the $k$-means problem in dimension $d$ is possible if it can be done in dimension $k$. 97 | 98 | To solve this problem we adopt a brute force approach. 99 | Let $Q_{\eps}$ be a set of points inside the unit ball $B^{k}_{1}$ such that: 100 | \[ 101 | \forall z \in B^{k}_{1} \;\; \exists \; q \in Q_{\eps} \; s.t. \;\; \|z - q\| \le \eps 102 | \] 103 | Such sets of points exist such that $|Q_{\eps}| \le c(\frac{1}{\eps})^k$. There are 104 | probabilistic constructions for such sets as well but we will not go into that. 105 | Assuming w.l.o.g. that $\|x_i\| \le 1$ we can constrain the centers of 106 | the clusters to one of the points in the $\eps$-net $Q_{\eps}$. 107 | Let $q_j$ be the closest point in $Q_{\eps}$ to $c_j$ (so $\|c_j - q_j \| \le \eps$). 108 | From a simple calculation we have that: 109 | \[ 110 | \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - q_j \|^2 \le \sum_{j \in [k]} \sum_{i \in S_j} \|x_i - c_j \|^2 + 5\eps. 111 | \] 112 | 113 | To find the best clustering we can exhaustively search through every set of $k$ points from $Q_{\eps}$. 114 | For each such set, compute the cost of this assignment on the original points and return the one minimizing the cost. 115 | That will require ${c(\frac{1}{\eps})^k \choose k}$ iterations over candidate solutions each of which requires $O(ndk)$ time. 116 | The final running time we achieve is $2^{O(k^2\log(1/\eps))}nd$. 117 | 118 | 119 | \section{Sampling based seeding for k-means} 120 | Another simple idea is to sample sufficiently many points from the input as candidate centers. 121 | Ideas similar to the ones described here can be found here \cite{ZhaHDGS01}. 122 | 123 | First, assume we have only one set of points $S$ and $|S|=n$. 124 | Also, denote by $c$ the centroid of $S$, $c = \frac{1}{n}\sum_{i \in S} x_i$ and assume w.l.o.g. $c=0$. 125 | We will claim that picking a random members of $S$ as a centroid is not much worse than 126 | picking $c=0$. 127 | Let $q$ be a member of $S$ chosen uniformly at random. 128 | Let us compute the expectation of the cost function. 129 | \begin{eqnarray} 130 | \E[\sum_{i \in S} \|x_i - q\|^2] &=& \sum_{i \in S} \sum_{j \in S} \frac{1}{n}\|x_i - x_j\|^2 \\ 131 | &=& \sum_{i \in S} \|x_i\|^2 - \frac{2}{n}(\sum_{i \in S} x_i)^T(\sum_{j \in S} x_j) + \sum_{j \in S} \|x_j\|^2 \\ 132 | &\le& 2 \sum_{i \in S} \|x_i - c\|^2. 133 | \end{eqnarray} 134 | 135 | \noindent Using Markov's inequality we get that 136 | \[ 137 | \Pr[\sum_{i \in S} \|x_i - q\|^2 \le 4\sum_{i \in S} \|x_i - c\|^2] \ge 1/2 138 | \] 139 | If this happens we say that $q$ is a good representative for $S$ (at least half of the points are good representatives!) 140 | Now consider again the situation where we have $k$ clusters $S_1,\dots,S_k$. 141 | If we are given a set $Q$ which contains a good candidate for each of the sets. 142 | Then, restricting ourselves to pick centers from $Q$ will result in at most a multiplicative factor of $4$ to the cost. 143 | 144 | The set $Q$ can be quite small if the set are roughly balanced. 145 | Let the smallest set contain $n_s$ points. 146 | We therefore succeed in finding a good representative for any set with probability at least $\frac{1}{2}\frac{n_s}{n}$. 147 | The probability of failure for any set is thus bounded by $k (1 - \frac{n_s}{2n})^{|Q|}$. 148 | Therefore $|Q| = O(k \log(k))$ if $n_s \in \Omega(n/k)$. 149 | 150 | Again, iterating over all subsets of $Q$ of size $k$ we can find an approximate 151 | solution is time $O({ck \log(k) \choose k}knd) = 2^{O(k \log(k))}nd$. 152 | 153 | \section{k-means++} 154 | 155 | In the above, we gave approximation algorithms to the $k$-means problem. 156 | Alas, any solution can be improved by performing Lloyds algorithm on its output. 157 | Therefore, such algorithms can be considered as `seeding' algorithms 158 | which give initial assignments to Lloyds algorithm. 159 | A well known seeding procedure \cite{ArthurV07} is called $k$-means++. 160 | \begin{algorithm} 161 | \caption{$k$-means++ algorithm \cite{ArthurV07}} 162 | \begin{algorithmic} 163 | \STATE $C \leftarrow \{x_i\}$ where $x_i$ is a uniformly chosen from $[n]$. 164 | \FOR{$j \in [k]$} 165 | \STATE Pick node $x$ with probability proportional to $\min_{c \in C} \|x - c\|^2$ 166 | \STATE Add $x$ to $C$ 167 | \ENDFOR 168 | \STATE {\bf return:} $C$ 169 | \end{algorithmic} 170 | \end{algorithm} 171 | In each iteration, the next center is chosen randomly from the input points. 172 | The distribution over the points is not uniform. 173 | Each point is picked with probability proportional to the minimal square distance from it to a picked center. 174 | Surprisingly, This simple and practical approach already gives an $O(\log(k))$ approximation guarantee. 175 | More precisely, let $f_{k-means}(C)$ denote the cost of $k$-means with the set of centers $C$. 176 | Also, denote by $C^*$ the optimal set of centers. Then 177 | \[ 178 | \E[f_{k-means}(C)] \le 8 (\log(k)+2)f_{k-means}(C^*) 179 | \] 180 | 181 | In \cite{AilonJM09} the authors give a streaming algorithm for this problem. 182 | They manipulate ideas from \cite{ArthurV07} and combine them with a hirarchical 183 | divide and conquer methodology. See also \cite{GuhaMMMO03} for a thorough survey and 184 | new techniques for clustering in streams. 185 | 186 | Another problem which is very related to $k$-means is the $k$-medians problem. 187 | Given a set to points $x_1,\ldots,x_n$ the aim is to find centers $c_1,\ldots,c_k$ which minimize: 188 | \[ 189 | f_{k-medians} = \sum_{i \in [n]} \min_{j \in [k]} \|x_i - c_j \| 190 | \] 191 | Both $k$-means and the $k$-median problem admit $1+\eps$ multiplicative approximation algorithms but these 192 | are far from being simple. See \cite{hk-sckmk-05} for more details, related work, and a new core set based solution. 193 | 194 | \section{The Inverted File Model (IVF)} 195 | One of the most common approaches in vector search is to begin with clustering the set of points using k-mean and, at search time, consider only the points within the nearest clusters. This is called the inverted file model (IVF) and is used extensively in practice. We will expand on this discussion in class. 196 | 197 | 198 | 199 | \bibliographystyle{plain} 200 | \bibliography{vs} 201 | 202 | \end{document} 203 | %%%%%%%% 204 | -------------------------------------------------------------------------------- /class_notes/Class_08_quantization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_08_quantization.pdf -------------------------------------------------------------------------------- /class_notes/Class_08_runbook_for_students.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "6adb5788", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "# requires Faiss to be installed, see \n", 14 | "# https://github.com/facebookresearch/faiss/blob/main/INSTALL.md#installing-faiss-via-conda\n", 15 | "# oh how to install the CPU version\n", 16 | "\n", 17 | "import faiss\n", 18 | "\n", 19 | "from faiss.contrib.datasets import SyntheticDataset\n", 20 | "\n", 21 | "from matplotlib import pyplot" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "31018e88", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# setup that works for my machine. Adjust to yours \n", 32 | "faiss.omp_set_num_threads(32)\n", 33 | "\n", 34 | "%matplotlib inline\n", 35 | "%config InlineBackend.figure_format='retina'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "e823d6ca", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# get some data\n", 46 | "ds = SyntheticDataset(64, 1000_000, 10000, 100)\n", 47 | "print(ds)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "c660a6ee", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# get training set\n", 58 | "xt = ds.get_train()\n", 59 | "xt.shape" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "e1ff2bf0", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "d = ds.d" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "d9d12999", 75 | "metadata": {}, 76 | "source": [ 77 | "# Run k-means " 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "be5b07a0", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# 4096 centroids \n", 88 | "km = faiss.Kmeans(ds.d, 4096)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "c6dfa79b", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "%%time\n", 99 | "km.train(xt)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "e185e9ed", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "centroids = km.centroids \n", 110 | "centroids.shape" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "73149508", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "MSE = km.obj[-1] / len(xt)\n", 121 | "MSE" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "a85938b3", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "pyplot.plot(km.obj / ds.nt)\n", 132 | "pyplot.ylabel(\"Mean Squared Error\")\n", 133 | "pyplot.xlabel(\"Iteration\")\n", 134 | "pyplot.grid()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "5d7dd70d", 140 | "metadata": {}, 141 | "source": [ 142 | "# Hierarchical k-means " 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "4252e712", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "def recursive_run_kmeans(xt, k, level): \n", 153 | " if level == 0: \n", 154 | " # all vectors encoded to the same, compute MSE\n", 155 | " centroid = xt.mean(axis=0)\n", 156 | " s = ((xt - centroid) ** 2).sum()\n", 157 | " return [centroid], s\n", 158 | " else: \n", 159 | " km = faiss.Kmeans(ds.d, k)\n", 160 | " km.train(xt)\n", 161 | " _, labels = km.assign(xt)\n", 162 | " tot_sum = 0\n", 163 | " centroids = []\n", 164 | " for i in range(k): \n", 165 | " subset = labels == i\n", 166 | " cent_i, sum_i = recursive_run_kmeans(xt[subset], k, level - 1)\n", 167 | " centroids += cent_i\n", 168 | " tot_sum += sum_i \n", 169 | " return centroids, tot_sum " 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "deab4bdf", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "%%time \n", 180 | "# 4096 = 8 ** 4\n", 181 | "cents, s = recursive_run_kmeans(xt, 8, 4)\n", 182 | "MSE = s / len(xt)\n", 183 | "MSE" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "fc7a74ce", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "%%time \n", 194 | "# 4096 = 64 ** 2\n", 195 | "cents, s = recursive_run_kmeans(xt, 64, 2)\n", 196 | "MSE = s / len(xt)\n", 197 | "MSE" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "70c019a3", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# search from centroids directly \n", 208 | "\n", 209 | "D, _ = faiss.knn(xt, cents, k=1)\n", 210 | "MSE = D.mean()\n", 211 | "MSE" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "id": "aa4e76cd", 217 | "metadata": {}, 218 | "source": [ 219 | "## Searching in a vector database " 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "070a8371", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# the database and set of query vectors are arrays\n", 230 | "xq = ds.get_queries()\n", 231 | "xb = ds.get_database()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "a881bc56", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "xq.shape" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "e55a1ce7", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "xb.shape" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "3d8d787a", 257 | "metadata": {}, 258 | "source": [ 259 | "### Ground truth and the knn function" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "328cdcce", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "# find ground-truth nearest neighbors \n", 270 | "gt_dis, gt = faiss.knn(xq, xb, k=10)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "id": "368cac81", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "gt.shape" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "id": "2419d184", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "gt[:3]" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "id": "ec4a0f9a", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "gt_dis[:3]" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "id": "f611e072", 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "((xq[1] - xb[6558])**2).sum()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "id": "b89accbb", 316 | "metadata": {}, 317 | "source": [ 318 | "# The inverted file " 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "dd8018b6", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "nlist = 4096\n", 329 | "\n", 330 | "# compute IVF entries for database = find the nearest centroid for each database vector \n", 331 | "_, list_nos = faiss.knn(xb, centroids, k=1)\n", 332 | "list_nos = list_nos.flatten()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "ec84f3c3", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "ivf_vectors = []\n", 343 | "ivf_ids = []\n", 344 | "\n", 345 | "for list_no in range(nlist): \n", 346 | " ids = np.where(list_nos == list_no)[0]\n", 347 | " ivf_ids.append(ids)\n", 348 | " ivf_vectors.append(xb[ids])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "522a369f", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "len(ivf_ids), len(ivf_vectors)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "81170e6b", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "max(len(l) for l in ivf_ids)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "c6ef3af3", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "min(len(l) for l in ivf_ids)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "31fbda29", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "# searching in the nearest centroid \n", 389 | "_, q_list_nos = faiss.knn(xq, centroids, k=1)\n", 390 | "found_nns = []\n", 391 | "for q in range(100): \n", 392 | " query = xq[q]\n", 393 | " # fetch contents of cluster\n", 394 | " cluster_vectors = ivf_vectors[q_list_nos[q, 0]]\n", 395 | " cluster_ids = ivf_ids[q_list_nos[q, 0]]\n", 396 | " if cluster_ids.size == 0: \n", 397 | " found_nns.append(-1)\n", 398 | " continue\n", 399 | " # compute distances \n", 400 | " distances = ((query - cluster_vectors)**2).sum(1)\n", 401 | " # collect result id\n", 402 | " result_id = cluster_ids[distances.argmin()]\n", 403 | " found_nns.append(result_id)\n", 404 | " " 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "84b46874", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "(found_nns == gt[:, 0]).sum()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "b4ed63d2", 420 | "metadata": {}, 421 | "source": [ 422 | "That's not much. Maybe we need to explore more clusters?" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "784a32aa", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "nprobe = 13\n", 433 | "# searching in the nearest centroid \n", 434 | "_, q_list_nos = faiss.knn(xq, centroids, k=nprobe)\n", 435 | "found_nns = []\n", 436 | "ndis = 0\n", 437 | "for q in range(100): \n", 438 | " query = xq[q]\n", 439 | " # fetch contents of clusters \n", 440 | " cluster_vectors = np.vstack([\n", 441 | " ivf_vectors[i]\n", 442 | " for i in q_list_nos[q]\n", 443 | " ])\n", 444 | " cluster_ids = np.hstack([\n", 445 | " ivf_ids[i]\n", 446 | " for i in q_list_nos[q]\n", 447 | " ])\n", 448 | " if cluster_ids.size == 0: \n", 449 | " found_nns.append(-1)\n", 450 | " continue\n", 451 | " # compute distances \n", 452 | " distances = ((query - cluster_vectors)**2).sum(1)\n", 453 | " ndis += len(cluster_ids)\n", 454 | " # collect result id\n", 455 | " result_id = cluster_ids[distances.argmin()]\n", 456 | " found_nns.append(result_id)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "id": "40055f37", 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "(found_nns == gt[:, 0]).sum()" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "id": "3b06f0e5", 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "ndis / 100 " 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "id": "f1d8b744", 482 | "metadata": {}, 483 | "source": [ 484 | "That's better, we computed just 106 distances on average per query (out of 10000)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "id": "99729b82", 490 | "metadata": {}, 491 | "source": [ 492 | "## Inverted file in Faiss " 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "id": "41a72316", 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "index = faiss.index_factory(d, \"IVF1024,Flat\") # flat means: don't encode the vectors!" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "id": "45727dbd", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "index.train(xt)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "id": "6415958b", 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "index.add(xb)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "id": "cf2ae5c9", 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "D, I = index.search(xq, 10)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "id": "14990831", 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "(I[:, 0] == gt[:, 0]).sum()" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "id": "10ee442a", 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "index.nprobe = 10\n", 553 | "D, I = index.search(xq, 10)\n", 554 | "(I[:, 0] == gt[:, 0]).sum()" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "id": "53cc6c5c", 560 | "metadata": {}, 561 | "source": [ 562 | "## Tradeoff speed / accuracy " 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "id": "fc32eca0", 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "results = {}\n", 573 | "for nlist in 64, 256, 1024: \n", 574 | " index = faiss.index_factory(d, f\"IVF{nlist},Flat\")\n", 575 | " index.train(xt)\n", 576 | " index.add(xb)\n", 577 | " for nprobe in 1, 2, 4, 8, 16, 32, 64, 128:\n", 578 | " if nprobe > nlist: \n", 579 | " continue\n", 580 | " index.nprobe = nprobe\n", 581 | " t0 = time.time()\n", 582 | " for run in range(100): # several runs to get stable timings\n", 583 | " D, I = index.search(xq, 10)\n", 584 | " t1 = time.time() \n", 585 | " recall = (I[:, 0] == gt[:, 0]).sum()\n", 586 | " print(f\"{nlist=:} {nprobe=:} {recall=:} time={(t1 - t0) * 1000 :.3f} ms\")\n", 587 | " results[(nlist, nprobe)] = (recall, (t1 - t0) * 1000)\n", 588 | " " 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "id": "9c492714", 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "for nlist in 64, 256, 1024: \n", 599 | " index = faiss.index_factory(d, f\"IVF{nlist},Flat\")\n", 600 | " index.train(xt)\n", 601 | " index.add(xb)\n", 602 | " res = [results[(nlist, nprobe)] for nprobe in [1, 2, 4, 8, 16, 32, 64, 128] if nprobe < nlist]\n", 603 | " recalls = [r[0] for r in res]\n", 604 | " times = [r[1] for r in res]\n", 605 | " pyplot.plot(recalls, times, label=f\"{nlist=:}\")\n", 606 | "\n", 607 | "pyplot.ylabel(\"time (ms)\")\n", 608 | "pyplot.xlabel(\"R@1\")\n", 609 | "pyplot.legend()\n", 610 | "pyplot.grid()\n", 611 | " " 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "id": "7b3f90f5", 617 | "metadata": {}, 618 | "source": [ 619 | "## Search cost as a function of the database size " 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "id": "9b0f60b6", 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "ns = 2 ** np.arange(10, 25)\n", 630 | "nprobe = 15 # fix nprobe \n", 631 | "for k in 4 ** np.arange(3, 7): \n", 632 | " coarse_quantization_cost = k\n", 633 | " ivf_scanning_cost = nprobe / k * ns\n", 634 | " pyplot.loglog(ns, coarse_quantization_cost + ivf_scanning_cost, label=f\"{k=:}\")\n", 635 | "pyplot.xlabel(\"database size\")\n", 636 | "pyplot.ylabel(\"nb distance computations\")\n", 637 | "pyplot.title(f\"search cost at {nprobe=:}\")\n", 638 | "pyplot.legend()\n", 639 | "pyplot.grid()" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "id": "49dec309", 645 | "metadata": {}, 646 | "source": [ 647 | "# Searching in compressed vectors " 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "id": "db326b75", 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "# work on a smaller subset because otherwise we don't see anything with such small codes \n", 658 | "xb_small = xb[:1000]\n", 659 | "_, gt_small = faiss.knn(xq, xb_small, k=10)" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "id": "d6455720", 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "# compute codes for database = find the nearest centroid for each database vector \n", 670 | "encoding_errors, codes = faiss.knn(xb_small, centroids, k=1)" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "id": "eed66ff0", 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "codes.shape" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "id": "2e230786", 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "codes = codes.flatten()" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "id": "592cf1d6", 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "# reconstruct \n", 701 | "reconstructed_xb = centroids[codes]" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "id": "86b7e431", 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "MSE = ((reconstructed_xb - xb_small) ** 2).sum(1).mean()\n", 712 | "MSE" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "id": "6fa261cb", 718 | "metadata": {}, 719 | "source": [ 720 | "Similar but a bit worse than the training MSE " 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "id": "ae1ab0ee", 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [ 730 | "# anothe way of computing it\n", 731 | "encoding_errors.mean()" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "id": "3c5bd736", 737 | "metadata": {}, 738 | "source": [ 739 | "## Asymmetric search" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "id": "1cee40e0", 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "found_dis, found_indices = faiss.knn(xq, reconstructed_xb, k=10)" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "id": "63737db3", 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "(gt_small[:, 0] == found_indices[:, 0]).sum() " 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "id": "0be8a494", 765 | "metadata": {}, 766 | "source": [ 767 | "We loose 73% of nearest neighbors because the vectors are compressed a lot (12 bits). But note chance is at 1/1000 = 0.1%" 768 | ] 769 | }, 770 | { 771 | "cell_type": "markdown", 772 | "id": "d1e78019", 773 | "metadata": {}, 774 | "source": [ 775 | "## Symmetric search " 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "id": "e9ae2b9c", 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "# let's encode and decode the queries as well \n", 786 | "_, xq_codes = faiss.knn(xq, centroids, k=1)\n", 787 | "xq_codes = xq_codes.flatten()\n", 788 | "reconstructed_xq = centroids[xq_codes]" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "id": "f61bc16a", 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "found_dis, found_indices = faiss.knn(reconstructed_xq, reconstructed_xb, k=10)" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "id": "b4c98736", 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "(gt_small[:, 0] == found_indices[:, 0]).sum() " 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "id": "c38a209e", 814 | "metadata": {}, 815 | "source": [ 816 | "Wow that's even worse" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "id": "415e294d", 822 | "metadata": {}, 823 | "source": [ 824 | "## Asymmetric search with look-up tables " 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "id": "56f9ede7", 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "# recall reference results\n", 835 | "found_dis, found_indices = faiss.knn(xq, reconstructed_xb, k=10)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": null, 841 | "id": "94838b00", 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "# make look-up tables for all queries\n", 846 | "def pairwise_distances(A, B): \n", 847 | " return (A ** 2).sum(1)[:, None] + (B ** 2).sum(1) - 2 * A @ B.T " 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "id": "fbccc070", 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "LUT = pairwise_distances(xq, centroids)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "id": "02a3fc04", 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "LUT.shape" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "id": "ea02f2d7", 874 | "metadata": {}, 875 | "outputs": [], 876 | "source": [ 877 | "codes.shape" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": null, 883 | "id": "5f9ef31d", 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "distances = LUT[:, codes]" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": null, 893 | "id": "678b9a5a", 894 | "metadata": {}, 895 | "outputs": [], 896 | "source": [ 897 | "distances.shape" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "id": "ecba60fb", 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [ 907 | "found_indices_2 = distances.argmin(axis=1)" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": null, 913 | "id": "fc8f7d7d", 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "np.all(found_indices[:, 0] == found_indices_2)" 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": null, 923 | "id": "d6ac2963", 924 | "metadata": {}, 925 | "outputs": [], 926 | "source": [ 927 | "found_indices_2" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "id": "9f7c2514", 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [ 937 | "found_indices[:, 0]" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": null, 943 | "id": "57aa95ea", 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [ 947 | "np.where(found_indices[:, 0] != found_indices_2)" 948 | ] 949 | }, 950 | { 951 | "cell_type": "markdown", 952 | "id": "4e3c366e", 953 | "metadata": {}, 954 | "source": [ 955 | "# Product Quantization" 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": null, 961 | "id": "19ba01fc", 962 | "metadata": {}, 963 | "outputs": [], 964 | "source": [ 965 | "# 4 sub-vectors, encode each in 2^8 elements\n", 966 | "pq = faiss.ProductQuantizer(d, 4, 8)" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": null, 972 | "id": "7865e4da", 973 | "metadata": {}, 974 | "outputs": [], 975 | "source": [ 976 | "pq.code_size # in bytes, bits/8 rounded up to next integer" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": null, 982 | "id": "21fb7138", 983 | "metadata": {}, 984 | "outputs": [], 985 | "source": [ 986 | "pq.train(xt)" 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": null, 992 | "id": "48543659", 993 | "metadata": {}, 994 | "outputs": [], 995 | "source": [ 996 | "xb_codes = pq.compute_codes(xb)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "id": "0f6c3eac", 1003 | "metadata": {}, 1004 | "outputs": [], 1005 | "source": [ 1006 | "pq_reconstruction = pq.decode(xb_codes)" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": null, 1012 | "id": "84d2814e", 1013 | "metadata": {}, 1014 | "outputs": [], 1015 | "source": [ 1016 | "# compute the MSE\n", 1017 | "((pq_reconstruction - xb) ** 2).sum(1).mean()" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "markdown", 1022 | "id": "a6f0a55e", 1023 | "metadata": {}, 1024 | "source": [ 1025 | "Better MSE than the 12-bit k-means one" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "markdown", 1030 | "id": "0d71662c", 1031 | "metadata": {}, 1032 | "source": [ 1033 | "## Manual reconstruction" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": null, 1039 | "id": "63ad6728", 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [ 1043 | "from faiss.contrib.inspect_tools import get_pq_centroids, get_additive_quantizer_codebooks" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": null, 1049 | "id": "57b275a0", 1050 | "metadata": {}, 1051 | "outputs": [], 1052 | "source": [ 1053 | "pq_centroids = get_pq_centroids(pq)" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "id": "6d96cd6b", 1060 | "metadata": {}, 1061 | "outputs": [], 1062 | "source": [ 1063 | "pq_centroids.shape" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "id": "416f93e9", 1069 | "metadata": {}, 1070 | "source": [ 1071 | "Layout: number of subvectors, K, subvector dimension" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": null, 1077 | "id": "9124a6e5", 1078 | "metadata": {}, 1079 | "outputs": [], 1080 | "source": [ 1081 | "xb_codes[:2]" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": null, 1087 | "id": "0d081828", 1088 | "metadata": {}, 1089 | "outputs": [], 1090 | "source": [ 1091 | "# reconstruct vector no 123 -- TODO implement the re-construction! \n", 1092 | "xb123_recons = " 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": null, 1098 | "id": "410d41d7", 1099 | "metadata": {}, 1100 | "outputs": [], 1101 | "source": [ 1102 | "np.all(pq_reconstruction[123] == xb123_recons)" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "markdown", 1107 | "id": "9fedac66", 1108 | "metadata": {}, 1109 | "source": [ 1110 | "## Compare options for fixed code_size\n", 1111 | "fix number of quantizers " 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "execution_count": null, 1117 | "id": "cde5f63d", 1118 | "metadata": {}, 1119 | "outputs": [], 1120 | "source": [ 1121 | "budget = 6 # budget 6 bytes per vector\n", 1122 | "for M in 4, 8, 16: \n", 1123 | " nbits = budget * 8 // M\n", 1124 | " print(f\"PQ {M}x{nbits}\")\n", 1125 | " pq = faiss.ProductQuantizer(d, M, nbits)\n", 1126 | " print(f\"Sub-vector size {pq.dsub} K={pq.ksub} code size {pq.code_size}\")\n", 1127 | " pq.train(xt)\n", 1128 | " t0 = time.time()\n", 1129 | " pq_reconstruction = pq.decode(pq.compute_codes(xb))\n", 1130 | " t1 = time.time()\n", 1131 | " MSE = ((pq_reconstruction - xb) ** 2).sum(1).mean()\n", 1132 | " print(f\"{MSE=:.2f} encode-decode time: {(t1 - t0)*1000:.3f} ms\")" 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "markdown", 1137 | "id": "aea675ce", 1138 | "metadata": {}, 1139 | "source": [ 1140 | "## Optimized product quantization" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "code", 1145 | "execution_count": null, 1146 | "id": "c6b8f7f1", 1147 | "metadata": {}, 1148 | "outputs": [], 1149 | "source": [ 1150 | "from faiss.contrib.inspect_tools import get_LinearTransform_matrix" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": null, 1156 | "id": "5eb63d88", 1157 | "metadata": {}, 1158 | "outputs": [], 1159 | "source": [ 1160 | "opq = faiss.OPQMatrix(d, 4)\n", 1161 | "pq = faiss.ProductQuantizer(d, 4, 8)" 1162 | ] 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "execution_count": null, 1167 | "id": "7f0e6e5c", 1168 | "metadata": {}, 1169 | "outputs": [], 1170 | "source": [ 1171 | "opq.train(xt)" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "code", 1176 | "execution_count": null, 1177 | "id": "ae9913a3", 1178 | "metadata": {}, 1179 | "outputs": [], 1180 | "source": [ 1181 | "pq.train(opq.apply(xt))" 1182 | ] 1183 | }, 1184 | { 1185 | "cell_type": "code", 1186 | "execution_count": null, 1187 | "id": "3afb9004", 1188 | "metadata": {}, 1189 | "outputs": [], 1190 | "source": [ 1191 | "xb_t = opq.apply(xb)" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "execution_count": null, 1197 | "id": "038bcb8a", 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "xb_t_recons = pq.decode(pq.compute_codes(xb_t))" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": null, 1207 | "id": "6a855571", 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [ 1211 | "((xb_t - xb_t_recons) ** 2).sum(1).mean()" 1212 | ] 1213 | }, 1214 | { 1215 | "cell_type": "markdown", 1216 | "id": "742355f8", 1217 | "metadata": {}, 1218 | "source": [ 1219 | "The MSE for regular PQ was 13 --> improves" 1220 | ] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "execution_count": null, 1225 | "id": "fb17c49c", 1226 | "metadata": {}, 1227 | "outputs": [], 1228 | "source": [ 1229 | "A, bias = get_LinearTransform_matrix(opq) # how to get the OPQ matrix" 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "code", 1234 | "execution_count": null, 1235 | "id": "155ac4a1", 1236 | "metadata": {}, 1237 | "outputs": [], 1238 | "source": [ 1239 | "A.shape" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "markdown", 1244 | "id": "b1d0ef83", 1245 | "metadata": {}, 1246 | "source": [ 1247 | "## PQ in an index" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "markdown", 1252 | "id": "5f0c9f7f", 1253 | "metadata": {}, 1254 | "source": [ 1255 | "A product quantizer with a search function (uses look-up tables)" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "execution_count": null, 1261 | "id": "e7c2b2d2", 1262 | "metadata": {}, 1263 | "outputs": [], 1264 | "source": [ 1265 | "index = faiss.index_factory(d, \"PQ8x6np\")\n", 1266 | "index.train(xt)\n", 1267 | "index.add(xb)\n", 1268 | "D, I = index.search(xq, 10)\n", 1269 | "(I[:, 0] == gt[:, 0]).sum()" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": null, 1275 | "id": "93889500", 1276 | "metadata": {}, 1277 | "outputs": [], 1278 | "source": [ 1279 | "index = faiss.index_factory(d, \"OPQ4,PQ8x6np\")\n", 1280 | "index.train(xt)\n", 1281 | "index.add(xb)\n", 1282 | "D, I = index.search(xq, 10)\n", 1283 | "(I[:, 0] == gt[:, 0]).sum()" 1284 | ] 1285 | }, 1286 | { 1287 | "cell_type": "markdown", 1288 | "id": "ff3843b9", 1289 | "metadata": {}, 1290 | "source": [ 1291 | "OPQ a bit better, but free at search time." 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "markdown", 1296 | "id": "4930ab30", 1297 | "metadata": {}, 1298 | "source": [ 1299 | "# Residual quantization" 1300 | ] 1301 | }, 1302 | { 1303 | "cell_type": "code", 1304 | "execution_count": null, 1305 | "id": "cc11a1bf", 1306 | "metadata": {}, 1307 | "outputs": [], 1308 | "source": [ 1309 | "rq = faiss.ResidualQuantizer(d, 4, 8)" 1310 | ] 1311 | }, 1312 | { 1313 | "cell_type": "code", 1314 | "execution_count": null, 1315 | "id": "19d003a0", 1316 | "metadata": {}, 1317 | "outputs": [], 1318 | "source": [ 1319 | "rq.max_beam_size " 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": null, 1325 | "id": "170d394e", 1326 | "metadata": {}, 1327 | "outputs": [], 1328 | "source": [ 1329 | "%%time \n", 1330 | "rq.train(xt[:50_000])" 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "execution_count": null, 1336 | "id": "5cdf572f", 1337 | "metadata": {}, 1338 | "outputs": [], 1339 | "source": [ 1340 | "xb_recons = rq.decode(rq.compute_codes(xb))\n", 1341 | "((xb - xb_recons) ** 2).sum(1).mean()" 1342 | ] 1343 | }, 1344 | { 1345 | "cell_type": "markdown", 1346 | "id": "81566a1b", 1347 | "metadata": {}, 1348 | "source": [ 1349 | "A bit better than OPQ" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "execution_count": null, 1355 | "id": "37821ac8", 1356 | "metadata": {}, 1357 | "outputs": [], 1358 | "source": [ 1359 | "rq.max_beam_size = 50" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "code", 1364 | "execution_count": null, 1365 | "id": "913e5caf", 1366 | "metadata": {}, 1367 | "outputs": [], 1368 | "source": [ 1369 | "%%time\n", 1370 | "xb_recons = rq.decode(rq.compute_codes(xb))\n", 1371 | "((xb - xb_recons) ** 2).sum(1).mean()" 1372 | ] 1373 | }, 1374 | { 1375 | "cell_type": "markdown", 1376 | "id": "b03a5f4a", 1377 | "metadata": {}, 1378 | "source": [ 1379 | "Improves (slowly)" 1380 | ] 1381 | }, 1382 | { 1383 | "cell_type": "markdown", 1384 | "id": "857e79bf", 1385 | "metadata": {}, 1386 | "source": [ 1387 | "# Search with additive quantizers" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "code", 1392 | "execution_count": null, 1393 | "id": "6888cb5b", 1394 | "metadata": {}, 1395 | "outputs": [], 1396 | "source": [ 1397 | "index = faiss.index_factory(d, \"RQ8x6\")\n", 1398 | "index.code_size" 1399 | ] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": null, 1404 | "id": "09331ce1", 1405 | "metadata": {}, 1406 | "outputs": [], 1407 | "source": [ 1408 | "index.train(xt[:50_000])" 1409 | ] 1410 | }, 1411 | { 1412 | "cell_type": "code", 1413 | "execution_count": null, 1414 | "id": "86e183da", 1415 | "metadata": {}, 1416 | "outputs": [], 1417 | "source": [ 1418 | "index.add(xb)\n", 1419 | "D, I = index.search(xq, 10)\n", 1420 | "(I[:, 0] == gt[:, 0]).sum()" 1421 | ] 1422 | }, 1423 | { 1424 | "cell_type": "markdown", 1425 | "id": "53b7af9e", 1426 | "metadata": {}, 1427 | "source": [ 1428 | "Better than PQ & OPQ" 1429 | ] 1430 | }, 1431 | { 1432 | "cell_type": "code", 1433 | "execution_count": null, 1434 | "id": "ddd9a100", 1435 | "metadata": {}, 1436 | "outputs": [], 1437 | "source": [ 1438 | "%timeit index.search(xq, 10)" 1439 | ] 1440 | }, 1441 | { 1442 | "cell_type": "markdown", 1443 | "id": "31faa0af", 1444 | "metadata": {}, 1445 | "source": [ 1446 | "This is a search timing with decoding " 1447 | ] 1448 | }, 1449 | { 1450 | "cell_type": "code", 1451 | "execution_count": null, 1452 | "id": "deab7f5e", 1453 | "metadata": {}, 1454 | "outputs": [], 1455 | "source": [ 1456 | "index = faiss.index_factory(d, \"RQ8x6_Nqint8\")\n", 1457 | "index.code_size" 1458 | ] 1459 | }, 1460 | { 1461 | "cell_type": "code", 1462 | "execution_count": null, 1463 | "id": "40cff484", 1464 | "metadata": {}, 1465 | "outputs": [], 1466 | "source": [ 1467 | "index.train(xt[:50_000])\n", 1468 | "index.add(xb)\n", 1469 | "D, I = index.search(xq, 10)\n", 1470 | "(I[:, 0] == gt[:, 0]).sum()" 1471 | ] 1472 | }, 1473 | { 1474 | "cell_type": "code", 1475 | "execution_count": null, 1476 | "id": "9daf0ba4", 1477 | "metadata": {}, 1478 | "outputs": [], 1479 | "source": [ 1480 | "%timeit index.search(xq, 10)" 1481 | ] 1482 | }, 1483 | { 1484 | "cell_type": "markdown", 1485 | "id": "ae050618", 1486 | "metadata": {}, 1487 | "source": [ 1488 | "Same result but much faster (uses encoded norm) " 1489 | ] 1490 | }, 1491 | { 1492 | "cell_type": "markdown", 1493 | "id": "674563a7", 1494 | "metadata": {}, 1495 | "source": [ 1496 | "# Scalar quantizers" 1497 | ] 1498 | }, 1499 | { 1500 | "cell_type": "code", 1501 | "execution_count": null, 1502 | "id": "cafaafed", 1503 | "metadata": {}, 1504 | "outputs": [], 1505 | "source": [ 1506 | "for key in \"Flat\", \"SQfp16\", \"SQ8\", \"SQ6\", \"SQ4\", \"LSHrt\": \n", 1507 | " index = faiss.index_factory(d, key)\n", 1508 | " index.train(xt[:50_000])\n", 1509 | " index.add(xb)\n", 1510 | " D, I = index.search(xq, 10)\n", 1511 | " nfound = (I[:, 0] == gt[:, 0]).sum()\n", 1512 | " \n", 1513 | " print(f\"{key} {index.code_size=:} {nfound=:}\")" 1514 | ] 1515 | }, 1516 | { 1517 | "cell_type": "markdown", 1518 | "id": "886f7b1a", 1519 | "metadata": {}, 1520 | "source": [ 1521 | "# Polysemous codes " 1522 | ] 1523 | }, 1524 | { 1525 | "cell_type": "code", 1526 | "execution_count": null, 1527 | "id": "b7b97cf4", 1528 | "metadata": {}, 1529 | "outputs": [], 1530 | "source": [ 1531 | "index = faiss.index_factory(d, \"PQ8x8\") # omit the np" 1532 | ] 1533 | }, 1534 | { 1535 | "cell_type": "code", 1536 | "execution_count": null, 1537 | "id": "72ef7542", 1538 | "metadata": {}, 1539 | "outputs": [], 1540 | "source": [ 1541 | "index.code_size" 1542 | ] 1543 | }, 1544 | { 1545 | "cell_type": "code", 1546 | "execution_count": null, 1547 | "id": "10ef0e5d", 1548 | "metadata": {}, 1549 | "outputs": [], 1550 | "source": [ 1551 | "index.train(xt)\n", 1552 | "index.add(xb)" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "execution_count": null, 1558 | "id": "9debd581", 1559 | "metadata": {}, 1560 | "outputs": [], 1561 | "source": [ 1562 | "index.polysemous_ht # threshold of binary code comparison -- default does not filter " 1563 | ] 1564 | }, 1565 | { 1566 | "cell_type": "code", 1567 | "execution_count": null, 1568 | "id": "412b57eb", 1569 | "metadata": {}, 1570 | "outputs": [], 1571 | "source": [ 1572 | "D, I = index.search(xq, 10)\n", 1573 | "(I[:, 0] == gt[:, 0]).sum()" 1574 | ] 1575 | }, 1576 | { 1577 | "cell_type": "code", 1578 | "execution_count": null, 1579 | "id": "60336e65", 1580 | "metadata": {}, 1581 | "outputs": [], 1582 | "source": [ 1583 | "%timeit index.search(xq, 10)" 1584 | ] 1585 | }, 1586 | { 1587 | "cell_type": "code", 1588 | "execution_count": null, 1589 | "id": "d0f73061", 1590 | "metadata": {}, 1591 | "outputs": [], 1592 | "source": [ 1593 | "index.search_type = faiss.IndexPQ.ST_polysemous\n", 1594 | "index.polysemous_ht = 24\n", 1595 | "D, I = index.search(xq, 10)\n", 1596 | "(I[:, 0] == gt[:, 0]).sum()" 1597 | ] 1598 | }, 1599 | { 1600 | "cell_type": "code", 1601 | "execution_count": null, 1602 | "id": "2b6ff69d", 1603 | "metadata": {}, 1604 | "outputs": [], 1605 | "source": [ 1606 | "%timeit index.search(xq, 10)" 1607 | ] 1608 | }, 1609 | { 1610 | "cell_type": "markdown", 1611 | "id": "9a03984a", 1612 | "metadata": {}, 1613 | "source": [ 1614 | "About twice faster, same accuracy" 1615 | ] 1616 | }, 1617 | { 1618 | "cell_type": "markdown", 1619 | "id": "c7ccba6c", 1620 | "metadata": {}, 1621 | "source": [ 1622 | "# IVFPQ index" 1623 | ] 1624 | }, 1625 | { 1626 | "cell_type": "code", 1627 | "execution_count": null, 1628 | "id": "cea808f2", 1629 | "metadata": {}, 1630 | "outputs": [], 1631 | "source": [ 1632 | "index = faiss.index_factory(d, \"IVF200,PQ16x8np\") " 1633 | ] 1634 | }, 1635 | { 1636 | "cell_type": "code", 1637 | "execution_count": null, 1638 | "id": "cc6a9584", 1639 | "metadata": {}, 1640 | "outputs": [], 1641 | "source": [ 1642 | "index.train(xt)" 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "code", 1647 | "execution_count": null, 1648 | "id": "e5ab2fdb", 1649 | "metadata": {}, 1650 | "outputs": [], 1651 | "source": [ 1652 | "index.add(xb)" 1653 | ] 1654 | }, 1655 | { 1656 | "cell_type": "code", 1657 | "execution_count": null, 1658 | "id": "8102f0a9", 1659 | "metadata": {}, 1660 | "outputs": [], 1661 | "source": [ 1662 | "D, I = index.search(xq, 10)" 1663 | ] 1664 | }, 1665 | { 1666 | "cell_type": "code", 1667 | "execution_count": null, 1668 | "id": "def2e3e2", 1669 | "metadata": {}, 1670 | "outputs": [], 1671 | "source": [ 1672 | "(I[:, 0] == gt[:, 0]).sum()" 1673 | ] 1674 | }, 1675 | { 1676 | "cell_type": "code", 1677 | "execution_count": null, 1678 | "id": "43cb68b7", 1679 | "metadata": {}, 1680 | "outputs": [], 1681 | "source": [ 1682 | "index.nprobe " 1683 | ] 1684 | }, 1685 | { 1686 | "cell_type": "code", 1687 | "execution_count": null, 1688 | "id": "dbc7d35c", 1689 | "metadata": {}, 1690 | "outputs": [], 1691 | "source": [ 1692 | "for nprobe in 2, 5, 10, 20, 50: \n", 1693 | " index.nprobe = nprobe \n", 1694 | " t0 = time.time()\n", 1695 | " for _ in range(50): \n", 1696 | " D, I = index.search(xq, 10)\n", 1697 | " t1 = time.time()\n", 1698 | " nok = (I[:, 0] == gt[:, 0]).sum()\n", 1699 | " print(f\"{nprobe=:} {nok=:} {(t1 - t0)*1000:.3f} ms\")" 1700 | ] 1701 | }, 1702 | { 1703 | "cell_type": "markdown", 1704 | "id": "c43314dc", 1705 | "metadata": {}, 1706 | "source": [ 1707 | "## Fast-scan SIMD implementation" 1708 | ] 1709 | }, 1710 | { 1711 | "cell_type": "code", 1712 | "execution_count": null, 1713 | "id": "d8842cd9", 1714 | "metadata": {}, 1715 | "outputs": [], 1716 | "source": [ 1717 | "index = faiss.index_factory(d, \"IVF200,PQ32x4fsr\") \n", 1718 | "index.train(xt)\n", 1719 | "index.add(xb)" 1720 | ] 1721 | }, 1722 | { 1723 | "cell_type": "code", 1724 | "execution_count": null, 1725 | "id": "986b6a5c", 1726 | "metadata": {}, 1727 | "outputs": [], 1728 | "source": [ 1729 | "for nprobe in 2, 5, 10, 20, 50: \n", 1730 | " index.nprobe = nprobe \n", 1731 | " t0 = time.time()\n", 1732 | " for _ in range(50): \n", 1733 | " D, I = index.search(xq, 10)\n", 1734 | " t1 = time.time()\n", 1735 | " nok = (I[:, 0] == gt[:, 0]).sum()\n", 1736 | " print(f\"{nprobe=:} {nok=:} {(t1 - t0)*1000:.3f} ms\")" 1737 | ] 1738 | }, 1739 | { 1740 | "cell_type": "code", 1741 | "execution_count": null, 1742 | "id": "3d9f505e", 1743 | "metadata": {}, 1744 | "outputs": [], 1745 | "source": [] 1746 | } 1747 | ], 1748 | "metadata": { 1749 | "kernelspec": { 1750 | "display_name": "Python 3 (ipykernel)", 1751 | "language": "python", 1752 | "name": "python3" 1753 | }, 1754 | "language_info": { 1755 | "codemirror_mode": { 1756 | "name": "ipython", 1757 | "version": 3 1758 | }, 1759 | "file_extension": ".py", 1760 | "mimetype": "text/x-python", 1761 | "name": "python", 1762 | "nbconvert_exporter": "python", 1763 | "pygments_lexer": "ipython3", 1764 | "version": "3.10.13" 1765 | } 1766 | }, 1767 | "nbformat": 4, 1768 | "nbformat_minor": 5 1769 | } 1770 | -------------------------------------------------------------------------------- /class_notes/Class_09_graph_indexes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/Class_09_graph_indexes.pdf -------------------------------------------------------------------------------- /class_notes/README.md: -------------------------------------------------------------------------------- 1 | # Long Term Memory in AI - Vector Search and Databases 2 | These are materials for [COS 597A](https://github.com/edoliberty/vector-search-class-notes/tree/main) is given at Princeton during the Fall semester 2023. 3 | 4 | ## Disclaimer 5 | The following content is rapidly evolving over the next several months. Please regard the materials above more as a sandbox than as shareable materials. 6 | 7 | ## Contribute 8 | These class notes are intended to be used freely by academics anywhere, students and professors alike. Please feel free to contribute in the form of pull requests or opening issues. 9 | -------------------------------------------------------------------------------- /class_notes/images/chernoff-exp-bounds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/chernoff-exp-bounds.png -------------------------------------------------------------------------------- /class_notes/images/dragon_diff_dup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/dragon_diff_dup.jpg -------------------------------------------------------------------------------- /class_notes/images/kdtrees-construction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-construction.png -------------------------------------------------------------------------------- /class_notes/images/kdtrees-proof.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-proof.png -------------------------------------------------------------------------------- /class_notes/images/kdtrees-search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kdtrees-search.png -------------------------------------------------------------------------------- /class_notes/images/kmeans-proj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/kmeans-proj.png -------------------------------------------------------------------------------- /class_notes/images/nnsearch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/nnsearch.png -------------------------------------------------------------------------------- /class_notes/images/pca.excalidraw: -------------------------------------------------------------------------------- 1 | { 2 | "type": "excalidraw", 3 | "version": 2, 4 | "source": "https://app.excalidraw.com", 5 | "elements": [ 6 | { 7 | "type": "arrow", 8 | "version": 89, 9 | "versionNonce": 1711806385, 10 | "isDeleted": false, 11 | "id": "psR6eSvp2P4QEWZr2DQKN", 12 | "fillStyle": "hachure", 13 | "strokeWidth": 1, 14 | "strokeStyle": "solid", 15 | "roughness": 1, 16 | "opacity": 100, 17 | "angle": 0, 18 | "x": 364.1484375, 19 | "y": 697.4765625, 20 | "strokeColor": "#1e1e1e", 21 | "backgroundColor": "transparent", 22 | "width": 3.27734375, 23 | "height": 385.296875, 24 | "seed": 287312799, 25 | "groupIds": [], 26 | "frameId": null, 27 | "roundness": { 28 | "type": 2 29 | }, 30 | "boundElements": [], 31 | "updated": 1693239320995, 32 | "link": null, 33 | "locked": false, 34 | "startBinding": null, 35 | "endBinding": null, 36 | "lastCommittedPoint": null, 37 | "startArrowhead": null, 38 | "endArrowhead": "arrow", 39 | "points": [ 40 | [ 41 | 0, 42 | 0 43 | ], 44 | [ 45 | -3.27734375, 46 | -385.296875 47 | ] 48 | ] 49 | }, 50 | { 51 | "type": "arrow", 52 | "version": 137, 53 | "versionNonce": 1877686239, 54 | "isDeleted": false, 55 | "id": "MFR8U6_UeWjWwRz45RJi3", 56 | "fillStyle": "hachure", 57 | "strokeWidth": 1, 58 | "strokeStyle": "solid", 59 | "roughness": 1, 60 | "opacity": 100, 61 | "angle": 0, 62 | "x": 279.73046875, 63 | "y": 619.62109375, 64 | "strokeColor": "#1e1e1e", 65 | "backgroundColor": "transparent", 66 | "width": 755.75, 67 | "height": 4.08203125, 68 | "seed": 794312191, 69 | "groupIds": [], 70 | "frameId": null, 71 | "roundness": { 72 | "type": 2 73 | }, 74 | "boundElements": [], 75 | "updated": 1693239320995, 76 | "link": null, 77 | "locked": false, 78 | "startBinding": null, 79 | "endBinding": null, 80 | "lastCommittedPoint": null, 81 | "startArrowhead": null, 82 | "endArrowhead": "arrow", 83 | "points": [ 84 | [ 85 | 0, 86 | 0 87 | ], 88 | [ 89 | 755.75, 90 | -4.08203125 91 | ] 92 | ] 93 | }, 94 | { 95 | "type": "diamond", 96 | "version": 865, 97 | "versionNonce": 1028163807, 98 | "isDeleted": false, 99 | "id": "8HSTYnEQtwMgEsmaZvHzy", 100 | "fillStyle": "hachure", 101 | "strokeWidth": 1, 102 | "strokeStyle": "solid", 103 | "roughness": 1, 104 | "opacity": 100, 105 | "angle": 5.956825058112036, 106 | "x": 349.19270593476466, 107 | "y": 367.4721183769821, 108 | "strokeColor": "#1e1e1e", 109 | "backgroundColor": "transparent", 110 | "width": 609.5141984414641, 111 | "height": 304.80769228380507, 112 | "seed": 1971597727, 113 | "groupIds": [], 114 | "frameId": null, 115 | "roundness": null, 116 | "boundElements": [ 117 | { 118 | "id": "JxAYXYDpuFLjxcxo4fGGt", 119 | "type": "arrow" 120 | }, 121 | { 122 | "id": "xZClQRoQkSaRFfs9laIPW", 123 | "type": "arrow" 124 | } 125 | ], 126 | "updated": 1693239518301, 127 | "link": null, 128 | "locked": false 129 | }, 130 | { 131 | "type": "ellipse", 132 | "version": 1072, 133 | "versionNonce": 1452729343, 134 | "isDeleted": false, 135 | "id": "jSIw3zMX4MMfJx63tvCod", 136 | "fillStyle": "solid", 137 | "strokeWidth": 2, 138 | "strokeStyle": "solid", 139 | "roughness": 1, 140 | "opacity": 100, 141 | "angle": 0, 142 | "x": 643.9316406249991, 143 | "y": 444.44726562499943, 144 | "strokeColor": "#1e1e1e", 145 | "backgroundColor": "#1e1e1e", 146 | "width": 11.36328125, 147 | "height": 11.296875, 148 | "seed": 1533171921, 149 | "groupIds": [], 150 | "frameId": null, 151 | "roundness": null, 152 | "boundElements": [], 153 | "updated": 1693239320995, 154 | "link": null, 155 | "locked": false 156 | }, 157 | { 158 | "type": "ellipse", 159 | "version": 1107, 160 | "versionNonce": 767074161, 161 | "isDeleted": false, 162 | "id": "nGxVo89H30T_e4-LYRUKn", 163 | "fillStyle": "solid", 164 | "strokeWidth": 2, 165 | "strokeStyle": "solid", 166 | "roughness": 1, 167 | "opacity": 100, 168 | "angle": 0, 169 | "x": 565.0117187499991, 170 | "y": 363.34960937499943, 171 | "strokeColor": "#1e1e1e", 172 | "backgroundColor": "#1e1e1e", 173 | "width": 11.36328125, 174 | "height": 11.296875, 175 | "seed": 1344345777, 176 | "groupIds": [], 177 | "frameId": null, 178 | "roundness": null, 179 | "boundElements": [], 180 | "updated": 1693239320995, 181 | "link": null, 182 | "locked": false 183 | }, 184 | { 185 | "type": "ellipse", 186 | "version": 1204, 187 | "versionNonce": 1829136191, 188 | "isDeleted": false, 189 | "id": "cTsABDIHJxAw41elopIyV", 190 | "fillStyle": "solid", 191 | "strokeWidth": 2, 192 | "strokeStyle": "solid", 193 | "roughness": 1, 194 | "opacity": 100, 195 | "angle": 0, 196 | "x": 473.2695312499991, 197 | "y": 570.0683593750008, 198 | "strokeColor": "#1e1e1e", 199 | "backgroundColor": "#1e1e1e", 200 | "width": 11.36328125, 201 | "height": 11.296875, 202 | "seed": 920510065, 203 | "groupIds": [], 204 | "frameId": null, 205 | "roundness": null, 206 | "boundElements": [], 207 | "updated": 1693239419995, 208 | "link": null, 209 | "locked": false 210 | }, 211 | { 212 | "type": "ellipse", 213 | "version": 1152, 214 | "versionNonce": 321203775, 215 | "isDeleted": false, 216 | "id": "e3JE3PUcczgJzM4WOZn76", 217 | "fillStyle": "solid", 218 | "strokeWidth": 2, 219 | "strokeStyle": "solid", 220 | "roughness": 1, 221 | "opacity": 100, 222 | "angle": 0, 223 | "x": 684.4999999999991, 224 | "y": 691.5097656250013, 225 | "strokeColor": "#1e1e1e", 226 | "backgroundColor": "#1e1e1e", 227 | "width": 11.36328125, 228 | "height": 11.296875, 229 | "seed": 1767565393, 230 | "groupIds": [], 231 | "frameId": null, 232 | "roundness": null, 233 | "boundElements": [], 234 | "updated": 1693239363160, 235 | "link": null, 236 | "locked": false 237 | }, 238 | { 239 | "type": "ellipse", 240 | "version": 1327, 241 | "versionNonce": 1515191217, 242 | "isDeleted": false, 243 | "id": "YR3c0Co_UxaclMGYqADWB", 244 | "fillStyle": "solid", 245 | "strokeWidth": 2, 246 | "strokeStyle": "solid", 247 | "roughness": 1, 248 | "opacity": 100, 249 | "angle": 0, 250 | "x": 509.8320312499991, 251 | "y": 536.2949218750008, 252 | "strokeColor": "#1e1e1e", 253 | "backgroundColor": "#1e1e1e", 254 | "width": 11.36328125, 255 | "height": 11.296875, 256 | "seed": 1062975025, 257 | "groupIds": [], 258 | "frameId": null, 259 | "roundness": null, 260 | "boundElements": [], 261 | "updated": 1693239360627, 262 | "link": null, 263 | "locked": false 264 | }, 265 | { 266 | "type": "ellipse", 267 | "version": 1123, 268 | "versionNonce": 2139676401, 269 | "isDeleted": false, 270 | "id": "Ncc_A-KxecqBQGiXYwUkt", 271 | "fillStyle": "solid", 272 | "strokeWidth": 2, 273 | "strokeStyle": "solid", 274 | "roughness": 1, 275 | "opacity": 100, 276 | "angle": 0, 277 | "x": 810.8046874999991, 278 | "y": 365.01367187499943, 279 | "strokeColor": "#1e1e1e", 280 | "backgroundColor": "#1e1e1e", 281 | "width": 11.36328125, 282 | "height": 11.296875, 283 | "seed": 1047132255, 284 | "groupIds": [], 285 | "frameId": null, 286 | "roundness": null, 287 | "boundElements": [], 288 | "updated": 1693239320995, 289 | "link": null, 290 | "locked": false 291 | }, 292 | { 293 | "type": "ellipse", 294 | "version": 1171, 295 | "versionNonce": 1439815889, 296 | "isDeleted": false, 297 | "id": "kebTOWkuV1Gi2T2pdIkUX", 298 | "fillStyle": "solid", 299 | "strokeWidth": 2, 300 | "strokeStyle": "solid", 301 | "roughness": 1, 302 | "opacity": 100, 303 | "angle": 0, 304 | "x": 742.3906249999991, 305 | "y": 567.4082031250008, 306 | "strokeColor": "#1e1e1e", 307 | "backgroundColor": "#1e1e1e", 308 | "width": 11.36328125, 309 | "height": 11.296875, 310 | "seed": 610315423, 311 | "groupIds": [], 312 | "frameId": null, 313 | "roundness": null, 314 | "boundElements": [], 315 | "updated": 1693239320995, 316 | "link": null, 317 | "locked": false 318 | }, 319 | { 320 | "type": "ellipse", 321 | "version": 1163, 322 | "versionNonce": 191994047, 323 | "isDeleted": false, 324 | "id": "NLleLR66x0sdOMy0wselI", 325 | "fillStyle": "solid", 326 | "strokeWidth": 2, 327 | "strokeStyle": "solid", 328 | "roughness": 1, 329 | "opacity": 100, 330 | "angle": 0, 331 | "x": 877.7812499999991, 332 | "y": 540.0175781250013, 333 | "strokeColor": "#1e1e1e", 334 | "backgroundColor": "#1e1e1e", 335 | "width": 11.36328125, 336 | "height": 11.296875, 337 | "seed": 1845477567, 338 | "groupIds": [], 339 | "frameId": null, 340 | "roundness": null, 341 | "boundElements": [], 342 | "updated": 1693239320995, 343 | "link": null, 344 | "locked": false 345 | }, 346 | { 347 | "type": "ellipse", 348 | "version": 1424, 349 | "versionNonce": 1363037983, 350 | "isDeleted": false, 351 | "id": "fgLhBWmoRKYV2V19yUIVc", 352 | "fillStyle": "solid", 353 | "strokeWidth": 2, 354 | "strokeStyle": "solid", 355 | "roughness": 1, 356 | "opacity": 100, 357 | "angle": 0, 358 | "x": 687.5546874999991, 359 | "y": 313.22851562499943, 360 | "strokeColor": "#1e1e1e", 361 | "backgroundColor": "#1e1e1e", 362 | "width": 11.36328125, 363 | "height": 11.296875, 364 | "seed": 2024267007, 365 | "groupIds": [], 366 | "frameId": null, 367 | "roundness": null, 368 | "boundElements": [], 369 | "updated": 1693239392930, 370 | "link": null, 371 | "locked": false 372 | }, 373 | { 374 | "id": "f26_GHdCGVl2N_wGG2VkZ", 375 | "type": "line", 376 | "x": 881.390625, 377 | "y": 542.84765625, 378 | "width": 43.875, 379 | "height": 77.04296875, 380 | "angle": 0, 381 | "strokeColor": "#1e1e1e", 382 | "backgroundColor": "transparent", 383 | "fillStyle": "hachure", 384 | "strokeWidth": 1, 385 | "strokeStyle": "dashed", 386 | "roughness": 1, 387 | "opacity": 100, 388 | "groupIds": [], 389 | "frameId": null, 390 | "roundness": null, 391 | "seed": 1393721105, 392 | "version": 39, 393 | "versionNonce": 360953087, 394 | "isDeleted": false, 395 | "boundElements": null, 396 | "updated": 1693239320995, 397 | "link": null, 398 | "locked": false, 399 | "points": [ 400 | [ 401 | 0, 402 | 0 403 | ], 404 | [ 405 | -43.875, 406 | -77.04296875 407 | ] 408 | ], 409 | "lastCommittedPoint": null, 410 | "startBinding": null, 411 | "endBinding": null, 412 | "startArrowhead": null, 413 | "endArrowhead": null 414 | }, 415 | { 416 | "type": "ellipse", 417 | "version": 1214, 418 | "versionNonce": 1031626353, 419 | "isDeleted": false, 420 | "id": "3s37WmvPbgH_4E5OjXE1m", 421 | "fillStyle": "solid", 422 | "strokeWidth": 2, 423 | "strokeStyle": "solid", 424 | "roughness": 1, 425 | "opacity": 100, 426 | "angle": 0, 427 | "x": 830.505859375, 428 | "y": 459.7695312500001, 429 | "strokeColor": "#1e1e1e", 430 | "backgroundColor": "transparent", 431 | "width": 11.36328125, 432 | "height": 11.296875, 433 | "seed": 975525087, 434 | "groupIds": [], 435 | "frameId": null, 436 | "roundness": null, 437 | "boundElements": [], 438 | "updated": 1693239320995, 439 | "link": null, 440 | "locked": false 441 | }, 442 | { 443 | "type": "ellipse", 444 | "version": 1335, 445 | "versionNonce": 1041238961, 446 | "isDeleted": false, 447 | "id": "f73_5TYYv5Mt5mhqPso08", 448 | "fillStyle": "solid", 449 | "strokeWidth": 2, 450 | "strokeStyle": "solid", 451 | "roughness": 1, 452 | "opacity": 100, 453 | "angle": 0, 454 | "x": 715.7826731642708, 455 | "y": 525.6458047041671, 456 | "strokeColor": "#1e1e1e", 457 | "backgroundColor": "transparent", 458 | "width": 11.36328125, 459 | "height": 11.296875, 460 | "seed": 1258547409, 461 | "groupIds": [], 462 | "frameId": null, 463 | "roundness": null, 464 | "boundElements": [], 465 | "updated": 1693239333950, 466 | "link": null, 467 | "locked": false 468 | }, 469 | { 470 | "id": "hh_z4WA9tpZLxJ6LYUnLS", 471 | "type": "line", 472 | "x": 745.4140625, 473 | "y": 570.6171875, 474 | "width": 20.11328125, 475 | "height": 34.45703125, 476 | "angle": 0, 477 | "strokeColor": "#1e1e1e", 478 | "backgroundColor": "transparent", 479 | "fillStyle": "hachure", 480 | "strokeWidth": 1, 481 | "strokeStyle": "dashed", 482 | "roughness": 1, 483 | "opacity": 100, 484 | "groupIds": [], 485 | "frameId": null, 486 | "roundness": null, 487 | "seed": 1593905873, 488 | "version": 38, 489 | "versionNonce": 843463935, 490 | "isDeleted": false, 491 | "boundElements": null, 492 | "updated": 1693239331724, 493 | "link": null, 494 | "locked": false, 495 | "points": [ 496 | [ 497 | 0, 498 | 0 499 | ], 500 | [ 501 | -20.11328125, 502 | -34.45703125 503 | ] 504 | ], 505 | "lastCommittedPoint": null, 506 | "startBinding": null, 507 | "endBinding": null, 508 | "startArrowhead": null, 509 | "endArrowhead": null 510 | }, 511 | { 512 | "id": "dQv-OSQHmxyifigNoEpmt", 513 | "type": "line", 514 | "x": 655.41796875, 515 | "y": 459.48828125, 516 | "width": 15.4296875, 517 | "height": 32.8359375, 518 | "angle": 0, 519 | "strokeColor": "#1e1e1e", 520 | "backgroundColor": "transparent", 521 | "fillStyle": "hachure", 522 | "strokeWidth": 1, 523 | "strokeStyle": "dashed", 524 | "roughness": 1, 525 | "opacity": 100, 526 | "groupIds": [], 527 | "frameId": null, 528 | "roundness": null, 529 | "seed": 1012231569, 530 | "version": 43, 531 | "versionNonce": 1541485215, 532 | "isDeleted": false, 533 | "boundElements": null, 534 | "updated": 1693239340345, 535 | "link": null, 536 | "locked": false, 537 | "points": [ 538 | [ 539 | 0, 540 | 0 541 | ], 542 | [ 543 | 15.4296875, 544 | 32.8359375 545 | ] 546 | ], 547 | "lastCommittedPoint": null, 548 | "startBinding": null, 549 | "endBinding": null, 550 | "startArrowhead": null, 551 | "endArrowhead": null 552 | }, 553 | { 554 | "type": "ellipse", 555 | "version": 1386, 556 | "versionNonce": 486839807, 557 | "isDeleted": false, 558 | "id": "R1-4ATAv_nN6yBknTiz9E", 559 | "fillStyle": "solid", 560 | "strokeWidth": 2, 561 | "strokeStyle": "solid", 562 | "roughness": 1, 563 | "opacity": 100, 564 | "angle": 0, 565 | "x": 668.892578125, 566 | "y": 497.1523437499999, 567 | "strokeColor": "#1e1e1e", 568 | "backgroundColor": "transparent", 569 | "width": 11.36328125, 570 | "height": 11.296875, 571 | "seed": 791709873, 572 | "groupIds": [], 573 | "frameId": null, 574 | "roundness": null, 575 | "boundElements": [], 576 | "updated": 1693239344248, 577 | "link": null, 578 | "locked": false 579 | }, 580 | { 581 | "id": "NXBahAn28pAT6DtCWimAG", 582 | "type": "line", 583 | "x": 520.6484375, 584 | "y": 553.5625, 585 | "width": 9.765625, 586 | "height": 18.26171875, 587 | "angle": 0, 588 | "strokeColor": "#1e1e1e", 589 | "backgroundColor": "transparent", 590 | "fillStyle": "hachure", 591 | "strokeWidth": 1, 592 | "strokeStyle": "dashed", 593 | "roughness": 1, 594 | "opacity": 100, 595 | "groupIds": [], 596 | "frameId": null, 597 | "roundness": null, 598 | "seed": 1897889311, 599 | "version": 48, 600 | "versionNonce": 2103633695, 601 | "isDeleted": false, 602 | "boundElements": null, 603 | "updated": 1693239427160, 604 | "link": null, 605 | "locked": false, 606 | "points": [ 607 | [ 608 | 0, 609 | 0 610 | ], 611 | [ 612 | 9.765625, 613 | 18.26171875 614 | ] 615 | ], 616 | "lastCommittedPoint": null, 617 | "startBinding": null, 618 | "endBinding": null, 619 | "startArrowhead": null, 620 | "endArrowhead": null 621 | }, 622 | { 623 | "type": "ellipse", 624 | "version": 1443, 625 | "versionNonce": 722951569, 626 | "isDeleted": false, 627 | "id": "FK_Gy7rcuVejdls-McB1n", 628 | "fillStyle": "solid", 629 | "strokeWidth": 2, 630 | "strokeStyle": "solid", 631 | "roughness": 1, 632 | "opacity": 100, 633 | "angle": 0, 634 | "x": 530.998046875, 635 | "y": 575.703125, 636 | "strokeColor": "#1e1e1e", 637 | "backgroundColor": "transparent", 638 | "width": 11.36328125, 639 | "height": 11.296875, 640 | "seed": 1282704561, 641 | "groupIds": [], 642 | "frameId": null, 643 | "roundness": null, 644 | "boundElements": [], 645 | "updated": 1693239360627, 646 | "link": null, 647 | "locked": false 648 | }, 649 | { 650 | "id": "GjS4uJ6VLFK9oZcXJr0Ox", 651 | "type": "line", 652 | "x": 689.03515625, 653 | "y": 697.7890625, 654 | "width": 76.89453125, 655 | "height": 113.76171875, 656 | "angle": 0, 657 | "strokeColor": "#1e1e1e", 658 | "backgroundColor": "transparent", 659 | "fillStyle": "hachure", 660 | "strokeWidth": 1, 661 | "strokeStyle": "dashed", 662 | "roughness": 1, 663 | "opacity": 100, 664 | "groupIds": [], 665 | "frameId": null, 666 | "roundness": null, 667 | "seed": 2142968113, 668 | "version": 103, 669 | "versionNonce": 457782481, 670 | "isDeleted": false, 671 | "boundElements": null, 672 | "updated": 1693239370270, 673 | "link": null, 674 | "locked": false, 675 | "points": [ 676 | [ 677 | 0, 678 | 0 679 | ], 680 | [ 681 | -76.89453125, 682 | -113.76171875 683 | ] 684 | ], 685 | "lastCommittedPoint": null, 686 | "startBinding": null, 687 | "endBinding": null, 688 | "startArrowhead": null, 689 | "endArrowhead": null 690 | }, 691 | { 692 | "type": "ellipse", 693 | "version": 1463, 694 | "versionNonce": 784994257, 695 | "isDeleted": false, 696 | "id": "OyKsnV-vE9KVPTC1LgaYS", 697 | "fillStyle": "solid", 698 | "strokeWidth": 2, 699 | "strokeStyle": "solid", 700 | "roughness": 1, 701 | "opacity": 100, 702 | "angle": 0, 703 | "x": 603.291015625, 704 | "y": 569.59765625, 705 | "strokeColor": "#1e1e1e", 706 | "backgroundColor": "transparent", 707 | "width": 11.36328125, 708 | "height": 11.296875, 709 | "seed": 1349389535, 710 | "groupIds": [], 711 | "frameId": null, 712 | "roundness": null, 713 | "boundElements": [], 714 | "updated": 1693239373992, 715 | "link": null, 716 | "locked": false 717 | }, 718 | { 719 | "id": "2QJVUvB1aPHrepxQvZdwm", 720 | "type": "line", 721 | "x": 820.875, 722 | "y": 379.046875, 723 | "width": 32.0234375, 724 | "height": 51.73828125, 725 | "angle": 0, 726 | "strokeColor": "#1e1e1e", 727 | "backgroundColor": "transparent", 728 | "fillStyle": "hachure", 729 | "strokeWidth": 1, 730 | "strokeStyle": "dashed", 731 | "roughness": 1, 732 | "opacity": 100, 733 | "groupIds": [], 734 | "frameId": null, 735 | "roundness": null, 736 | "seed": 982924863, 737 | "version": 19, 738 | "versionNonce": 353800031, 739 | "isDeleted": false, 740 | "boundElements": null, 741 | "updated": 1693239386996, 742 | "link": null, 743 | "locked": false, 744 | "points": [ 745 | [ 746 | 0, 747 | 0 748 | ], 749 | [ 750 | 32.0234375, 751 | 51.73828125 752 | ] 753 | ], 754 | "lastCommittedPoint": null, 755 | "startBinding": null, 756 | "endBinding": null, 757 | "startArrowhead": null, 758 | "endArrowhead": null 759 | }, 760 | { 761 | "type": "ellipse", 762 | "version": 1258, 763 | "versionNonce": 1590588959, 764 | "isDeleted": false, 765 | "id": "AU0JHXmpi1KVNnpTNWfme", 766 | "fillStyle": "solid", 767 | "strokeWidth": 2, 768 | "strokeStyle": "solid", 769 | "roughness": 1, 770 | "opacity": 100, 771 | "angle": 0, 772 | "x": 856.162109375, 773 | "y": 434.15234375, 774 | "strokeColor": "#1e1e1e", 775 | "backgroundColor": "transparent", 776 | "width": 11.36328125, 777 | "height": 11.296875, 778 | "seed": 1012953073, 779 | "groupIds": [], 780 | "frameId": null, 781 | "roundness": null, 782 | "boundElements": [ 783 | { 784 | "id": "APhoaq5Vf-fOMoHzNEmxq", 785 | "type": "arrow" 786 | } 787 | ], 788 | "updated": 1693239665678, 789 | "link": null, 790 | "locked": false 791 | }, 792 | { 793 | "id": "hWKcuVKHTedL0zaT3viK0", 794 | "type": "line", 795 | "x": 699.31640625, 796 | "y": 328.25390625, 797 | "width": 44.28125, 798 | "height": 81.23046875, 799 | "angle": 0, 800 | "strokeColor": "#1e1e1e", 801 | "backgroundColor": "transparent", 802 | "fillStyle": "hachure", 803 | "strokeWidth": 1, 804 | "strokeStyle": "dashed", 805 | "roughness": 1, 806 | "opacity": 100, 807 | "groupIds": [], 808 | "frameId": null, 809 | "roundness": null, 810 | "seed": 602469969, 811 | "version": 78, 812 | "versionNonce": 2131126129, 813 | "isDeleted": false, 814 | "boundElements": null, 815 | "updated": 1693239399787, 816 | "link": null, 817 | "locked": false, 818 | "points": [ 819 | [ 820 | 0, 821 | 0 822 | ], 823 | [ 824 | 44.28125, 825 | 81.23046875 826 | ] 827 | ], 828 | "lastCommittedPoint": null, 829 | "startBinding": null, 830 | "endBinding": null, 831 | "startArrowhead": null, 832 | "endArrowhead": null 833 | }, 834 | { 835 | "type": "ellipse", 836 | "version": 1318, 837 | "versionNonce": 157246321, 838 | "isDeleted": false, 839 | "id": "4iDQENfrnMiAu1qA26bJ3", 840 | "fillStyle": "solid", 841 | "strokeWidth": 2, 842 | "strokeStyle": "solid", 843 | "roughness": 1, 844 | "opacity": 100, 845 | "angle": 0, 846 | "x": 745.466796875, 847 | "y": 418.5390625, 848 | "strokeColor": "#1e1e1e", 849 | "backgroundColor": "transparent", 850 | "width": 11.36328125, 851 | "height": 11.296875, 852 | "seed": 603646015, 853 | "groupIds": [], 854 | "frameId": null, 855 | "roundness": null, 856 | "boundElements": [], 857 | "updated": 1693239403082, 858 | "link": null, 859 | "locked": false 860 | }, 861 | { 862 | "id": "eKBOrAoy26LMq-IqdE9Mn", 863 | "type": "line", 864 | "x": 570.17578125, 865 | "y": 377.1015625, 866 | "width": 22.75, 867 | "height": 41.0234375, 868 | "angle": 0, 869 | "strokeColor": "#1e1e1e", 870 | "backgroundColor": "transparent", 871 | "fillStyle": "hachure", 872 | "strokeWidth": 1, 873 | "strokeStyle": "dashed", 874 | "roughness": 1, 875 | "opacity": 100, 876 | "groupIds": [], 877 | "frameId": null, 878 | "roundness": null, 879 | "seed": 1229948959, 880 | "version": 35, 881 | "versionNonce": 17255999, 882 | "isDeleted": false, 883 | "boundElements": null, 884 | "updated": 1693239407390, 885 | "link": null, 886 | "locked": false, 887 | "points": [ 888 | [ 889 | 0, 890 | 0 891 | ], 892 | [ 893 | 22.75, 894 | 41.0234375 895 | ] 896 | ], 897 | "lastCommittedPoint": null, 898 | "startBinding": null, 899 | "endBinding": null, 900 | "startArrowhead": null, 901 | "endArrowhead": null 902 | }, 903 | { 904 | "type": "ellipse", 905 | "version": 1377, 906 | "versionNonce": 1566092831, 907 | "isDeleted": false, 908 | "id": "cp5uAThKVgKBkKksASfq3", 909 | "fillStyle": "solid", 910 | "strokeWidth": 2, 911 | "strokeStyle": "solid", 912 | "roughness": 1, 913 | "opacity": 100, 914 | "angle": 0, 915 | "x": 590.556640625, 916 | "y": 420.15234375, 917 | "strokeColor": "#1e1e1e", 918 | "backgroundColor": "transparent", 919 | "width": 11.36328125, 920 | "height": 11.296875, 921 | "seed": 202812177, 922 | "groupIds": [], 923 | "frameId": null, 924 | "roundness": null, 925 | "boundElements": [], 926 | "updated": 1693239412631, 927 | "link": null, 928 | "locked": false 929 | }, 930 | { 931 | "type": "ellipse", 932 | "version": 1500, 933 | "versionNonce": 1069620991, 934 | "isDeleted": false, 935 | "id": "ggRTxyZhqy4sauhGA09nR", 936 | "fillStyle": "solid", 937 | "strokeWidth": 2, 938 | "strokeStyle": "solid", 939 | "roughness": 1, 940 | "opacity": 100, 941 | "angle": 0, 942 | "x": 453.912109375, 943 | "y": 540.4609375, 944 | "strokeColor": "#1e1e1e", 945 | "backgroundColor": "transparent", 946 | "width": 11.36328125, 947 | "height": 11.296875, 948 | "seed": 1299890705, 949 | "groupIds": [], 950 | "frameId": null, 951 | "roundness": null, 952 | "boundElements": [], 953 | "updated": 1693239425731, 954 | "link": null, 955 | "locked": false 956 | }, 957 | { 958 | "type": "line", 959 | "version": 80, 960 | "versionNonce": 1794690367, 961 | "isDeleted": false, 962 | "id": "7iZC_vinFqhDINfcEXiM3", 963 | "fillStyle": "hachure", 964 | "strokeWidth": 1, 965 | "strokeStyle": "dashed", 966 | "roughness": 1, 967 | "opacity": 100, 968 | "angle": 0, 969 | "x": 467.0243193823844, 970 | "y": 556.9658289533108, 971 | "strokeColor": "#1e1e1e", 972 | "backgroundColor": "transparent", 973 | "width": 9.765625, 974 | "height": 18.26171875, 975 | "seed": 265726783, 976 | "groupIds": [], 977 | "frameId": null, 978 | "roundness": null, 979 | "boundElements": [], 980 | "updated": 1693239430271, 981 | "link": null, 982 | "locked": false, 983 | "startBinding": null, 984 | "endBinding": null, 985 | "lastCommittedPoint": null, 986 | "startArrowhead": null, 987 | "endArrowhead": null, 988 | "points": [ 989 | [ 990 | 0, 991 | 0 992 | ], 993 | [ 994 | 9.765625, 995 | 18.26171875 996 | ] 997 | ] 998 | }, 999 | { 1000 | "id": "JxAYXYDpuFLjxcxo4fGGt", 1001 | "type": "arrow", 1002 | "x": 366.08984375, 1003 | "y": 615.4140625, 1004 | "width": 177.61328125, 1005 | "height": 29.21484375, 1006 | "angle": 0, 1007 | "strokeColor": "#1e1e1e", 1008 | "backgroundColor": "transparent", 1009 | "fillStyle": "hachure", 1010 | "strokeWidth": 1, 1011 | "strokeStyle": "solid", 1012 | "roughness": 1, 1013 | "opacity": 100, 1014 | "groupIds": [], 1015 | "frameId": null, 1016 | "roundness": null, 1017 | "seed": 1261621745, 1018 | "version": 45, 1019 | "versionNonce": 399426033, 1020 | "isDeleted": false, 1021 | "boundElements": null, 1022 | "updated": 1693239515433, 1023 | "link": null, 1024 | "locked": false, 1025 | "points": [ 1026 | [ 1027 | 0, 1028 | 0 1029 | ], 1030 | [ 1031 | 177.61328125, 1032 | 29.21484375 1033 | ] 1034 | ], 1035 | "lastCommittedPoint": null, 1036 | "startBinding": { 1037 | "elementId": "8HSTYnEQtwMgEsmaZvHzy", 1038 | "focus": 1.0481540308011237, 1039 | "gap": 1 1040 | }, 1041 | "endBinding": null, 1042 | "startArrowhead": null, 1043 | "endArrowhead": "arrow" 1044 | }, 1045 | { 1046 | "id": "xgCU6Gc5ZQpPFJr5o1brZ", 1047 | "type": "text", 1048 | "x": 433.5530014038086, 1049 | "y": 640.15625, 1050 | "width": 14.643997192382812, 1051 | "height": 35, 1052 | "angle": 0, 1053 | "strokeColor": "#1e1e1e", 1054 | "backgroundColor": "transparent", 1055 | "fillStyle": "hachure", 1056 | "strokeWidth": 1, 1057 | "strokeStyle": "solid", 1058 | "roughness": 1, 1059 | "opacity": 100, 1060 | "groupIds": [], 1061 | "frameId": null, 1062 | "roundness": null, 1063 | "seed": 794945329, 1064 | "version": 32, 1065 | "versionNonce": 1063307761, 1066 | "isDeleted": false, 1067 | "boundElements": null, 1068 | "updated": 1693239493942, 1069 | "link": null, 1070 | "locked": false, 1071 | "text": "v", 1072 | "fontSize": 28, 1073 | "fontFamily": 1, 1074 | "textAlign": "center", 1075 | "verticalAlign": "top", 1076 | "baseline": 25, 1077 | "containerId": null, 1078 | "originalText": "v", 1079 | "lineHeight": 1.25 1080 | }, 1081 | { 1082 | "type": "text", 1083 | "version": 62, 1084 | "versionNonce": 1719395423, 1085 | "isDeleted": false, 1086 | "id": "cyCapA8uGbYaE0xsKFUtV", 1087 | "fillStyle": "hachure", 1088 | "strokeWidth": 1, 1089 | "strokeStyle": "solid", 1090 | "roughness": 1, 1091 | "opacity": 100, 1092 | "angle": 0, 1093 | "x": 388.1623764038086, 1094 | "y": 532.13671875, 1095 | "strokeColor": "#1e1e1e", 1096 | "backgroundColor": "transparent", 1097 | "width": 14.643997192382812, 1098 | "height": 35, 1099 | "seed": 1411027295, 1100 | "groupIds": [], 1101 | "frameId": null, 1102 | "roundness": null, 1103 | "boundElements": [], 1104 | "updated": 1693239492634, 1105 | "link": null, 1106 | "locked": false, 1107 | "fontSize": 28, 1108 | "fontFamily": 1, 1109 | "text": "v", 1110 | "textAlign": "center", 1111 | "verticalAlign": "top", 1112 | "containerId": null, 1113 | "originalText": "v", 1114 | "lineHeight": 1.25, 1115 | "baseline": 25 1116 | }, 1117 | { 1118 | "type": "text", 1119 | "version": 89, 1120 | "versionNonce": 794662271, 1121 | "isDeleted": false, 1122 | "id": "0hIddUaa_QgZ7vFLqvf7U", 1123 | "fillStyle": "hachure", 1124 | "strokeWidth": 1, 1125 | "strokeStyle": "solid", 1126 | "roughness": 1, 1127 | "opacity": 100, 1128 | "angle": 0, 1129 | "x": 450.7109069824219, 1130 | "y": 659.8984375, 1131 | "strokeColor": "#1e1e1e", 1132 | "backgroundColor": "transparent", 1133 | "width": 4.33599853515625, 1134 | "height": 20, 1135 | "seed": 1232364497, 1136 | "groupIds": [], 1137 | "frameId": null, 1138 | "roundness": null, 1139 | "boundElements": [], 1140 | "updated": 1693239507281, 1141 | "link": null, 1142 | "locked": false, 1143 | "fontSize": 16, 1144 | "fontFamily": 1, 1145 | "text": "1", 1146 | "textAlign": "center", 1147 | "verticalAlign": "top", 1148 | "containerId": null, 1149 | "originalText": "1", 1150 | "lineHeight": 1.25, 1151 | "baseline": 14 1152 | }, 1153 | { 1154 | "type": "text", 1155 | "version": 144, 1156 | "versionNonce": 1166858801, 1157 | "isDeleted": false, 1158 | "id": "DSegTrSal4dMNN6IJ7TLa", 1159 | "fillStyle": "hachure", 1160 | "strokeWidth": 1, 1161 | "strokeStyle": "solid", 1162 | "roughness": 1, 1163 | "opacity": 100, 1164 | "angle": 0, 1165 | "x": 401.0930633544922, 1166 | "y": 549.80078125, 1167 | "strokeColor": "#1e1e1e", 1168 | "backgroundColor": "transparent", 1169 | "width": 11.391998291015625, 1170 | "height": 20, 1171 | "seed": 111641023, 1172 | "groupIds": [], 1173 | "frameId": null, 1174 | "roundness": null, 1175 | "boundElements": [], 1176 | "updated": 1693239513312, 1177 | "link": null, 1178 | "locked": false, 1179 | "fontSize": 16, 1180 | "fontFamily": 1, 1181 | "text": "2", 1182 | "textAlign": "center", 1183 | "verticalAlign": "top", 1184 | "containerId": null, 1185 | "originalText": "2", 1186 | "lineHeight": 1.25, 1187 | "baseline": 14 1188 | }, 1189 | { 1190 | "type": "arrow", 1191 | "version": 181, 1192 | "versionNonce": 892260657, 1193 | "isDeleted": false, 1194 | "id": "xZClQRoQkSaRFfs9laIPW", 1195 | "fillStyle": "hachure", 1196 | "strokeWidth": 1, 1197 | "strokeStyle": "solid", 1198 | "roughness": 1, 1199 | "opacity": 100, 1200 | "angle": 0, 1201 | "x": 365.4546954613179, 1202 | "y": 615.3624257761985, 1203 | "strokeColor": "#1e1e1e", 1204 | "backgroundColor": "transparent", 1205 | "width": 136.6953125, 1206 | "height": 134.9609375, 1207 | "seed": 1365466065, 1208 | "groupIds": [], 1209 | "frameId": null, 1210 | "roundness": null, 1211 | "boundElements": [], 1212 | "updated": 1693239522713, 1213 | "link": null, 1214 | "locked": false, 1215 | "startBinding": { 1216 | "elementId": "8HSTYnEQtwMgEsmaZvHzy", 1217 | "focus": -0.9831183739783577, 1218 | "gap": 1.4362444909314434 1219 | }, 1220 | "endBinding": null, 1221 | "lastCommittedPoint": null, 1222 | "startArrowhead": null, 1223 | "endArrowhead": "arrow", 1224 | "points": [ 1225 | [ 1226 | 0, 1227 | 0 1228 | ], 1229 | [ 1230 | 136.6953125, 1231 | -134.9609375 1232 | ] 1233 | ] 1234 | }, 1235 | { 1236 | "id": "bZWMnxvDKIUGwt_KoiAum", 1237 | "type": "text", 1238 | "x": 908.4673156738281, 1239 | "y": 349.69921875, 1240 | "width": 93.45599365234375, 1241 | "height": 45, 1242 | "angle": 0, 1243 | "strokeColor": "#1e1e1e", 1244 | "backgroundColor": "transparent", 1245 | "fillStyle": "hachure", 1246 | "strokeWidth": 1, 1247 | "strokeStyle": "solid", 1248 | "roughness": 1, 1249 | "opacity": 100, 1250 | "groupIds": [], 1251 | "frameId": null, 1252 | "roundness": null, 1253 | "seed": 1048544017, 1254 | "version": 78, 1255 | "versionNonce": 567293343, 1256 | "isDeleted": false, 1257 | "boundElements": null, 1258 | "updated": 1693239650468, 1259 | "link": null, 1260 | "locked": false, 1261 | "text": "V V x", 1262 | "fontSize": 36, 1263 | "fontFamily": 1, 1264 | "textAlign": "center", 1265 | "verticalAlign": "top", 1266 | "baseline": 32, 1267 | "containerId": null, 1268 | "originalText": "V V x", 1269 | "lineHeight": 1.25 1270 | }, 1271 | { 1272 | "id": "Og00EPx6Lhp3iSq-SJHvF", 1273 | "type": "text", 1274 | "x": 930.0948486328125, 1275 | "y": 371.5, 1276 | "width": 9.739990234375, 1277 | "height": 25, 1278 | "angle": 0, 1279 | "strokeColor": "#1e1e1e", 1280 | "backgroundColor": "transparent", 1281 | "fillStyle": "hachure", 1282 | "strokeWidth": 1, 1283 | "strokeStyle": "solid", 1284 | "roughness": 1, 1285 | "opacity": 100, 1286 | "groupIds": [], 1287 | "frameId": null, 1288 | "roundness": null, 1289 | "seed": 1512582833, 1290 | "version": 109, 1291 | "versionNonce": 1252964305, 1292 | "isDeleted": false, 1293 | "boundElements": null, 1294 | "updated": 1693239650468, 1295 | "link": null, 1296 | "locked": false, 1297 | "text": "k", 1298 | "fontSize": 20, 1299 | "fontFamily": 1, 1300 | "textAlign": "center", 1301 | "verticalAlign": "top", 1302 | "baseline": 18, 1303 | "containerId": null, 1304 | "originalText": "k", 1305 | "lineHeight": 1.25 1306 | }, 1307 | { 1308 | "type": "text", 1309 | "version": 140, 1310 | "versionNonce": 322080255, 1311 | "isDeleted": false, 1312 | "id": "wQcrlcXpUAOq9d8wmlEQw", 1313 | "fillStyle": "hachure", 1314 | "strokeWidth": 1, 1315 | "strokeStyle": "solid", 1316 | "roughness": 1, 1317 | "opacity": 100, 1318 | "angle": 0, 1319 | "x": 963.8292236328125, 1320 | "y": 372.15234375, 1321 | "strokeColor": "#1e1e1e", 1322 | "backgroundColor": "transparent", 1323 | "width": 9.739990234375, 1324 | "height": 25, 1325 | "seed": 1834865887, 1326 | "groupIds": [], 1327 | "frameId": null, 1328 | "roundness": null, 1329 | "boundElements": [ 1330 | { 1331 | "id": "APhoaq5Vf-fOMoHzNEmxq", 1332 | "type": "arrow" 1333 | } 1334 | ], 1335 | "updated": 1693239665678, 1336 | "link": null, 1337 | "locked": false, 1338 | "fontSize": 20, 1339 | "fontFamily": 1, 1340 | "text": "k", 1341 | "textAlign": "center", 1342 | "verticalAlign": "top", 1343 | "containerId": null, 1344 | "originalText": "k", 1345 | "lineHeight": 1.25, 1346 | "baseline": 18 1347 | }, 1348 | { 1349 | "type": "text", 1350 | "version": 160, 1351 | "versionNonce": 1955173809, 1352 | "isDeleted": false, 1353 | "id": "CaEHaW4foPnuLnVxFw3cD", 1354 | "fillStyle": "hachure", 1355 | "strokeWidth": 1, 1356 | "strokeStyle": "solid", 1357 | "roughness": 1, 1358 | "opacity": 100, 1359 | "angle": 0, 1360 | "x": 965.2725067138672, 1361 | "y": 338.3046875, 1362 | "strokeColor": "#1e1e1e", 1363 | "backgroundColor": "transparent", 1364 | "width": 16.079986572265625, 1365 | "height": 25, 1366 | "seed": 818709247, 1367 | "groupIds": [], 1368 | "frameId": null, 1369 | "roundness": null, 1370 | "boundElements": [], 1371 | "updated": 1693239650468, 1372 | "link": null, 1373 | "locked": false, 1374 | "fontSize": 20, 1375 | "fontFamily": 1, 1376 | "text": "T", 1377 | "textAlign": "center", 1378 | "verticalAlign": "top", 1379 | "containerId": null, 1380 | "originalText": "T", 1381 | "lineHeight": 1.25, 1382 | "baseline": 18 1383 | }, 1384 | { 1385 | "type": "text", 1386 | "version": 161, 1387 | "versionNonce": 1602954111, 1388 | "isDeleted": false, 1389 | "id": "s79jHU6wtiQlE8-i6WKSj", 1390 | "fillStyle": "hachure", 1391 | "strokeWidth": 1, 1392 | "strokeStyle": "solid", 1393 | "roughness": 1, 1394 | "opacity": 100, 1395 | "angle": 0, 1396 | "x": 790.0090026855469, 1397 | "y": 312.109375, 1398 | "strokeColor": "#1e1e1e", 1399 | "backgroundColor": "transparent", 1400 | "width": 20.23199462890625, 1401 | "height": 45, 1402 | "seed": 1282579825, 1403 | "groupIds": [], 1404 | "frameId": null, 1405 | "roundness": null, 1406 | "boundElements": [], 1407 | "updated": 1693239659788, 1408 | "link": null, 1409 | "locked": false, 1410 | "fontSize": 36, 1411 | "fontFamily": 1, 1412 | "text": "x", 1413 | "textAlign": "center", 1414 | "verticalAlign": "top", 1415 | "containerId": null, 1416 | "originalText": "x", 1417 | "lineHeight": 1.25, 1418 | "baseline": 32 1419 | }, 1420 | { 1421 | "id": "APhoaq5Vf-fOMoHzNEmxq", 1422 | "type": "arrow", 1423 | "x": 956.984375, 1424 | "y": 406.6328125, 1425 | "width": 75.328125, 1426 | "height": 26.7578125, 1427 | "angle": 0, 1428 | "strokeColor": "#1e1e1e", 1429 | "backgroundColor": "transparent", 1430 | "fillStyle": "hachure", 1431 | "strokeWidth": 1, 1432 | "strokeStyle": "solid", 1433 | "roughness": 1, 1434 | "opacity": 100, 1435 | "groupIds": [], 1436 | "frameId": null, 1437 | "roundness": null, 1438 | "seed": 214570993, 1439 | "version": 57, 1440 | "versionNonce": 1568222047, 1441 | "isDeleted": false, 1442 | "boundElements": null, 1443 | "updated": 1693239669626, 1444 | "link": null, 1445 | "locked": false, 1446 | "points": [ 1447 | [ 1448 | 0, 1449 | 0 1450 | ], 1451 | [ 1452 | -75.328125, 1453 | 26.7578125 1454 | ] 1455 | ], 1456 | "lastCommittedPoint": null, 1457 | "startBinding": { 1458 | "elementId": "wQcrlcXpUAOq9d8wmlEQw", 1459 | "focus": -1.2522333285519662, 1460 | "gap": 9.48046875 1461 | }, 1462 | "endBinding": { 1463 | "elementId": "AU0JHXmpi1KVNnpTNWfme", 1464 | "focus": 0.10462789327715792, 1465 | "gap": 15.145177073482863 1466 | }, 1467 | "startArrowhead": null, 1468 | "endArrowhead": "arrow" 1469 | } 1470 | ], 1471 | "appState": { 1472 | "gridSize": null, 1473 | "viewBackgroundColor": "#ffffff" 1474 | }, 1475 | "files": {} 1476 | } -------------------------------------------------------------------------------- /class_notes/images/pca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/pca.png -------------------------------------------------------------------------------- /class_notes/images/vectorsearch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/vector-search-class-notes/3f2b3410adc6298b051f16f6cde378836f3e4379/class_notes/images/vectorsearch.png -------------------------------------------------------------------------------- /class_notes/vs.bib: -------------------------------------------------------------------------------- 1 | %% This BibTeX bibliography file was created using BibDesk. 2 | %% https://bibdesk.sourceforge.io/ 3 | 4 | %% Created for Edo Liberty at 2023-08-28 13:55:33 -0400 5 | 6 | 7 | %% Saved with string encoding Unicode (UTF-8) 8 | 9 | 10 | 11 | @article{RokhlinST09, 12 | author = {Vladimir Rokhlin and Arthur Szlam and Mark Tygert}, 13 | bibsource = {dblp computer science bibliography, http://dblp.org}, 14 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/siammax/RokhlinST09}, 15 | date-added = {2023-08-28 13:55:06 -0400}, 16 | date-modified = {2023-08-28 13:55:06 -0400}, 17 | doi = {10.1137/080736417}, 18 | journal = {{SIAM} J. Matrix Analysis Applications}, 19 | number = {3}, 20 | pages = {1100--1124}, 21 | timestamp = {Tue, 22 Mar 2011 09:17:45 +0100}, 22 | title = {A Randomized Algorithm for Principal Component Analysis}, 23 | url = {http://dx.doi.org/10.1137/080736417}, 24 | volume = {31}, 25 | year = {2009}, 26 | bdsk-url-1 = {http://dx.doi.org/10.1137/080736417}} 27 | 28 | @proceedings{DBLP:conf/nips/2015, 29 | bibsource = {dblp computer science bibliography, http://dblp.org}, 30 | biburl = {http://dblp.uni-trier.de/rec/bib/conf/nips/2015}, 31 | date-added = {2023-08-28 13:55:06 -0400}, 32 | date-modified = {2023-08-28 13:55:06 -0400}, 33 | editor = {Corinna Cortes and Neil D. Lawrence and Daniel D. Lee and Masashi Sugiyama and Roman Garnett}, 34 | timestamp = {Fri, 08 Apr 2016 19:32:52 +0200}, 35 | title = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada}, 36 | url = {http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015}, 37 | year = {2015}, 38 | bdsk-url-1 = {http://papers.nips.cc/book/advances-in-neural-information-processing-systems-28-2015}} 39 | 40 | @article{HalkoMT2011, 41 | acmid = {2078881}, 42 | address = {Philadelphia, PA, USA}, 43 | author = {Halko, N. and Martinsson, P. G. and Tropp, J. A.}, 44 | date-added = {2023-08-28 13:55:06 -0400}, 45 | date-modified = {2023-08-28 13:55:06 -0400}, 46 | doi = {10.1137/090771806}, 47 | issn = {0036-1445}, 48 | issue_date = {May 2011}, 49 | journal = {SIAM Rev.}, 50 | keywords = {Johnson-Lindenstrauss lemma, dimension reduction, eigenvalue decomposition, interpolative decomposition, matrix approximation, parallel algorithm, pass-efficient algorithm, principal component analysis, random matrix, randomized algorithm, rank-revealing QR factorization, singular value decomposition, streaming algorithm}, 51 | month = may, 52 | number = {2}, 53 | numpages = {72}, 54 | pages = {217--288}, 55 | publisher = {Society for Industrial and Applied Mathematics}, 56 | title = {Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions}, 57 | url = {http://dx.doi.org/10.1137/090771806}, 58 | volume = {53}, 59 | year = {2011}, 60 | bdsk-url-1 = {http://dx.doi.org/10.1137/090771806}} 61 | 62 | @article{Rudelson08, 63 | author = {Mark Rudelson}, 64 | date-added = {2023-08-28 13:55:06 -0400}, 65 | date-modified = {2023-08-28 13:55:06 -0400}, 66 | journal = {Annals of Mathematics}, 67 | pages = {575-600}, 68 | title = {Invertibility of random matrices: norm of the inverse}, 69 | volume = {168 Issue 2}, 70 | year = {2008}} 71 | 72 | @article{WittenE15, 73 | acmid = {2756048}, 74 | address = {Secaucus, NJ, USA}, 75 | author = {Witten, Rafi and Cand\`{e}s, Emmanuel}, 76 | date-added = {2023-08-28 13:55:06 -0400}, 77 | date-modified = {2023-08-28 13:55:06 -0400}, 78 | doi = {10.1007/s00453-014-9891-7}, 79 | issn = {0178-4617}, 80 | issue_date = {May 2015}, 81 | journal = {Algorithmica}, 82 | keywords = {Dimension reduction, Matrix approximation, Pass efficient algorithm, Random matrix, Randomized linear algebra}, 83 | month = may, 84 | number = {1}, 85 | numpages = {18}, 86 | pages = {264--281}, 87 | publisher = {Springer-Verlag New York, Inc.}, 88 | title = {Randomized Algorithms for Low-Rank Matrix Factorizations: Sharp Performance Bounds}, 89 | url = {http://dx.doi.org/10.1007/s00453-014-9891-7}, 90 | volume = {72}, 91 | year = {2015}, 92 | bdsk-url-1 = {http://dx.doi.org/10.1007/s00453-014-9891-7}} 93 | 94 | @inproceedings{MuscoM15, 95 | author = {Cameron Musco and Christopher Musco}, 96 | bibsource = {dblp computer science bibliography, http://dblp.org}, 97 | biburl = {http://dblp.uni-trier.de/rec/bib/conf/nips/MuscoM15}, 98 | booktitle = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada}, 99 | crossref = {DBLP:conf/nips/2015}, 100 | date-added = {2023-08-28 13:55:06 -0400}, 101 | date-modified = {2023-08-28 13:55:06 -0400}, 102 | pages = {1396--1404}, 103 | timestamp = {Fri, 08 Apr 2016 19:32:53 +0200}, 104 | title = {Randomized Block Krylov Methods for Stronger and Faster Approximate Singular Value Decomposition}, 105 | url = {http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition}, 106 | year = {2015}, 107 | bdsk-url-1 = {http://papers.nips.cc/paper/5735-randomized-block-krylov-methods-for-stronger-and-faster-approximate-singular-value-decomposition}} 108 | 109 | @misc{liberty2016short, 110 | archiveprefix = {arXiv}, 111 | author = {Edo Liberty}, 112 | date-added = {2023-08-28 12:52:04 -0400}, 113 | date-modified = {2023-08-28 12:52:04 -0400}, 114 | eprint = {1605.05610}, 115 | primaryclass = {cs.NA}, 116 | title = {A Short Proof for Gap Independence of Simultaneous Iteration}, 117 | year = {2016}} 118 | 119 | @inproceedings{NIPS2013_6e0721b2, 120 | author = {Achlioptas, Dimitris and Karnin, Zohar S and Liberty, Edo}, 121 | booktitle = {Advances in Neural Information Processing Systems}, 122 | date-added = {2023-08-28 07:26:11 -0400}, 123 | date-modified = {2023-08-28 07:26:11 -0400}, 124 | editor = {C.J. Burges and L. Bottou and M. Welling and Z. Ghahramani and K.Q. Weinberger}, 125 | publisher = {Curran Associates, Inc.}, 126 | title = {Near-Optimal Entrywise Sampling for Data Matrices}, 127 | url = {https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf}, 128 | volume = {26}, 129 | year = {2013}, 130 | bdsk-url-1 = {https://proceedings.neurips.cc/paper_files/paper/2013/file/6e0721b2c6977135b916ef286bcb49ec-Paper.pdf}} 131 | 132 | @inbook{doi:10.1137/1.9781611974317.7, 133 | abstract = { Abstract This paper shows that one can be competitive with the k-means objective while operating online. In this model, the algorithm receives vectors v1, {\ldots}, vn one by one in an arbitrary order. For each vector vt the algorithm outputs a cluster identifier before receiving vt+1. Our online algorithm generates O(k log n log γn) clusters whose expected k-means cost is O(W* log n). Here, W* is the optimal k-means cost using k clusters and γ is the aspect ratio of the data. The dependence on γ is shown to be unavoidable and tight. We also show that, experimentally, it is not much worse than k-means++ while operating in a strictly more constrained computational model. }, 134 | author = {Edo Liberty and Ram Sriharsha and Maxim Sviridenko}, 135 | booktitle = {2016 Proceedings of the Meeting on Algorithm Engineering and Experiments (ALENEX)}, 136 | date-added = {2023-08-27 23:57:26 -0400}, 137 | date-modified = {2023-08-27 23:57:26 -0400}, 138 | doi = {10.1137/1.9781611974317.7}, 139 | eprint = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611974317.7}, 140 | pages = {81-89}, 141 | title = {An Algorithm for Online K-Means Clustering}, 142 | url = {https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7}, 143 | bdsk-url-1 = {https://epubs.siam.org/doi/abs/10.1137/1.9781611974317.7}, 144 | bdsk-url-2 = {https://doi.org/10.1137/1.9781611974317.7}} 145 | 146 | @misc{liberty2022simpler, 147 | archiveprefix = {arXiv}, 148 | author = {Edo Liberty}, 149 | date-added = {2023-08-27 23:55:56 -0400}, 150 | date-modified = {2023-08-27 23:55:56 -0400}, 151 | eprint = {2202.01780}, 152 | primaryclass = {cs.DS}, 153 | title = {Even Simpler Deterministic Matrix Sketching}, 154 | year = {2022}} 155 | 156 | @inproceedings{nngraph, 157 | abstract = {K-Nearest Neighbor Graph (K-NNG) construction is an important operation with many web related applications, including collaborative filtering, similarity search, and many others in data mining and machine learning. Existing methods for K-NNG construction either do not scale, or are specific to certain similarity measures. We present NN-Descent, a simple yet efficient algorithm for approximate K-NNG construction with arbitrary similarity measures. Our method is based on local search, has minimal space overhead and does not rely on any shared global index. Hence, it is especially suitable for large-scale applications where data structures need to be distributed over the network. We have shown with a variety of datasets and similarity measures that the proposed method typically converges to above 90\% recall with each point comparing only to several percent of the whole dataset on average.}, 158 | address = {New York, NY, USA}, 159 | author = {Dong, Wei and Moses, Charikar and Li, Kai}, 160 | booktitle = {Proceedings of the 20th International Conference on World Wide Web}, 161 | date-added = {2023-08-27 22:55:23 -0400}, 162 | date-modified = {2023-08-27 22:55:31 -0400}, 163 | doi = {10.1145/1963405.1963487}, 164 | isbn = {9781450306324}, 165 | keywords = {iterative method, k-nearest neighbor graph, arbitrary similarity measure}, 166 | location = {Hyderabad, India}, 167 | numpages = {10}, 168 | pages = {577--586}, 169 | publisher = {Association for Computing Machinery}, 170 | series = {WWW '11}, 171 | title = {Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures}, 172 | url = {https://doi.org/10.1145/1963405.1963487}, 173 | year = {2011}, 174 | bdsk-url-1 = {https://doi.org/10.1145/1963405.1963487}} 175 | 176 | @inproceedings{NEURIPS2019_09853c7f, 177 | author = {Jayaram Subramanya, Suhas and Devvrit, Fnu and Simhadri, Harsha Vardhan and Krishnawamy, Ravishankar and Kadekodi, Rohan}, 178 | booktitle = {Advances in Neural Information Processing Systems}, 179 | date-added = {2023-08-27 22:54:28 -0400}, 180 | date-modified = {2023-08-27 22:54:28 -0400}, 181 | editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, 182 | publisher = {Curran Associates, Inc.}, 183 | title = {DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node}, 184 | url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf}, 185 | volume = {32}, 186 | year = {2019}, 187 | bdsk-url-1 = {https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf}} 188 | 189 | @misc{tay2022transformer, 190 | archiveprefix = {arXiv}, 191 | author = {Yi Tay and Vinh Q. Tran and Mostafa Dehghani and Jianmo Ni and Dara Bahri and Harsh Mehta and Zhen Qin and Kai Hui and Zhe Zhao and Jai Gupta and Tal Schuster and William W. Cohen and Donald Metzler}, 192 | date-added = {2023-08-27 22:53:43 -0400}, 193 | date-modified = {2023-08-27 22:53:43 -0400}, 194 | eprint = {2202.06991}, 195 | primaryclass = {cs.CL}, 196 | title = {Transformer Memory as a Differentiable Search Index}, 197 | year = {2022}} 198 | 199 | @inproceedings{neuralQuantization, 200 | abstract = {We tackle the problem of unsupervised visual descriptors compression, which is a key ingredient of large-scale image retrieval systems. While the deep learning machinery has benefited literally all computer vision pipelines, the existing state-of-the-art compression methods employ shallow architectures, and we aim to close this gap by our paper. In more detail, we introduce a DNN architecture for the unsupervised compressed-domain retrieval, based on multi-codebook quantization. The proposed architecture is designed to incorporate both fast data encoding and efficient distances computation via lookup tables. We demonstrate the exceptional advantage of our scheme over existing quantization approaches on several datasets of visual descriptors via outperforming the previous state-of-the-art by a large margin.}, 201 | address = {Los Alamitos, CA, USA}, 202 | author = {S. Morozov and A. Babenko}, 203 | booktitle = {2019 IEEE/CVF International Conference on Computer Vision (ICCV)}, 204 | date-added = {2023-08-27 22:52:56 -0400}, 205 | date-modified = {2023-08-27 22:53:14 -0400}, 206 | doi = {10.1109/ICCV.2019.00313}, 207 | keywords = {quantization (signal);image coding;computer architecture;visualization;computer vision;encoding;databases}, 208 | month = {nov}, 209 | pages = {3036-3045}, 210 | publisher = {IEEE Computer Society}, 211 | title = {Unsupervised Neural Quantization for Compressed-Domain Similarity Search}, 212 | url = {https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313}, 213 | year = {2019}, 214 | bdsk-url-1 = {https://doi.ieeecomputersociety.org/10.1109/ICCV.2019.00313}, 215 | bdsk-url-2 = {https://doi.org/10.1109/ICCV.2019.00313}} 216 | 217 | @inproceedings{LSQ, 218 | abstract = {Multi-codebook quantization (MCQ) is the task of expressing a set of vectors as accurately as possible in terms of discrete entries in multiple bases. Work in MCQ is heavily focused on lowering quantization error, thereby improving distance estimation and recall on benchmarks of visual descriptors at a fixed memory budget. However, recent studies and methods in this area are hard to compare against each other, because they use different datasets, different protocols, and, perhaps most importantly, different computational budgets. In this work, we first benchmark a series of MCQ baselines on an equal footing and provide an analysis of their recall-vs-running-time performance. We observe that local search quantization (LSQ) is in practice much faster than its competitors, but is not the most accurate method in all cases. We then introduce two novel improvements that render LSQ (i) more accurate and (ii) faster. These improvements are easy to implement, and define a new state of the art in MCQ.}, 219 | address = {Berlin, Heidelberg}, 220 | author = {Martinez, Julieta and Zakhmi, Shobhit and Hoos, Holger H. and Little, James J.}, 221 | booktitle = {Computer Vision -- ECCV 2018: 15th European Conference, Munich, Germany, September 8-14, 2018, Proceedings, Part XVI}, 222 | date-added = {2023-08-27 22:51:54 -0400}, 223 | date-modified = {2023-08-27 22:52:02 -0400}, 224 | doi = {10.1007/978-3-030-01270-0_30}, 225 | isbn = {978-3-030-01269-4}, 226 | location = {Munich, Germany}, 227 | numpages = {16}, 228 | pages = {508--523}, 229 | publisher = {Springer-Verlag}, 230 | title = {LSQ++: Lower Running Time and Higher Recall in Multi-Codebook Quantization}, 231 | url = {https://doi.org/10.1007/978-3-030-01270-0_30}, 232 | year = {2018}, 233 | bdsk-url-1 = {https://doi.org/10.1007/978-3-030-01270-0_30}} 234 | 235 | @inproceedings{Martinez2016RevisitingAQ, 236 | author = {Julieta Martinez and Joris Clement and Holger H. Hoos and J. Little}, 237 | booktitle = {European Conference on Computer Vision}, 238 | date-added = {2023-08-27 22:51:08 -0400}, 239 | date-modified = {2023-08-27 22:51:08 -0400}, 240 | title = {Revisiting Additive Quantization}, 241 | url = {https://api.semanticscholar.org/CorpusID:7340738}, 242 | year = {2016}, 243 | bdsk-url-1 = {https://api.semanticscholar.org/CorpusID:7340738}} 244 | 245 | @misc{fu2018fast, 246 | archiveprefix = {arXiv}, 247 | author = {Cong Fu and Chao Xiang and Changxu Wang and Deng Cai}, 248 | date-added = {2023-08-27 22:49:58 -0400}, 249 | date-modified = {2023-08-27 22:49:58 -0400}, 250 | eprint = {1707.00143}, 251 | primaryclass = {cs.LG}, 252 | title = {Fast Approximate Nearest Neighbor Search With The Navigating Spreading-out Graph}, 253 | year = {2018}} 254 | 255 | @misc{guo2020accelerating, 256 | archiveprefix = {arXiv}, 257 | author = {Ruiqi Guo and Philip Sun and Erik Lindgren and Quan Geng and David Simcha and Felix Chern and Sanjiv Kumar}, 258 | date-added = {2023-08-27 22:49:28 -0400}, 259 | date-modified = {2023-08-27 22:49:28 -0400}, 260 | eprint = {1908.10396}, 261 | primaryclass = {cs.LG}, 262 | title = {Accelerating Large-Scale Inference with Anisotropic Vector Quantization}, 263 | year = {2020}} 264 | 265 | @article{Andre_2021, 266 | author = {Fabien Andre and Anne-Marie Kermarrec and Nicolas Le Scouarnec}, 267 | date-added = {2023-08-27 22:48:12 -0400}, 268 | date-modified = {2023-08-27 22:48:12 -0400}, 269 | doi = {10.1109/tpami.2019.2952606}, 270 | journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence}, 271 | month = {may}, 272 | number = {5}, 273 | pages = {1666--1677}, 274 | publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, 275 | title = {Quicker {ADC} : Unlocking the Hidden Potential of Product Quantization With {SIMD}}, 276 | url = {https://doi.org/10.1109%2Ftpami.2019.2952606}, 277 | volume = {43}, 278 | year = 2021, 279 | bdsk-url-1 = {https://doi.org/10.1109%2Ftpami.2019.2952606}, 280 | bdsk-url-2 = {https://doi.org/10.1109/tpami.2019.2952606}} 281 | 282 | @misc{hnsw, 283 | archiveprefix = {arXiv}, 284 | author = {Yu. A. Malkov and D. A. Yashunin}, 285 | date-added = {2023-08-27 22:47:22 -0400}, 286 | date-modified = {2023-08-27 22:47:31 -0400}, 287 | eprint = {1603.09320}, 288 | primaryclass = {cs.DS}, 289 | title = {Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs}, 290 | year = {2018}} 291 | 292 | @misc{johnson2017billionscale, 293 | archiveprefix = {arXiv}, 294 | author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou}, 295 | date-added = {2023-08-27 22:46:42 -0400}, 296 | date-modified = {2023-08-27 22:46:42 -0400}, 297 | eprint = {1702.08734}, 298 | primaryclass = {cs.CV}, 299 | title = {Billion-scale similarity search with GPUs}, 300 | year = {2017}} 301 | 302 | @article{pq, 303 | author = {Jegou, Herve and Douze, Matthijs and Schmid, Cordelia}, 304 | date-added = {2023-08-27 22:45:03 -0400}, 305 | date-modified = {2023-08-28 10:10:02 -0400}, 306 | doi = {10.1109/TPAMI.2010.57}, 307 | journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, 308 | number = {1}, 309 | pages = {117-128}, 310 | title = {Product Quantization for Nearest Neighbor Search}, 311 | volume = {33}, 312 | year = {2011}, 313 | bdsk-url-1 = {https://doi.org/10.1109/TPAMI.2010.57}} 314 | 315 | @article{kdtree-worstcase, 316 | abstract = {Given a file of N records each of which has k keys, the worst-case analysis for the region and partial region queries in multidimensional binary search trees and balanced quad trees are presented. It is shown that the search algorithms proposed in [1, 3] run in time O(k N 1 1/k) for region queries in both tree structures. For partial region queries with s keys specified, the search algorithms run at most in time O(s N 1 1/ k ) in both structures.}, 317 | address = {Berlin, Heidelberg}, 318 | author = {Lee, D. T. and Wong, C. K.}, 319 | date-added = {2023-08-26 14:50:53 -0400}, 320 | date-modified = {2023-08-26 14:51:02 -0400}, 321 | doi = {10.1007/BF00263763}, 322 | issn = {0001-5903}, 323 | issue_date = {March 1977}, 324 | journal = {Acta Inf.}, 325 | month = {mar}, 326 | number = {1}, 327 | numpages = {7}, 328 | pages = {23--29}, 329 | publisher = {Springer-Verlag}, 330 | title = {Worst-Case Analysis for Region and Partial Region Searches in Multidimensional Binary Search Trees and Balanced Quad Trees}, 331 | url = {https://doi.org/10.1007/BF00263763}, 332 | volume = {9}, 333 | year = {1977}, 334 | bdsk-url-1 = {https://doi.org/10.1007/BF00263763}} 335 | 336 | @inproceedings{rptrees, 337 | abstract = {We present a simple variant of the k-d tree which automatically adapts to intrinsic low dimensional structure in data without having to explicitly learn this structure.}, 338 | address = {New York, NY, USA}, 339 | author = {Dasgupta, Sanjoy and Freund, Yoav}, 340 | booktitle = {Proceedings of the Fortieth Annual ACM Symposium on Theory of Computing}, 341 | date-added = {2023-08-26 14:41:09 -0400}, 342 | date-modified = {2023-08-26 14:41:18 -0400}, 343 | doi = {10.1145/1374376.1374452}, 344 | isbn = {9781605580470}, 345 | keywords = {manifold, random projection, k-d tree, curse of dimension}, 346 | location = {Victoria, British Columbia, Canada}, 347 | numpages = {10}, 348 | pages = {537--546}, 349 | publisher = {Association for Computing Machinery}, 350 | series = {STOC '08}, 351 | title = {Random Projection Trees and Low Dimensional Manifolds}, 352 | url = {https://doi.org/10.1145/1374376.1374452}, 353 | year = {2008}, 354 | bdsk-url-1 = {https://doi.org/10.1145/1374376.1374452}} 355 | 356 | @article{LibertyMatrixSketching2012, 357 | author = {Edo Liberty}, 358 | bibsource = {DBLP, http://dblp.uni-trier.de}, 359 | date-added = {2012-12-23 13:06:13 +0200}, 360 | date-modified = {2012-12-23 13:06:27 +0200}, 361 | ee = {http://arxiv.org/abs/1206.0594}, 362 | journal = {CoRR}, 363 | title = {Simple and Deterministic Matrix Sketching}, 364 | volume = {abs/1206.0594}, 365 | year = {2012}} 366 | 367 | @article{rvSamplingFromLargeMatrices2007, 368 | acmid = {1255449}, 369 | address = {New York, NY, USA}, 370 | articleno = {21}, 371 | author = {Rudelson, Mark and Vershynin, Roman}, 372 | date-added = {2012-12-23 10:23:13 +0200}, 373 | date-modified = {2012-12-23 10:23:55 +0200}, 374 | doi = {10.1145/1255443.1255449}, 375 | issn = {0004-5411}, 376 | issue_date = {July 2007}, 377 | journal = {J. ACM}, 378 | keywords = {Monte-Carlo methods, Randomized algorithms, low-rank approximations, massive data sets, singular-value decompositions}, 379 | month = jul, 380 | number = {4}, 381 | publisher = {ACM}, 382 | title = {Sampling from large matrices: An approach through geometric functional analysis}, 383 | url = {http://doi.acm.org/10.1145/1255443.1255449}, 384 | volume = {54}, 385 | year = {2007}, 386 | bdsk-url-1 = {http://doi.acm.org/10.1145/1255443.1255449}, 387 | bdsk-url-2 = {http://dx.doi.org/10.1145/1255443.1255449}} 388 | 389 | @inproceedings{AroraHaKaFRS06, 390 | acmid = {2165259}, 391 | address = {Berlin, Heidelberg}, 392 | author = {Arora, Sanjeev and Hazan, Elad and Kale, Satyen}, 393 | booktitle = {Proceedings of the 9th international conference on Approximation Algorithms for Combinatorial Optimization Problems, and 10th international conference on Randomization and Computation}, 394 | date-added = {2012-12-16 10:02:26 +0200}, 395 | date-modified = {2012-12-16 10:02:26 +0200}, 396 | doi = {10.1007/11830924_26}, 397 | isbn = {3-540-38044-2, 978-3-540-38044-3}, 398 | location = {Barcelona, Spain}, 399 | numpages = {8}, 400 | pages = {272--279}, 401 | publisher = {Springer-Verlag}, 402 | series = {APPROX'06/RANDOM'06}, 403 | title = {A fast random sampling algorithm for sparsifying matrices}, 404 | url = {http://dx.doi.org/10.1007/11830924_26}, 405 | year = {2006}, 406 | bdsk-url-1 = {http://dx.doi.org/10.1007/11830924_26}} 407 | 408 | @inproceedings{JelaniH2012, 409 | author = {Jelani Nelson and Huy L. Nguyen}, 410 | booktitle = {arXiv:1211.0995v1}, 411 | date-added = {2012-12-02 10:16:07 +0200}, 412 | date-modified = {2012-12-02 10:16:45 +0200}, 413 | title = {Sparsity Lower Bounds for Dimensionality Reducing Maps}, 414 | year = {2012}} 415 | 416 | @inproceedings{KaneN12, 417 | author = {Daniel M. Kane and Jelani Nelson}, 418 | bibsource = {DBLP, http://dblp.uni-trier.de}, 419 | booktitle = {SODA}, 420 | crossref = {DBLP:conf/soda/2012}, 421 | date-added = {2012-12-02 10:12:45 +0200}, 422 | date-modified = {2012-12-02 10:12:51 +0200}, 423 | ee = {http://portal.acm.org/citation.cfm?id=2095210{\&}CFID=63838676{\&}CFTOKEN=79617016}, 424 | pages = {1195-1206}, 425 | title = {Sparser Johnson-Lindenstrauss transforms}, 426 | year = {2012}} 427 | 428 | @article{GuhaMMMO03, 429 | author = {Sudipto Guha and Adam Meyerson and Nina Mishra and Rajeev Motwani and Liadan O'Callaghan}, 430 | bibsource = {DBLP, http://dblp.uni-trier.de}, 431 | date-added = {2012-01-15 15:34:54 +0200}, 432 | date-modified = {2012-01-15 15:35:01 +0200}, 433 | ee = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2003.1198387}, 434 | journal = {IEEE Trans. Knowl. Data Eng.}, 435 | number = {3}, 436 | pages = {515-528}, 437 | title = {Clustering Data Streams: Theory and Practice}, 438 | volume = {15}, 439 | year = {2003}} 440 | 441 | @inproceedings{AilonJM09, 442 | author = {Nir Ailon and Ragesh Jaiswal and Claire Monteleoni}, 443 | bibsource = {DBLP, http://dblp.uni-trier.de}, 444 | booktitle = {NIPS}, 445 | crossref = {DBLP:conf/nips/2009}, 446 | date-added = {2012-01-15 15:17:28 +0200}, 447 | date-modified = {2012-01-15 15:17:36 +0200}, 448 | ee = {http://books.nips.cc/papers/files/nips22/NIPS2009_1085.pdf}, 449 | pages = {10-18}, 450 | title = {Streaming k-means approximation}, 451 | year = {2009}} 452 | 453 | @inproceedings{ArthurV07, 454 | author = {David Arthur and Sergei Vassilvitskii}, 455 | bibsource = {DBLP, http://dblp.uni-trier.de}, 456 | booktitle = {SODA}, 457 | crossref = {DBLP:conf/soda/2007}, 458 | date-added = {2012-01-15 15:14:52 +0200}, 459 | date-modified = {2012-01-15 15:15:02 +0200}, 460 | ee = {http://doi.acm.org/10.1145/1283383.1283494}, 461 | pages = {1027-1035}, 462 | title = {k-means++: the advantages of careful seeding}, 463 | year = {2007}} 464 | 465 | @inproceedings{hk-sckmk-05, 466 | author = {S. {Har-Peled} and A. Kushal}, 467 | booktitle = SOCG_2005, 468 | date-added = {2012-01-15 14:17:41 +0200}, 469 | date-modified = {2023-08-28 07:17:01 -0400}, 470 | pages = {126--134}, 471 | title = {Smaller Coresets for k-Median and k-Means Clustering}, 472 | year = {2005}} 473 | 474 | @inproceedings{DingH04a, 475 | author = {Chris H. Q. Ding and Xiaofeng He}, 476 | bibsource = {DBLP, http://dblp.uni-trier.de}, 477 | booktitle = {ICML}, 478 | crossref = {DBLP:conf/icml/2004}, 479 | date-added = {2012-01-14 18:01:07 +0200}, 480 | date-modified = {2023-08-28 07:16:45 -0400}, 481 | ee = {http://doi.acm.org/10.1145/1015330.1015408}, 482 | title = {K-means clustering via principal component analysis}, 483 | year = {2004}} 484 | 485 | @article{Lloyd82leastsquares, 486 | author = {Stuart P. Lloyd}, 487 | date-added = {2012-01-14 17:55:10 +0200}, 488 | date-modified = {2012-01-14 17:55:10 +0200}, 489 | journal = {IEEE Transactions on Information Theory}, 490 | pages = {129--137}, 491 | title = {Least squares quantization in pcm}, 492 | volume = {28}, 493 | year = {1982}} 494 | 495 | @inproceedings{ZhaHDGS01, 496 | author = {Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming Gu and Horst D. Simon}, 497 | bibsource = {DBLP, http://dblp.uni-trier.de}, 498 | booktitle = {NIPS}, 499 | crossref = {DBLP:conf/nips/2001}, 500 | date-added = {2012-01-14 17:53:42 +0200}, 501 | date-modified = {2012-01-14 17:54:11 +0200}, 502 | ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA41.ps.gz}, 503 | pages = {1057-1064}, 504 | title = {Spectral Relaxation for K-means Clustering}, 505 | year = {2001}} 506 | 507 | @article{Bentley75, 508 | acmid = {361007}, 509 | address = {New York, NY, USA}, 510 | author = {Bentley, Jon Louis}, 511 | date-added = {2011-12-30 14:57:39 +0200}, 512 | date-modified = {2011-12-30 14:57:58 +0200}, 513 | doi = {http://doi.acm.org/10.1145/361002.361007}, 514 | issn = {0001-0782}, 515 | issue = {9}, 516 | journal = {Commun. ACM}, 517 | keywords = {associative retrieval, attribute, binary search trees, binary tree insertion, information retrieval system, intersection queries, key, nearest neighbor queries, partial match queries}, 518 | month = {September}, 519 | numpages = {9}, 520 | pages = {509--517}, 521 | publisher = {ACM}, 522 | title = {Multidimensional binary search trees used for associative searching}, 523 | url = {http://doi.acm.org/10.1145/361002.361007}, 524 | volume = {18}, 525 | year = {1975}, 526 | bdsk-url-1 = {http://doi.acm.org/10.1145/361002.361007}} 527 | 528 | @inproceedings{GionisIM99, 529 | author = {Aristides Gionis and Piotr Indyk and Rajeev Motwani}, 530 | booktitle = {VLDB}, 531 | date-added = {2011-12-30 14:48:57 +0200}, 532 | date-modified = {2011-12-30 14:54:13 +0200}, 533 | pages = {518-529}, 534 | title = {Similarity Search in High Dimensions via Hashing}, 535 | year = {1999}} 536 | 537 | @inproceedings{Charikar02, 538 | author = {Moses Charikar}, 539 | bibsource = {DBLP, http://dblp.uni-trier.de}, 540 | booktitle = {STOC}, 541 | date-added = {2011-12-30 14:44:59 +0200}, 542 | date-modified = {2011-12-30 14:45:18 +0200}, 543 | ee = {http://doi.acm.org/10.1145/509907.509965}, 544 | pages = {380-388}, 545 | title = {Similarity estimation techniques from rounding algorithms}, 546 | year = {2002}} 547 | 548 | @misc{Drineas03passefficient, 549 | author = {Petros Drineas and Ravi Kannan}, 550 | date-added = {2011-12-18 16:54:46 +0200}, 551 | date-modified = {2011-12-18 16:54:46 +0200}, 552 | title = {Pass Efficient Algorithms for Approximating Large Matrices}, 553 | year = {2003}} 554 | 555 | @article{AhlswedeW02, 556 | author = {Rudolf Ahlswede and Andreas Winter}, 557 | bibsource = {DBLP, http://dblp.uni-trier.de}, 558 | date-added = {2011-12-18 11:47:32 +0200}, 559 | date-modified = {2011-12-18 11:47:42 +0200}, 560 | ee = {http://dx.doi.org/10.1109/18.985947}, 561 | journal = {IEEE Transactions on Information Theory}, 562 | number = {3}, 563 | pages = {569-579}, 564 | title = {Strong converse for identification via quantum channels}, 565 | volume = {48}, 566 | year = {2002}} 567 | 568 | @inproceedings{AilonL11, 569 | author = {Nir Ailon and Edo Liberty}, 570 | bibsource = {DBLP, http://dblp.uni-trier.de}, 571 | booktitle = {SODA}, 572 | crossref = {DBLP:conf/soda/2011}, 573 | date-added = {2011-11-27 14:08:34 +0200}, 574 | date-modified = {2011-11-27 14:08:39 +0200}, 575 | ee = {http://www.siam.org/proceedings/soda/2011/SODA11_017_ailonn.pdf}, 576 | pages = {185-191}, 577 | title = {An Almost Optimal Unrestricted Fast Johnson-Lindenstrauss Transform}, 578 | year = {2011}} 579 | 580 | @article{DasGuptaGupta99, 581 | author = {S. DasGupta and A. Gupta}, 582 | date-added = {2011-11-27 14:06:16 +0200}, 583 | date-modified = {2023-08-28 07:29:49 -0400}, 584 | journal = {Technical Report, UC Berkeley}, 585 | title = {An elementary proof of the Johnson-Lindenstrauss lemma}, 586 | volume = {99-006}, 587 | year = 1999} 588 | 589 | @inproceedings{AilonCh06, 590 | address = {Seattle, WA}, 591 | author = {Nir Ailon and Bernard Chazelle}, 592 | booktitle = {Proceedings of the 38st Annual Symposium on the Theory of Compututing (STOC)}, 593 | date-added = {2011-11-27 14:05:58 +0200}, 594 | date-modified = {2023-08-28 07:30:03 -0400}, 595 | pages = {557--563}, 596 | title = {Approximate nearest neighbors and the fast Johnson-Lindenstrauss transform}, 597 | year = 2006} 598 | 599 | @article{JL84, 600 | author = {W. B. Johnson and J. Lindenstrauss}, 601 | date-added = {2011-11-27 14:05:01 +0200}, 602 | date-modified = {2023-08-28 07:18:14 -0400}, 603 | journal = {Contemporary Mathematics}, 604 | pages = {189--206}, 605 | title = {Extensions of Lipschitz mappings into a Hilbert space}, 606 | volume = 26, 607 | year = 1984} 608 | 609 | @article{douze2015quickcsg, 610 | author = {Douze, Matthijs and Franco, Jean-S{\'e}bastien and Raffin, Bruno}, 611 | school = {Inria-Research Centre Grenoble--Rh{\^o}ne-Alpes; INRIA}, 612 | title = {QuickCSG: Arbitrary and faster boolean combinations of n solids}, 613 | year = {2015}} 614 | 615 | @phdthesis{subramanian1990search, 616 | author = {Subramanian, KR and Fussel, DS}, 617 | school = {PhD thesis, The University of Texas at Austin}, 618 | title = {A search structure based on kd trees for efficient ray tracing}, 619 | year = {1990}} 620 | -------------------------------------------------------------------------------- /class_notes/vs.sty: -------------------------------------------------------------------------------- 1 | 2 | %%%% PACKAGES %%%% 3 | \usepackage{fullpage} 4 | \usepackage{algorithm,algorithmic} 5 | \usepackage{amsfonts, amsmath, amsthm} 6 | \usepackage{graphicx} 7 | %for dotted lines inside matrices 8 | \usepackage{arydshln} 9 | \setlength{\dashlinedash}{.8pt} % 10 | \setlength{\dashlinegap}{1.2pt} % 11 | 12 | %%%% ENVIRONMENTS %%%% 13 | \newtheorem{definition}{Definition}[section] 14 | \newtheorem{fact}{Fact}[section] 15 | \newtheorem{claim}{Claim}[section] 16 | \newtheorem{lemma}{Lemma}[section] 17 | \newtheorem{remark}{Remark}[section] 18 | \newtheorem{theorem}{Theorem}[section] 19 | \newtheorem{proposition}{Proposition}[section] 20 | 21 | %%%% COMMANDS %%%% 22 | \newcommand{\E}{{\mathbb E}} 23 | \newcommand{\Var}{{\operatorname{Var}}} 24 | \newcommand{\var}{{\operatorname{Var}}} 25 | \newcommand{\poly}{{\operatorname{poly}}} 26 | \newcommand{\const}{{\operatorname{const}}} 27 | \newcommand{\OPT}{{\operatorname{OPT}}} 28 | \newcommand{\ALG}{{\operatorname{ALG}}} 29 | 30 | 31 | \newcommand{\allones}{\mathbf{1}} 32 | \newcommand{\abs}[1]{\left| #1 \right|} 33 | \newcommand{\norm}[1]{\| #1 \|} 34 | \newcommand{\eps}{\varepsilon} 35 | \newcommand{\tab}{\hspace{.5cm}} 36 | \newcommand{\R}{{\mathbb{R}}} 37 | \newcommand{\Sph}{{\mathbb{S}}} 38 | \newcommand{\N}{{\mathcal{N}}} 39 | 40 | 41 | \newcommand{\lecturetitle}[1]{ 42 | \noindent 43 | \begin{center} 44 | \framebox{ 45 | \vbox{\vspace{2mm} 46 | \hbox to 6.28in { {\bf Long Term Memory in AI - Vector Search and Databases 47 | \hfill COS 597A Fall 2023} } 48 | \vspace{4mm} 49 | \hbox to 6.28in { {\Large \hfill #1 \hfill} } 50 | \vspace{2mm} 51 | \hbox to 6.28in { {\it Lectures: Lectures: Edo Liberty and Matthijs Douze \hfill}} 52 | \vspace{2mm}} 53 | } 54 | \end{center} 55 | \markboth{Lectures: Edo Liberty and Matthijs Douze}{Lectures: Edo Liberty and Matthijs Douze} 56 | {\small 57 | {\bf Warning}: {\it 58 | Please do not cite this note as a peer reviewed source. 59 | Please submit requests and corrections as issues or pull requests at github.com/edoliberty/vector-search-class-notes} 60 | \vspace*{4mm}} 61 | \hrule 62 | \vspace{1cm} 63 | } 64 | 65 | %%%% COMMON DEFS %%%% 66 | \date{\nonumber} 67 | 68 | --------------------------------------------------------------------------------