├── .gitignore ├── README.md ├── __init__.py ├── c ├── Makefile ├── common.c ├── common.h ├── frequentDirections.c ├── frequentDirections.h ├── randomProjection.c ├── randomProjection.h ├── randomSum.c ├── randomSum.h ├── rowSampler.c ├── rowSampler.h ├── singleItemSampler.c ├── singleItemSampler.h ├── sparseMatrix.c ├── sparseMatrix.h ├── sparseSketcher.c ├── sparseSketcher.h ├── sparseVector.c ├── sparseVector.h ├── testAll.c ├── test_vs_d.c ├── test_vs_ell.c ├── test_vs_n.c └── test_vs_sparsity.c ├── experiments ├── __init__.py ├── compareApproximationErrors.py └── compareRunningTimes.py ├── frequent_directions_experiments ├── __init__.py ├── blockPower.py ├── bruteForce.py ├── entrySampler.py ├── exampleUsage.py ├── frequentDirections.py ├── matrixSketcherBase.py ├── randomProjections.py ├── randomSums.py ├── rowSampler.py ├── sparseMatrix.py ├── sparseSketcher.py ├── sparseSketcher_sparseMat.py ├── sparseVector.py └── utils │ ├── __init__.py │ ├── common.py │ ├── reservoirSampler.py │ └── syntheticDataMaker.py ├── setup.py ├── test.sh └── test ├── __init__.py ├── runtests.sh ├── testBruteForce.py ├── testEntrySampler.py ├── testFrequentDirections.py ├── testRandomProjections.py ├── testRandomSums.py ├── testRowSampler.py └── testSparseSketcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # mac folder file 60 | *.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Matrix Sketching 2 | This repo was created by [Edo Liberty](www.edoliberty.com) and [Mina Ghashami](http://www.cs.utah.edu/~ghashami/). 3 | It builds all common streaming matrix sketching algroithms in Python. 4 | It is developed for academic use only and for reproducability of the results in the following papers 5 | * [Simple and Deterministic Matrix Sketches](http://www.cs.yale.edu/homes/el327/papers/simpleMatrixSketching.pdf) Edo Liberty 6 | * [Relative Errors for Deterministic Low-Rank Matrix Approximations](http://www.cs.utah.edu/~ghashami/papers/relative_err_soda.pdf) Mina Ghashami, Jeff M. Phillips 7 | * [Frequent Directions: Simple and Deterministic Matrix Sketching](http://www.cs.utah.edu/~ghashami/papers/fd_journal.pdf) Mina Ghashami, Edo Liberty, Jeff M. Phillips, David P. Woodruff 8 | * [Efficient Frequent Directions Algorithm for Sparse Matrices](http://arxiv.org/abs/1602.00412) Mina Ghashami, Edo Liberty, Jeff M. Phillips 9 | 10 | 11 | #### Usage 12 | If you are only using the library, you will noly need to the "python" folder. 13 | It contains an exampleUsage.py file for your convenience. 14 | 15 | 16 | #### Running tests and experiments 17 | Running tests requires using the -m flag which is standard in python unittesting. 18 | For example, to run the bruteForce sketcher test, go to the parent directory (outside frequentdirection/) and run 19 | ``` 20 | python -m frequentdirections.test.testBruteForce 21 | ``` 22 | 23 | #### Contributing 24 | Please feel free to send me pull requests. The test package is minimal. 25 | So, if you make chages to the core classes. Please also include the tests to cover your changes. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/frequent-directions-experiments/06ecc4a1513c9b83c0bda3de1d2cb5ded468e3a0/__init__.py -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | 2 | main: 3 | gcc -c -O3 -g -std=c99 -o sparseVector.o sparseVector.c 4 | gcc -c -O3 -g -std=c99 -o sparseMatrix.o sparseMatrix.c 5 | gcc -c -O3 -g -std=c99 -o sparseSketcher.o sparseSketcher.c 6 | gcc -c -O3 -g -std=c99 -o frequentDirections.o frequentDirections.c 7 | gcc -c -O3 -g -std=c99 -o testAll.o testAll.c 8 | gcc -c -O3 -g -std=c99 -o common.o common.c 9 | gfortran common.o frequentDirections.o sparseVector.o sparseMatrix.o sparseSketcher.o testAll.o /usr/lib/liblapacke.a /usr/lib/liblapack.a /usr/lib/libblas.a -o testAll.exe 10 | 11 | 12 | -------------------------------------------------------------------------------- /c/common.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | /* 4 | * QR decomposition of G 5 | * Q is returned in G, stored in the row-wise format 6 | * R is not returned 7 | */ 8 | void qrDecomp(double* G, lapack_int d, lapack_int ell) { 9 | /* if G is a vector */ 10 | if(d == 1){ 11 | normalizeVector(G, ell); 12 | return; 13 | } 14 | 15 | double tau[ell]; 16 | for(int i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | #define max(x, y) (x>y ? x : y) 14 | #define min(x, y) (xclass_name = "FrequentDirections"; 5 | self->dimension = dim; 6 | self->ell = ell; 7 | self->m = 2*ell; 8 | self->sketch = (double*) malloc(dim * (self->m) * sizeof(double)); 9 | self->nextRow = 0; 10 | } 11 | 12 | 13 | void append_to_fd(FrequentDirections* self, SparseVector* sv){ 14 | 15 | if (self->nextRow == self->m) 16 | rotate_fd(self); 17 | 18 | 19 | int j = 0; 20 | int rid = (self->nextRow) * (self->dimension); 21 | 22 | double* vec = densify_sparseVector(sv); 23 | 24 | 25 | for(int i = 0; i < sv->dimension; i++) 26 | self->sketch[rid + i] = vec[i]; 27 | 28 | self->nextRow ++; 29 | free(vec); 30 | } 31 | 32 | 33 | void rotate_fd(FrequentDirections* self){ 34 | double* S = (double*) malloc(sizeof(double) * self->m); 35 | double* U = (double*) malloc(sizeof(double) * self->m * self->m); 36 | double* Vt = (double*) malloc(sizeof(double) * self->m * self->dimension); 37 | 38 | 39 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', self->m, self->dimension, self->sketch, self->dimension, S, U, self->m, Vt, self->dimension); 40 | 41 | 42 | // compute S*Vt 43 | for(int i=0; i < self->ell; i++){ 44 | S[i] = sqrt( pow(S[i],2) - pow(S[self->ell - 1],2) ); 45 | for(int j=0; j < self->dimension; j++) 46 | self->sketch[i * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ; 47 | } 48 | 49 | memset(&self->sketch[self->ell * self->dimension], 0, self->ell * self->dimension * sizeof(double)); 50 | 51 | 52 | self->nextRow = self->ell; 53 | free(S); free(U); free(Vt); 54 | } 55 | 56 | void get_fdSketch(FrequentDirections* self) {} 57 | -------------------------------------------------------------------------------- /c/frequentDirections.h: -------------------------------------------------------------------------------- 1 | #ifndef FD_H 2 | #define FD_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sparseVector.h" 10 | #include 11 | #include "common.h" 12 | 13 | typedef struct { 14 | char* class_name; 15 | int dimension; 16 | int ell; 17 | int m; 18 | int nextRow; 19 | double* sketch; 20 | 21 | } FrequentDirections; 22 | 23 | 24 | void init_fd(FrequentDirections* self, int ell, int dim ); 25 | void append_to_fd(FrequentDirections* self, SparseVector* sv); 26 | void get_fdSketch(FrequentDirections* self); 27 | void rotate_fd(FrequentDirections* self); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /c/randomProjection.c: -------------------------------------------------------------------------------- 1 | #include "randomProjection.h" 2 | 3 | void init_randomProj(RandomProjection* self, int ell, int dim ){ 4 | self->class_name = "RandomProjection"; 5 | self->dimension = dim; 6 | self->ell = ell; 7 | self->sketch = (double*) malloc(dim * ell * sizeof(double)); 8 | memset(self->sketch, 0, sizeof(double) * ell * dim); 9 | srand(time(NULL)); 10 | } 11 | 12 | 13 | void append_to_randomProj(RandomProjection* self, SparseVector* sv){ 14 | int sign, index; 15 | 16 | for(int i=0; i < self->ell; i++){ 17 | sign = (-2) * (rand() % 2) + 1; 18 | for(int j=0; j < sv->nnz; j++){ 19 | index = i * self->dimension + sv->cols[j]; 20 | self->sketch[index] += (sign/sqrt(self->ell)) * (sv->values[j]); 21 | } 22 | } 23 | 24 | } 25 | 26 | -------------------------------------------------------------------------------- /c/randomProjection.h: -------------------------------------------------------------------------------- 1 | #ifndef RANDPROJ_H 2 | #define RANDPROJ_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sparseVector.h" 10 | 11 | typedef struct { 12 | char* class_name; 13 | int dimension; 14 | int ell; 15 | double* sketch; 16 | 17 | } RandomProjection; 18 | 19 | 20 | void init_randomProj(RandomProjection* self, int ell, int dim ); 21 | void append_to_randomProj(RandomProjection* self, SparseVector* sv); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /c/randomSum.c: -------------------------------------------------------------------------------- 1 | #include "randomSum.h" 2 | 3 | void init_randomSum(RandomSum* self, int ell, int dim ){ 4 | self->class_name = "RandomSum"; 5 | self->dimension = dim; 6 | self->ell = ell; 7 | self->sketch = (double*) malloc(dim * ell * sizeof(double)); 8 | memset(self->sketch, 0, sizeof(double) * ell * dim); 9 | srand(time(NULL)); 10 | } 11 | 12 | 13 | void append_to_randomSum(RandomSum* self, SparseVector* sv){ 14 | int rid = rand() % (self->ell); 15 | int sign = (-2) * (rand() % 2) + 1; 16 | int index; 17 | for(int i=0; innz; i++){ 18 | index = rid * self->dimension + sv->cols[i]; 19 | self->sketch[index] += sign * (sv->values[i]); 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /c/randomSum.h: -------------------------------------------------------------------------------- 1 | #ifndef RANDSUM_H 2 | #define RANDSUM_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sparseVector.h" 10 | 11 | typedef struct { 12 | char* class_name; 13 | double* sketch; 14 | int dimension; 15 | int ell; 16 | 17 | } RandomSum; 18 | 19 | 20 | void init_randomSum(RandomSum* self, int ell, int dim ); 21 | void append_to_randomSum(RandomSum* self, SparseVector* sv); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /c/rowSampler.c: -------------------------------------------------------------------------------- 1 | #include "rowSampler.h" 2 | 3 | void init_rowSampler(RowSampler* self, int ell, int dim ){ 4 | self-> class_name = "RowSampler"; 5 | self-> dimension = dim; 6 | self-> ell = ell; 7 | self-> sketch = (double*) malloc(sizeof(double) * ell * dim); 8 | self-> samplers = (SingleItemSampler*) malloc(sizeof(SingleItemSampler) * ell); 9 | memset(self-> sketch, 0 , sizeof(double) * ell * dim); 10 | } 11 | 12 | void append_to_rowSampler(RowSampler* self, SparseVector* sv){ 13 | int i; 14 | for(i=0; i < self-> ell; i++) 15 | add_itemSampler(&(self-> samplers[i]), sv); 16 | } 17 | 18 | 19 | void get_rowSamplerSketch(RowSampler* self){ 20 | 21 | SparseVector* item; 22 | double item_prob; 23 | 24 | for(int i=0; i < self-> ell; i++){ 25 | item = (self-> samplers[i]).item; 26 | item_prob = (self-> samplers[i]).item_probability; 27 | for(int j=0; j< item-> nnz; j++) 28 | self-> sketch[i * self-> dimension + item-> cols[j]] = (item-> values[j]) / sqrt(item_prob * self-> ell); 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /c/rowSampler.h: -------------------------------------------------------------------------------- 1 | #ifndef ROWSAMPLER_H 2 | #define ROWSAMPLER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sparseVector.h" 10 | #include "singleItemSampler.h" 11 | 12 | typedef struct { 13 | char* class_name; 14 | int dimension; 15 | int ell; 16 | double* sketch; 17 | SingleItemSampler* samplers; 18 | 19 | } RowSampler; 20 | 21 | 22 | void init_rowSampler(RowSampler* self, int ell, int dim ); 23 | void append_to_rowSampler(RowSampler* self, SparseVector* sv); 24 | void get_rowSamplerSketch(RowSampler* self); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /c/singleItemSampler.c: -------------------------------------------------------------------------------- 1 | #include "singleItemSampler.h" 2 | 3 | void init_itemSampler(SingleItemSampler* self){ 4 | self-> item = NULL; 5 | self-> item_weight = 0; 6 | self-> item_probability = 0; 7 | self-> sum_w = 0; 8 | self-> machine_precision = 1e-10; 9 | srand(time(NULL)); 10 | } 11 | 12 | 13 | void add_itemSampler(SingleItemSampler* self, SparseVector* sv){ 14 | self-> sum_w += sv-> squaredNorm; 15 | double p = sv-> squaredNorm / max(self-> sum_w , self-> machine_precision); 16 | 17 | double randomVal = rand()/(RAND_MAX+1.0); 18 | 19 | if (randomVal < p){ 20 | self-> item = sv; 21 | self-> item_weight = sv-> squaredNorm; 22 | self-> item_probability = p; 23 | }else{ 24 | self-> item_probability = self-> item_probability * (1.0-p); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /c/singleItemSampler.h: -------------------------------------------------------------------------------- 1 | #ifndef ITEMSAMPLER_H 2 | #define ITEMSAMPLER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sparseVector.h" 10 | #include "common.h" 11 | 12 | 13 | typedef struct { 14 | SparseVector* item; 15 | double item_weight; 16 | double item_probability; 17 | double sum_w; 18 | double machine_precision; 19 | 20 | } SingleItemSampler; 21 | 22 | 23 | void init_itemSampler(SingleItemSampler* self); 24 | void add_itemSampler(SingleItemSampler* self, SparseVector* sv); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /c/sparseMatrix.c: -------------------------------------------------------------------------------- 1 | #include "sparseMatrix.h" 2 | 3 | 4 | void init_sparseMatrix (SparseMatrix* self, int dim, int len){ 5 | 6 | self->current_nnz = 0; 7 | self->dimension = dim; 8 | self->nextRow = 0; 9 | self->squaredFrob = 0; 10 | self->vectors = (SparseVector*) malloc(sizeof(SparseVector) * len); 11 | } 12 | 13 | 14 | void append_to_sparseMatrix (SparseMatrix* self, SparseVector* sv){ 15 | 16 | self->vectors[self->nextRow] = *sv; 17 | self->nextRow ++; 18 | self->squaredFrob += sv->squaredNorm; 19 | self->current_nnz += sv->nnz; 20 | } 21 | 22 | 23 | void print_sparseMatrix(SparseMatrix* self){ 24 | for(int i=0; i < self->nextRow; i++) 25 | print_sparseVector(&(self->vectors[i])); 26 | 27 | } 28 | 29 | 30 | /* 31 | * computes A^TA.G where G is d*ell matrix 32 | * returns the result in G 33 | * temp is 1*ell working memory 34 | * product is d*ell working memory 35 | */ 36 | void covMultiply_sparseMatrix (SparseMatrix* self, int d, int ell, double** G, double* temp, double** product){ 37 | 38 | int rid; 39 | double val; 40 | SparseVector vec; 41 | 42 | for(int j=0; j < d * ell; j++) 43 | (*product)[j] = 0; 44 | 45 | for(int i = 0; i < self-> nextRow; i++){ 46 | for(int j=0; j vectors[i]; 49 | 50 | for(int j=0; j < vec.nnz; j++){ 51 | rid = vec.cols[j] * ell; 52 | val = vec.values[j]; 53 | 54 | for (int t=0; tnextRow) * ell; 78 | SparseVector vec; 79 | int rid, gidx; 80 | double val; 81 | 82 | for(int i=0; i < itr; i++) 83 | product[i] = 0; 84 | 85 | for(int i=0; i < self-> nextRow; i++){ 86 | vec = self-> vectors[i]; 87 | rid = i * ell; 88 | 89 | for (int t=0; t < vec.nnz ; t++){ 90 | val = vec.values[t]; 91 | gidx = vec.cols[t]*ell; 92 | 93 | for(int j=0; j < ell; j++){ 94 | product[rid + j] += val * G[gidx + j]; 95 | } 96 | } 97 | } 98 | 99 | } 100 | 101 | 102 | /* computes Gt*A 103 | * G has ell columns 104 | * output is returned in double* product 105 | */ 106 | void transposeRightMult (SparseMatrix* self, int ell, double* G, double* product){ 107 | int itr = (self->dimension) * ell; 108 | SparseVector vec; 109 | int rid, col; 110 | double val; 111 | 112 | for(int i=0; i nextRow; i++){ 117 | vec = self-> vectors[i]; 118 | rid = i*ell; 119 | 120 | for(int j=0; j < vec.nnz; j++){ 121 | val = vec.values[j]; 122 | col = vec.cols[j]; 123 | 124 | for (int t=0; tdimension) + col] += G[rid + t] * val; 126 | } 127 | } 128 | } 129 | } 130 | 131 | 132 | void blockPowerMethod(SparseMatrix *self, int ell, double epsilon, double* G, double* lsv, double* temp_vec, double* temp_mat){ 133 | int iterations = (int) ceil(1 * (log(self->dimension / epsilon) / epsilon)); 134 | 135 | for(int i=0; i < iterations; i++){ 136 | if(i % 10 == 0) 137 | qrDecomp(G, self->dimension, ell); 138 | covMultiply_sparseMatrix(self, self->dimension, ell, &G, temp_vec, &temp_mat); 139 | } 140 | 141 | // approx left singular vectors 142 | leftMult (self, ell, G, lsv); 143 | qrDecomp(lsv, self->nextRow, ell); 144 | 145 | } 146 | 147 | 148 | /* returns covariance matrix, i.e. AtA 149 | */ 150 | double* getCovariance_sparseMatrix(SparseMatrix* self){ 151 | 152 | double* cov = (double*) malloc(sizeof(double) * self->dimension * self-> dimension); 153 | memset(cov, 0 , self->dimension * self-> dimension * sizeof(double)); 154 | 155 | 156 | int elemIndex; 157 | double val; 158 | SparseVector vec; 159 | 160 | for(int t=0; t < self->nextRow; t++){ 161 | vec = self->vectors[t]; 162 | for(int i=0; i< vec.nnz; i++){ 163 | for(int j=0; j< vec.nnz; j++){ 164 | elemIndex = vec.cols[i] * self-> dimension + vec.cols[j]; 165 | val = (vec.values[i]) * (vec.values[j]); 166 | cov[elemIndex] += val; 167 | } 168 | } 169 | } 170 | 171 | return cov; 172 | } 173 | 174 | void densify_sparseMatrix(SparseMatrix* self, double* output){ 175 | 176 | int rid; 177 | SparseVector vec; 178 | 179 | int itr = self->nextRow * self->dimension; 180 | for(int i=0; inextRow; t++){ 185 | vec = self->vectors[t]; 186 | rid = t * self->dimension; 187 | 188 | for(int i=0; i < vec.nnz; i++) 189 | output[ rid + vec.cols[i] ] = vec.values[i]; 190 | } 191 | } 192 | 193 | 194 | double computeCovErr(SparseMatrix* A, double* B, int ell, int d){ 195 | double* AtA = getCovariance_sparseMatrix(A); 196 | double* BtB = getDenseCovariance(B, ell, d); 197 | subtract(AtA, BtB, d, d); 198 | return getSpectralNorm(AtA, d, d); 199 | } 200 | 201 | double computeRelCovErr(SparseMatrix* A, double* B, int ell, int d){ 202 | double s = computeCovErr(A,B,ell,d); 203 | return s / A-> squaredFrob; 204 | 205 | } 206 | 207 | double topRank_cov(double* AtA, int d, int k){ 208 | 209 | double* S = (double*) malloc(sizeof(double) * d); 210 | double* U = (double*) malloc(sizeof(double) * d * d); 211 | double* Vt = (double*) malloc(sizeof(double) * d * d); 212 | 213 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'N', d, d, AtA, d, S, U, d, Vt, d); 214 | 215 | free(U); free(Vt); 216 | double tailSquaredFrob = 0; 217 | 218 | for(int i = k; i < d ; i++) 219 | tailSquaredFrob += S[i]; 220 | 221 | free(S); 222 | return tailSquaredFrob; 223 | 224 | } 225 | 226 | 227 | /* computes top rank k of A, returns it in Vt, returns tail norm of A too */ 228 | double topRank(SparseMatrix* A, int k){ 229 | 230 | double* Adense = (double*) malloc(sizeof(double) * A->nextRow * A->dimension); 231 | densify_sparseMatrix(A, Adense); 232 | 233 | double* S = (double*) malloc(sizeof(double) * A->nextRow); 234 | double* U = (double*) malloc(sizeof(double) * A->nextRow * A->nextRow); 235 | double* Vt = (double*) malloc(sizeof(double) * A->dimension * A->dimension); 236 | 237 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'N', A->nextRow, A->dimension, Adense, A->dimension, S, U, A->nextRow, Vt, A->dimension); 238 | 239 | free(U); free(Adense); free(Vt); 240 | int itr = min(A->nextRow, A->dimension); 241 | double tailSquaredFrob = 0; 242 | 243 | for(int i = k; i < itr ; i++) 244 | tailSquaredFrob += pow(S[i],2); 245 | 246 | free(S); 247 | return tailSquaredFrob; 248 | } 249 | 250 | 251 | double computeRelProjErr(SparseMatrix* A, double* B, int ell, int d, int k, double tailSquaredFrob){ 252 | 253 | 254 | double projNorm = 0, projErr = 0; 255 | double projVec[k]; 256 | SparseVector vec; 257 | int rid; 258 | 259 | 260 | double* S = (double*) malloc(sizeof(double) * 2 * ell); 261 | double* U = (double*) malloc(sizeof(double) * 4 * ell * ell); 262 | double* Vt = (double*) malloc(sizeof(double) * d * d); 263 | 264 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'A', 2*ell, d, B, d, S, U, 2*ell, Vt, d); 265 | 266 | 267 | for(int t=0; t< A->nextRow; t++){ 268 | vec = A->vectors[t]; 269 | projNorm = 0; 270 | 271 | for(int i=0; innz; i++ ) 286 | dp += sv->values[i] * Vt[rid*dim + sv->cols[i]]; 287 | 288 | return dp; 289 | } 290 | -------------------------------------------------------------------------------- /c/sparseMatrix.h: -------------------------------------------------------------------------------- 1 | #ifndef SPARSEMATRIX_H 2 | #define SPARSEMATRIX_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "sparseVector.h" 12 | #include "common.h" 13 | 14 | typedef struct { 15 | SparseVector* vectors; 16 | int nextRow; 17 | int dimension; 18 | int current_nnz; 19 | double squaredFrob; 20 | 21 | } SparseMatrix; 22 | 23 | 24 | double topRank_cov(double* AtA, int d, int k); 25 | 26 | void init_sparseMatrix (SparseMatrix* self, int dim, int len); 27 | void append_to_sparseMatrix (SparseMatrix *self, SparseVector *sv); 28 | void print_sparseMatrix(SparseMatrix* self); 29 | void covMultiply_sparseMatrix (SparseMatrix *self, int dimension, int ell, double** G, double* temp, double** product); 30 | void leftMult (SparseMatrix *self, int ell, double* G, double* product); 31 | void transposeRightMult (SparseMatrix *self, int ell, double* G, double* product); 32 | void blockPowerMethod(SparseMatrix *self, int ell, double epsilon, double* G, double* lsv, double* temp_vec, double* temp_mat); 33 | double* getCovariance_sparseMatrix(SparseMatrix *self); 34 | void densify_sparseMatrix(SparseMatrix* self, double* output); 35 | double computeCovErr(SparseMatrix* A, double* B, int ell, int d); 36 | double computeRelCovErr(SparseMatrix* A, double* B, int ell, int d); 37 | double topRank(SparseMatrix* A, int k); 38 | double computeRelProjErr(SparseMatrix* A, double* B, int ell, int d, int k, double tailSquaredFrob); 39 | double dotproduct(SparseVector* sv, double* Vt, int rid, int dim); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /c/sparseSketcher.c: -------------------------------------------------------------------------------- 1 | #include "sparseSketcher.h" 2 | 3 | void init_sparseSketcher(SparseSketcher* self, int ell, int dim ){ 4 | self->class_name = "sparseSketcher"; 5 | self->dimension = dim; 6 | self->ell = ell; 7 | self->m = 2*ell; 8 | self->sketch = (double*) malloc(sizeof(double) * (self->m) * dim); 9 | init_sparseMatrix(&(self->buffer), dim, dim); 10 | self->nnz_threshold = ell * dim; 11 | 12 | for(int i=0; i < ell*dim; i++) 13 | self->sketch[i] = 0; 14 | 15 | } 16 | 17 | 18 | void append_to_sparseSketcher(SparseSketcher* self, SparseVector* sv){ 19 | if((self->buffer).current_nnz >= self->nnz_threshold || (self->buffer).nextRow >= self->dimension) 20 | rotate_sparseSketcher(self); 21 | append_to_sparseMatrix(&(self->buffer), sv); 22 | } 23 | 24 | void rotate_sparseSketcher(SparseSketcher *self){ 25 | sparseShrink(self); 26 | denseShrink(self); 27 | } 28 | 29 | 30 | void get_sparseSketch(SparseSketcher *self){ 31 | sparseShrink(self); 32 | //rotate_sparseSketcher(self); 33 | } 34 | 35 | void sparseShrink(SparseSketcher *self){ 36 | if((self->buffer).nextRow > self->ell){ 37 | double* temp_vec = (double*) malloc(sizeof(double) * self->ell); 38 | double* temp_mat = (double*) malloc(sizeof(double) * self->ell * self->dimension); 39 | double* G = (double*) malloc(self->ell * self->dimension * sizeof(double)); 40 | double* Z = (double*) malloc(self->ell * (self->buffer).nextRow * sizeof(double)); 41 | 42 | for(int i=0; i < self->ell * self->dimension; i++) 43 | G[i] = ( (float)rand() / (float)(RAND_MAX) ); 44 | 45 | blockPowerMethod(&(self->buffer), self->ell, 1, G, Z, temp_vec, temp_mat); 46 | free(temp_vec); 47 | free(G); 48 | 49 | //computing P = ZtA, temp_mat is P 50 | transposeRightMult(&(self->buffer), self->ell, Z, temp_mat); 51 | free(Z); 52 | 53 | // svd(ZtA) 54 | double* S = (double*) malloc(sizeof(double) * self->ell); 55 | double* U = (double*) malloc(sizeof(double) * self->ell * self->ell); 56 | double* Vt = (double*) malloc(sizeof(double) * self->dimension * self->ell); 57 | 58 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', self->ell, self->dimension, temp_mat, self->dimension, S, U, self->ell, Vt, self->dimension); 59 | free(temp_mat); 60 | 61 | 62 | // shrink S and compute S*Vt 63 | for(int i=0; i < self->ell; i++){ 64 | S[i] = sqrt( pow(S[i],2) - pow(S[self->ell-1],2) ); 65 | for(int j=0; j < self->dimension; j++) 66 | self->sketch[(self->ell + i) * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ; 67 | } 68 | }else{ // self->buffer has atmost ell rows 69 | 70 | SparseVector temp; 71 | 72 | for(int i=0; i < (self->buffer).nextRow; i++){ 73 | temp = (self->buffer).vectors[i]; 74 | for(int j=0; j < temp.nnz; j++) 75 | self->sketch[(self->ell + i) * self->dimension + temp.cols[j]] = temp.values[j]; 76 | } 77 | } 78 | 79 | // reset buffer 80 | (self->buffer).current_nnz = 0; 81 | (self->buffer).nextRow = 0; 82 | (self->buffer).squaredFrob = 0; 83 | } 84 | 85 | 86 | void denseShrink(SparseSketcher* self){ 87 | double* S = (double*) malloc(sizeof(double) * 2 * self->ell); 88 | double* U = (double*) malloc(sizeof(double) * 4 * self->ell * self->ell); 89 | double* Vt = (double*) malloc(sizeof(double) * 2 * self->dimension * self->ell); 90 | 91 | int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', 2*self->ell, self->dimension, self->sketch, self->dimension, S, U, 2*self->ell, Vt, self->dimension); 92 | 93 | for(int i=0; i < self->ell; i++){ 94 | S[i] = sqrt( pow(S[i],2) - pow(S[self->ell-1],2) ); 95 | for(int j=0; j < self->dimension; j++) 96 | self->sketch[i * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ; 97 | } 98 | 99 | memset(&self->sketch[self->ell * self->dimension], 0, self->ell * self->dimension * sizeof(double)); 100 | } 101 | -------------------------------------------------------------------------------- /c/sparseSketcher.h: -------------------------------------------------------------------------------- 1 | #ifndef SPARSESKETCHER_H 2 | #define SPARSESKETCHER_H 3 | 4 | #include "sparseMatrix.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | typedef struct { 12 | char* class_name; 13 | int dimension; 14 | int ell; 15 | int m; 16 | double* sketch; 17 | SparseMatrix buffer; 18 | int nnz_threshold; 19 | 20 | } SparseSketcher; 21 | 22 | 23 | void init_sparseSketcher(SparseSketcher* self, int ell, int dim ); 24 | void append_to_sparseSketcher(SparseSketcher* self, SparseVector* sv); 25 | void sparseShrink(SparseSketcher* self); 26 | void denseShrink(SparseSketcher* self); 27 | void rotate_sparseSketcher(SparseSketcher* self); 28 | void get_sparseSketch(SparseSketcher* self); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /c/sparseVector.c: -------------------------------------------------------------------------------- 1 | #include "sparseVector.h" 2 | 3 | 4 | void init_sparseVector(SparseVector* self, int dim, int cols[], double vals[], int nnz){ 5 | self-> nnz = nnz; 6 | self-> dimension = dim; 7 | self-> cols = (int*) malloc(sizeof(int) * self-> nnz); 8 | self-> values = (double*) malloc(sizeof(double) * self-> nnz); 9 | self-> squaredNorm = 0; 10 | 11 | for (int i=0; i < self-> nnz; i++){ 12 | self-> cols[i] = cols[i]; 13 | self-> values[i] = vals[i]; 14 | self-> squaredNorm += pow(vals[i] , 2); 15 | } 16 | } 17 | 18 | 19 | /* it generates a vector of dim dimension, 20 | with only nnz non-zeros 21 | first jlen columns have threshold_prob probability of getting a non-zero 22 | non-zeros are picked from [-10, 10] uniformly at random 23 | */ 24 | void skew_init_sparseVector(SparseVector* self, int dim, int nnz, int jlen, double threshold_prob){ 25 | self-> dimension = dim; 26 | self-> nnz = nnz; 27 | self-> cols = (int*) malloc(sizeof(int) * self-> nnz); 28 | self-> values = (double*) malloc(sizeof(double) * self-> nnz); 29 | self-> squaredNorm = 0; 30 | 31 | double randomVal; 32 | int flag, col_id, t; 33 | 34 | for (int i=0; i < self-> nnz; i++){ 35 | randomVal = rand()/(RAND_MAX+1.0); 36 | 37 | if(randomVal < threshold_prob){ // goes to first "jlen" columns 38 | col_id = (int) rand() % jlen; 39 | flag = 1; 40 | 41 | while (flag == 1){ 42 | for (t=0; t < i; t++) 43 | if (col_id == self-> cols[t]) 44 | break; 45 | if (t == i) 46 | flag = 0; 47 | else 48 | col_id = rand() % jlen; 49 | } 50 | } 51 | 52 | else{// goes to the rest of columns 53 | col_id = jlen + (int) rand() % (dim-jlen); 54 | flag = 1; 55 | 56 | while (flag == 1){ 57 | for (t=0; t < i; t++) 58 | if (col_id == self-> cols[t]) 59 | break; 60 | if (t == i) 61 | flag = 0; 62 | else 63 | col_id = jlen + (int) rand() % (dim-jlen); 64 | } 65 | } 66 | self-> cols[i] = col_id; 67 | int tempr = 2 * (rand()%2) - 1; 68 | self-> values[i] = tempr * (int)ceil( ((double)rand()/(double)(RAND_MAX)) * 10); 69 | 70 | self-> squaredNorm += pow(self-> values[i] , 2); 71 | } 72 | } 73 | 74 | 75 | void random_init_sparseVector(SparseVector* self, int dim, int nnz){ 76 | self-> dimension = dim; 77 | self-> nnz = nnz; 78 | self-> cols = (int*) malloc(sizeof(int) * self-> nnz); 79 | self-> values = (double*) malloc(sizeof(double) * self-> nnz); 80 | self-> squaredNorm = 0; 81 | int i; 82 | 83 | for (i=0; i < self-> nnz; i++){ 84 | double newly_gen = rand() % dim; 85 | int flag = 1; 86 | int j= 0; 87 | 88 | while (flag == 1){ 89 | for (j=0; j < i; j++) 90 | if (newly_gen == self-> cols[j]) 91 | break; 92 | if (j == i) 93 | flag = 0; 94 | else 95 | newly_gen = rand() % dim; 96 | } 97 | self-> cols[i] = newly_gen; 98 | self-> values[i] = (int)ceil( ((double)rand()/(double)(RAND_MAX)) * 10); 99 | self-> squaredNorm += pow(self-> values[i] , 2); 100 | } 101 | } 102 | 103 | void print_sparseVector(SparseVector* self){ 104 | 105 | for (int i=0; i< self-> nnz; i++) 106 | printf("(%d, %.2f)", self-> cols[i], self-> values[i] ); 107 | printf("\n"); 108 | } 109 | 110 | 111 | double* densify_sparseVector(SparseVector* self){ 112 | double* vec = (double*) malloc(sizeof(double) * self->dimension); 113 | 114 | for(int i=0; i < self->dimension ; i++) 115 | vec[i] = 0; 116 | for(int i=0; i < self->nnz; i++) 117 | vec[self->cols[i]] = self->values[i]; 118 | return vec; 119 | } 120 | 121 | -------------------------------------------------------------------------------- /c/sparseVector.h: -------------------------------------------------------------------------------- 1 | #ifndef SPARSEVEC_H 2 | #define SPARSEVEC_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | typedef struct{ 10 | double squaredNorm; 11 | double* values; 12 | int dimension; 13 | int* cols; 14 | int nnz; 15 | 16 | } SparseVector; 17 | 18 | void init_sparseVector(SparseVector* self, int dim, int cols[], double vals[], int nnz); 19 | void random_init_sparseVector(SparseVector* self, int dim, int nnz); 20 | void print_sparseVector(SparseVector* self); 21 | void skew_init_sparseVector(SparseVector* self, int dim, int nnz, int jlen, double threshold_prob); 22 | double* densify_sparseVector(SparseVector* self); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /c/testAll.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "common.h" 5 | #include "sparseSketcher.h" 6 | #include "frequentDirections.h" 7 | #include "test_vs_ell.c" 8 | #include "test_vs_d.c" 9 | #include "test_vs_n.c" 10 | #include "test_vs_sparsity.c" 11 | 12 | 13 | 14 | int main(){ 15 | test_vs_ell(); 16 | test_vs_sparsity(); 17 | test_vs_d(); 18 | test_vs_n(); 19 | } 20 | 21 | -------------------------------------------------------------------------------- /c/test_vs_d.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "frequentDirections.h" 3 | #include "sparseSketcher.h" 4 | #include 5 | #include 6 | #include 7 | 8 | void test_vs_d(){ 9 | int n = 10000; 10 | int dim_set[] = {1000,2000,3000,4000,5000,6000}; 11 | int k = 10; 12 | int exp_no = 6; 13 | int nnz = 100; 14 | int ell = 50; 15 | 16 | double start, end, cpu_time_used; 17 | SparseMatrix A; 18 | SparseVector arr[n]; 19 | SparseSketcher sfd; 20 | FrequentDirections fd; 21 | 22 | double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no]; 23 | double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no]; 24 | 25 | double tailSquaredFrob; 26 | 27 | 28 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | void test_vs_ell(){ 9 | int n = 10000; 10 | int dim = 1000; 11 | int k = 10; 12 | int exp_no = 6; 13 | int nnz = 100; 14 | int ell_set[] = {5, 10, 15, 20, 50, 100}; 15 | 16 | double start, end, cpu_time_used; 17 | SparseMatrix A; 18 | SparseVector arr[n]; 19 | SparseSketcher sfd; 20 | FrequentDirections fd; 21 | 22 | double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no]; 23 | double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no]; 24 | 25 | double tailSquaredFrob; 26 | 27 | 28 | // input matrix 29 | init_sparseMatrix(&A, dim, n); 30 | for (int j=0; j < n; j++){ 31 | skew_init_sparseVector(&arr[j], dim, nnz, (int) (1.5 * nnz), 0.9); 32 | append_to_sparseMatrix(&A, &arr[j]); 33 | } 34 | tailSquaredFrob = topRank(&A, k); 35 | 36 | 37 | // expr 38 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | void test_vs_n(){ 9 | int n_set[] = {10000,20000,30000,40000,50000, 60000}; 10 | int dim = 1000; 11 | int k = 10; 12 | int exp_no = 6; 13 | int nnz = 100; 14 | int ell = 50; 15 | 16 | double start, end, cpu_time_used; 17 | SparseMatrix A; 18 | SparseVector arr[60000]; 19 | SparseSketcher sfd; 20 | FrequentDirections fd; 21 | 22 | double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no]; 23 | double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no]; 24 | 25 | double tailSquaredFrob; 26 | 27 | 28 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | 9 | void test_vs_sparsity(){ 10 | int n = 10000; 11 | int dim = 1000; 12 | int ell = 50; 13 | int k = 10; 14 | int exp_no = 6; 15 | int var_set[] = {0.01 * dim, 0.05 * dim, 0.1*dim, 0.3*dim, 0.5*dim, 0.7*dim}; 16 | //{0.005 * dim, 0.01 * dim, 0.05 * dim, 0.1*dim, 0.3*dim, 0.5*dim}; 17 | 18 | double start, end, cpu_time_used; 19 | SparseMatrix A; 20 | SparseVector arr[n]; 21 | SparseSketcher sfd; 22 | FrequentDirections fd; 23 | 24 | double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no]; 25 | double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no]; 26 | 27 | double tailSquaredFrob; 28 | 29 | init_sparseSketcher(&sfd, ell, dim); 30 | init_fd(&fd, ell, dim); 31 | 32 | 33 | for(int i=0; i d / 2: 41 | continue 42 | 43 | sketcher = sketcherClass(d, ell) 44 | for row in A: 45 | sketcher.append(row) 46 | 47 | sketch = sketcher.get() 48 | 49 | diff = ATA - dot(sketch.transpose(), sketch) 50 | relative_cov_err = norm(diff, 2) / squared_frob_A 51 | 52 | [u, s, vt] = svd(sketch, full_matrices=False) 53 | vt = vt[:k, :] 54 | projection = dot(A, dot(vt.transpose(), vt)) 55 | proj_err = norm(A - projection, "fro") ** 2 56 | opt_rank_k_err = norm(A - A_rank_k, "fro") ** 2 57 | relative_proj_err = float(proj_err) / float(opt_rank_k_err) 58 | 59 | print(sketcher.class_name, relative_cov_err, relative_proj_err) 60 | -------------------------------------------------------------------------------- /experiments/compareRunningTimes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from itertools import product 3 | from time import time as timer 4 | from numpy import cov as covariance 5 | 6 | sys.path.append("../sketch") # needed for imports 7 | from utils.syntheticDataMaker import SyntheticDataMaker 8 | import bruteForce, frequentDirections, rowSampler, randomProjections, randomSums 9 | 10 | if __name__ == "__main__": 11 | sketcherClasses = [ 12 | bruteForce.BruteForce, 13 | rowSampler.RowSampler, 14 | randomProjections.RandomProjections, 15 | randomSums.RandomSums, 16 | frequentDirections.FrequentDirections, 17 | ] 18 | ns = [1000] 19 | ds = [100] 20 | ells = range(10, 101, 10) 21 | ks = [5] 22 | rounds = 1 23 | 24 | for (n, d, k) in product(ns, ds, ks): 25 | data_maker = SyntheticDataMaker() 26 | data_maker.initBeforeMake(d, k, signal_to_noise_ratio=10.0) 27 | A = data_maker.makeMatrix(n) # n * d matrix 28 | 29 | for (sketcherClass, ell, r) in product(sketcherClasses, ells, range(rounds)): 30 | if ell > d / 2: 31 | continue 32 | 33 | sketcher = sketcherClass(d, ell) 34 | t_start = timer() 35 | for row in A: 36 | sketcher.append(row) 37 | t_end = timer() 38 | 39 | totalSketchTime = t_end - t_start 40 | print(sketcher.class_name, totalSketchTime) 41 | -------------------------------------------------------------------------------- /frequent_directions_experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import blockPower 3 | from . import bruteForce 4 | from . import entrySampler 5 | from . import exampleUsage 6 | from . import frequentDirections 7 | from . import matrixSketcherBase 8 | from . import randomProjections 9 | from . import randomSums 10 | from . import rowSampler 11 | from . import sparseMatrix 12 | from . import sparseSketcher_sparseMat 13 | from . import sparseSketcher 14 | from . import sparseVector 15 | from . import utils 16 | 17 | __all__ = [ 18 | "blockPower", 19 | "bruteForce", 20 | "entrySampler", 21 | "exampleUsage", 22 | "frequentDirections", 23 | "matrixSketcherBase", 24 | "randomProjections", 25 | "randomSums", 26 | "rowSampler", 27 | "sparseMatrix", 28 | "sparseSketcher_sparseMat", 29 | "sparseSketcher", 30 | "sparseVector", 31 | "utils", 32 | ] 33 | -------------------------------------------------------------------------------- /frequent_directions_experiments/blockPower.py: -------------------------------------------------------------------------------- 1 | # import numpy, scipy 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from numpy.random import randn 5 | from numpy import ceil, log, zeros 6 | from numpy.linalg import qr, svd 7 | 8 | 9 | def blockpower(sparseMat, ell, eps=1): 10 | n, d = sparseMat.getShape() 11 | init_mat = randn(d, ell) 12 | num_of_iter = int( 13 | 10 * ceil(log(n / eps) / eps) 14 | ) # constant 10 should be found experimentally based on eps 15 | 16 | for i in range(num_of_iter): 17 | init_mat = sparseMat.covarianceMult(init_mat) 18 | # K = mat.dot(init_mat) 19 | # init_mat = (mat.transpose()).dot(K) 20 | 21 | K = sparseMat.mult(init_mat) 22 | 23 | [Q, _] = qr(K) 24 | # M = (Q.transpose()).dot(mat) 25 | M = (mat.transpose()).dot(Q) # computing transpose of what we need 26 | 27 | [U, S, _] = svd(M, full_matrices=False) 28 | 29 | return S, U[:, :ell].transpose() # U is ell*d 30 | # return (U[:,:ell].transpose()).dot(Q.transpose()), S[:ell] #this step might violate sing val bound we want 31 | 32 | 33 | if __name__ == "__main__": 34 | A = numpy.random.randn(500, 300) 35 | bpm = BlockPower() 36 | V = bpm.svds(A, 20) 37 | 38 | Vnew = numpy.dot(numpy.transpose(A), numpy.dot(A, V)) 39 | 40 | for j in range(20): 41 | z = numpy.linalg.norm(Vnew[:, j]) 42 | Vnew[:, j] = Vnew[:, j] / z 43 | 44 | print(numpy.linalg.norm(Vnew - V) ** 2) 45 | 46 | # print numpy.dot(numpy.transpose(V),V) 47 | -------------------------------------------------------------------------------- /frequent_directions_experiments/bruteForce.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy import zeros, dot, outer, diag, sqrt 3 | from numpy.linalg import svd 4 | from .matrixSketcherBase import MatrixSketcherBase 5 | 6 | 7 | class BruteForce(MatrixSketcherBase): 8 | def __init__(self, d, ell): 9 | self.d = d 10 | self.ell = ell 11 | self.class_name = "BruteForce" 12 | self.covariance = zeros((self.d, self.d)) 13 | 14 | def append(self, vector): 15 | self.covariance += outer(vector, vector) 16 | 17 | def get(self): 18 | (U, s, Vt) = svd(self.covariance) 19 | return dot(diag(sqrt(s[: self.ell])), Vt[: self.ell, :]) 20 | -------------------------------------------------------------------------------- /frequent_directions_experiments/entrySampler.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from scipy.sparse import dok_matrix 3 | from scipy import float32 4 | from scipy.sparse.linalg import svds 5 | from numpy import dot, diag, sqrt 6 | 7 | from .utils.reservoirSampler import ReservoirSampler 8 | 9 | 10 | class EntrySampler: 11 | def __init__(self, d, ell): 12 | self.class_name = "EntrySampler" 13 | self.d = d 14 | self.ell = ell 15 | self.nnz = d * ell 16 | self.rows = 0 17 | self.sampler = ReservoirSampler(self.nnz) 18 | 19 | def append(self, v): 20 | for (col, val) in enumerate(v): 21 | self.sampler.add((self.rows, col, val), abs(val)) 22 | self.rows += 1 23 | 24 | def get(self): 25 | B = dok_matrix((self.rows, self.d), dtype=float32) 26 | for ((row, col, val), p) in self.sampler.get(with_probabilities=True): 27 | B[row, col] += val / (p * self.nnz) 28 | covariance = dot(B.transpose(), B) 29 | (_, s, Vt) = svds( 30 | covariance, k=self.ell, maxiter=50, return_singular_vectors=True 31 | ) 32 | return dot(diag(sqrt(s[: self.ell])), Vt[: self.ell, :]) 33 | -------------------------------------------------------------------------------- /frequent_directions_experiments/exampleUsage.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import sys 3 | from numpy.linalg import norm 4 | from numpy import dot 5 | 6 | from .utils.syntheticDataMaker import SyntheticDataMaker 7 | from .frequentDirections import FrequentDirections 8 | 9 | n = 500 10 | d = 100 11 | ell = 20 12 | k = 5 13 | 14 | # this is only needed for generating input vectors 15 | dataMaker = SyntheticDataMaker() 16 | dataMaker.initBeforeMake(d, k, signal_to_noise_ratio=10.0) 17 | 18 | # This is where the sketching actually happens 19 | sketcher = FrequentDirections(d, ell) 20 | for i in range(n): 21 | row = dataMaker.makeRow() 22 | sketcher.append(row) 23 | sketch = sketcher.get() 24 | 25 | # Here is where you do something with the sketch. 26 | # The sketch is an ell by d matrix 27 | # For example, you can compute an approximate covariance of the input 28 | # matrix like this: 29 | 30 | approxCovarianceMatrix = dot(sketch.transpose(), sketch) 31 | -------------------------------------------------------------------------------- /frequent_directions_experiments/frequentDirections.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy import zeros, max, sqrt, isnan, isinf, dot, diag, count_nonzero 3 | from numpy.linalg import svd, linalg 4 | from scipy.linalg import svd as scipy_svd 5 | from scipy.sparse.linalg import svds as scipy_svds 6 | 7 | from .matrixSketcherBase import MatrixSketcherBase 8 | 9 | 10 | class FrequentDirections(MatrixSketcherBase): 11 | def __init__(self, d, ell): 12 | self.class_name = "FrequentDirections" 13 | self.d = d 14 | self.ell = ell 15 | self.m = 2 * self.ell 16 | self._sketch = zeros((self.m, self.d)) 17 | self.nextZeroRow = 0 18 | 19 | def append(self, vector): 20 | if count_nonzero(vector) == 0: 21 | return 22 | 23 | if self.nextZeroRow >= self.m: 24 | self.__rotate__() 25 | 26 | self._sketch[self.nextZeroRow, :] = vector 27 | self.nextZeroRow += 1 28 | 29 | def __rotate__(self): 30 | try: 31 | [_, s, Vt] = svd(self._sketch, full_matrices=False) 32 | except linalg.LinAlgError as err: 33 | [_, s, Vt] = scipy_svd(self._sketch, full_matrices=False) 34 | # [_,s,Vt] = scipy_svds(self._sketch, k = self.ell) 35 | 36 | if len(s) >= self.ell: 37 | sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2) 38 | self._sketch[: self.ell :, :] = dot(diag(sShrunk), Vt[: self.ell, :]) 39 | self._sketch[self.ell :, :] = 0 40 | self.nextZeroRow = self.ell 41 | else: 42 | self._sketch[: len(s), :] = dot(diag(s), Vt[: len(s), :]) 43 | self._sketch[len(s) :, :] = 0 44 | self.nextZeroRow = len(s) 45 | 46 | def get(self): 47 | return self._sketch[: self.ell, :] 48 | -------------------------------------------------------------------------------- /frequent_directions_experiments/matrixSketcherBase.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy import zeros 3 | 4 | 5 | class MatrixSketcherBase: 6 | def __init__(self, d, ell): 7 | self.d = d 8 | self.ell = ell 9 | self._sketch = zeros((self.ell, self.d)) 10 | 11 | # Appending a row vector to sketch 12 | def append(self, vector): 13 | pass 14 | 15 | # Convenient looping numpy matrices row by row 16 | def extend(self, vectors): 17 | for vector in vectors: 18 | self.append(vector) 19 | 20 | # returns the sketch matrix 21 | def get(self): 22 | return self._sketch 23 | 24 | # Convenience support for the += operator append 25 | def __iadd__(self, vector): 26 | self.append(vector) 27 | return self 28 | -------------------------------------------------------------------------------- /frequent_directions_experiments/randomProjections.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy import outer, sqrt 3 | from numpy.random import choice 4 | from .matrixSketcherBase import MatrixSketcherBase 5 | 6 | 7 | class RandomProjections(MatrixSketcherBase): 8 | def __init__(self, d, ell): 9 | MatrixSketcherBase.__init__(self, d, ell) 10 | self.class_name = "RandomProjections" 11 | self.rescaled_signs = [-1.0, 1.0] / sqrt(self.ell) 12 | 13 | def append(self, vector): 14 | randomVector = choice(self.rescaled_signs, self.ell) 15 | self._sketch += outer(randomVector, vector) 16 | -------------------------------------------------------------------------------- /frequent_directions_experiments/randomSums.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy.random import randint, choice 3 | from .matrixSketcherBase import MatrixSketcherBase 4 | 5 | 6 | class RandomSums(MatrixSketcherBase): 7 | def __init__(self, d, ell): 8 | MatrixSketcherBase.__init__(self, d, ell) 9 | self.class_name = "RandomSums" 10 | self.signs = [1.0, -1.0] 11 | 12 | def append(self, vector): 13 | row = randint(self.ell) 14 | sign = choice(self.signs) 15 | # v = (sign*vector).tolist() 16 | # self._sketch[row,:] += v[0] 17 | self._sketch[row, :] += sign * vector 18 | -------------------------------------------------------------------------------- /frequent_directions_experiments/rowSampler.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy 3 | from numpy.linalg import norm 4 | from random import random 5 | from .matrixSketcherBase import MatrixSketcherBase 6 | 7 | 8 | class RowSampler(MatrixSketcherBase): 9 | def __init__(self, d, ell): 10 | MatrixSketcherBase.__init__(self, d, ell) 11 | self.class_name = "RowSampler" 12 | self.samplers = [singleItemSampler() for i in range(self.ell)] 13 | 14 | def append(self, vector): 15 | row_norm_square = norm(vector) ** 2 16 | for i in range(self.ell): 17 | self.samplers[i].add(vector, row_norm_square) 18 | 19 | def get(self): 20 | for (i, sampler) in enumerate(self.samplers): 21 | p = sampler.item_probability 22 | row = sampler.item 23 | if row is not None: 24 | self._sketch[i, :] = row / (numpy.sqrt(p * float(self.ell))) 25 | return self._sketch 26 | 27 | 28 | class singleItemSampler: 29 | def __init__(self): 30 | self.item = None 31 | self.item_weight = 0.0 32 | self.item_probability = 0.0 33 | self.sum_w = 0.0 34 | self.machine_precision = 1e-10 35 | 36 | def add(self, item, w=1): 37 | w = float(w) 38 | if w <= 0.0: 39 | return 40 | self.sum_w += w 41 | p = w / max(self.sum_w, self.machine_precision) 42 | if random() < p or self.item is None: 43 | self.item = item 44 | self.item_weight = w 45 | self.item_probability = p 46 | else: 47 | self.item_probability = self.item_probability * (1.0 - p) 48 | 49 | def get(self): 50 | return (self.item, self.item_weight, self.item_probability) 51 | -------------------------------------------------------------------------------- /frequent_directions_experiments/sparseMatrix.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import sys 4 | from numpy import ( 5 | ceil, 6 | log, 7 | array, 8 | sum, 9 | float32, 10 | uint32, 11 | zeros, 12 | empty, 13 | arange, 14 | concatenate, 15 | sqrt, 16 | diag, 17 | ) 18 | from .sparseVector import SparseVector 19 | from scipy.sparse import rand 20 | from numpy.random import randn 21 | from .utils.common import * 22 | from time import time as timer 23 | from numpy.linalg import qr 24 | 25 | 26 | class SparseMatrix: 27 | def __init__(self, dim): 28 | self.rows = empty((1, dim)) 29 | self.cols = empty((1, dim)) 30 | self.values = empty((1, dim)) 31 | self.nnz = 0 # number of non-zeros 32 | self.nextRow = 0 33 | self.pointer = 0 34 | self.dimension = dim 35 | 36 | def append(self, vector): 37 | if vector.d != self.dimension: 38 | print("dimension mismatch: can not append this vector to the matrix") 39 | return 40 | 41 | # extend arrays 42 | if self.pointer + vector.nnz > self.rows.shape[1]: 43 | z = empty((1, self.rows.shape[1])) 44 | self.rows = concatenate((self.rows, z), axis=1) 45 | self.cols = concatenate((self.cols, z), axis=1) 46 | self.values = concatenate((self.values, z), axis=1) 47 | 48 | for i in range(vector.nnz): 49 | self.rows[0, self.pointer] = self.nextRow 50 | self.cols[0, self.pointer] = vector.cols[i] 51 | self.values[0, self.pointer] = vector.values[i] 52 | self.pointer += 1 53 | 54 | self.nextRow += 1 55 | self.nnz += vector.nnz 56 | 57 | def getShape(self): 58 | return self.nextRow, self.dimension 59 | 60 | def toDense(self): 61 | denseMat = zeros((self.nextRow, self.dimension)) 62 | rowIndex = self.rows[0, 0] 63 | headPtr = 0 64 | 65 | for ptr in range(self.pointer): 66 | if ptr != self.pointer - 1 and self.rows[0, ptr] != rowIndex: 67 | for j in range(headPtr, ptr): 68 | denseMat[rowIndex, self.cols[0, j]] = self.values[0, j] 69 | # resetting 70 | rowIndex = self.rows[0, ptr] 71 | headPtr = ptr 72 | 73 | elif ptr == self.pointer - 1: 74 | if self.rows[0, ptr] == rowIndex: 75 | for j in range(headPtr, ptr + 1): 76 | denseMat[rowIndex, self.cols[0, j]] = self.values[0, j] 77 | elif self.rows[0, ptr] != rowIndex: 78 | for j in range(headPtr, ptr): 79 | denseMat[rowIndex, self.cols[0, j]] = self.values[0, j] 80 | rowIndex = self.rows[0, ptr] 81 | denseMat[rowIndex, self.cols[0, ptr]] = self.values[0, ptr] 82 | 83 | return denseMat 84 | 85 | def sparseShrink(self, ell): 86 | Z = self.blockpower(ell, 0.25) 87 | ZtA = self.transposeRightMult(Z) 88 | [u, s, vt] = svd(ZtA, full_matrices=False) 89 | for i in range(len(s)): 90 | s[i] = sqrt(s[i] ** 2 - s[-1] ** 2) 91 | return diag(s).dot(vt) 92 | 93 | def blockpower(self, ell, eps=1): 94 | n, d = self.getShape() 95 | init_mat = randn(d, ell) 96 | num_of_iter = int(10 * ceil(log(d / eps) / eps)) 97 | 98 | for i in range(num_of_iter): 99 | [init_mat, _] = qr(init_mat) 100 | init_mat = self.covarianceMult(init_mat) 101 | 102 | K = self.leftMult(init_mat) 103 | [Q, _] = qr(K) 104 | del K 105 | del init_mat 106 | return Q 107 | 108 | ## A^TA * denseMat 109 | def covarianceMult(self, denseMat): 110 | rowIndex = self.rows[0, 0] 111 | headPtr = 0 112 | ptr = 0 113 | d, ell = denseMat.shape 114 | temp = zeros((1, ell)) 115 | product = zeros((d, ell)) 116 | 117 | while ptr != self.pointer: 118 | headPtr = ptr 119 | rowIndex = self.rows[0, headPtr] 120 | del temp 121 | temp = zeros((1, ell)) 122 | 123 | while ptr != self.pointer and self.rows[0, ptr] == rowIndex: 124 | temp += denseMat[self.cols[0, ptr], :] * self.values[0, ptr] 125 | ptr += 1 126 | 127 | for j in range(headPtr, ptr): 128 | product[self.cols[0, j], :] += self.values[0, j] * temp[0, :] 129 | 130 | return product 131 | 132 | ## computes G^tA 133 | def transposeRightMult(self, denseMat): 134 | ptr = 0 135 | rowIndex = self.rows[0, ptr] 136 | m, ell = denseMat.shape 137 | product = zeros((ell, self.dimension)) 138 | 139 | while ptr != self.pointer: 140 | rowIndex = self.rows[0, ptr] 141 | while ptr != self.pointer and rowIndex == self.rows[0, ptr]: 142 | for t in range(ell): 143 | product[t, self.cols[0, ptr]] += ( 144 | self.values[0, ptr] * denseMat[rowIndex, t] 145 | ) 146 | ptr += 1 147 | 148 | return product 149 | 150 | ## computes A*G 151 | def leftMult(self, denseMat): 152 | rowIndex = self.rows[0, 0] 153 | headPtr = 0 154 | d, ell = denseMat.shape 155 | product = zeros((self.nextRow, ell)) 156 | 157 | for ptr in range(self.pointer): 158 | # case 1 159 | if ( 160 | self.rows[0, ptr] != rowIndex and ptr != self.pointer - 1 161 | ): # headPtr -> ptr-1 is one row 162 | for j in range(headPtr, ptr): 163 | product[rowIndex, :] += ( 164 | denseMat[self.cols[0, j], :] * self.values[0, j] 165 | ) 166 | # resetting 167 | rowIndex = self.rows[0, ptr] 168 | headPtr = ptr 169 | 170 | # case 2 and 3 171 | elif ptr == self.pointer - 1: 172 | # case 2 173 | if self.rows[0, ptr] == rowIndex: 174 | for j in range(headPtr, ptr + 1): 175 | product[rowIndex, :] += ( 176 | denseMat[self.cols[0, j], :] * self.values[0, j] 177 | ) 178 | 179 | # case 3 180 | elif self.rows[0, ptr] != rowIndex: 181 | for j in range(headPtr, ptr): 182 | product[rowIndex, :] += ( 183 | denseMat[self.cols[0, j], :] * self.values[0, j] 184 | ) 185 | 186 | # resetting 187 | rowIndex = self.rows[0, ptr] 188 | product[rowIndex, :] = ( 189 | denseMat[self.cols[0, ptr], :] * self.values[0, ptr] 190 | ) 191 | 192 | return product 193 | 194 | 195 | if __name__ == "__main__": 196 | N = 10000 197 | dimension = 1000 198 | density = 0.1 199 | ell = 10 200 | 201 | A = rand(N, dimension, density, format="coo") 202 | svList, flag = cooToSparseVectorsList(A) 203 | sparseMat = SparseMatrix(dimension) 204 | for sv in svList: 205 | sparseMat.append(sv) 206 | 207 | s = timer() 208 | B = sparseMat.sparseShrink(ell) 209 | e = timer() 210 | print("elapsed time in python is ", e - s) 211 | 212 | print(B) 213 | -------------------------------------------------------------------------------- /frequent_directions_experiments/sparseSketcher.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy import zeros, sqrt, dot, diag, ceil, log 3 | from numpy.random import randn 4 | from numpy.linalg import norm, svd, qr, eigh 5 | from scipy.sparse import lil_matrix as sparse_matrix 6 | from scipy.sparse import csc_matrix, rand 7 | 8 | from .matrixSketcherBase import MatrixSketcherBase 9 | 10 | 11 | # simultaneous iterations algorithm 12 | # inputs: matrix is input matrix, ell is number of desired right singular vectors 13 | # outputs: transpose of approximated top ell singular vectors, and first ell singular values 14 | def simIter(matrix, ell): 15 | [m, d] = matrix.shape 16 | num_of_iter = int(ceil(4 * log(m))) 17 | init_vectors = randn(m, ell) 18 | matrix = csc_matrix(matrix) 19 | matrix_trans = matrix.transpose() 20 | 21 | for i in range(num_of_iter): 22 | init_vectors = matrix.dot((matrix_trans).dot(init_vectors)) 23 | 24 | [Q, _] = qr((matrix_trans).dot(init_vectors)) 25 | M = matrix.dot(Q) 26 | 27 | [_, S, U] = svd(M, full_matrices=False) 28 | 29 | return (U[:, :ell].transpose()).dot(Q.transpose()), S[:ell] 30 | 31 | 32 | # sparse frequent directions sketcher 33 | class SparseSketcher(MatrixSketcherBase): 34 | def __init__(self, d, ell): 35 | self.class_name = "SparseSketcher" 36 | self.d = d 37 | self.ell = ell 38 | self._sketch = zeros((2 * self.ell, self.d)) 39 | self.sketch_nextZeroRow = 0 40 | 41 | self.buffer_ell = self.d 42 | self.buffer = sparse_matrix((self.buffer_ell, self.d)) 43 | self.buffer_nnz = 0 44 | self.buffer_nextZeroRow = 0 45 | self.buffer_nnz_threshold = 2 * self.ell * self.d 46 | 47 | def append(self, vector): 48 | if vector.nnz == 0: 49 | return 50 | 51 | if ( 52 | self.buffer_nextZeroRow >= self.buffer_ell 53 | or self.buffer_nnz >= self.buffer_nnz_threshold 54 | ): 55 | self.__rotate__() 56 | 57 | self.buffer[self.buffer_nextZeroRow, :] = vector 58 | self.buffer_nnz += vector.nnz 59 | self.buffer_nextZeroRow += 1 60 | 61 | def __rotate__(self): 62 | # First shrink the buffer 63 | [Vt, s] = simIter(self.buffer, self.ell) 64 | 65 | # insert the shrunk part into the sketch 66 | if len(s) >= self.ell: 67 | sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2) 68 | self._sketch[self.ell :, :] = dot(diag(sShrunk), Vt[: self.ell, :]) 69 | else: 70 | self._sketch[self.ell : self.ell + len(s), :] = dot( 71 | diag(s), Vt[: len(s), :] 72 | ) 73 | 74 | # resetting the buffer matrix 75 | del self.buffer 76 | self.buffer = sparse_matrix((self.buffer_ell, self.d)) 77 | self.buffer_nnz = 0 78 | self.buffer_nextZeroRow = 0 79 | 80 | # A dense shrink of the sketch 81 | [_, s, Vt] = svd(self._sketch, full_matrices=False) 82 | if len(s) >= self.ell: 83 | sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2) 84 | self._sketch[: self.ell, :] = dot(diag(sShrunk), Vt[: self.ell, :]) 85 | self._sketch[self.ell :, :] = 0 86 | else: 87 | self._sketch[: len(s), :] = dot(diag(s), Vt[: len(s), :]) 88 | self._sketch[len(s) :, :] = 0 89 | 90 | def get(self): 91 | self.__rotate__() 92 | return self._sketch[: self.ell, :] 93 | -------------------------------------------------------------------------------- /frequent_directions_experiments/sparseSketcher_sparseMat.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from numpy import zeros, sqrt, dot, diag, ceil, log 4 | from numpy.random import randn 5 | from numpy import cov as covariance 6 | from numpy.linalg import norm, svd, qr, eigh 7 | from scipy.sparse import lil_matrix, csc_matrix, csr_matrix, dok_matrix, rand 8 | from time import time as timer 9 | import pickle 10 | 11 | from .matrixSketcherBase import MatrixSketcherBase 12 | from .utils.common import truncateSVD 13 | from .blockPower import blockpower 14 | from .sparseVector import SparseVector 15 | from .frequentDirections import FrequentDirections as FD 16 | 17 | from .sparseMatrix import SparseMatrix 18 | 19 | # sparse frequent directions sketcher 20 | class SparseSketcher(MatrixSketcherBase): 21 | def __init__(self, d, ell): 22 | self.class_name = "SparseSketcher_sparseMatrix" 23 | self.d = d 24 | self.ell = ell 25 | self._sketch = zeros((2 * self.ell, self.d)) 26 | 27 | self.buffer_nnz_threshold = 2 * self.ell * self.d 28 | self.buffer = SparseMatrix(self.buffer_nnz_threshold) 29 | 30 | def append(self, vector): 31 | if self.buffer.nnz >= self.buffer_nnz_threshold: 32 | self.__rotate__() 33 | 34 | self.buffer.append(vector) 35 | 36 | def __rotate__(self): 37 | # First shrink the buffer 38 | [s, vt] = blockpower(self.buffer, self.ell) 39 | 40 | # insert the shrunk part into the sketch 41 | if len(s) < self.ell: 42 | self._sketch[self.ell : self.ell + len(s), :] = dot( 43 | diag(s), vt[: len(s), :] 44 | ) 45 | else: # len(s) == self.ell 46 | sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2) 47 | self._sketch[self.ell :, :] = dot(diag(sShrunk), vt[: self.ell, :]) 48 | 49 | # resetting the buffer matrix 50 | del self.buffer 51 | self.buffer = lil_matrix((self.d, self.d)) 52 | self.buffer_nnz = 0 53 | self.buffer_nextRow = 0 54 | 55 | # A dense shrink of the sketch 56 | [_, s, vt] = svd(self._sketch, full_matrices=False) 57 | if len(s) >= self.ell: 58 | sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2) 59 | self._sketch[: self.ell, :] = dot(diag(sShrunk), vt[: self.ell, :]) 60 | self._sketch[self.ell :, :] = 0 61 | else: 62 | self._sketch[: len(s), :] = dot(diag(s), vt[: len(s), :]) 63 | self._sketch[len(s) :, :] = 0 64 | 65 | def get(self): 66 | self.__rotate__() 67 | return self._sketch[: self.ell, :] 68 | 69 | 70 | if __name__ == "__main__": 71 | # make input data 72 | n = 1000 73 | d = 400 74 | k = 5 75 | ells = list(range(10, 21, 10)) 76 | density = 0.1 77 | A = rand(n, d, density, format="lil") 78 | 79 | # error computation 80 | B = A.todense() 81 | ATA = covariance(B.T) 82 | squared_frob_A = norm(B, "fro") ** 2 83 | A_rank_k = truncateSVD(B, k) 84 | opt_rank_k_err = norm(B - A_rank_k, "fro") ** 2 85 | 86 | for ell in ells: 87 | sketcher = SparseSketcher(d, ell) 88 | 89 | t_start = timer() 90 | for sv in A: 91 | sketcher.append(sv) 92 | t_end = timer() 93 | totalSketchTime = t_end - t_start 94 | 95 | sketch = sketcher.get() 96 | #### cov-error ####### 97 | diff = ATA - dot(sketch.transpose(), sketch) 98 | relative_cov_err = float(norm(diff, 2)) / float(squared_frob_A) 99 | 100 | #### proj-error ###### 101 | [u, s, vt] = svd(sketch, full_matrices=False) 102 | vt = vt[:k, :] 103 | projection = dot(B, dot(vt.transpose(), vt)) 104 | proj_err = norm(B - projection, "fro") ** 2 105 | relative_proj_err = float(proj_err) / float(opt_rank_k_err) 106 | 107 | print( 108 | "sparse: ell=", 109 | ell, 110 | "time=", 111 | totalSketchTime, 112 | "cov-err=", 113 | relative_cov_err, 114 | "proj-err=", 115 | relative_proj_err, 116 | ) 117 | 118 | ############### FD ###################################3 119 | sketcher = FD(d, ell) 120 | t_start = timer() 121 | for sv in A: 122 | sketcher.append(sv) 123 | t_end = timer() 124 | totalSketchTime = t_end - t_start 125 | 126 | sketch = sketcher.get() 127 | #### cov-error ####### 128 | diff = ATA - dot(sketch.transpose(), sketch) 129 | relative_cov_err = float(norm(diff, 2)) / float(squared_frob_A) 130 | 131 | #### proj-error ###### 132 | [u, s, vt] = svd(sketch, full_matrices=False) 133 | vt = vt[:k, :] 134 | projection = dot(B, dot(vt.transpose(), vt)) 135 | proj_err = norm(B - projection, "fro") ** 2 136 | relative_proj_err = float(proj_err) / float(opt_rank_k_err) 137 | 138 | print( 139 | "DenseFD: ell=", 140 | ell, 141 | "time=", 142 | totalSketchTime, 143 | "cov-err=", 144 | relative_cov_err, 145 | "proj-err=", 146 | relative_proj_err, 147 | ) 148 | -------------------------------------------------------------------------------- /frequent_directions_experiments/sparseVector.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import sys 3 | from numpy import array, sum, float32, uint32, zeros 4 | 5 | 6 | class SparseVector: 7 | def __init__(self, d, kvList): 8 | self.d = d 9 | kvList = [kv for kv in kvList if kv[0] >= 0 and kv[0] < self.d] 10 | kvList.sort() 11 | self.cols = array([kv[0] for kv in kvList], dtype=uint32) 12 | self.values = array([kv[1] for kv in kvList], dtype=float32) 13 | self.shape = (1, self.d) 14 | self.nnz = len(self.cols) 15 | 16 | self._normSquare = sum(self.values**2) 17 | 18 | def todense(self): 19 | v = zeros(self.shape) 20 | for i in range(self.nnz): 21 | v[0, self.cols[i]] = self.values[i] 22 | return v 23 | 24 | def getNnz(self): 25 | return self.nnz 26 | 27 | def getNormSquare(self): 28 | return self._normSquare 29 | 30 | def distSquare(self, other): 31 | return self._normSquare + other._normSquare - 2 * self.dot(other) 32 | 33 | 34 | if __name__ == "__main__": 35 | d = 30 36 | sv1 = SparseVector(d, [(1, 3.1), (23, 0.1), (13, 13)]) 37 | sv2 = SparseVector(d, [(12, 3.1), (23, 0.1), (43, -0.4)]) 38 | 39 | mat = SparseMatrix() 40 | -------------------------------------------------------------------------------- /frequent_directions_experiments/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import common 3 | from . import syntheticDataMaker 4 | from . import reservoirSampler 5 | 6 | __all__ = ["common", "reservoirSampler", "syntheticDataMaker"] 7 | -------------------------------------------------------------------------------- /frequent_directions_experiments/utils/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from numpy.linalg import svd 3 | from numpy import dot 4 | from numpy import diagflat 5 | 6 | 7 | def truncateSVD(A, k): 8 | U, s, Vt = svd(A, full_matrices=False) 9 | opt = dot(U[:, 0:k], dot(diagflat(s[0:k]), Vt[0:k, :])) 10 | return opt 11 | -------------------------------------------------------------------------------- /frequent_directions_experiments/utils/reservoirSampler.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from numpy.random import binomial 4 | from random import sample 5 | 6 | 7 | class ReservoirSampler: 8 | def __init__(self, t_paralel_sampleres=1): 9 | self.t = t_paralel_sampleres 10 | self.t_range = list(range(self.t)) 11 | self.items = [None] * self.t 12 | self.items_weights = [0.0] * self.t 13 | 14 | self.item_probability = 0.0 15 | self.sum_w = 0.0 16 | self.machine_precision = 1e-10 17 | 18 | def add(self, item, w=1): 19 | 20 | w = float(w) 21 | if w <= 0.0: 22 | return 23 | self.sum_w += w 24 | p = w / max(self.sum_w, self.machine_precision) 25 | 26 | num_items_to_update = binomial(self.t, p) 27 | items_to_update = sample(self.t_range, num_items_to_update) 28 | 29 | for i in items_to_update: 30 | self.items[i] = item 31 | self.items_weights[i] = w 32 | 33 | def get(self, with_probabilities=False): 34 | if with_probabilities: 35 | probs = [w / self.sum_w for w in self.items_weights] 36 | return list(zip(self.items, probs)) 37 | else: 38 | return self.items 39 | 40 | 41 | if __name__ == "__main__": 42 | n = 1000 43 | items = list(range(n)) 44 | weights = list(range(n)) 45 | 46 | rs = ReservoirSampler(1000) 47 | 48 | for i in range(n): 49 | rs.add(items[i], weights[i]) 50 | 51 | print(sorted(rs.get())) 52 | -------------------------------------------------------------------------------- /frequent_directions_experiments/utils/syntheticDataMaker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import sys 3 | from numpy.random import randn 4 | from numpy.linalg import qr 5 | import numpy 6 | 7 | 8 | class SyntheticDataMaker: 9 | def __init__(self): 10 | self.wasInitForMake = False 11 | 12 | def initBeforeMake( 13 | self, 14 | dimension, 15 | signal_dimension=0, 16 | signal_to_noise_ratio=0, 17 | signal_singular_value_decay_factor=0, 18 | signal_singular_value_decay_type="exp", 19 | ): 20 | 21 | self.dimension = dimension 22 | self.signal_dimension = signal_dimension 23 | self.signal_to_noise_ratio = signal_to_noise_ratio 24 | self.signal_singular_value_decay_factor = signal_singular_value_decay_factor 25 | self.signal_singular_value_decay_type = signal_singular_value_decay_type 26 | 27 | # setting a random singular space 28 | [Q, R] = qr(randn(self.dimension, self.signal_dimension)) 29 | self.signal_row_space = Q.transpose() 30 | del Q, R 31 | 32 | # setting the singular values 33 | eta = self.signal_singular_value_decay_factor 34 | if self.signal_singular_value_decay_type == "exp": 35 | self.signal_singular_values = [ 36 | numpy.exp(-10 * eta * i / self.signal_dimension) 37 | for i in range(self.signal_dimension) 38 | ] 39 | elif self.signal_singular_value_decay_type == "lin": 40 | self.signal_singular_values = [ 41 | max(1.0 - eta * float(i) / self.signal_dimension, 0.0) 42 | for i in range(self.signal_dimension) 43 | ] 44 | else: 45 | self.signal_singular_values = numpy.ones(self.signal_dimension) 46 | # done initializing 47 | self.wasInitForMake = True 48 | 49 | def makeRow(self): 50 | if not self.wasInitForMake: 51 | sys.stderr.write("ERROR: must run initBeforeMake(...) before makeRow()") 52 | return 53 | noise = randn(self.dimension) 54 | signal_coeffs = randn(self.signal_dimension) 55 | signal = numpy.dot( 56 | self.signal_singular_values * signal_coeffs, self.signal_row_space 57 | ) 58 | return signal + noise / self.signal_to_noise_ratio 59 | 60 | def makeMatrix(self, n): 61 | matrix = numpy.zeros((n, self.dimension)) 62 | for i in range(n): 63 | matrix[i, :] = self.makeRow() 64 | return matrix 65 | 66 | def getSignalRowSpace(self): 67 | return self.signal_row_space 68 | 69 | def __vector_to_string__(self, v): 70 | s = "%s\n" % (",".join("%.2E" % x for x in v.flatten())) 71 | return s 72 | 73 | def __vector_from_string(self, s): 74 | v = numpy.array([float(x) for x in s.strip("\n").split(",")]) 75 | return v 76 | 77 | def readFromFileIter(self, f=sys.stdin): 78 | for line in f: 79 | yield self.__vector_from_string(line) 80 | 81 | def writeToFile(self, v, f=sys.stdout): 82 | f.write(self.__vector_to_string__(v)) 83 | 84 | def writeToFileIter(self, vs, f=sys.stdout): 85 | for v in vs: 86 | f.write(self.__vector_to_string__(v)) 87 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Frequent directions experiments python module.""" 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="frequent_directions_experiments", 7 | version="0.1.0", 8 | ) 9 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | python -m frequent-directions-experiments.test.testBruteForce 2 | python -m frequent-directions-experiments.test.testEntrySampler 3 | python -m frequent-directions-experiments.test.testFrequentDirections 4 | python -m frequent-directions-experiments.test.testRandomProjections 5 | python -m frequent-directions-experiments.test.testRandomSums 6 | python -m frequent-directions-experiments.test.testRowSampler 7 | python -m frequent-directions-experiments.test.testSparseSketcher -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edoliberty/frequent-directions-experiments/06ecc4a1513c9b83c0bda3de1d2cb5ded468e3a0/test/__init__.py -------------------------------------------------------------------------------- /test/runtests.sh: -------------------------------------------------------------------------------- 1 | python -m unittest discover -------------------------------------------------------------------------------- /test/testBruteForce.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testBruteForce(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.bruteForce.BruteForce(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | sketch = sketcher.get() 25 | self.assertEqual(sketch.shape, (ell, d)) 26 | 27 | 28 | if __name__ == "__main__": 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /test/testEntrySampler.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testEntrySampler(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.entrySampler.EntrySampler(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | sketch = sketcher.get() 25 | self.assertEqual(sketch.shape, (ell, d)) 26 | 27 | 28 | if __name__ == "__main__": 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /test/testFrequentDirections.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testFrequentDirections(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.frequentDirections.FrequentDirections(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | sketch = sketcher.get() 25 | self.assertEqual(sketch.shape, (ell, d)) 26 | 27 | 28 | if __name__ == "__main__": 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /test/testRandomProjections.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testRandomProjection(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.randomProjections.RandomProjections(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | 25 | sketch = sketcher.get() 26 | self.assertEqual(sketch.shape, (ell, d)) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /test/testRandomSums.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testRandomSums(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.randomSums.RandomSums(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | 25 | sketch = sketcher.get() 26 | 27 | self.assertEqual(sketch.shape, (ell, d)) 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /test/testRowSampler.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | 4 | 5 | class testRowSampler(unittest.TestCase): 6 | def test_running(self): 7 | n = 100 8 | d = 20 9 | ell = 5 10 | syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker() 11 | syntheticDataMaker.initBeforeMake( 12 | d, 13 | signal_dimension=10, 14 | signal_to_noise_ratio=5, 15 | signal_singular_value_decay_factor=1, 16 | signal_singular_value_decay_type="lin", 17 | ) 18 | 19 | sketcher = fde.rowSampler.RowSampler(d, ell) 20 | 21 | for i in range(n): 22 | v = syntheticDataMaker.makeRow() 23 | sketcher.append(v) 24 | 25 | sketch = sketcher.get() 26 | self.assertEqual(sketch.shape, (ell, d)) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /test/testSparseSketcher.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import frequent_directions_experiments as fde 3 | from scipy.sparse import rand 4 | 5 | 6 | class testSparseSketcher(unittest.TestCase): 7 | def test_running(self): 8 | n = 100 9 | d = 20 10 | ell = 5 11 | A = rand(n, d, density=0.001, format="lil") 12 | sketcher = fde.sparseSketcher.SparseSketcher(d, ell) 13 | 14 | for v in A: 15 | sketcher.append(v) 16 | sketch = sketcher.get() 17 | self.assertEqual(sketch.shape, (ell, d)) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | --------------------------------------------------------------------------------