├── .gitignore
├── README.md
├── __init__.py
├── c
    ├── Makefile
    ├── common.c
    ├── common.h
    ├── frequentDirections.c
    ├── frequentDirections.h
    ├── randomProjection.c
    ├── randomProjection.h
    ├── randomSum.c
    ├── randomSum.h
    ├── rowSampler.c
    ├── rowSampler.h
    ├── singleItemSampler.c
    ├── singleItemSampler.h
    ├── sparseMatrix.c
    ├── sparseMatrix.h
    ├── sparseSketcher.c
    ├── sparseSketcher.h
    ├── sparseVector.c
    ├── sparseVector.h
    ├── testAll.c
    ├── test_vs_d.c
    ├── test_vs_ell.c
    ├── test_vs_n.c
    └── test_vs_sparsity.c
├── experiments
    ├── __init__.py
    ├── compareApproximationErrors.py
    └── compareRunningTimes.py
├── frequent_directions_experiments
    ├── __init__.py
    ├── blockPower.py
    ├── bruteForce.py
    ├── entrySampler.py
    ├── exampleUsage.py
    ├── frequentDirections.py
    ├── matrixSketcherBase.py
    ├── randomProjections.py
    ├── randomSums.py
    ├── rowSampler.py
    ├── sparseMatrix.py
    ├── sparseSketcher.py
    ├── sparseSketcher_sparseMat.py
    ├── sparseVector.py
    └── utils
    │   ├── __init__.py
    │   ├── common.py
    │   ├── reservoirSampler.py
    │   └── syntheticDataMaker.py
├── setup.py
├── test.sh
└── test
    ├── __init__.py
    ├── runtests.sh
    ├── testBruteForce.py
    ├── testEntrySampler.py
    ├── testFrequentDirections.py
    ├── testRandomProjections.py
    ├── testRandomSums.py
    ├── testRowSampler.py
    └── testSparseSketcher.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # mac folder file
60 | *.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Matrix Sketching
 2 | This repo was created by [Edo Liberty](www.edoliberty.com) and [Mina Ghashami](http://www.cs.utah.edu/~ghashami/).
 3 | It builds all common streaming matrix sketching algroithms in Python.
 4 | It is developed for academic use only and for reproducability of the results in the following papers
 5 | * [Simple and Deterministic Matrix Sketches](http://www.cs.yale.edu/homes/el327/papers/simpleMatrixSketching.pdf) Edo Liberty
 6 | * [Relative Errors for Deterministic Low-Rank Matrix Approximations](http://www.cs.utah.edu/~ghashami/papers/relative_err_soda.pdf) Mina Ghashami, Jeff M. Phillips
 7 | * [Frequent Directions: Simple and Deterministic Matrix Sketching](http://www.cs.utah.edu/~ghashami/papers/fd_journal.pdf) Mina Ghashami, Edo Liberty, Jeff M. Phillips, David P. Woodruff
 8 | * [Efficient Frequent Directions Algorithm for Sparse Matrices](http://arxiv.org/abs/1602.00412) Mina Ghashami, Edo Liberty, Jeff M. Phillips
 9 | 
10 | 
11 | #### Usage
12 | If you are only using the library, you will noly need to the "python" folder.
13 | It contains an exampleUsage.py file for your convenience.
14 | 
15 |  
16 | #### Running tests and experiments 
17 | Running tests requires using the -m flag which is standard in python unittesting. 
18 | For example, to run the bruteForce sketcher test, go to the parent directory (outside frequentdirection/) and run
19 | ```
20 | python -m frequentdirections.test.testBruteForce
21 | ```
22 | 
23 | #### Contributing
24 | Please feel free to send me pull requests. The test package is minimal. 
25 | So, if you make chages to the core classes. Please also include the tests to cover your changes. 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/frequent-directions-experiments/06ecc4a1513c9b83c0bda3de1d2cb5ded468e3a0/__init__.py


--------------------------------------------------------------------------------
/c/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | main: 
 3 | 	gcc -c -O3 -g -std=c99 -o sparseVector.o sparseVector.c
 4 | 	gcc -c -O3 -g -std=c99 -o sparseMatrix.o sparseMatrix.c
 5 | 	gcc -c -O3 -g -std=c99 -o sparseSketcher.o sparseSketcher.c
 6 | 	gcc -c -O3 -g -std=c99 -o frequentDirections.o frequentDirections.c	
 7 | 	gcc -c -O3 -g -std=c99 -o testAll.o testAll.c
 8 | 	gcc -c -O3 -g -std=c99 -o common.o common.c
 9 | 	gfortran common.o frequentDirections.o sparseVector.o sparseMatrix.o sparseSketcher.o testAll.o /usr/lib/liblapacke.a /usr/lib/liblapack.a /usr/lib/libblas.a -o testAll.exe
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/c/common.c:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | /*
  4 |  * QR decomposition of G
  5 |  * Q is returned in G, stored in the row-wise format
  6 |  * R is not returned 
  7 | */
  8 | void qrDecomp(double* G, lapack_int d, lapack_int ell) {
  9 |   /* if G is a vector */
 10 |   if(d == 1){ 
 11 |     normalizeVector(G, ell);
 12 |     return;
 13 |   }
 14 | 
 15 |   double tau[ell];
 16 |   for(int i=0; i<ell; i++)
 17 |     tau[i] = 0;
 18 | 
 19 |   lapack_int x = LAPACKE_dgeqrf(LAPACK_ROW_MAJOR, d, ell, G, ell, tau);
 20 |   x = LAPACKE_dorgqr(LAPACK_ROW_MAJOR, d, ell, ell, G, ell, tau);
 21 | }
 22 | 
 23 | 
 24 | void print_two_dim(char* desc, double* mat, int m, int n) {
 25 |   
 26 |   printf("%s \n",desc);
 27 |   for(int i = 0; i < m; i++ ) {
 28 |     for(int j=0; j < n; j++)
 29 |       printf( " %.10e", mat[i*n+j] );
 30 |     printf("\n ");
 31 |   }
 32 | }
 33 | 
 34 | 
 35 | void print_one_dim_double(char* desc, double* mat, int length){
 36 |   printf("%s",desc);
 37 |   printf("[");
 38 |   for(int i = 0; i < length; i++ )
 39 |     if (i < length - 1)
 40 |       printf("%f , ",mat[i]);
 41 |     else
 42 |       printf("%f ",mat[i]);
 43 |   printf("],\n");
 44 | }
 45 | 
 46 | void print_one_dim_int(char* desc, int* mat, int length){
 47 |   
 48 |   printf("%s",desc);
 49 |   printf("[");
 50 |   for(int i = 0; i < length; i++ )
 51 |     printf("%d ,",mat[i]);
 52 |   printf("]\n");
 53 | }
 54 | 
 55 | void dot_product(double* C, int m, int n){
 56 |   double dotproduct = 0;
 57 | 
 58 |   for(int j = 0; j < n-1; j++ ) {
 59 |     dotproduct = 0;
 60 |     for(int i = 0; i < m; i++ ) 
 61 |       dotproduct += C[i*n+j] * C[i*n+j+1];
 62 |     printf("dot product of columns %d and %d = %f", j, j+1, dotproduct);
 63 |     printf("\n");
 64 |   }
 65 | }
 66 | 
 67 | void normalizeVector(double* vec, int len){
 68 |   int squaredNorm = 0;
 69 | 
 70 |   for(int i=0; i<len; i++)
 71 |     squaredNorm += pow(vec[i],2);
 72 | 
 73 |   for(int i=0; i<len; i++)
 74 |     vec[i] = vec[i] / sqrt(squaredNorm); 
 75 |   
 76 | }
 77 | 
 78 | void column_norm (double* C, int m, int n){
 79 |   double temp = 0;
 80 |   
 81 |   for(int j = 0; j < n; j++ ) {
 82 |     temp = 0;
 83 |     for(int i = 0; i < m; i++ ) 
 84 |       temp += pow(C[i*n+j],2);
 85 |     printf("column %d norm = %f", j, temp);
 86 |     printf("\n");
 87 |   }
 88 | }
 89 | 
 90 | void printline(){
 91 |   printf("---------------------\n");
 92 | }
 93 | 
 94 | // computes AtA of dimensions d*d
 95 | double* getDenseCovariance(double* mat, int ell, int d){
 96 |   double* cov = (double*) malloc(sizeof(double) * d * d);
 97 | 
 98 |   for(int i=0; i<d; i++)
 99 |     for(int j=0; j<d; j++)
100 |       cov[i*d+j] = 0;
101 | 
102 |   
103 |  
104 |   for(int i=0; i<ell; i++)
105 |     for(int j=0; j<d; j++)
106 |       for(int k=0; k<d; k++){
107 | 	cov[j*d+k] += mat[i*d+j] * mat[i*d+k];
108 |       }
109 | 
110 |   return cov;
111 | }
112 | 
113 | // returns mat1 - mat2
114 | void subtract(double* mat1, double* mat2, int n, int d){
115 |   if (sizeof(mat1) != sizeof(mat2)){
116 |     printf("dimensions of two matrices do not match");
117 |     return;
118 |   }
119 | 
120 | 
121 |   for(int i=0; i<n; i++)
122 |     for(int j=0; j<d; j++)
123 |       mat1[i*d+j] = mat1[i*d+j] - mat2[i*d+j];
124 | }
125 | 
126 | // computes spectral norm of mat
127 | double getSpectralNorm(double* mat, int ell, int d){
128 | 
129 |   double* S = (double*) malloc(sizeof(double) * ell);
130 |   double* U = (double*) malloc(sizeof(double) * ell * 1);
131 |   double* Vt = (double*) malloc(sizeof(double) * d * 1);
132 | 
133 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'N', ell, d, mat, d, S, U, ell, Vt, d);
134 | 
135 |   free(U); free(Vt);
136 |   double sing = S[0];
137 |   free(S);
138 |   return sing;
139 | }
140 | 
141 | 
142 | void write_to_file(double* mat, int n, int d){
143 |   printf("in Write to file \n");
144 |   FILE* fp; 
145 |   fp = fopen("CtC.txt","w");
146 | 
147 |   for(int i=0; i<n; i++){
148 |     for(int j=0; j<d; j++){
149 |       fprintf(fp, "%f ",mat[i*d +  j]);
150 |     }
151 |     fprintf(fp,"%s","\n");
152 |   }
153 | 
154 |   fclose(fp);
155 | 
156 | }
157 | 
158 | 


--------------------------------------------------------------------------------
/c/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON
 2 | #define COMMON
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <string.h>
 8 | #include <time.h>
 9 | #include <stdint.h>
10 | #include <lapacke.h>
11 | 
12 | 
13 | #define max(x, y) (x>y ? x : y)
14 | #define min(x, y) (x<y ? x : y)
15 | 
16 | 
17 | void write_to_file(double* mat, int n, int d);
18 | void qrDecomp(double* G, lapack_int d, lapack_int ell);
19 | void dot_product(double* C, int m, int n);
20 | void column_norm (double* C, int m, int n);
21 | double getSpectralNorm(double* mat, int ell, int d);
22 | void subtract(double* mat1, double* mat2, int n, int d);
23 | double* getDenseCovariance(double* mat, int ell, int d);
24 | void normalizeVector(double* vec, int len);
25 | 
26 | void print_two_dim(char* desc, double* mat, int m, int n);
27 | void print_one_dim_double(char* desc, double* mat, int length);
28 | void print_one_dim_int(char* desc, int* mat, int length);
29 | void printline(void);
30 | 
31 | #endif
32 | 
33 | 


--------------------------------------------------------------------------------
/c/frequentDirections.c:
--------------------------------------------------------------------------------
 1 | #include "frequentDirections.h"
 2 | 
 3 | void init_fd(FrequentDirections* self, int ell, int dim ){
 4 |   self->class_name = "FrequentDirections";
 5 |   self->dimension = dim;
 6 |   self->ell = ell;
 7 |   self->m = 2*ell;
 8 |   self->sketch = (double*) malloc(dim * (self->m) * sizeof(double));
 9 |   self->nextRow = 0;
10 | }
11 | 
12 | 
13 | void append_to_fd(FrequentDirections* self, SparseVector* sv){
14 | 
15 |   if (self->nextRow == self->m)
16 |     rotate_fd(self);
17 |   
18 | 
19 |   int j = 0;
20 |   int rid = (self->nextRow) * (self->dimension);
21 | 
22 |   double* vec = densify_sparseVector(sv);
23 |   
24 | 
25 |   for(int i = 0; i < sv->dimension; i++) 
26 |     self->sketch[rid + i] = vec[i];
27 | 
28 |   self->nextRow ++;
29 |   free(vec);  
30 | }
31 | 
32 | 
33 | void rotate_fd(FrequentDirections* self){
34 |   double* S = (double*) malloc(sizeof(double) * self->m);
35 |   double* U = (double*) malloc(sizeof(double) * self->m * self->m);
36 |   double* Vt = (double*) malloc(sizeof(double) * self->m * self->dimension);
37 | 
38 | 
39 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', self->m, self->dimension, self->sketch, self->dimension, S, U, self->m, Vt, self->dimension);
40 | 
41 | 
42 |   // compute S*Vt
43 |   for(int i=0; i < self->ell; i++){
44 |     S[i] = sqrt( pow(S[i],2) - pow(S[self->ell - 1],2) );
45 |     for(int j=0; j < self->dimension; j++)
46 |       self->sketch[i * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ;
47 |   }
48 | 
49 |   memset(&self->sketch[self->ell * self->dimension], 0, self->ell * self->dimension * sizeof(double));
50 | 
51 | 
52 |   self->nextRow = self->ell;  
53 |   free(S); free(U); free(Vt); 
54 | }
55 | 
56 | void get_fdSketch(FrequentDirections* self) {}
57 | 


--------------------------------------------------------------------------------
/c/frequentDirections.h:
--------------------------------------------------------------------------------
 1 | #ifndef FD_H
 2 | #define FD_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <time.h>
 8 | #include <string.h>
 9 | #include "sparseVector.h"
10 | #include <lapacke.h>
11 | #include "common.h"
12 | 
13 | typedef struct {
14 |   char* class_name;
15 |   int dimension;
16 |   int ell;
17 |   int m;
18 |   int nextRow;
19 |   double* sketch;
20 | 
21 | } FrequentDirections;
22 | 
23 | 
24 | void init_fd(FrequentDirections* self, int ell, int dim );
25 | void append_to_fd(FrequentDirections* self, SparseVector* sv);
26 | void get_fdSketch(FrequentDirections* self);
27 | void rotate_fd(FrequentDirections* self);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/c/randomProjection.c:
--------------------------------------------------------------------------------
 1 | #include "randomProjection.h"
 2 | 
 3 | void init_randomProj(RandomProjection* self, int ell, int dim ){
 4 |   self->class_name = "RandomProjection";
 5 |   self->dimension = dim;
 6 |   self->ell = ell;
 7 |   self->sketch = (double*) malloc(dim * ell * sizeof(double));
 8 |   memset(self->sketch, 0, sizeof(double) * ell * dim);
 9 |   srand(time(NULL));
10 | }
11 | 
12 | 
13 | void append_to_randomProj(RandomProjection* self, SparseVector* sv){
14 |   int sign, index;
15 | 
16 |   for(int i=0; i < self->ell; i++){
17 |     sign = (-2) * (rand() % 2) + 1;
18 |     for(int j=0; j < sv->nnz; j++){
19 |       index = i * self->dimension + sv->cols[j];
20 |       self->sketch[index] += (sign/sqrt(self->ell)) * (sv->values[j]);
21 |     }
22 |   }
23 | 
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/c/randomProjection.h:
--------------------------------------------------------------------------------
 1 | #ifndef RANDPROJ_H
 2 | #define RANDPROJ_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <time.h>
 8 | #include <string.h>
 9 | #include "sparseVector.h"
10 | 
11 | typedef struct {
12 |   char* class_name;
13 |   int dimension;
14 |   int ell;
15 |   double* sketch;
16 | 
17 | } RandomProjection;
18 | 
19 | 
20 | void init_randomProj(RandomProjection* self, int ell, int dim );
21 | void append_to_randomProj(RandomProjection* self, SparseVector* sv);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/c/randomSum.c:
--------------------------------------------------------------------------------
 1 | #include "randomSum.h"
 2 | 
 3 | void init_randomSum(RandomSum* self, int ell, int dim ){
 4 |   self->class_name = "RandomSum";
 5 |   self->dimension = dim;
 6 |   self->ell = ell;
 7 |   self->sketch = (double*) malloc(dim * ell * sizeof(double));
 8 |   memset(self->sketch, 0, sizeof(double) * ell * dim);
 9 |   srand(time(NULL));
10 | }
11 | 
12 | 
13 | void append_to_randomSum(RandomSum* self, SparseVector* sv){
14 |   int rid = rand() % (self->ell);
15 |   int sign = (-2) * (rand() % 2) + 1;
16 |   int index;
17 |   for(int i=0; i<sv->nnz; i++){
18 |     index = rid * self->dimension + sv->cols[i];
19 |     self->sketch[index] += sign * (sv->values[i]);
20 |   }
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/c/randomSum.h:
--------------------------------------------------------------------------------
 1 | #ifndef RANDSUM_H
 2 | #define RANDSUM_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <time.h>
 8 | #include <string.h>
 9 | #include "sparseVector.h"
10 | 
11 | typedef struct {
12 |   char* class_name;
13 |   double* sketch;
14 |   int dimension;
15 |   int ell;
16 | 
17 | } RandomSum;
18 | 
19 | 
20 | void init_randomSum(RandomSum* self, int ell, int dim );
21 | void append_to_randomSum(RandomSum* self, SparseVector* sv);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/c/rowSampler.c:
--------------------------------------------------------------------------------
 1 | #include "rowSampler.h"
 2 | 
 3 | void init_rowSampler(RowSampler* self, int ell, int dim ){
 4 |   self-> class_name = "RowSampler";
 5 |   self-> dimension = dim;
 6 |   self-> ell = ell;
 7 |   self-> sketch = (double*) malloc(sizeof(double) * ell * dim);
 8 |   self-> samplers = (SingleItemSampler*) malloc(sizeof(SingleItemSampler) * ell);
 9 |   memset(self-> sketch, 0 , sizeof(double) * ell * dim);
10 | }
11 | 
12 | void append_to_rowSampler(RowSampler* self, SparseVector* sv){
13 |   int i;
14 |   for(i=0; i < self-> ell; i++)
15 |     add_itemSampler(&(self-> samplers[i]), sv);
16 | }
17 | 
18 | 
19 | void get_rowSamplerSketch(RowSampler* self){
20 | 
21 |   SparseVector* item;
22 |   double item_prob;
23 | 
24 |   for(int i=0; i < self-> ell; i++){
25 |     item = (self-> samplers[i]).item;
26 |     item_prob = (self-> samplers[i]).item_probability;
27 |     for(int j=0; j< item-> nnz; j++)
28 |       self-> sketch[i * self-> dimension + item-> cols[j]] = (item-> values[j]) / sqrt(item_prob * self-> ell);
29 |   }
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/c/rowSampler.h:
--------------------------------------------------------------------------------
 1 | #ifndef ROWSAMPLER_H
 2 | #define ROWSAMPLER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <time.h>
 8 | #include <string.h>
 9 | #include "sparseVector.h"
10 | #include "singleItemSampler.h"
11 | 
12 | typedef struct {
13 |   char* class_name;
14 |   int dimension;
15 |   int ell;
16 |   double* sketch;
17 |   SingleItemSampler* samplers;
18 | 
19 | } RowSampler;
20 | 
21 | 
22 | void init_rowSampler(RowSampler* self, int ell, int dim );
23 | void append_to_rowSampler(RowSampler* self, SparseVector* sv);
24 | void get_rowSamplerSketch(RowSampler* self);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/c/singleItemSampler.c:
--------------------------------------------------------------------------------
 1 | #include "singleItemSampler.h"
 2 | 
 3 | void init_itemSampler(SingleItemSampler* self){
 4 |   self-> item = NULL;
 5 |   self-> item_weight = 0;
 6 |   self-> item_probability = 0;
 7 |   self-> sum_w = 0;
 8 |   self-> machine_precision = 1e-10;
 9 |   srand(time(NULL));
10 | }
11 | 
12 | 
13 | void add_itemSampler(SingleItemSampler* self, SparseVector* sv){
14 |   self-> sum_w += sv-> squaredNorm;
15 |   double p = sv-> squaredNorm / max(self-> sum_w , self-> machine_precision);
16 | 
17 |   double randomVal = rand()/(RAND_MAX+1.0);
18 | 
19 |   if (randomVal < p){
20 |     self-> item = sv;
21 |     self-> item_weight = sv-> squaredNorm;
22 |     self-> item_probability = p;
23 |   }else{
24 |     self-> item_probability = self-> item_probability * (1.0-p);
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/c/singleItemSampler.h:
--------------------------------------------------------------------------------
 1 | #ifndef ITEMSAMPLER_H
 2 | #define ITEMSAMPLER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <time.h>
 8 | #include <string.h>
 9 | #include "sparseVector.h"
10 | #include "common.h"
11 | 
12 | 
13 | typedef struct {
14 |   SparseVector* item;
15 |   double item_weight;
16 |   double item_probability;
17 |   double sum_w;
18 |   double machine_precision;
19 | 
20 | } SingleItemSampler;
21 | 
22 | 
23 | void init_itemSampler(SingleItemSampler* self);
24 | void add_itemSampler(SingleItemSampler* self, SparseVector* sv);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/c/sparseMatrix.c:
--------------------------------------------------------------------------------
  1 | #include "sparseMatrix.h"
  2 | 
  3 | 
  4 | void init_sparseMatrix (SparseMatrix* self, int dim, int len){
  5 | 
  6 |   self->current_nnz = 0;
  7 |   self->dimension = dim;
  8 |   self->nextRow = 0;
  9 |   self->squaredFrob = 0;
 10 |   self->vectors = (SparseVector*) malloc(sizeof(SparseVector) * len);
 11 | }
 12 | 
 13 | 
 14 | void append_to_sparseMatrix (SparseMatrix* self, SparseVector* sv){
 15 | 
 16 |   self->vectors[self->nextRow] = *sv;
 17 |   self->nextRow ++;
 18 |   self->squaredFrob += sv->squaredNorm;
 19 |   self->current_nnz += sv->nnz;
 20 | }
 21 | 
 22 | 
 23 | void print_sparseMatrix(SparseMatrix* self){
 24 |   for(int i=0; i < self->nextRow; i++)
 25 |     print_sparseVector(&(self->vectors[i]));
 26 |   
 27 | }
 28 | 
 29 | 
 30 | /*
 31 |  * computes A^TA.G where G is d*ell matrix
 32 |  * returns the result in G 
 33 |  * temp is 1*ell working memory
 34 |  * product is d*ell working memory
 35 | */
 36 | void covMultiply_sparseMatrix (SparseMatrix* self, int d, int ell, double** G, double* temp, double** product){
 37 | 
 38 |   int rid;
 39 |   double val;
 40 |   SparseVector vec;
 41 | 
 42 |   for(int j=0; j < d * ell; j++)
 43 |     (*product)[j] = 0;
 44 |   
 45 |   for(int i = 0; i < self-> nextRow; i++){
 46 |     for(int j=0; j<ell; j++)
 47 |       temp[j] = 0;
 48 |     vec = self-> vectors[i];
 49 | 
 50 |     for(int j=0; j < vec.nnz; j++){
 51 |       rid = vec.cols[j] * ell;
 52 |       val = vec.values[j];
 53 | 
 54 |       for (int t=0; t<ell ; t++)
 55 | 	temp[t] += (*G)[rid + t] * val;
 56 |     }
 57 |     
 58 |     for(int j=0; j < vec.nnz; j++){
 59 |       rid = vec.cols[j] * ell;
 60 |       val = vec.values[j];
 61 |       for (int t=0; t<ell; t++)
 62 | 	(*product) [rid + t] += temp[t] * val;
 63 |     }
 64 |   }
 65 | 
 66 |   double* G_addr = *G;
 67 |   *G = *product;
 68 |   *product = G_addr;
 69 | 
 70 | }
 71 | 
 72 | 
 73 | /* computes A*G for G being d*ell matrix
 74 |  * output is stored in double* product
 75 |  */
 76 | void leftMult (SparseMatrix* self, int ell, double* G, double* product){
 77 |   int itr = (self->nextRow) * ell;
 78 |   SparseVector vec;
 79 |   int rid, gidx;
 80 |   double val;
 81 | 
 82 |   for(int i=0; i < itr; i++)
 83 |     product[i] = 0;
 84 | 
 85 |   for(int i=0; i < self-> nextRow; i++){
 86 |     vec = self-> vectors[i];
 87 |     rid = i * ell;
 88 | 
 89 |     for (int t=0; t < vec.nnz ; t++){
 90 |       val = vec.values[t];
 91 |       gidx = vec.cols[t]*ell;
 92 | 
 93 |       for(int j=0; j < ell; j++){
 94 | 	product[rid + j] += val * G[gidx + j];
 95 |       } 
 96 |     }
 97 |   }
 98 | 
 99 | }
100 | 
101 | 
102 | /* computes Gt*A
103 |  * G has ell columns
104 |  * output is returned in double* product
105 |  */
106 | void transposeRightMult (SparseMatrix* self, int ell, double* G, double* product){
107 |   int itr = (self->dimension) * ell;
108 |   SparseVector vec;
109 |   int rid, col;
110 |   double val;
111 | 
112 |   for(int i=0; i<itr; i++)
113 |     product[i] = 0;
114 | 
115 | 
116 |   for(int i=0; i < self-> nextRow; i++){
117 |     vec = self-> vectors[i];
118 |     rid = i*ell;
119 | 
120 |     for(int j=0; j < vec.nnz; j++){
121 |       val = vec.values[j];  
122 |       col = vec.cols[j];
123 | 
124 |       for (int t=0; t<ell; t++){
125 | 	product[t * (self->dimension) + col] += G[rid + t] * val;
126 |       }
127 |     }
128 |   }
129 | }
130 | 
131 | 
132 | void blockPowerMethod(SparseMatrix *self, int ell, double epsilon, double* G, double* lsv, double* temp_vec, double* temp_mat){
133 |   int iterations = (int) ceil(1 * (log(self->dimension / epsilon) / epsilon));
134 | 
135 |   for(int i=0; i < iterations; i++){
136 |     if(i % 10 == 0)
137 |       qrDecomp(G, self->dimension, ell);
138 |     covMultiply_sparseMatrix(self, self->dimension, ell, &G, temp_vec, &temp_mat); 
139 |   }
140 | 
141 |   // approx left singular vectors
142 |   leftMult (self, ell, G, lsv);
143 |   qrDecomp(lsv, self->nextRow, ell);
144 | 
145 | }
146 | 
147 | 
148 | /* returns covariance matrix, i.e. AtA
149 |  */
150 | double* getCovariance_sparseMatrix(SparseMatrix* self){
151 | 
152 |   double* cov = (double*) malloc(sizeof(double) * self->dimension * self-> dimension);
153 |   memset(cov, 0 , self->dimension * self-> dimension * sizeof(double));
154 | 
155 |   
156 |   int elemIndex;
157 |   double val;
158 |   SparseVector vec;
159 | 
160 |   for(int t=0; t < self->nextRow; t++){
161 |     vec = self->vectors[t];
162 |     for(int i=0; i< vec.nnz; i++){
163 |       for(int j=0; j< vec.nnz; j++){
164 | 	elemIndex = vec.cols[i] * self-> dimension + vec.cols[j];
165 | 	val = (vec.values[i]) * (vec.values[j]);
166 | 	cov[elemIndex] += val;
167 |       }
168 |     }
169 |   }
170 |   
171 |   return cov;
172 | }
173 | 
174 | void densify_sparseMatrix(SparseMatrix* self, double* output){
175 | 
176 |   int rid;
177 |   SparseVector vec;
178 | 
179 |   int itr = self->nextRow * self->dimension;
180 |   for(int i=0; i<itr; i++)
181 |     output[i] = 0;
182 | 
183 |   
184 |   for(int t=0; t < self->nextRow; t++){
185 |     vec = self->vectors[t];
186 |     rid = t * self->dimension;
187 | 
188 |     for(int i=0; i < vec.nnz; i++)
189 |       output[ rid + vec.cols[i] ] = vec.values[i];
190 |   }
191 | }
192 | 
193 | 
194 | double computeCovErr(SparseMatrix* A, double* B, int ell, int d){
195 |   double* AtA = getCovariance_sparseMatrix(A);
196 |   double* BtB = getDenseCovariance(B, ell, d);
197 |   subtract(AtA, BtB, d, d);
198 |   return getSpectralNorm(AtA, d, d);
199 | }
200 | 
201 | double computeRelCovErr(SparseMatrix* A, double* B, int ell, int d){
202 |   double s = computeCovErr(A,B,ell,d);
203 |   return s / A-> squaredFrob;
204 | 
205 | }
206 | 
207 | double topRank_cov(double* AtA, int d, int k){
208 |   
209 |   double* S = (double*) malloc(sizeof(double) * d);
210 |   double* U = (double*) malloc(sizeof(double) * d * d);
211 |   double* Vt = (double*) malloc(sizeof(double) * d * d);
212 |   
213 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'N', d, d, AtA, d, S, U, d, Vt, d);
214 | 
215 |   free(U); free(Vt);
216 |   double tailSquaredFrob = 0;
217 | 
218 |   for(int i = k; i < d ; i++)
219 |     tailSquaredFrob += S[i];
220 | 
221 |   free(S);
222 |   return tailSquaredFrob;
223 | 
224 | }
225 | 
226 | 
227 | /* computes top rank k of A, returns it in Vt, returns tail norm of A too */
228 | double topRank(SparseMatrix* A, int k){
229 | 
230 |   double* Adense = (double*) malloc(sizeof(double) * A->nextRow * A->dimension);
231 |   densify_sparseMatrix(A, Adense);
232 | 
233 |   double* S = (double*) malloc(sizeof(double) * A->nextRow);
234 |   double* U = (double*) malloc(sizeof(double) * A->nextRow * A->nextRow);
235 |   double* Vt = (double*) malloc(sizeof(double) * A->dimension * A->dimension);
236 |   
237 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'N', A->nextRow, A->dimension, Adense, A->dimension, S, U, A->nextRow, Vt, A->dimension);
238 | 
239 |   free(U); free(Adense); free(Vt);
240 |   int itr = min(A->nextRow, A->dimension);
241 |   double tailSquaredFrob = 0;
242 | 
243 |   for(int i = k; i < itr ; i++)
244 |     tailSquaredFrob += pow(S[i],2);
245 |   
246 |   free(S);
247 |   return tailSquaredFrob;
248 | }
249 | 
250 | 
251 | double computeRelProjErr(SparseMatrix* A, double* B, int ell, int d, int k, double tailSquaredFrob){
252 |  
253 | 
254 |   double projNorm = 0, projErr = 0;
255 |   double projVec[k];
256 |   SparseVector vec;
257 |   int rid;
258 | 
259 | 
260 |   double* S = (double*) malloc(sizeof(double) * 2 * ell);
261 |   double* U = (double*) malloc(sizeof(double) * 4 * ell * ell);
262 |   double* Vt = (double*) malloc(sizeof(double) * d * d);
263 | 
264 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'A', 2*ell, d, B, d, S, U, 2*ell, Vt, d);
265 |  
266 | 
267 |   for(int t=0; t< A->nextRow; t++){
268 |     vec = A->vectors[t];
269 |     projNorm = 0;
270 | 
271 |     for(int i=0; i<k; i++){
272 |       projVec[i] = dotproduct(&vec, Vt, i, d);
273 |       projNorm += pow(projVec[i],2);
274 |     }
275 | 
276 |     projErr += (vec.squaredNorm - projNorm);
277 |   }
278 | 
279 |   return projErr / tailSquaredFrob;
280 | }
281 | 
282 | 
283 | double dotproduct(SparseVector* sv, double* Vt, int rid, int dim){
284 |   double dp = 0;
285 |   for(int i=0; i < sv->nnz; i++ )
286 |     dp += sv->values[i] * Vt[rid*dim + sv->cols[i]];
287 | 
288 |   return dp;
289 | }
290 | 


--------------------------------------------------------------------------------
/c/sparseMatrix.h:
--------------------------------------------------------------------------------
 1 | #ifndef SPARSEMATRIX_H
 2 | #define SPARSEMATRIX_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | #include <lapacke.h>
 8 | #include <string.h>
 9 | #include <time.h>
10 | #include <stdint.h>
11 | #include "sparseVector.h"
12 | #include "common.h"
13 | 
14 | typedef struct {
15 |   SparseVector* vectors;
16 |   int nextRow;
17 |   int dimension;
18 |   int current_nnz;
19 |   double squaredFrob;
20 | 
21 | } SparseMatrix;
22 | 
23 | 
24 | double topRank_cov(double* AtA, int d, int k);
25 | 
26 | void init_sparseMatrix (SparseMatrix* self, int dim, int len);
27 | void append_to_sparseMatrix (SparseMatrix *self, SparseVector *sv);
28 | void print_sparseMatrix(SparseMatrix* self);
29 | void covMultiply_sparseMatrix (SparseMatrix *self, int dimension, int ell, double** G, double* temp, double** product);
30 | void leftMult (SparseMatrix *self, int ell, double* G, double* product);
31 | void transposeRightMult (SparseMatrix *self, int ell, double* G, double* product);
32 | void blockPowerMethod(SparseMatrix *self, int ell, double epsilon, double* G, double* lsv, double* temp_vec, double* temp_mat);
33 | double* getCovariance_sparseMatrix(SparseMatrix *self);
34 | void densify_sparseMatrix(SparseMatrix* self, double* output);
35 | double computeCovErr(SparseMatrix* A, double* B, int ell, int d);
36 | double computeRelCovErr(SparseMatrix* A, double* B, int ell, int d);
37 | double topRank(SparseMatrix* A, int k);
38 | double computeRelProjErr(SparseMatrix* A, double* B, int ell, int d, int k, double tailSquaredFrob);
39 | double dotproduct(SparseVector* sv, double* Vt, int rid, int dim);
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/c/sparseSketcher.c:
--------------------------------------------------------------------------------
  1 | #include "sparseSketcher.h"
  2 | 
  3 | void init_sparseSketcher(SparseSketcher* self, int ell, int dim ){
  4 |   self->class_name = "sparseSketcher";
  5 |   self->dimension = dim;
  6 |   self->ell = ell;
  7 |   self->m = 2*ell;
  8 |   self->sketch = (double*) malloc(sizeof(double) * (self->m) * dim);
  9 |   init_sparseMatrix(&(self->buffer), dim, dim);
 10 |   self->nnz_threshold = ell * dim;
 11 | 
 12 |   for(int i=0; i < ell*dim; i++)
 13 |     self->sketch[i] = 0;
 14 | 
 15 | }  
 16 | 
 17 | 
 18 | void append_to_sparseSketcher(SparseSketcher* self, SparseVector* sv){
 19 |   if((self->buffer).current_nnz >= self->nnz_threshold || (self->buffer).nextRow >= self->dimension)
 20 |     rotate_sparseSketcher(self);
 21 |   append_to_sparseMatrix(&(self->buffer), sv);
 22 | }
 23 | 
 24 | void rotate_sparseSketcher(SparseSketcher *self){
 25 |   sparseShrink(self);
 26 |   denseShrink(self);
 27 | }
 28 | 
 29 | 
 30 | void get_sparseSketch(SparseSketcher *self){
 31 |   sparseShrink(self);
 32 |   //rotate_sparseSketcher(self);
 33 | }
 34 | 
 35 | void sparseShrink(SparseSketcher *self){
 36 |   if((self->buffer).nextRow > self->ell){
 37 |     double* temp_vec = (double*) malloc(sizeof(double) * self->ell);
 38 |     double* temp_mat = (double*) malloc(sizeof(double) * self->ell * self->dimension);
 39 |     double* G = (double*) malloc(self->ell * self->dimension * sizeof(double));
 40 |     double* Z = (double*) malloc(self->ell * (self->buffer).nextRow * sizeof(double));
 41 | 
 42 |     for(int i=0; i < self->ell * self->dimension; i++)
 43 |       G[i] = ( (float)rand() / (float)(RAND_MAX) );
 44 | 
 45 |     blockPowerMethod(&(self->buffer), self->ell, 1, G, Z, temp_vec, temp_mat);
 46 |     free(temp_vec); 
 47 |     free(G);
 48 | 
 49 |     //computing P = ZtA, temp_mat is P
 50 |     transposeRightMult(&(self->buffer), self->ell, Z, temp_mat);
 51 |     free(Z);
 52 | 
 53 |     // svd(ZtA)
 54 |     double* S = (double*) malloc(sizeof(double) * self->ell);
 55 |     double* U = (double*) malloc(sizeof(double) * self->ell * self->ell);
 56 |     double* Vt = (double*) malloc(sizeof(double) * self->dimension * self->ell);
 57 | 
 58 |     int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', self->ell, self->dimension, temp_mat, self->dimension, S, U, self->ell, Vt, self->dimension);
 59 |     free(temp_mat);
 60 | 
 61 | 
 62 |     // shrink S and compute S*Vt
 63 |     for(int i=0; i < self->ell; i++){
 64 |       S[i] = sqrt( pow(S[i],2) - pow(S[self->ell-1],2) );
 65 |       for(int j=0; j < self->dimension; j++)
 66 | 	self->sketch[(self->ell + i) * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ;
 67 |     }
 68 |   }else{ // self->buffer has atmost ell rows
 69 |     
 70 |     SparseVector temp;
 71 | 
 72 |     for(int i=0; i < (self->buffer).nextRow; i++){
 73 |       temp = (self->buffer).vectors[i];
 74 |       for(int j=0; j < temp.nnz; j++)
 75 | 	self->sketch[(self->ell + i) * self->dimension + temp.cols[j]] = temp.values[j];
 76 |     }
 77 |   }
 78 | 
 79 |   // reset buffer
 80 |   (self->buffer).current_nnz = 0;
 81 |   (self->buffer).nextRow = 0;
 82 |   (self->buffer).squaredFrob = 0;
 83 | }
 84 | 
 85 | 
 86 | void denseShrink(SparseSketcher* self){
 87 |   double* S = (double*) malloc(sizeof(double) * 2 * self->ell);
 88 |   double* U = (double*) malloc(sizeof(double) * 4 * self->ell * self->ell);
 89 |   double* Vt = (double*) malloc(sizeof(double) * 2 * self->dimension * self->ell);
 90 | 
 91 |   int info = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', 2*self->ell, self->dimension, self->sketch, self->dimension, S, U, 2*self->ell, Vt, self->dimension);
 92 | 
 93 |   for(int i=0; i < self->ell; i++){
 94 |     S[i] = sqrt( pow(S[i],2) - pow(S[self->ell-1],2) );
 95 |     for(int j=0; j < self->dimension; j++)
 96 |       self->sketch[i * self->dimension + j] = Vt[i * self->dimension + j] * S[i] ;
 97 |   }
 98 | 
 99 |   memset(&self->sketch[self->ell * self->dimension], 0, self->ell * self->dimension * sizeof(double));
100 | }
101 | 


--------------------------------------------------------------------------------
/c/sparseSketcher.h:
--------------------------------------------------------------------------------
 1 | #ifndef SPARSESKETCHER_H
 2 | #define SPARSESKETCHER_H
 3 | 
 4 | #include "sparseMatrix.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <math.h>
 8 | #include <lapacke.h>
 9 | #include <string.h>
10 | 
11 | typedef struct {
12 |   char* class_name;
13 |   int dimension;
14 |   int ell;
15 |   int m;
16 |   double* sketch;
17 |   SparseMatrix buffer;
18 |   int nnz_threshold;
19 | 
20 | } SparseSketcher;
21 | 
22 | 
23 | void init_sparseSketcher(SparseSketcher* self, int ell, int dim );
24 | void append_to_sparseSketcher(SparseSketcher* self, SparseVector* sv);
25 | void sparseShrink(SparseSketcher* self);
26 | void denseShrink(SparseSketcher* self);
27 | void rotate_sparseSketcher(SparseSketcher* self);
28 | void get_sparseSketch(SparseSketcher* self);
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/c/sparseVector.c:
--------------------------------------------------------------------------------
  1 | #include "sparseVector.h"
  2 | 
  3 | 
  4 | void init_sparseVector(SparseVector* self, int dim, int cols[], double vals[], int nnz){
  5 |   self-> nnz = nnz;
  6 |   self-> dimension = dim;
  7 |   self-> cols = (int*) malloc(sizeof(int) * self-> nnz);
  8 |   self-> values = (double*) malloc(sizeof(double) * self-> nnz);
  9 |   self-> squaredNorm = 0;
 10 |   
 11 |   for (int i=0; i < self-> nnz; i++){
 12 |     self-> cols[i] = cols[i];
 13 |     self-> values[i] = vals[i];
 14 |     self-> squaredNorm += pow(vals[i] , 2);
 15 |   }
 16 | }
 17 | 
 18 | 
 19 | /* it generates a vector of dim dimension, 
 20 |    with only nnz non-zeros
 21 |    first jlen columns have threshold_prob probability of getting a non-zero
 22 |    non-zeros are picked from [-10, 10] uniformly at random
 23 |  */
 24 | void skew_init_sparseVector(SparseVector* self, int dim, int nnz, int jlen, double threshold_prob){
 25 |   self-> dimension = dim;  
 26 |   self-> nnz = nnz;
 27 |   self-> cols = (int*) malloc(sizeof(int) * self-> nnz);
 28 |   self-> values = (double*) malloc(sizeof(double) * self-> nnz);
 29 |   self-> squaredNorm = 0;
 30 | 
 31 |   double randomVal;  
 32 |   int flag, col_id, t;
 33 | 
 34 |   for (int i=0; i < self-> nnz; i++){
 35 |     randomVal = rand()/(RAND_MAX+1.0); 
 36 | 
 37 |     if(randomVal < threshold_prob){ // goes to first "jlen" columns
 38 |       col_id = (int) rand() % jlen;
 39 |       flag = 1;
 40 | 
 41 |       while (flag == 1){
 42 | 	for (t=0; t < i; t++)
 43 | 	  if (col_id == self-> cols[t])
 44 | 	    break;
 45 | 	if (t == i)
 46 | 	  flag = 0; 
 47 | 	else
 48 | 	  col_id = rand() % jlen;
 49 |       }
 50 |     }
 51 | 
 52 |     else{// goes to the rest of columns
 53 |       col_id = jlen + (int) rand() % (dim-jlen);
 54 |       flag = 1;
 55 | 
 56 |       while (flag == 1){
 57 | 	for (t=0; t < i; t++)
 58 | 	  if (col_id == self-> cols[t])
 59 | 	    break;
 60 | 	if (t == i)
 61 | 	  flag = 0; 
 62 | 	else
 63 | 	  col_id = jlen + (int) rand() % (dim-jlen);
 64 |       }
 65 |     }
 66 |     self-> cols[i] = col_id;
 67 |     int tempr = 2 * (rand()%2) - 1;
 68 |     self-> values[i] = tempr * (int)ceil( ((double)rand()/(double)(RAND_MAX)) * 10);
 69 |   
 70 |     self-> squaredNorm += pow(self-> values[i] , 2);
 71 |   }
 72 | }
 73 | 
 74 | 
 75 | void random_init_sparseVector(SparseVector* self, int dim, int nnz){
 76 |   self-> dimension = dim;  
 77 |   self-> nnz = nnz;
 78 |   self-> cols = (int*) malloc(sizeof(int) * self-> nnz);
 79 |   self-> values = (double*) malloc(sizeof(double) * self-> nnz);
 80 |   self-> squaredNorm = 0;
 81 |   int i;
 82 | 
 83 |   for (i=0; i < self-> nnz; i++){
 84 |     double newly_gen = rand() % dim;
 85 |     int flag = 1;
 86 |     int j= 0;
 87 | 
 88 |     while (flag == 1){
 89 |       for (j=0; j < i; j++)
 90 | 	if (newly_gen == self-> cols[j])
 91 | 	  break;
 92 |       if (j == i)
 93 | 	flag = 0; 
 94 |       else
 95 | 	newly_gen = rand() % dim;
 96 |     }
 97 |     self-> cols[i] = newly_gen;
 98 |     self-> values[i] = (int)ceil( ((double)rand()/(double)(RAND_MAX)) * 10);
 99 |     self-> squaredNorm += pow(self-> values[i] , 2);
100 |   }
101 | }
102 | 
103 | void print_sparseVector(SparseVector* self){
104 |   
105 |   for (int i=0; i< self-> nnz; i++)
106 |     printf("(%d, %.2f)", self-> cols[i], self-> values[i]  );
107 |   printf("\n");
108 | }
109 | 
110 | 
111 | double* densify_sparseVector(SparseVector* self){
112 |   double* vec = (double*) malloc(sizeof(double) * self->dimension);
113 |   
114 |   for(int i=0; i < self->dimension ; i++)
115 |     vec[i] = 0;
116 |   for(int i=0; i < self->nnz; i++)
117 |     vec[self->cols[i]] = self->values[i];
118 |   return vec;
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/c/sparseVector.h:
--------------------------------------------------------------------------------
 1 | #ifndef SPARSEVEC_H
 2 | #define SPARSEVEC_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <math.h>
 7 | 
 8 | 
 9 | typedef struct{
10 |   double squaredNorm;
11 |   double* values;
12 |   int dimension;
13 |   int* cols;
14 |   int nnz;
15 | 
16 | } SparseVector;
17 | 
18 | void init_sparseVector(SparseVector* self, int dim, int cols[], double vals[], int nnz);
19 | void random_init_sparseVector(SparseVector* self, int dim, int nnz);
20 | void print_sparseVector(SparseVector* self);
21 | void skew_init_sparseVector(SparseVector* self, int dim, int nnz, int jlen, double threshold_prob);
22 | double* densify_sparseVector(SparseVector* self);
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/c/testAll.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include "common.h"
 5 | #include "sparseSketcher.h"
 6 | #include "frequentDirections.h"
 7 | #include "test_vs_ell.c"
 8 | #include "test_vs_d.c"
 9 | #include "test_vs_n.c"
10 | #include "test_vs_sparsity.c"
11 | 
12 | 
13 | 
14 | int main(){
15 |   test_vs_ell();
16 |   test_vs_sparsity();
17 |   test_vs_d();
18 |   test_vs_n();
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/c/test_vs_d.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "frequentDirections.h"
 3 | #include "sparseSketcher.h"
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | 
 8 | void test_vs_d(){
 9 |   int n = 10000;
10 |   int dim_set[] = {1000,2000,3000,4000,5000,6000};
11 |   int k = 10;
12 |   int exp_no = 6;
13 |   int nnz = 100;
14 |   int ell = 50;
15 | 
16 |   double start, end, cpu_time_used;
17 |   SparseMatrix A;
18 |   SparseVector arr[n];
19 |   SparseSketcher sfd;
20 |   FrequentDirections fd;
21 | 
22 |   double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no];
23 |   double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no];
24 | 
25 |   double tailSquaredFrob;
26 | 
27 | 
28 |   for(int i=0; i<exp_no; i++){
29 |     
30 |     printf("i = %d\n", i);
31 |     sfd_time[i] = 0;
32 |     fd_time[i] = 0;
33 | 
34 |     init_sparseSketcher(&sfd, ell, dim_set[i]);
35 |     init_fd(&fd, ell, dim_set[i]);
36 | 
37 |     // input matrix
38 |     init_sparseMatrix(&A, dim_set[i], n);
39 | 
40 |     for (int j=0; j < n; j++){
41 |       skew_init_sparseVector(&arr[j], dim_set[i], nnz, (int) (1.5 * nnz), 0.9);
42 |       append_to_sparseMatrix(&A, &arr[j]);
43 | 
44 |       // SFD
45 |       start = clock();
46 |       append_to_sparseSketcher(&sfd, &arr[j]);
47 |       sfd_time[i] += (double) (clock() - start);
48 | 
49 |       //FD
50 |       start = clock();
51 |       append_to_fd(&fd, &arr[j]);
52 |       fd_time[i] += (double) (clock() - start); 
53 |     }
54 | 
55 |     tailSquaredFrob = topRank(&A, k);
56 | 
57 |   
58 |     //SFD
59 |     start = clock();
60 |     get_sparseSketch(&sfd);
61 |     sfd_time[i] += (double) (clock() - start);
62 |     sfd_time[i] = sfd_time[i] / CLOCKS_PER_SEC;
63 |     sfd_cov_err[i] = computeRelCovErr(&A, sfd.sketch, ell, dim_set[i]);
64 |     sfd_proj_err[i] = computeRelProjErr(&A, sfd.sketch, ell, dim_set[i], k, tailSquaredFrob);
65 | 
66 |     
67 |     //FD
68 |     start = clock();
69 |     get_fdSketch(&fd);
70 |     fd_time[i] += (double) (clock() - start);
71 |     fd_time[i] = fd_time[i] / CLOCKS_PER_SEC;
72 |     fd_cov_err[i] = computeRelCovErr(&A, fd.sketch, ell, dim_set[i]);
73 |     fd_proj_err[i] = computeRelProjErr(&A, fd.sketch, ell, dim_set[i], k, tailSquaredFrob);  
74 | 
75 |   }
76 |     
77 | 
78 |   printf("SFD:\n");
79 |   print_one_dim_double("\'proj\':", sfd_proj_err, exp_no);
80 |   print_one_dim_double("\'cov\':", sfd_cov_err, exp_no);  
81 |   print_one_dim_double("\'time\':", sfd_time, exp_no);
82 | 
83 |   print_one_dim_double("\'proj\':", fd_proj_err, exp_no);    
84 |   print_one_dim_double("\'cov\':", fd_cov_err, exp_no);
85 |   print_one_dim_double("\'time\':", fd_time, exp_no);
86 | }
87 | 


--------------------------------------------------------------------------------
/c/test_vs_ell.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "frequentDirections.h"
 3 | #include "sparseSketcher.h"
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | 
 8 | void test_vs_ell(){
 9 |   int n = 10000;
10 |   int dim = 1000;
11 |   int k = 10;
12 |   int exp_no = 6;
13 |   int nnz = 100;
14 |   int ell_set[] = {5, 10, 15, 20, 50, 100};
15 | 
16 |   double start, end, cpu_time_used;
17 |   SparseMatrix A;
18 |   SparseVector arr[n];
19 |   SparseSketcher sfd;
20 |   FrequentDirections fd;
21 | 
22 |   double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no];
23 |   double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no];
24 | 
25 |   double tailSquaredFrob;
26 | 
27 | 
28 |   // input matrix
29 |   init_sparseMatrix(&A, dim, n);
30 |   for (int j=0; j < n; j++){
31 |     skew_init_sparseVector(&arr[j], dim, nnz, (int) (1.5 * nnz), 0.9);
32 |     append_to_sparseMatrix(&A, &arr[j]);
33 |   }
34 |   tailSquaredFrob = topRank(&A, k);
35 |     
36 | 
37 |   // expr
38 |   for(int i=0; i<exp_no; i++){
39 |     printf("i = %d\n", i);
40 |    
41 |     sfd_time[i] = 0;
42 |     fd_time[i] = 0;
43 | 
44 |     init_sparseSketcher(&sfd, ell_set[i], dim);
45 |     init_fd(&fd, ell_set[i], dim);
46 | 
47 |     for (int j=0; j < n; j++){
48 |       // SFD
49 |       start = clock();
50 |       append_to_sparseSketcher(&sfd, &arr[j]);
51 |       sfd_time[i] += (double) (clock() - start);
52 | 
53 |       //FD
54 |       start = clock();
55 |       append_to_fd(&fd, &arr[j]);
56 |       fd_time[i] += (double) (clock() - start); 
57 |     }
58 |     //SFD
59 |     start = clock();
60 |     get_sparseSketch(&sfd);
61 |     sfd_time[i] += (double) (clock() - start);
62 |     sfd_time[i] = sfd_time[i] / CLOCKS_PER_SEC;
63 |     sfd_cov_err[i] = computeRelCovErr(&A, sfd.sketch, ell_set[i], dim);
64 |     sfd_proj_err[i] = computeRelProjErr(&A, sfd.sketch, ell_set[i], dim, k, tailSquaredFrob);
65 | 
66 |     
67 |     //FD
68 |     start = clock();
69 |     get_fdSketch(&fd);
70 |     fd_time[i] += (double) (clock() - start);
71 |     fd_time[i] = fd_time[i] / CLOCKS_PER_SEC;
72 |     fd_cov_err[i] = computeRelCovErr(&A, fd.sketch, ell_set[i], dim);
73 |     fd_proj_err[i] = computeRelProjErr(&A, fd.sketch, ell_set[i], dim, k, tailSquaredFrob);  
74 | 
75 |   }
76 | 
77 |   printf("SFD:\n");
78 |   print_one_dim_double("\'proj\':", sfd_proj_err, exp_no);
79 |   print_one_dim_double("\'cov\':", sfd_cov_err, exp_no);  
80 |   print_one_dim_double("\'time\':", sfd_time, exp_no);
81 | 
82 |   print_one_dim_double("\'proj\':", fd_proj_err, exp_no);    
83 |   print_one_dim_double("\'cov\':", fd_cov_err, exp_no);
84 |   print_one_dim_double("\'time\':", fd_time, exp_no);
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/c/test_vs_n.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "frequentDirections.h"
 3 | #include "sparseSketcher.h"
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | 
 8 | void test_vs_n(){
 9 |   int n_set[] = {10000,20000,30000,40000,50000, 60000};
10 |   int dim = 1000;
11 |   int k = 10;
12 |   int exp_no = 6;
13 |   int nnz = 100;
14 |   int ell = 50;
15 | 
16 |   double start, end, cpu_time_used;
17 |   SparseMatrix A;
18 |   SparseVector arr[60000]; 
19 |   SparseSketcher sfd;
20 |   FrequentDirections fd;
21 | 
22 |   double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no];
23 |   double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no];
24 | 
25 |   double tailSquaredFrob;
26 | 
27 | 
28 |   for(int i=0; i<exp_no; i++){
29 |     
30 |     printf("i = %d\n", i);
31 |     sfd_time[i] = 0;
32 |     fd_time[i] = 0;
33 | 
34 |     init_sparseSketcher(&sfd, ell, dim);
35 |     init_fd(&fd, ell, dim);
36 | 
37 |     // input matrix
38 |     init_sparseMatrix(&A, dim, n_set[i]);
39 | 
40 |     for (int j=0; j < n_set[i]; j++){
41 |       skew_init_sparseVector(&arr[j], dim, nnz, (int) (1.5 * nnz), 0.9);
42 |       append_to_sparseMatrix(&A, &arr[j]);
43 | 
44 |       // SFD
45 |       start = clock();
46 |       append_to_sparseSketcher(&sfd, &arr[j]);
47 |       sfd_time[i] += (double) (clock() - start);
48 | 
49 |       //FD
50 |       start = clock();
51 |       append_to_fd(&fd, &arr[j]);
52 |       fd_time[i] += (double) (clock() - start); 
53 |     }
54 | 
55 |     tailSquaredFrob = topRank(&A, k);
56 | 
57 |   
58 |     //SFD
59 |     start = clock();
60 |     get_sparseSketch(&sfd);
61 |     sfd_time[i] += (double) (clock() - start);
62 |     sfd_time[i] = sfd_time[i] / CLOCKS_PER_SEC;
63 |     sfd_cov_err[i] = computeRelCovErr(&A, sfd.sketch, ell, dim);
64 |     sfd_proj_err[i] = computeRelProjErr(&A, sfd.sketch, ell, dim, k, tailSquaredFrob);
65 | 
66 |     
67 |     //FD
68 |     start = clock();
69 |     get_fdSketch(&fd);
70 |     fd_time[i] += (double) (clock() - start);
71 |     fd_time[i] = fd_time[i] / CLOCKS_PER_SEC;
72 |     fd_cov_err[i] = computeRelCovErr(&A, fd.sketch, ell, dim);
73 |     fd_proj_err[i] = computeRelProjErr(&A, fd.sketch, ell, dim, k, tailSquaredFrob);  
74 | 
75 |   }
76 | 
77 |   printf("SFD:\n");
78 |   print_one_dim_double("\'proj\':", sfd_proj_err, exp_no);
79 |   print_one_dim_double("\'cov\':", sfd_cov_err, exp_no);  
80 |   print_one_dim_double("\'time\':", sfd_time, exp_no);
81 | 
82 |   print_one_dim_double("\'proj\':", fd_proj_err, exp_no);    
83 |   print_one_dim_double("\'cov\':", fd_cov_err, exp_no);
84 |   print_one_dim_double("\'time\':", fd_time, exp_no);
85 |     
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/c/test_vs_sparsity.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "frequentDirections.h"
 3 | #include "sparseSketcher.h"
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | 
 8 | 
 9 | void test_vs_sparsity(){
10 |   int n = 10000;
11 |   int dim = 1000;
12 |   int ell = 50;
13 |   int k = 10;
14 |   int exp_no = 6;
15 |   int var_set[] = {0.01 * dim, 0.05 * dim, 0.1*dim, 0.3*dim, 0.5*dim, 0.7*dim}; 
16 |   //{0.005 * dim, 0.01 * dim, 0.05 * dim, 0.1*dim, 0.3*dim, 0.5*dim};
17 | 
18 |   double start, end, cpu_time_used;
19 |   SparseMatrix A;
20 |   SparseVector arr[n];
21 |   SparseSketcher sfd;
22 |   FrequentDirections fd;
23 | 
24 |   double sfd_cov_err[exp_no], sfd_proj_err[exp_no], sfd_time[exp_no];
25 |   double fd_cov_err[exp_no], fd_proj_err[exp_no], fd_time[exp_no];
26 | 
27 |   double tailSquaredFrob;
28 | 
29 |   init_sparseSketcher(&sfd, ell, dim);
30 |   init_fd(&fd, ell, dim);
31 |   
32 | 
33 |   for(int i=0; i<exp_no; i++){
34 |     printf("i = %d\n", i);
35 | 
36 |     sfd_time[i] = 0;
37 |     fd_time[i] = 0;
38 |     init_sparseMatrix(&A, dim, n);
39 | 
40 |     for (int j=0; j < n; j++){
41 |       skew_init_sparseVector(&arr[j], dim, var_set[i], (int) (1.5 *var_set[i]), 0.9);
42 |       append_to_sparseMatrix(&A, &arr[j]);
43 | 
44 |       // SFD
45 |       start = clock();
46 |       append_to_sparseSketcher(&sfd, &arr[j]);
47 |       sfd_time[i] += (double) (clock() - start);
48 | 
49 |       //FD
50 |       start = clock();
51 |       append_to_fd(&fd, &arr[j]);
52 |       fd_time[i] += (double) (clock() - start); 
53 |     }
54 |     
55 |     tailSquaredFrob = topRank(&A, k);
56 |     
57 |     //SFD
58 |     start = clock();
59 |     get_sparseSketch(&sfd);
60 |     sfd_time[i] += (double) (clock() - start);
61 |     sfd_time[i] = sfd_time[i] / CLOCKS_PER_SEC;
62 |     sfd_cov_err[i] = computeRelCovErr(&A, sfd.sketch, ell, dim);
63 |     sfd_proj_err[i] = computeRelProjErr(&A, sfd.sketch, ell, dim, k, tailSquaredFrob);
64 | 
65 |     
66 |     //FD
67 |     start = clock();
68 |     get_fdSketch(&fd);
69 |     fd_time[i] += (double) (clock() - start);
70 |     fd_time[i] = fd_time[i] / CLOCKS_PER_SEC;
71 |     fd_cov_err[i] = computeRelCovErr(&A, fd.sketch, ell, dim);
72 |     fd_proj_err[i] = computeRelProjErr(&A, fd.sketch, ell, dim, k, tailSquaredFrob);  
73 |   }  
74 | 
75 |   printf("SFD:\n");
76 |   print_one_dim_double("\'proj\':", sfd_proj_err, exp_no);
77 |   print_one_dim_double("\'cov\':", sfd_cov_err, exp_no);  
78 |   print_one_dim_double("\'time\':", sfd_time, exp_no);
79 | 
80 |   print_one_dim_double("\'proj\':", fd_proj_err, exp_no);    
81 |   print_one_dim_double("\'cov\':", fd_cov_err, exp_no);
82 |   print_one_dim_double("\'time\':", fd_time, exp_no);
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/frequent-directions-experiments/06ecc4a1513c9b83c0bda3de1d2cb5ded468e3a0/experiments/__init__.py


--------------------------------------------------------------------------------
/experiments/compareApproximationErrors.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from itertools import product
 3 | from time import time as timer
 4 | from numpy.linalg import svd
 5 | from numpy.linalg import norm
 6 | from numpy import dot
 7 | from numpy import zeros
 8 | from numpy import cov as covariance
 9 | 
10 | sys.path.append("../sketch")  # needed for imports
11 | from utils.syntheticDataMaker import SyntheticDataMaker
12 | from utils.common import truncateSVD
13 | import bruteForce, frequentDirections, rowSampler, randomProjections, randomSums
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     sketcherClasses = [
18 |         bruteForce.BruteForce,
19 |         rowSampler.RowSampler,
20 |         randomProjections.RandomProjections,
21 |         randomSums.RandomSums,
22 |         frequentDirections.FrequentDirections,
23 |     ]
24 |     ns = [500]
25 |     ds = [100]
26 |     ells = range(10, 101, 10)
27 |     ks = [5]
28 |     rounds = 1
29 | 
30 |     for (n, d, k) in product(ns, ds, ks):
31 |         data_maker = SyntheticDataMaker()
32 |         data_maker.initBeforeMake(d, k, signal_to_noise_ratio=10.0)
33 |         A = data_maker.makeMatrix(n)  # n * d matrix
34 | 
35 |         ATA = covariance(A.T)
36 |         squared_frob_A = norm(A, "fro") ** 2
37 |         A_rank_k = truncateSVD(A, k)
38 | 
39 |         for (sketcherClass, ell, r) in product(sketcherClasses, ells, range(rounds)):
40 |             if ell > d / 2:
41 |                 continue
42 | 
43 |             sketcher = sketcherClass(d, ell)
44 |             for row in A:
45 |                 sketcher.append(row)
46 | 
47 |             sketch = sketcher.get()
48 | 
49 |             diff = ATA - dot(sketch.transpose(), sketch)
50 |             relative_cov_err = norm(diff, 2) / squared_frob_A
51 | 
52 |             [u, s, vt] = svd(sketch, full_matrices=False)
53 |             vt = vt[:k, :]
54 |             projection = dot(A, dot(vt.transpose(), vt))
55 |             proj_err = norm(A - projection, "fro") ** 2
56 |             opt_rank_k_err = norm(A - A_rank_k, "fro") ** 2
57 |             relative_proj_err = float(proj_err) / float(opt_rank_k_err)
58 | 
59 |             print(sketcher.class_name, relative_cov_err, relative_proj_err)
60 | 


--------------------------------------------------------------------------------
/experiments/compareRunningTimes.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from itertools import product
 3 | from time import time as timer
 4 | from numpy import cov as covariance
 5 | 
 6 | sys.path.append("../sketch")  # needed for imports
 7 | from utils.syntheticDataMaker import SyntheticDataMaker
 8 | import bruteForce, frequentDirections, rowSampler, randomProjections, randomSums
 9 | 
10 | if __name__ == "__main__":
11 |     sketcherClasses = [
12 |         bruteForce.BruteForce,
13 |         rowSampler.RowSampler,
14 |         randomProjections.RandomProjections,
15 |         randomSums.RandomSums,
16 |         frequentDirections.FrequentDirections,
17 |     ]
18 |     ns = [1000]
19 |     ds = [100]
20 |     ells = range(10, 101, 10)
21 |     ks = [5]
22 |     rounds = 1
23 | 
24 |     for (n, d, k) in product(ns, ds, ks):
25 |         data_maker = SyntheticDataMaker()
26 |         data_maker.initBeforeMake(d, k, signal_to_noise_ratio=10.0)
27 |         A = data_maker.makeMatrix(n)  # n * d matrix
28 | 
29 |         for (sketcherClass, ell, r) in product(sketcherClasses, ells, range(rounds)):
30 |             if ell > d / 2:
31 |                 continue
32 | 
33 |             sketcher = sketcherClass(d, ell)
34 |             t_start = timer()
35 |             for row in A:
36 |                 sketcher.append(row)
37 |             t_end = timer()
38 | 
39 |             totalSketchTime = t_end - t_start
40 |             print(sketcher.class_name, totalSketchTime)
41 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from . import blockPower
 3 | from . import bruteForce
 4 | from . import entrySampler
 5 | from . import exampleUsage
 6 | from . import frequentDirections
 7 | from . import matrixSketcherBase
 8 | from . import randomProjections
 9 | from . import randomSums
10 | from . import rowSampler
11 | from . import sparseMatrix
12 | from . import sparseSketcher_sparseMat
13 | from . import sparseSketcher
14 | from . import sparseVector
15 | from . import utils
16 | 
17 | __all__ = [
18 |     "blockPower",
19 |     "bruteForce",
20 |     "entrySampler",
21 |     "exampleUsage",
22 |     "frequentDirections",
23 |     "matrixSketcherBase",
24 |     "randomProjections",
25 |     "randomSums",
26 |     "rowSampler",
27 |     "sparseMatrix",
28 |     "sparseSketcher_sparseMat",
29 |     "sparseSketcher",
30 |     "sparseVector",
31 |     "utils",
32 | ]
33 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/blockPower.py:
--------------------------------------------------------------------------------
 1 | # import numpy, scipy
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from numpy.random import randn
 5 | from numpy import ceil, log, zeros
 6 | from numpy.linalg import qr, svd
 7 | 
 8 | 
 9 | def blockpower(sparseMat, ell, eps=1):
10 |     n, d = sparseMat.getShape()
11 |     init_mat = randn(d, ell)
12 |     num_of_iter = int(
13 |         10 * ceil(log(n / eps) / eps)
14 |     )  # constant 10 should be found experimentally based on eps
15 | 
16 |     for i in range(num_of_iter):
17 |         init_mat = sparseMat.covarianceMult(init_mat)
18 |         # K = mat.dot(init_mat)
19 |         # init_mat = (mat.transpose()).dot(K)
20 | 
21 |     K = sparseMat.mult(init_mat)
22 | 
23 |     [Q, _] = qr(K)
24 |     # M = (Q.transpose()).dot(mat)
25 |     M = (mat.transpose()).dot(Q)  # computing transpose of what we need
26 | 
27 |     [U, S, _] = svd(M, full_matrices=False)
28 | 
29 |     return S, U[:, :ell].transpose()  # U is ell*d
30 |     # return (U[:,:ell].transpose()).dot(Q.transpose()), S[:ell] #this step might violate sing val bound we want
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     A = numpy.random.randn(500, 300)
35 |     bpm = BlockPower()
36 |     V = bpm.svds(A, 20)
37 | 
38 |     Vnew = numpy.dot(numpy.transpose(A), numpy.dot(A, V))
39 | 
40 |     for j in range(20):
41 |         z = numpy.linalg.norm(Vnew[:, j])
42 |         Vnew[:, j] = Vnew[:, j] / z
43 | 
44 |     print(numpy.linalg.norm(Vnew - V) ** 2)
45 | 
46 |     # print numpy.dot(numpy.transpose(V),V)
47 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/bruteForce.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy import zeros, dot, outer, diag, sqrt
 3 | from numpy.linalg import svd
 4 | from .matrixSketcherBase import MatrixSketcherBase
 5 | 
 6 | 
 7 | class BruteForce(MatrixSketcherBase):
 8 |     def __init__(self, d, ell):
 9 |         self.d = d
10 |         self.ell = ell
11 |         self.class_name = "BruteForce"
12 |         self.covariance = zeros((self.d, self.d))
13 | 
14 |     def append(self, vector):
15 |         self.covariance += outer(vector, vector)
16 | 
17 |     def get(self):
18 |         (U, s, Vt) = svd(self.covariance)
19 |         return dot(diag(sqrt(s[: self.ell])), Vt[: self.ell, :])
20 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/entrySampler.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from scipy.sparse import dok_matrix
 3 | from scipy import float32
 4 | from scipy.sparse.linalg import svds
 5 | from numpy import dot, diag, sqrt
 6 | 
 7 | from .utils.reservoirSampler import ReservoirSampler
 8 | 
 9 | 
10 | class EntrySampler:
11 |     def __init__(self, d, ell):
12 |         self.class_name = "EntrySampler"
13 |         self.d = d
14 |         self.ell = ell
15 |         self.nnz = d * ell
16 |         self.rows = 0
17 |         self.sampler = ReservoirSampler(self.nnz)
18 | 
19 |     def append(self, v):
20 |         for (col, val) in enumerate(v):
21 |             self.sampler.add((self.rows, col, val), abs(val))
22 |         self.rows += 1
23 | 
24 |     def get(self):
25 |         B = dok_matrix((self.rows, self.d), dtype=float32)
26 |         for ((row, col, val), p) in self.sampler.get(with_probabilities=True):
27 |             B[row, col] += val / (p * self.nnz)
28 |         covariance = dot(B.transpose(), B)
29 |         (_, s, Vt) = svds(
30 |             covariance, k=self.ell, maxiter=50, return_singular_vectors=True
31 |         )
32 |         return dot(diag(sqrt(s[: self.ell])), Vt[: self.ell, :])
33 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/exampleUsage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import sys
 3 | from numpy.linalg import norm
 4 | from numpy import dot
 5 | 
 6 | from .utils.syntheticDataMaker import SyntheticDataMaker
 7 | from .frequentDirections import FrequentDirections
 8 | 
 9 | n = 500
10 | d = 100
11 | ell = 20
12 | k = 5
13 | 
14 | # this is only needed for generating input vectors
15 | dataMaker = SyntheticDataMaker()
16 | dataMaker.initBeforeMake(d, k, signal_to_noise_ratio=10.0)
17 | 
18 | # This is where the sketching actually happens
19 | sketcher = FrequentDirections(d, ell)
20 | for i in range(n):
21 |     row = dataMaker.makeRow()
22 |     sketcher.append(row)
23 | sketch = sketcher.get()
24 | 
25 | # Here is where you do something with the sketch.
26 | # The sketch is an ell by d matrix
27 | # For example, you can compute an approximate covariance of the input
28 | # matrix like this:
29 | 
30 | approxCovarianceMatrix = dot(sketch.transpose(), sketch)
31 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/frequentDirections.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy import zeros, max, sqrt, isnan, isinf, dot, diag, count_nonzero
 3 | from numpy.linalg import svd, linalg
 4 | from scipy.linalg import svd as scipy_svd
 5 | from scipy.sparse.linalg import svds as scipy_svds
 6 | 
 7 | from .matrixSketcherBase import MatrixSketcherBase
 8 | 
 9 | 
10 | class FrequentDirections(MatrixSketcherBase):
11 |     def __init__(self, d, ell):
12 |         self.class_name = "FrequentDirections"
13 |         self.d = d
14 |         self.ell = ell
15 |         self.m = 2 * self.ell
16 |         self._sketch = zeros((self.m, self.d))
17 |         self.nextZeroRow = 0
18 | 
19 |     def append(self, vector):
20 |         if count_nonzero(vector) == 0:
21 |             return
22 | 
23 |         if self.nextZeroRow >= self.m:
24 |             self.__rotate__()
25 | 
26 |         self._sketch[self.nextZeroRow, :] = vector
27 |         self.nextZeroRow += 1
28 | 
29 |     def __rotate__(self):
30 |         try:
31 |             [_, s, Vt] = svd(self._sketch, full_matrices=False)
32 |         except linalg.LinAlgError as err:
33 |             [_, s, Vt] = scipy_svd(self._sketch, full_matrices=False)
34 |         # [_,s,Vt] = scipy_svds(self._sketch, k = self.ell)
35 | 
36 |         if len(s) >= self.ell:
37 |             sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2)
38 |             self._sketch[: self.ell :, :] = dot(diag(sShrunk), Vt[: self.ell, :])
39 |             self._sketch[self.ell :, :] = 0
40 |             self.nextZeroRow = self.ell
41 |         else:
42 |             self._sketch[: len(s), :] = dot(diag(s), Vt[: len(s), :])
43 |             self._sketch[len(s) :, :] = 0
44 |             self.nextZeroRow = len(s)
45 | 
46 |     def get(self):
47 |         return self._sketch[: self.ell, :]
48 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/matrixSketcherBase.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy import zeros
 3 | 
 4 | 
 5 | class MatrixSketcherBase:
 6 |     def __init__(self, d, ell):
 7 |         self.d = d
 8 |         self.ell = ell
 9 |         self._sketch = zeros((self.ell, self.d))
10 | 
11 |     # Appending a row vector to sketch
12 |     def append(self, vector):
13 |         pass
14 | 
15 |     # Convenient looping numpy matrices row by row
16 |     def extend(self, vectors):
17 |         for vector in vectors:
18 |             self.append(vector)
19 | 
20 |     # returns the sketch matrix
21 |     def get(self):
22 |         return self._sketch
23 | 
24 |     # Convenience support for the += operator  append
25 |     def __iadd__(self, vector):
26 |         self.append(vector)
27 |         return self
28 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/randomProjections.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy import outer, sqrt
 3 | from numpy.random import choice
 4 | from .matrixSketcherBase import MatrixSketcherBase
 5 | 
 6 | 
 7 | class RandomProjections(MatrixSketcherBase):
 8 |     def __init__(self, d, ell):
 9 |         MatrixSketcherBase.__init__(self, d, ell)
10 |         self.class_name = "RandomProjections"
11 |         self.rescaled_signs = [-1.0, 1.0] / sqrt(self.ell)
12 | 
13 |     def append(self, vector):
14 |         randomVector = choice(self.rescaled_signs, self.ell)
15 |         self._sketch += outer(randomVector, vector)
16 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/randomSums.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy.random import randint, choice
 3 | from .matrixSketcherBase import MatrixSketcherBase
 4 | 
 5 | 
 6 | class RandomSums(MatrixSketcherBase):
 7 |     def __init__(self, d, ell):
 8 |         MatrixSketcherBase.__init__(self, d, ell)
 9 |         self.class_name = "RandomSums"
10 |         self.signs = [1.0, -1.0]
11 | 
12 |     def append(self, vector):
13 |         row = randint(self.ell)
14 |         sign = choice(self.signs)
15 |         # v = (sign*vector).tolist()
16 |         # self._sketch[row,:] += v[0]
17 |         self._sketch[row, :] += sign * vector
18 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/rowSampler.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy
 3 | from numpy.linalg import norm
 4 | from random import random
 5 | from .matrixSketcherBase import MatrixSketcherBase
 6 | 
 7 | 
 8 | class RowSampler(MatrixSketcherBase):
 9 |     def __init__(self, d, ell):
10 |         MatrixSketcherBase.__init__(self, d, ell)
11 |         self.class_name = "RowSampler"
12 |         self.samplers = [singleItemSampler() for i in range(self.ell)]
13 | 
14 |     def append(self, vector):
15 |         row_norm_square = norm(vector) ** 2
16 |         for i in range(self.ell):
17 |             self.samplers[i].add(vector, row_norm_square)
18 | 
19 |     def get(self):
20 |         for (i, sampler) in enumerate(self.samplers):
21 |             p = sampler.item_probability
22 |             row = sampler.item
23 |             if row is not None:
24 |                 self._sketch[i, :] = row / (numpy.sqrt(p * float(self.ell)))
25 |         return self._sketch
26 | 
27 | 
28 | class singleItemSampler:
29 |     def __init__(self):
30 |         self.item = None
31 |         self.item_weight = 0.0
32 |         self.item_probability = 0.0
33 |         self.sum_w = 0.0
34 |         self.machine_precision = 1e-10
35 | 
36 |     def add(self, item, w=1):
37 |         w = float(w)
38 |         if w <= 0.0:
39 |             return
40 |         self.sum_w += w
41 |         p = w / max(self.sum_w, self.machine_precision)
42 |         if random() < p or self.item is None:
43 |             self.item = item
44 |             self.item_weight = w
45 |             self.item_probability = p
46 |         else:
47 |             self.item_probability = self.item_probability * (1.0 - p)
48 | 
49 |     def get(self):
50 |         return (self.item, self.item_weight, self.item_probability)
51 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/sparseMatrix.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | import sys
  4 | from numpy import (
  5 |     ceil,
  6 |     log,
  7 |     array,
  8 |     sum,
  9 |     float32,
 10 |     uint32,
 11 |     zeros,
 12 |     empty,
 13 |     arange,
 14 |     concatenate,
 15 |     sqrt,
 16 |     diag,
 17 | )
 18 | from .sparseVector import SparseVector
 19 | from scipy.sparse import rand
 20 | from numpy.random import randn
 21 | from .utils.common import *
 22 | from time import time as timer
 23 | from numpy.linalg import qr
 24 | 
 25 | 
 26 | class SparseMatrix:
 27 |     def __init__(self, dim):
 28 |         self.rows = empty((1, dim))
 29 |         self.cols = empty((1, dim))
 30 |         self.values = empty((1, dim))
 31 |         self.nnz = 0  # number of non-zeros
 32 |         self.nextRow = 0
 33 |         self.pointer = 0
 34 |         self.dimension = dim
 35 | 
 36 |     def append(self, vector):
 37 |         if vector.d != self.dimension:
 38 |             print("dimension mismatch: can not append this vector to the matrix")
 39 |             return
 40 | 
 41 |         # extend arrays
 42 |         if self.pointer + vector.nnz > self.rows.shape[1]:
 43 |             z = empty((1, self.rows.shape[1]))
 44 |             self.rows = concatenate((self.rows, z), axis=1)
 45 |             self.cols = concatenate((self.cols, z), axis=1)
 46 |             self.values = concatenate((self.values, z), axis=1)
 47 | 
 48 |         for i in range(vector.nnz):
 49 |             self.rows[0, self.pointer] = self.nextRow
 50 |             self.cols[0, self.pointer] = vector.cols[i]
 51 |             self.values[0, self.pointer] = vector.values[i]
 52 |             self.pointer += 1
 53 | 
 54 |         self.nextRow += 1
 55 |         self.nnz += vector.nnz
 56 | 
 57 |     def getShape(self):
 58 |         return self.nextRow, self.dimension
 59 | 
 60 |     def toDense(self):
 61 |         denseMat = zeros((self.nextRow, self.dimension))
 62 |         rowIndex = self.rows[0, 0]
 63 |         headPtr = 0
 64 | 
 65 |         for ptr in range(self.pointer):
 66 |             if ptr != self.pointer - 1 and self.rows[0, ptr] != rowIndex:
 67 |                 for j in range(headPtr, ptr):
 68 |                     denseMat[rowIndex, self.cols[0, j]] = self.values[0, j]
 69 |                 # resetting
 70 |                 rowIndex = self.rows[0, ptr]
 71 |                 headPtr = ptr
 72 | 
 73 |             elif ptr == self.pointer - 1:
 74 |                 if self.rows[0, ptr] == rowIndex:
 75 |                     for j in range(headPtr, ptr + 1):
 76 |                         denseMat[rowIndex, self.cols[0, j]] = self.values[0, j]
 77 |                 elif self.rows[0, ptr] != rowIndex:
 78 |                     for j in range(headPtr, ptr):
 79 |                         denseMat[rowIndex, self.cols[0, j]] = self.values[0, j]
 80 |                     rowIndex = self.rows[0, ptr]
 81 |                     denseMat[rowIndex, self.cols[0, ptr]] = self.values[0, ptr]
 82 | 
 83 |         return denseMat
 84 | 
 85 |     def sparseShrink(self, ell):
 86 |         Z = self.blockpower(ell, 0.25)
 87 |         ZtA = self.transposeRightMult(Z)
 88 |         [u, s, vt] = svd(ZtA, full_matrices=False)
 89 |         for i in range(len(s)):
 90 |             s[i] = sqrt(s[i] ** 2 - s[-1] ** 2)
 91 |         return diag(s).dot(vt)
 92 | 
 93 |     def blockpower(self, ell, eps=1):
 94 |         n, d = self.getShape()
 95 |         init_mat = randn(d, ell)
 96 |         num_of_iter = int(10 * ceil(log(d / eps) / eps))
 97 | 
 98 |         for i in range(num_of_iter):
 99 |             [init_mat, _] = qr(init_mat)
100 |             init_mat = self.covarianceMult(init_mat)
101 | 
102 |         K = self.leftMult(init_mat)
103 |         [Q, _] = qr(K)
104 |         del K
105 |         del init_mat
106 |         return Q
107 | 
108 |     ## A^TA * denseMat
109 |     def covarianceMult(self, denseMat):
110 |         rowIndex = self.rows[0, 0]
111 |         headPtr = 0
112 |         ptr = 0
113 |         d, ell = denseMat.shape
114 |         temp = zeros((1, ell))
115 |         product = zeros((d, ell))
116 | 
117 |         while ptr != self.pointer:
118 |             headPtr = ptr
119 |             rowIndex = self.rows[0, headPtr]
120 |             del temp
121 |             temp = zeros((1, ell))
122 | 
123 |             while ptr != self.pointer and self.rows[0, ptr] == rowIndex:
124 |                 temp += denseMat[self.cols[0, ptr], :] * self.values[0, ptr]
125 |                 ptr += 1
126 | 
127 |             for j in range(headPtr, ptr):
128 |                 product[self.cols[0, j], :] += self.values[0, j] * temp[0, :]
129 | 
130 |         return product
131 | 
132 |     ## computes G^tA
133 |     def transposeRightMult(self, denseMat):
134 |         ptr = 0
135 |         rowIndex = self.rows[0, ptr]
136 |         m, ell = denseMat.shape
137 |         product = zeros((ell, self.dimension))
138 | 
139 |         while ptr != self.pointer:
140 |             rowIndex = self.rows[0, ptr]
141 |             while ptr != self.pointer and rowIndex == self.rows[0, ptr]:
142 |                 for t in range(ell):
143 |                     product[t, self.cols[0, ptr]] += (
144 |                         self.values[0, ptr] * denseMat[rowIndex, t]
145 |                     )
146 |                 ptr += 1
147 | 
148 |         return product
149 | 
150 |     ## computes A*G
151 |     def leftMult(self, denseMat):
152 |         rowIndex = self.rows[0, 0]
153 |         headPtr = 0
154 |         d, ell = denseMat.shape
155 |         product = zeros((self.nextRow, ell))
156 | 
157 |         for ptr in range(self.pointer):
158 |             # case 1
159 |             if (
160 |                 self.rows[0, ptr] != rowIndex and ptr != self.pointer - 1
161 |             ):  # headPtr -> ptr-1 is one row
162 |                 for j in range(headPtr, ptr):
163 |                     product[rowIndex, :] += (
164 |                         denseMat[self.cols[0, j], :] * self.values[0, j]
165 |                     )
166 |                 # resetting
167 |                 rowIndex = self.rows[0, ptr]
168 |                 headPtr = ptr
169 | 
170 |                 # case 2 and 3
171 |             elif ptr == self.pointer - 1:
172 |                 # case 2
173 |                 if self.rows[0, ptr] == rowIndex:
174 |                     for j in range(headPtr, ptr + 1):
175 |                         product[rowIndex, :] += (
176 |                             denseMat[self.cols[0, j], :] * self.values[0, j]
177 |                         )
178 | 
179 |                     # case 3
180 |                 elif self.rows[0, ptr] != rowIndex:
181 |                     for j in range(headPtr, ptr):
182 |                         product[rowIndex, :] += (
183 |                             denseMat[self.cols[0, j], :] * self.values[0, j]
184 |                         )
185 | 
186 |                     # resetting
187 |                     rowIndex = self.rows[0, ptr]
188 |                     product[rowIndex, :] = (
189 |                         denseMat[self.cols[0, ptr], :] * self.values[0, ptr]
190 |                     )
191 | 
192 |         return product
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     N = 10000
197 |     dimension = 1000
198 |     density = 0.1
199 |     ell = 10
200 | 
201 |     A = rand(N, dimension, density, format="coo")
202 |     svList, flag = cooToSparseVectorsList(A)
203 |     sparseMat = SparseMatrix(dimension)
204 |     for sv in svList:
205 |         sparseMat.append(sv)
206 | 
207 |     s = timer()
208 |     B = sparseMat.sparseShrink(ell)
209 |     e = timer()
210 |     print("elapsed time in python is ", e - s)
211 | 
212 |     print(B)
213 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/sparseSketcher.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy import zeros, sqrt, dot, diag, ceil, log
 3 | from numpy.random import randn
 4 | from numpy.linalg import norm, svd, qr, eigh
 5 | from scipy.sparse import lil_matrix as sparse_matrix
 6 | from scipy.sparse import csc_matrix, rand
 7 | 
 8 | from .matrixSketcherBase import MatrixSketcherBase
 9 | 
10 | 
11 | # simultaneous iterations algorithm
12 | # inputs: matrix is input matrix, ell is number of desired right singular vectors
13 | # outputs: transpose of approximated top ell singular vectors, and first ell singular values
14 | def simIter(matrix, ell):
15 |     [m, d] = matrix.shape
16 |     num_of_iter = int(ceil(4 * log(m)))
17 |     init_vectors = randn(m, ell)
18 |     matrix = csc_matrix(matrix)
19 |     matrix_trans = matrix.transpose()
20 | 
21 |     for i in range(num_of_iter):
22 |         init_vectors = matrix.dot((matrix_trans).dot(init_vectors))
23 | 
24 |     [Q, _] = qr((matrix_trans).dot(init_vectors))
25 |     M = matrix.dot(Q)
26 | 
27 |     [_, S, U] = svd(M, full_matrices=False)
28 | 
29 |     return (U[:, :ell].transpose()).dot(Q.transpose()), S[:ell]
30 | 
31 | 
32 | # sparse frequent directions sketcher
33 | class SparseSketcher(MatrixSketcherBase):
34 |     def __init__(self, d, ell):
35 |         self.class_name = "SparseSketcher"
36 |         self.d = d
37 |         self.ell = ell
38 |         self._sketch = zeros((2 * self.ell, self.d))
39 |         self.sketch_nextZeroRow = 0
40 | 
41 |         self.buffer_ell = self.d
42 |         self.buffer = sparse_matrix((self.buffer_ell, self.d))
43 |         self.buffer_nnz = 0
44 |         self.buffer_nextZeroRow = 0
45 |         self.buffer_nnz_threshold = 2 * self.ell * self.d
46 | 
47 |     def append(self, vector):
48 |         if vector.nnz == 0:
49 |             return
50 | 
51 |         if (
52 |             self.buffer_nextZeroRow >= self.buffer_ell
53 |             or self.buffer_nnz >= self.buffer_nnz_threshold
54 |         ):
55 |             self.__rotate__()
56 | 
57 |         self.buffer[self.buffer_nextZeroRow, :] = vector
58 |         self.buffer_nnz += vector.nnz
59 |         self.buffer_nextZeroRow += 1
60 | 
61 |     def __rotate__(self):
62 |         # First shrink the buffer
63 |         [Vt, s] = simIter(self.buffer, self.ell)
64 | 
65 |         # insert the shrunk part into the sketch
66 |         if len(s) >= self.ell:
67 |             sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2)
68 |             self._sketch[self.ell :, :] = dot(diag(sShrunk), Vt[: self.ell, :])
69 |         else:
70 |             self._sketch[self.ell : self.ell + len(s), :] = dot(
71 |                 diag(s), Vt[: len(s), :]
72 |             )
73 | 
74 |         # resetting the buffer matrix
75 |         del self.buffer
76 |         self.buffer = sparse_matrix((self.buffer_ell, self.d))
77 |         self.buffer_nnz = 0
78 |         self.buffer_nextZeroRow = 0
79 | 
80 |         # A dense shrink of the sketch
81 |         [_, s, Vt] = svd(self._sketch, full_matrices=False)
82 |         if len(s) >= self.ell:
83 |             sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2)
84 |             self._sketch[: self.ell, :] = dot(diag(sShrunk), Vt[: self.ell, :])
85 |             self._sketch[self.ell :, :] = 0
86 |         else:
87 |             self._sketch[: len(s), :] = dot(diag(s), Vt[: len(s), :])
88 |             self._sketch[len(s) :, :] = 0
89 | 
90 |     def get(self):
91 |         self.__rotate__()
92 |         return self._sketch[: self.ell, :]
93 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/sparseSketcher_sparseMat.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | from numpy import zeros, sqrt, dot, diag, ceil, log
  4 | from numpy.random import randn
  5 | from numpy import cov as covariance
  6 | from numpy.linalg import norm, svd, qr, eigh
  7 | from scipy.sparse import lil_matrix, csc_matrix, csr_matrix, dok_matrix, rand
  8 | from time import time as timer
  9 | import pickle
 10 | 
 11 | from .matrixSketcherBase import MatrixSketcherBase
 12 | from .utils.common import truncateSVD
 13 | from .blockPower import blockpower
 14 | from .sparseVector import SparseVector
 15 | from .frequentDirections import FrequentDirections as FD
 16 | 
 17 | from .sparseMatrix import SparseMatrix
 18 | 
 19 | # sparse frequent directions sketcher
 20 | class SparseSketcher(MatrixSketcherBase):
 21 |     def __init__(self, d, ell):
 22 |         self.class_name = "SparseSketcher_sparseMatrix"
 23 |         self.d = d
 24 |         self.ell = ell
 25 |         self._sketch = zeros((2 * self.ell, self.d))
 26 | 
 27 |         self.buffer_nnz_threshold = 2 * self.ell * self.d
 28 |         self.buffer = SparseMatrix(self.buffer_nnz_threshold)
 29 | 
 30 |     def append(self, vector):
 31 |         if self.buffer.nnz >= self.buffer_nnz_threshold:
 32 |             self.__rotate__()
 33 | 
 34 |         self.buffer.append(vector)
 35 | 
 36 |     def __rotate__(self):
 37 |         # First shrink the buffer
 38 |         [s, vt] = blockpower(self.buffer, self.ell)
 39 | 
 40 |         # insert the shrunk part into the sketch
 41 |         if len(s) < self.ell:
 42 |             self._sketch[self.ell : self.ell + len(s), :] = dot(
 43 |                 diag(s), vt[: len(s), :]
 44 |             )
 45 |         else:  # len(s) == self.ell
 46 |             sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2)
 47 |             self._sketch[self.ell :, :] = dot(diag(sShrunk), vt[: self.ell, :])
 48 | 
 49 |         # resetting the buffer matrix
 50 |         del self.buffer
 51 |         self.buffer = lil_matrix((self.d, self.d))
 52 |         self.buffer_nnz = 0
 53 |         self.buffer_nextRow = 0
 54 | 
 55 |         # A dense shrink of the sketch
 56 |         [_, s, vt] = svd(self._sketch, full_matrices=False)
 57 |         if len(s) >= self.ell:
 58 |             sShrunk = sqrt(s[: self.ell] ** 2 - s[self.ell - 1] ** 2)
 59 |             self._sketch[: self.ell, :] = dot(diag(sShrunk), vt[: self.ell, :])
 60 |             self._sketch[self.ell :, :] = 0
 61 |         else:
 62 |             self._sketch[: len(s), :] = dot(diag(s), vt[: len(s), :])
 63 |             self._sketch[len(s) :, :] = 0
 64 | 
 65 |     def get(self):
 66 |         self.__rotate__()
 67 |         return self._sketch[: self.ell, :]
 68 | 
 69 | 
 70 | if __name__ == "__main__":
 71 |     # make input data
 72 |     n = 1000
 73 |     d = 400
 74 |     k = 5
 75 |     ells = list(range(10, 21, 10))
 76 |     density = 0.1
 77 |     A = rand(n, d, density, format="lil")
 78 | 
 79 |     # error computation
 80 |     B = A.todense()
 81 |     ATA = covariance(B.T)
 82 |     squared_frob_A = norm(B, "fro") ** 2
 83 |     A_rank_k = truncateSVD(B, k)
 84 |     opt_rank_k_err = norm(B - A_rank_k, "fro") ** 2
 85 | 
 86 |     for ell in ells:
 87 |         sketcher = SparseSketcher(d, ell)
 88 | 
 89 |         t_start = timer()
 90 |         for sv in A:
 91 |             sketcher.append(sv)
 92 |         t_end = timer()
 93 |         totalSketchTime = t_end - t_start
 94 | 
 95 |         sketch = sketcher.get()
 96 |         #### cov-error #######
 97 |         diff = ATA - dot(sketch.transpose(), sketch)
 98 |         relative_cov_err = float(norm(diff, 2)) / float(squared_frob_A)
 99 | 
100 |         #### proj-error ######
101 |         [u, s, vt] = svd(sketch, full_matrices=False)
102 |         vt = vt[:k, :]
103 |         projection = dot(B, dot(vt.transpose(), vt))
104 |         proj_err = norm(B - projection, "fro") ** 2
105 |         relative_proj_err = float(proj_err) / float(opt_rank_k_err)
106 | 
107 |         print(
108 |             "sparse: ell=",
109 |             ell,
110 |             "time=",
111 |             totalSketchTime,
112 |             "cov-err=",
113 |             relative_cov_err,
114 |             "proj-err=",
115 |             relative_proj_err,
116 |         )
117 | 
118 |         ############### FD ###################################3
119 |         sketcher = FD(d, ell)
120 |         t_start = timer()
121 |         for sv in A:
122 |             sketcher.append(sv)
123 |         t_end = timer()
124 |         totalSketchTime = t_end - t_start
125 | 
126 |         sketch = sketcher.get()
127 |         #### cov-error #######
128 |         diff = ATA - dot(sketch.transpose(), sketch)
129 |         relative_cov_err = float(norm(diff, 2)) / float(squared_frob_A)
130 | 
131 |         #### proj-error ######
132 |         [u, s, vt] = svd(sketch, full_matrices=False)
133 |         vt = vt[:k, :]
134 |         projection = dot(B, dot(vt.transpose(), vt))
135 |         proj_err = norm(B - projection, "fro") ** 2
136 |         relative_proj_err = float(proj_err) / float(opt_rank_k_err)
137 | 
138 |         print(
139 |             "DenseFD: ell=",
140 |             ell,
141 |             "time=",
142 |             totalSketchTime,
143 |             "cov-err=",
144 |             relative_cov_err,
145 |             "proj-err=",
146 |             relative_proj_err,
147 |         )
148 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/sparseVector.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import sys
 3 | from numpy import array, sum, float32, uint32, zeros
 4 | 
 5 | 
 6 | class SparseVector:
 7 |     def __init__(self, d, kvList):
 8 |         self.d = d
 9 |         kvList = [kv for kv in kvList if kv[0] >= 0 and kv[0] < self.d]
10 |         kvList.sort()
11 |         self.cols = array([kv[0] for kv in kvList], dtype=uint32)
12 |         self.values = array([kv[1] for kv in kvList], dtype=float32)
13 |         self.shape = (1, self.d)
14 |         self.nnz = len(self.cols)
15 | 
16 |         self._normSquare = sum(self.values**2)
17 | 
18 |     def todense(self):
19 |         v = zeros(self.shape)
20 |         for i in range(self.nnz):
21 |             v[0, self.cols[i]] = self.values[i]
22 |         return v
23 | 
24 |     def getNnz(self):
25 |         return self.nnz
26 | 
27 |     def getNormSquare(self):
28 |         return self._normSquare
29 | 
30 |     def distSquare(self, other):
31 |         return self._normSquare + other._normSquare - 2 * self.dot(other)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     d = 30
36 |     sv1 = SparseVector(d, [(1, 3.1), (23, 0.1), (13, 13)])
37 |     sv2 = SparseVector(d, [(12, 3.1), (23, 0.1), (43, -0.4)])
38 | 
39 |     mat = SparseMatrix()
40 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from . import common
3 | from . import syntheticDataMaker
4 | from . import reservoirSampler
5 | 
6 | __all__ = ["common", "reservoirSampler", "syntheticDataMaker"]
7 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/utils/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from numpy.linalg import svd
 3 | from numpy import dot
 4 | from numpy import diagflat
 5 | 
 6 | 
 7 | def truncateSVD(A, k):
 8 |     U, s, Vt = svd(A, full_matrices=False)
 9 |     opt = dot(U[:, 0:k], dot(diagflat(s[0:k]), Vt[0:k, :]))
10 |     return opt
11 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/utils/reservoirSampler.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | from numpy.random import binomial
 4 | from random import sample
 5 | 
 6 | 
 7 | class ReservoirSampler:
 8 |     def __init__(self, t_paralel_sampleres=1):
 9 |         self.t = t_paralel_sampleres
10 |         self.t_range = list(range(self.t))
11 |         self.items = [None] * self.t
12 |         self.items_weights = [0.0] * self.t
13 | 
14 |         self.item_probability = 0.0
15 |         self.sum_w = 0.0
16 |         self.machine_precision = 1e-10
17 | 
18 |     def add(self, item, w=1):
19 | 
20 |         w = float(w)
21 |         if w <= 0.0:
22 |             return
23 |         self.sum_w += w
24 |         p = w / max(self.sum_w, self.machine_precision)
25 | 
26 |         num_items_to_update = binomial(self.t, p)
27 |         items_to_update = sample(self.t_range, num_items_to_update)
28 | 
29 |         for i in items_to_update:
30 |             self.items[i] = item
31 |             self.items_weights[i] = w
32 | 
33 |     def get(self, with_probabilities=False):
34 |         if with_probabilities:
35 |             probs = [w / self.sum_w for w in self.items_weights]
36 |             return list(zip(self.items, probs))
37 |         else:
38 |             return self.items
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     n = 1000
43 |     items = list(range(n))
44 |     weights = list(range(n))
45 | 
46 |     rs = ReservoirSampler(1000)
47 | 
48 |     for i in range(n):
49 |         rs.add(items[i], weights[i])
50 | 
51 |     print(sorted(rs.get()))
52 | 


--------------------------------------------------------------------------------
/frequent_directions_experiments/utils/syntheticDataMaker.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import sys
 3 | from numpy.random import randn
 4 | from numpy.linalg import qr
 5 | import numpy
 6 | 
 7 | 
 8 | class SyntheticDataMaker:
 9 |     def __init__(self):
10 |         self.wasInitForMake = False
11 | 
12 |     def initBeforeMake(
13 |         self,
14 |         dimension,
15 |         signal_dimension=0,
16 |         signal_to_noise_ratio=0,
17 |         signal_singular_value_decay_factor=0,
18 |         signal_singular_value_decay_type="exp",
19 |     ):
20 | 
21 |         self.dimension = dimension
22 |         self.signal_dimension = signal_dimension
23 |         self.signal_to_noise_ratio = signal_to_noise_ratio
24 |         self.signal_singular_value_decay_factor = signal_singular_value_decay_factor
25 |         self.signal_singular_value_decay_type = signal_singular_value_decay_type
26 | 
27 |         # setting a random singular space
28 |         [Q, R] = qr(randn(self.dimension, self.signal_dimension))
29 |         self.signal_row_space = Q.transpose()
30 |         del Q, R
31 | 
32 |         # setting the singular values
33 |         eta = self.signal_singular_value_decay_factor
34 |         if self.signal_singular_value_decay_type == "exp":
35 |             self.signal_singular_values = [
36 |                 numpy.exp(-10 * eta * i / self.signal_dimension)
37 |                 for i in range(self.signal_dimension)
38 |             ]
39 |         elif self.signal_singular_value_decay_type == "lin":
40 |             self.signal_singular_values = [
41 |                 max(1.0 - eta * float(i) / self.signal_dimension, 0.0)
42 |                 for i in range(self.signal_dimension)
43 |             ]
44 |         else:
45 |             self.signal_singular_values = numpy.ones(self.signal_dimension)
46 |         # done initializing
47 |         self.wasInitForMake = True
48 | 
49 |     def makeRow(self):
50 |         if not self.wasInitForMake:
51 |             sys.stderr.write("ERROR: must run initBeforeMake(...) before makeRow()")
52 |             return
53 |         noise = randn(self.dimension)
54 |         signal_coeffs = randn(self.signal_dimension)
55 |         signal = numpy.dot(
56 |             self.signal_singular_values * signal_coeffs, self.signal_row_space
57 |         )
58 |         return signal + noise / self.signal_to_noise_ratio
59 | 
60 |     def makeMatrix(self, n):
61 |         matrix = numpy.zeros((n, self.dimension))
62 |         for i in range(n):
63 |             matrix[i, :] = self.makeRow()
64 |         return matrix
65 | 
66 |     def getSignalRowSpace(self):
67 |         return self.signal_row_space
68 | 
69 |     def __vector_to_string__(self, v):
70 |         s = "%s\n" % (",".join("%.2E" % x for x in v.flatten()))
71 |         return s
72 | 
73 |     def __vector_from_string(self, s):
74 |         v = numpy.array([float(x) for x in s.strip("\n").split(",")])
75 |         return v
76 | 
77 |     def readFromFileIter(self, f=sys.stdin):
78 |         for line in f:
79 |             yield self.__vector_from_string(line)
80 | 
81 |     def writeToFile(self, v, f=sys.stdout):
82 |         f.write(self.__vector_to_string__(v))
83 | 
84 |     def writeToFileIter(self, vs, f=sys.stdout):
85 |         for v in vs:
86 |             f.write(self.__vector_to_string__(v))
87 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """Frequent directions experiments python module."""
2 | 
3 | from setuptools import setup
4 | 
5 | setup(
6 |     name="frequent_directions_experiments",
7 |     version="0.1.0",
8 | )
9 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | python -m frequent-directions-experiments.test.testBruteForce
2 | python -m frequent-directions-experiments.test.testEntrySampler
3 | python -m frequent-directions-experiments.test.testFrequentDirections
4 | python -m frequent-directions-experiments.test.testRandomProjections
5 | python -m frequent-directions-experiments.test.testRandomSums
6 | python -m frequent-directions-experiments.test.testRowSampler
7 | python -m frequent-directions-experiments.test.testSparseSketcher


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edoliberty/frequent-directions-experiments/06ecc4a1513c9b83c0bda3de1d2cb5ded468e3a0/test/__init__.py


--------------------------------------------------------------------------------
/test/runtests.sh:
--------------------------------------------------------------------------------
1 | python -m unittest discover


--------------------------------------------------------------------------------
/test/testBruteForce.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testBruteForce(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.bruteForce.BruteForce(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 |         sketch = sketcher.get()
25 |         self.assertEqual(sketch.shape, (ell, d))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/test/testEntrySampler.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testEntrySampler(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.entrySampler.EntrySampler(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 |         sketch = sketcher.get()
25 |         self.assertEqual(sketch.shape, (ell, d))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/test/testFrequentDirections.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testFrequentDirections(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.frequentDirections.FrequentDirections(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 |         sketch = sketcher.get()
25 |         self.assertEqual(sketch.shape, (ell, d))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/test/testRandomProjections.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testRandomProjection(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.randomProjections.RandomProjections(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 | 
25 |         sketch = sketcher.get()
26 |         self.assertEqual(sketch.shape, (ell, d))
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/test/testRandomSums.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testRandomSums(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.randomSums.RandomSums(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 | 
25 |         sketch = sketcher.get()
26 | 
27 |         self.assertEqual(sketch.shape, (ell, d))
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/test/testRowSampler.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | 
 4 | 
 5 | class testRowSampler(unittest.TestCase):
 6 |     def test_running(self):
 7 |         n = 100
 8 |         d = 20
 9 |         ell = 5
10 |         syntheticDataMaker = fde.utils.syntheticDataMaker.SyntheticDataMaker()
11 |         syntheticDataMaker.initBeforeMake(
12 |             d,
13 |             signal_dimension=10,
14 |             signal_to_noise_ratio=5,
15 |             signal_singular_value_decay_factor=1,
16 |             signal_singular_value_decay_type="lin",
17 |         )
18 | 
19 |         sketcher = fde.rowSampler.RowSampler(d, ell)
20 | 
21 |         for i in range(n):
22 |             v = syntheticDataMaker.makeRow()
23 |             sketcher.append(v)
24 | 
25 |         sketch = sketcher.get()
26 |         self.assertEqual(sketch.shape, (ell, d))
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/test/testSparseSketcher.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import frequent_directions_experiments as fde
 3 | from scipy.sparse import rand
 4 | 
 5 | 
 6 | class testSparseSketcher(unittest.TestCase):
 7 |     def test_running(self):
 8 |         n = 100
 9 |         d = 20
10 |         ell = 5
11 |         A = rand(n, d, density=0.001, format="lil")
12 |         sketcher = fde.sparseSketcher.SparseSketcher(d, ell)
13 | 
14 |         for v in A:
15 |             sketcher.append(v)
16 |         sketch = sketcher.get()
17 |         self.assertEqual(sketch.shape, (ell, d))
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------