├── .gitignore ├── LICENSE ├── README.md ├── examples ├── fractional_pca.py ├── hinge.py ├── huber_pca.py ├── likert.py ├── mixed.py ├── pca_nonneg.py ├── pca_nucnorm.py ├── pca_test.py ├── readme.md └── smiley.py ├── glrm ├── __init__.py ├── convergence.py ├── glrm.py ├── loss.py ├── reg.py └── util.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # Under development 57 | *TODO* 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Corinne Horn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## GLRM 2 | 3 | GLRM is a python package for exploratory data analysis using Generalized Low 4 | Rank Models (GLRMs). 5 | 6 | A GLRM seeks factors X and Y such that XY approximates data table A 7 | using an arbitrary error metric (i.e. loss function) for each column of A. 8 | This framework allows for the generalization of principal components analysis 9 | (PCA) to a heterogenous dataset A, where columns of A contain data with 10 | different data types (e.g., Boolean, ordinal, interval). 11 | GLRM easily handles missing data by choosing a loss of zero for the missing 12 | entries of A. 13 | 14 | For more information on GLRMs, see [our 15 | paper](http://www.stanford.edu/~boyd/papers/glrm.html). 16 | 17 | This project provides a GLRM object for automatically computing factors X and Y, 18 | decoding XY back into the appropriate domain, and imputing missing entries. 19 | 20 | ## Installation 21 | python setup.py install 22 | 23 | ## Basic usage 24 | The source code for similar problems can be found in the 'examples' folder. 25 | 26 | A GLRM model is specified by data table A, loss functions L, regularizers for X and Y, rank k, 27 | and an (optional) list of missing entries. 28 | 29 | from glrm import GLRM 30 | 31 | Consider a data table A that is approximately rank k, where the first n1 columns 32 | contain Boolean data, and the next n2 columns contain numerical data. 33 | 34 | m, n1, n2, k = 50, 25, 25, 5 35 | eta = 0.1 # noise 36 | A = randn(m,k).dot(randn(k,n1+n2)) + eta*randn(m,n1+n2) 37 | A_bool = sign(A[:,:n1]) # Boolean data must be labeled as -1, 1 38 | A_real = A[:,n1:] 39 | 40 | We decide to use hinge loss for the Boolean data, and quadratic loss 41 | for the numerical data. The scaling of each loss function 42 | is handled automatically during the intialization of the GLRM object. 43 | 44 | from glrm.loss import QuadraticLoss, HingeLoss 45 | 46 | Data A is stored as a list of submatrices, where each submatrix 47 | is associated with a data type. The loss functions associated with each 48 | submatrix are stored similarly. 49 | 50 | A_list = [A_bool, A_real] 51 | loss_list = [HingeLoss, QuadraticLoss] 52 | 53 | To improve generalization error, we choose to use quadratic regularization 54 | on both factors X and Y with weight 0.1. (For no regularization on X and Y, use 55 | ZeroReg.) 56 | 57 | from glrm.reg import QuadraticReg 58 | regX, regY = QuadraticReg(0.1), QuadraticReg(0.1) 59 | 60 | If any entries are corrupted or missing, we store indices of the missing 61 | entries *for each submatrix* in the list format shown above. 62 | For example, if a 4x4 block of data is missing from the center of A, 63 | this corresponds to rows 24-27 and columns 49-50 of submatrix 1, 64 | and rows 24-27 and columns 1-2 of submatrix 2. (Python is 0-indexed.) 65 | 66 | missing1 = [(23, 48), (23, 49), (24, 48), (24, 49), \ 67 | (25, 48), (25, 49), (26, 48), (26, 49)] 68 | missing2 = [(23, 0), (23, 1), (24, 0), (24, 1), \ 69 | (25, 0), (25, 1), (26, 0), (26, 1)] 70 | missing_list = [missing1, missing2] 71 | 72 | If a GLRM object is not provided a list of missing entries, then it is assumed 73 | that no entries are missing. 74 | 75 | [Optional] To specify the tolerance and maximum number of iterations 76 | of the alternating minimization algorithm, create a Convergence object to pass 77 | to the model. The default parameter values are shown below. 78 | 79 | from glrm.util import Convergence 80 | c = Convergence(TOL = 1e-3, max_iters = 1000) 81 | 82 | All that remains is to initialize the GLRM model and call fit(). 83 | 84 | model = GLRM(A_list, loss_list, regX, regY, k, missing = missing_list, converge = c) 85 | model.fit() 86 | 87 | To extract the factors X, Y and impute missing values, 88 | 89 | X, Y = model.factors() 90 | A_hat = model.predict() # a horizontally concatenated matrix, not a list 91 | 92 | To compare our prediction error, 93 | 94 | norm(A_hat - hstack(A_list)) # by hand 95 | 96 | To view convergence history, 97 | 98 | ch = model.convergence() # grab convergence history of alt min problem 99 | ch.plot() # view convergence of objective 100 | 101 | 102 | ## Supported loss functions and regularizers 103 | 104 | - QuadraticLoss 105 | - HuberLoss 106 | - HingeLoss 107 | - OrdinalLoss 108 | - ZeroReg 109 | - LinearReg 110 | - QuadraticReg 111 | 112 | ## Developing loss functions and regularizers (not guaranteed to work yet) 113 | 114 | - FractionalLoss 115 | - NonnegativeReg 116 | 117 | To use NonnegativeReg on either X or Y, you must specify to use proximal 118 | gradient descent on the corresponding subproblem. 119 | 120 | # given A_list, loss_list, k from above 121 | from glrm.reg import NonnegativeReg 122 | from glrm.algs import ProxGD 123 | 124 | regX, regY = NonnegativeReg(1.0), NonnegativeReg(1.0) 125 | model = GLRM(A_list, loss_list, regX, regY, k, algX = ProxGD, algY = ProxGD) 126 | model.fit() 127 | 128 | 129 | ## Questions/concerns/feedback 130 | Please send messages to Corinne Horn (cehorn at stanford.edu). 131 | -------------------------------------------------------------------------------- /examples/fractional_pca.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import FractionalLoss 2 | from glrm.reg import QuadraticReg 3 | from glrm import GLRM 4 | from glrm.util import pplot 5 | from numpy.random import randn, choice, seed 6 | from numpy import sign, exp 7 | seed(2) 8 | 9 | # Generate problem data 10 | m, n, k = 50, 50, 5 11 | eta = 0.1 # noise power 12 | data = exp(randn(m,k).dot(randn(k,n)) + eta*randn(m,n))+eta*randn(m,n) # noisy rank k 13 | 14 | # Initialize model 15 | A = data 16 | loss = FractionalLoss 17 | regX, regY = QuadraticReg(0.1), QuadraticReg(0.1) 18 | glrm_frac = GLRM(A, loss, regX, regY, k) 19 | 20 | # Fit 21 | glrm_frac.fit() 22 | 23 | # Results 24 | X, Y = glrm_frac.factors() 25 | A_hat = glrm_frac.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 26 | ch = glrm_frac.convergence() # convergence history 27 | pplot([A, A_hat, A-A_hat], ["original", "glrm", "error"]) 28 | 29 | # Now with missing data 30 | # from numpy.random import choice 31 | # from itertools import product 32 | # missing = list(product(range(int(0.25*m), int(0.75*m)), range(int(0.25*n), int(0.75*n)))) 33 | # 34 | # glrm_pca_nn_missing = GLRM(A, loss, regX, regY, k, missing) 35 | # glrm_pca_nn_missing.fit() 36 | # glrm_pca_nn_missing.compare() 37 | -------------------------------------------------------------------------------- /examples/hinge.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import HingeLoss 2 | from glrm.reg import NonnegativeReg, QuadraticReg 3 | from glrm import GLRM 4 | from glrm.util import pplot 5 | from glrm.convergence import Convergence 6 | from numpy.random import randn, choice, seed 7 | from numpy.random import choice 8 | from itertools import product 9 | from numpy import sign 10 | 11 | # Generate problem data 12 | m, n, k = 100, 100, 10 13 | eta = 0.1 # noise power 14 | X_true, Y_true = randn(m,k), randn(k,n) 15 | data = sign(X_true.dot(Y_true) + eta*randn(m,n)) # noisy rank k 16 | 17 | # Initialize model 18 | A = data 19 | loss = HingeLoss 20 | regX, regY = QuadraticReg(0.01), QuadraticReg(0.01) 21 | c = Convergence(TOL=1e-2) 22 | model = GLRM(A, loss, regX, regY, k, converge=c) 23 | 24 | # Fit 25 | model.fit(eps=1e-4, max_iters = 1000) # want more precision for hinge loss problem 26 | 27 | # Results 28 | X, Y = model.factors() 29 | A_hat = model.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 30 | ch = model.convergence() # convergence history 31 | pplot([A, A_hat, A - A_hat], ["original", "glrm", "error"]) 32 | # 33 | # # Now with missing data 34 | # missing = list(product(range(int(0.25*m), int(0.75*m)), range(int(0.25*n), int(0.75*n)))) 35 | # glrm_nn_missing = GLRM(A, loss, regX, regY, k, missing) 36 | # glrm_nn_missing.fit() 37 | # A_hat = glrm_nn_missing.predict() 38 | # pplot([A, missing, A_hat, A - A_hat], \ 39 | # ["original", "missing", "glrm", "error"]) 40 | # 41 | -------------------------------------------------------------------------------- /examples/huber_pca.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import HuberLoss 2 | from glrm.reg import QuadraticReg 3 | from glrm import GLRM 4 | from glrm.util import pplot 5 | from numpy.random import randn, choice, seed 6 | from numpy import sign 7 | from random import sample 8 | from math import sqrt 9 | from itertools import product 10 | from matplotlib import pyplot as plt 11 | seed(1) 12 | 13 | # Generate problem data 14 | m, n, k = 50, 50, 5 15 | sym_noise = 0.2*sqrt(k)*randn(m,n) 16 | asym_noise = sqrt(k)*randn(m,n) + 3*abs(sqrt(k)*randn(m,n)) # large, sparse noise 17 | rate = 0.3 # percent of entries that are corrupted by large, outlier noise 18 | corrupted_entries = sample(list(product(range(m), range(n))), int(m*n*rate)) 19 | data = randn(m,k).dot(randn(k,n)) 20 | A = data + sym_noise 21 | for ij in corrupted_entries: A[ij] += asym_noise[ij] 22 | 23 | # Initialize model 24 | loss = HuberLoss 25 | regX, regY = QuadraticReg(0.1), QuadraticReg(0.1) 26 | glrm_huber = GLRM(A, loss, regX, regY, k) 27 | 28 | # Fit 29 | glrm_huber.fit() 30 | 31 | # Results 32 | X, Y = glrm_huber.factors() 33 | A_hat = glrm_huber.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 34 | ch = glrm_huber.convergence() # convergence history 35 | pplot([data, A, A_hat, data-A_hat], ["original", "corrupted", "glrm", "error"]) 36 | 37 | 38 | # Now with missing data 39 | from numpy.random import choice 40 | missing = list(product(range(int(0.25*m), int(0.75*m)), range(int(0.25*n), int(0.75*n)))) 41 | 42 | glrm_huber_missing = GLRM(A, loss, regX, regY, k, missing) 43 | glrm_huber_missing.fit() 44 | A_hat = glrm_huber_missing.predict() 45 | pplot([data, A, missing, A_hat, data-A_hat], ["original", "corrupted", "missing", "glrm", "error"]) 46 | -------------------------------------------------------------------------------- /examples/likert.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import OrdinalLoss 2 | from glrm.reg import QuadraticReg 3 | from glrm import GLRM 4 | from glrm.convergence import Convergence 5 | from glrm.util import pplot 6 | from numpy.random import randn, choice, seed 7 | from numpy import sign 8 | from itertools import product 9 | from math import ceil 10 | seed(1) 11 | 12 | # Generate problem data 13 | m, n, k = 100, 100, 10 14 | data = randn(m,k).dot(randn(k,n)) 15 | data = data - data.min() 16 | data = (data/data.max()*6).round() + 1 # approx rank k 17 | #data = choice(range(7), (m,n)) + 1 # not inherently rank k 18 | 19 | # Initialize model 20 | A = data 21 | loss = OrdinalLoss 22 | regX, regY = QuadraticReg(0.1), QuadraticReg(0.1) 23 | glrm_ord = GLRM(A, loss, regX, regY, k) 24 | 25 | # Fit 26 | glrm_ord.fit(eps=1e-3, max_iters=1000) 27 | 28 | # Results 29 | X, Y = glrm_ord.factors() 30 | A_hat = glrm_ord.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 31 | ch = glrm_ord.convergence() # convergence history 32 | pplot([A, A_hat, A-A_hat], ["original", "glrm", "error"]) 33 | -------------------------------------------------------------------------------- /examples/mixed.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import QuadraticLoss, HingeLoss, OrdinalLoss 2 | from glrm.reg import QuadraticReg 3 | from glrm import GLRM 4 | from glrm.convergence import Convergence 5 | from glrm.util import pplot, unroll_missing 6 | from numpy.random import randn, choice, seed 7 | from itertools import product 8 | from numpy import sign, ceil, hstack 9 | seed(1) 10 | 11 | # Generate problem data 12 | m, k = 50, 10 13 | n1 = 25 # cols of numerical data 14 | n2 = 10 # cols of ordinal data 15 | n3 = 25 # cols of boolean data 16 | n = n1+n2+n3 17 | data = randn(m,k).dot(randn(k,n)) 18 | data_real = data[:,:n1] # numerical data 19 | data_ord = data[:,n1:n1+n2] 20 | data_ord = data_ord - data_ord.min() 21 | data_ord = (data_ord/data_ord.max()*6 + 1).round()# ordinal data, e.g., Likert scale 22 | data_bool = sign(data[:,n1+n2:]) 23 | 24 | # Initialize model 25 | A = [data_real, data_ord, data_bool] 26 | loss = [QuadraticLoss, OrdinalLoss, HingeLoss] 27 | regX, regY = QuadraticReg(0.01), QuadraticReg(0.01) 28 | converge = Convergence(TOL = 1e-2, max_iters = 1000) # optional (default TOL = 1e-3) 29 | glrm_mix = GLRM(A, loss, regX, regY, k, converge = converge) 30 | 31 | # Fit 32 | glrm_mix.fit() 33 | 34 | # Results 35 | X, Y = glrm_mix.factors() 36 | A_hat = glrm_mix.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 37 | ch = glrm_mix.convergence() # convergence history 38 | pplot([hstack(A), A_hat, hstack(A)-A_hat], ["original", "glrm", "error"]) 39 | 40 | # Now with missing data 41 | missing = [list(product(range(35, 50), range(n1-5, n1))), list(product(range(35, 42 | 50), range(0, n2))), list(product(range(35, 50), range(0, n3-5)))] 43 | 44 | glrm_mix_missing = GLRM(A, loss, regX, regY, k, missing) 45 | glrm_mix_missing.fit() 46 | A_hat = glrm_mix_missing.predict() 47 | 48 | # translate missing list into something that we can plot 49 | new_missing = unroll_missing(missing, [n1, n2, n3]) 50 | pplot([hstack(A), new_missing, A_hat, hstack(A)-A_hat], ["original", "missing", "glrm", "error"]) 51 | -------------------------------------------------------------------------------- /examples/pca_nonneg.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import QuadraticLoss 2 | from glrm.reg import NonnegativeReg, QuadraticReg 3 | from glrm import GLRM 4 | from glrm.util import pplot 5 | from numpy.random import randn, choice, seed 6 | from numpy.random import choice 7 | from itertools import product 8 | from numpy import sign 9 | 10 | # Generate problem data 11 | m, n, k = 20, 20, 5 12 | eta = 0.1 # noise power 13 | X_true, Y_true = abs(randn(m,k)), abs(randn(k,n)) 14 | data = X_true.dot(Y_true) + eta*randn(m,n) # noisy rank k 15 | 16 | # Initialize model 17 | A = data 18 | loss = QuadraticLoss 19 | regX, regY = NonnegativeReg(0.1), NonnegativeReg(0.1) 20 | glrm_nn = GLRM(A, loss, regX, regY, k) 21 | 22 | # Fit 23 | glrm_nn.fit() 24 | 25 | # Results 26 | X, Y = glrm_nn.factors() 27 | A_hat = glrm_nn.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 28 | ch = glrm_nn.convergence() # convergence history 29 | pplot([A, A_hat, A - A_hat], ["original", "glrm", "error"]) 30 | 31 | # Now with missing data 32 | missing = list(product(range(int(0.25*m), int(0.75*m)), range(int(0.25*n), int(0.75*n)))) 33 | glrm_nn_missing = GLRM(A, loss, regX, regY, k, missing) 34 | glrm_nn_missing.fit() 35 | A_hat = glrm_nn_missing.predict() 36 | pplot([A, missing, A_hat, A - A_hat], \ 37 | ["original", "missing", "glrm", "error"]) 38 | 39 | -------------------------------------------------------------------------------- /examples/pca_nucnorm.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import QuadraticLoss 2 | from glrm.reg import QuadraticReg, ZeroReg 3 | from glrm import GLRM 4 | from glrm.util import pplot 5 | from numpy.random import randn, choice, seed 6 | from numpy.random import choice 7 | from itertools import product 8 | from numpy import sign 9 | seed(1) 10 | 11 | # Generate problem data 12 | m, n, k = 50, 50, 10 13 | eta = 0.1 # noise power 14 | data = randn(m,k).dot(randn(k,n)) + eta*randn(m,n) # noisy rank k 15 | 16 | # Initialize model 17 | A = data 18 | loss = QuadraticLoss 19 | regX, regY = QuadraticReg(0.0001), QuadraticReg(0.0001) 20 | glrm_nn = GLRM(A, loss, regX, regY, k) 21 | 22 | # Fit 23 | glrm_nn.fit(eps=1e-4, max_iters=1000) 24 | 25 | # Results 26 | X, Y = glrm_nn.factors() 27 | A_hat = glrm_nn.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 28 | ch = glrm_nn.convergence() # convergence history 29 | pplot([A, A_hat, A - A_hat], ["original", "glrm", "error"]) 30 | 31 | # # Now with missing data 32 | # missing = list(product(range(int(0.25*m), int(0.75*m)), range(int(0.25*n), int(0.75*n)))) 33 | # glrm_nn_missing = GLRM(A, loss, regX, regY, k, missing) 34 | # glrm_nn_missing.fit() 35 | # A_hat = glrm_nn_missing.predict() 36 | # pplot([A, missing, A_hat, A - A_hat], \ 37 | # ["original", "missing", "glrm", "error"]) 38 | # 39 | -------------------------------------------------------------------------------- /examples/pca_test.py: -------------------------------------------------------------------------------- 1 | from numpy import diag, sqrt, tile, hstack, vstack, ones 2 | from numpy.linalg import svd, norm 3 | from numpy.random import randn, seed 4 | from glrm.loss import QuadraticLoss, HuberLoss, HingeLoss 5 | from glrm import GLRM 6 | from glrm.reg import QuadraticReg, LinearReg, NonnegativeReg 7 | seed(1) 8 | 9 | def PCA(A, k): 10 | mean_A = tile(A.mean(0), (A.shape[0],1)) 11 | A0 = A - mean_A 12 | 13 | u, s, v = svd(A0, full_matrices = False) 14 | u, s, v = u[:,:k], diag(sqrt(s[:k])), v[:k,:] 15 | X = hstack((u.dot(s), ones((m,1)))) 16 | Y = vstack((s.dot(v), A.mean(0))) 17 | 18 | return X, Y 19 | 20 | def GLRMfit(A, k, missing=None): 21 | loss = QuadraticLoss 22 | regX, regY = LinearReg(0.001), LinearReg(0.001) 23 | model = GLRM(A, loss, regX, regY, k, missing) 24 | model.fit(eps=1e-4, max_iters=1000) 25 | model.converge.plot() 26 | return model.factors() 27 | 28 | if __name__ == '__main__': 29 | m, n, k = 100, 50, 10 30 | A = randn(m,n) 31 | missing = [[(1,1), (3,5), (10, 10)]] 32 | X, Y = GLRMfit(A, k, missing) 33 | Xpca, Ypca = PCA(A, k) 34 | 35 | Z = A-X.dot(Y) 36 | Zpca = A-Xpca.dot(Ypca) 37 | for (i,j) in missing[0]: Z[i,j], Zpca[i,j] = 0,0 38 | print norm(Z) 39 | print norm(Zpca) 40 | -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | An overview of each of the simulations in this folder. 2 | 3 | pca_nucnorm.py 4 | - numerical data 5 | - QuadraticLoss, QuadraticReg 6 | - second example labels center as missing 7 | 8 | pca_nonneg.py (still buggy with prox gradient descent tuning parameters) 9 | - numerical data 10 | - QuadraticLoss, NonnegativeReg 11 | 12 | huber_pca.py 13 | - approximately low rank numerical data with sparse, high-magnitude noise 14 | - HuberLoss, QuadraticReg 15 | - second example labels center as missing 16 | 17 | likert.py 18 | - data table with ordinal data (1-7 integer data) 19 | - OrdinalLoss, QuadraticReg 20 | 21 | mixed.py 22 | - data table containing numerical, ordinal (1-7 integer), and Boolean data 23 | - (QuadraticLoss, OrdinalLoss, HingeLoss), QuadraticReg 24 | - second example labels lower right block as missing 25 | 26 | smiley.py (somewhat slow) 27 | - Boolean data (in the shape of a smiley face) 28 | - HingeLoss, QuadraticReg 29 | 30 | fractional_pca.py (does not work) 31 | - for testing the development of FractionalLoss 32 | -------------------------------------------------------------------------------- /examples/smiley.py: -------------------------------------------------------------------------------- 1 | from glrm.loss import HingeLoss 2 | from glrm.reg import QuadraticReg 3 | from glrm import GLRM 4 | from glrm.convergence import Convergence 5 | from glrm.util import pplot 6 | from numpy.random import randn, choice, seed 7 | from numpy import sign, ones 8 | from itertools import product 9 | seed(1) 10 | 11 | # Generate problem data (draw smiley with -1's, 1's) 12 | m, n, k = 500, 500, 8 13 | data = -ones((m, n)) 14 | for i,j in product(range(120, 190), range(120, 190)): 15 | d = (155-i)**2 + (155-j)**2 16 | if d <= 35**2: 17 | data[i,j] = 1 18 | data[i, m-j] = 1 19 | for i,j in product(range(300, 451), range(100, 251)): 20 | d = (250 - i)**2 + (250-j)**2 21 | if d <= 200**2 and d >= 150**2: 22 | data[i,j] = 1 23 | data[i,m-j] = 1 24 | 25 | # Initialize model 26 | A = data 27 | loss = HingeLoss 28 | regX, regY = QuadraticReg(0.1), QuadraticReg(0.1) 29 | converge = Convergence(TOL = 1e-2) 30 | glrm_binary = GLRM(A, loss, regX, regY, k, converge = converge) 31 | 32 | # Fit 33 | glrm_binary.fit() 34 | 35 | # Results 36 | X, Y = glrm_binary.factors() 37 | A_hat = glrm_binary.predict() # glrm_pca.predict(X, Y) works too; returns decode(XY) 38 | ch = glrm_binary.convergence() # convergence history 39 | pplot([A, A_hat, A - A_hat], ["original", "glrm", "error"]) 40 | -------------------------------------------------------------------------------- /glrm/__init__.py: -------------------------------------------------------------------------------- 1 | from glrm import GLRM 2 | -------------------------------------------------------------------------------- /glrm/convergence.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | 3 | class Convergence(object): 4 | 5 | def __init__(self, TOL = 1e-2, max_iters = 1e3): 6 | self.TOL = TOL 7 | self.max_iters = max_iters 8 | self.reset() 9 | 10 | def reset(self): 11 | self.obj = [] 12 | self.val = [] 13 | 14 | def d(self): # if converge.d == True: 15 | # return True if converged 16 | if len(self) < 2: return False 17 | if len(self) > self.max_iters: 18 | print "hit max iters for convergence object" 19 | return True 20 | return abs(self.obj[-1] - self.obj[-2])/self.obj[-2] < self.TOL 21 | 22 | def __len__(self): 23 | return len(self.obj) 24 | 25 | def __str__(self): 26 | return str(self.obj) 27 | 28 | def __repr__(self): 29 | return str(self.obj) 30 | 31 | def plot(self): 32 | plt.plot(self.obj) 33 | plt.title("model error") 34 | plt.xlabel("iteration") 35 | plt.show() 36 | -------------------------------------------------------------------------------- /glrm/glrm.py: -------------------------------------------------------------------------------- 1 | from convergence import Convergence 2 | from numpy import sqrt, repeat, tile, hstack, array, zeros, ones, sqrt, diag, asarray, hstack, vstack, split, cumsum 3 | from numpy.random import randn 4 | from copy import copy 5 | from numpy.linalg import svd 6 | import cvxpy as cp 7 | 8 | # XXX does not support splitting over samples yet (only over features to 9 | # accommodate arbitrary losses by column). 10 | 11 | class GLRM(object): 12 | 13 | def __init__(self, A, loss, regX, regY, k, missing_list = None, converge = None, scale=True): 14 | 15 | self.scale = scale 16 | # Turn everything in to lists / convert to correct dimensions 17 | if not isinstance(A, list): A = [A] 18 | if not isinstance(loss, list): loss = [loss] 19 | if not isinstance(regY, list): regY = [regY] 20 | if len(regY) == 1 and len(regY) < len(loss): 21 | regY = [copy(regY[0]) for _ in range(len(loss))] 22 | if missing_list and not isinstance(missing_list[0], list): missing_list = [missing_list] 23 | 24 | loss = [L(Aj) for Aj, L in zip(A, loss)] 25 | 26 | # save necessary info 27 | self.A, self.k, self.L = A, k, loss 28 | if converge == None: self.converge = Convergence() 29 | else: self.converge = converge 30 | 31 | # initialize cvxpy problems 32 | self._initialize_probs(A, k, missing_list, regX, regY) 33 | 34 | 35 | def factors(self): 36 | # return X, Y as matrices (not lists of sub matrices) 37 | return self.X, hstack(self.Y) 38 | 39 | def convergence(self): 40 | # convergence information for alternating minimization algorithm 41 | return self.converge 42 | 43 | def predict(self): 44 | # return decode(XY), low-rank approximation of A 45 | return hstack([L.decode(self.X.dot(yj)) for Aj, yj, L in zip(self.A, self.Y, self.L)]) 46 | 47 | def fit(self, max_iters=100, eps=1e-2, use_indirect=False, warm_start=False): 48 | 49 | Xv, Yp, pX = self.probX 50 | Xp, Yv, pY = self.probY 51 | self.converge.reset() 52 | 53 | # alternating minimization 54 | while not self.converge.d(): 55 | objX = pX.solve(solver=cp.SCS, eps=eps, max_iters=max_iters, 56 | use_indirect=use_indirect, warm_start=warm_start) 57 | Xp.value[:,:-1] = copy(Xv.value) 58 | 59 | # can parallelize this 60 | for ypj, yvj, pyj in zip(Yp, Yv, pY): 61 | objY = pyj.solve(solver=cp.SCS, eps=eps, max_iters=max_iters, 62 | use_indirect=use_indirect, warm_start=warm_start) 63 | ypj.value = copy(yvj.value) 64 | self.converge.obj.append(objX) 65 | 66 | self._finalize_XY(Xv, Yv) 67 | return self.X, self.Y 68 | 69 | def _initialize_probs(self, A, k, missing_list, regX, regY): 70 | 71 | # useful parameters 72 | m = A[0].shape[0] 73 | ns = [a.shape[1] for a in A] 74 | if missing_list == None: missing_list = [[]]*len(self.L) 75 | 76 | # initialize A, X, Y 77 | B = self._initialize_A(A, missing_list) 78 | X0, Y0 = self._initialize_XY(B, k, missing_list) 79 | self.X0, self.Y0 = X0, Y0 80 | 81 | # cvxpy problems 82 | Xv, Yp = cp.Variable(m,k), [cp.Parameter(k+1,ni) for ni in ns] 83 | Xp, Yv = cp.Parameter(m,k+1), [cp.Variable(k+1,ni) for ni in ns] 84 | Xp.value = copy(X0) 85 | for yj, yj0 in zip(Yp, Y0): yj.value = copy(yj0) 86 | onesM = cp.Constant(ones((m,1))) 87 | 88 | obj = sum(L(Aj, cp.mul_elemwise(mask, Xv*yj[:-1,:] \ 89 | + onesM*yj[-1:,:]) + offset) + ry(yj[:-1,:])\ 90 | for L, Aj, yj, mask, offset, ry in \ 91 | zip(self.L, A, Yp, self.masks, self.offsets, regY)) + regX(Xv) 92 | pX = cp.Problem(cp.Minimize(obj)) 93 | pY = [cp.Problem(cp.Minimize(\ 94 | L(Aj, cp.mul_elemwise(mask, Xp*yj) + offset) \ 95 | + ry(yj[:-1,:]) + regX(Xp))) \ 96 | for L, Aj, yj, mask, offset, ry in zip(self.L, A, Yv, self.masks, self.offsets, regY)] 97 | 98 | self.probX = (Xv, Yp, pX) 99 | self.probY = (Xp, Yv, pY) 100 | 101 | def _initialize_A(self, A, missing_list): 102 | """ Subtract out means of non-missing, standardize by std. """ 103 | m = A[0].shape[0] 104 | ns = [a.shape[1] for a in A] 105 | mean, stdev = [zeros(ni) for ni in ns], [zeros(ni) for ni in ns] 106 | B, masks, offsets = [], [], [] 107 | 108 | # compute stdev for entries that are not missing 109 | for ni, sv, mu, ai, missing, L in zip(ns, stdev, mean, A, missing_list, self.L): 110 | 111 | # collect non-missing terms 112 | for j in range(ni): 113 | elems = array([ai[i,j] for i in range(m) if (i,j) not in missing]) 114 | alpha = cp.Variable() 115 | # calculate standarized energy per column 116 | sv[j] = cp.Problem(cp.Minimize(\ 117 | L(elems, alpha*ones(elems.shape)))).solve()/len(elems) 118 | mu[j] = alpha.value 119 | 120 | offset, mask = tile(mu, (m,1)), tile(sv, (m,1)) 121 | mask[mask == 0] = 1 122 | bi = (ai-offset)/mask # standardize 123 | 124 | # zero-out missing entries (for XY initialization) 125 | for (i,j) in missing: bi[i,j], mask[i,j] = 0, 0 126 | 127 | B.append(bi) # save 128 | masks.append(mask) 129 | offsets.append(offset) 130 | self.masks = masks 131 | self.offsets = offsets 132 | return B 133 | 134 | def _initialize_XY(self, B, k, missing_list): 135 | """ Scale by ration of non-missing, SVD, append col of ones, add noise. """ 136 | A = hstack(bi for bi in B) 137 | m, n = A.shape 138 | 139 | # normalize entries that are missing 140 | if self.scale: stdev = A.std(0) 141 | else: stdev = ones(n) 142 | mu = A.mean(0) 143 | C = sqrt(1e-2/k) # XXX may need to be adjusted for larger problems 144 | A = (A-mu)/stdev + C*randn(m,n) 145 | 146 | # SVD to get initial point 147 | u, s, v = svd(A, full_matrices = False) 148 | u, s, v = u[:,:k], diag(sqrt(s[:k])), v[:k,:] 149 | X0, Y0 = asarray(u.dot(s)), asarray(s.dot(v))*asarray(stdev) 150 | 151 | # append col of ones to X, row of zeros to Y 152 | X0 = hstack((X0, ones((m,1)))) + C*randn(m,k+1) 153 | Y0 = vstack((Y0, mu)) + C*randn(k+1,n) 154 | 155 | # split Y0 156 | ns = cumsum([bj.shape[1] for bj in B]) 157 | if len(ns) == 1: Y0 = [Y0] 158 | else: Y0 = split(Y0, ns, 1) 159 | 160 | return X0, Y0 161 | 162 | def _finalize_XY(self, Xv, Yv): 163 | """ Multiply by std, offset by mean """ 164 | m, k = Xv.shape.size 165 | self.X = asarray(hstack((Xv.value, ones((m,1))))) 166 | self.Y = [asarray(yj.value)*tile(mask[0,:],(k+1,1)) \ 167 | for yj, mask in zip(Yv, self.masks)] 168 | for offset, Y in zip(self.offsets, self.Y): Y[-1,:] += offset[0,:] 169 | 170 | -------------------------------------------------------------------------------- /glrm/loss.py: -------------------------------------------------------------------------------- 1 | import cvxpy as cp 2 | from numpy import ones, maximum, minimum, sign, floor, ceil 3 | 4 | """ 5 | Abstract loss class and canonical loss functions. 6 | """ 7 | 8 | # Abstract Loss class 9 | class Loss(object): 10 | def __init__(self, A): return 11 | def loss(self, A, U): raise NotImplementedError("Override me!") 12 | def encode(self, A): return A # default 13 | def decode(self, A): return A # default 14 | def __str__(self): return "GLRM Loss: override me!" 15 | def __call__(self, A, U): return self.loss(A, U) 16 | 17 | # Canonical loss functions 18 | class QuadraticLoss(Loss): 19 | def loss(self, A, U): return cp.norm(cp.Constant(A) - U, "fro")/2.0 20 | def __str__(self): return "quadratic loss" 21 | 22 | class HuberLoss(Loss): 23 | a = 1.0 # XXX does the value of 'a' propagate if we update it? 24 | def loss(self, A, U): return cp.sum_entries(cp.huber(cp.Constant(A) - U, self.a)) 25 | def __str__(self): return "huber loss" 26 | 27 | # class FractionalLoss(Loss): 28 | # PRECISION = 1e-10 29 | # def loss(self, A, U): 30 | # B = cp.Constant(A) 31 | # U = cp.max_elemwise(U, self.PRECISION) # to avoid dividing by zero 32 | # return cp.max_elemwise(cp.mul_elemwise(cp.inv_pos(cp.pos(U)), B-U), \ 33 | # return maximum((A - U)/U, (U - A)/A) 34 | # 35 | 36 | class HingeLoss(Loss): 37 | def loss(self, A, U): return cp.sum_entries(cp.pos(ones(A.shape)-cp.mul_elemwise(cp.Constant(A), U))) 38 | def decode(self, A): return sign(A) # return back to Boolean 39 | def __str__(self): return "hinge loss" 40 | 41 | class OrdinalLoss(Loss): 42 | def __init__(self, A): 43 | self.Amax, self.Amin = A.max(), A.min() 44 | def loss(self, A, U): 45 | return cp.sum_entries(sum(cp.mul_elemwise(1*(b >= A),\ 46 | cp.pos(U-b*ones(A.shape))) + cp.mul_elemwise(1*(b < A), \ 47 | cp.pos(-U + (b+1)*ones(A.shape))) for b in range(int(self.Amin), int(self.Amax)))) 48 | def decode(self, A): return maximum(minimum(A.round(), self.Amax), self.Amin) 49 | def __str__(self): return "ordinal loss" 50 | -------------------------------------------------------------------------------- /glrm/reg.py: -------------------------------------------------------------------------------- 1 | from numpy.linalg import norm 2 | from numpy import sign, Inf 3 | from util import shrinkage 4 | import cvxpy as cp 5 | 6 | """ 7 | Abstract reg class and canonical regularizer functions. 8 | """ 9 | 10 | # Abstract Reg class 11 | class Reg(object): 12 | # shape indicates how quickly it grows: 0 [flat], 1 [linear], 2 [quadratic+] 13 | def reg(self, X): raise NotImplementedError("Override me!") 14 | def __init__(self, nu=1): self.nu = nu # XXX think of a better way to handle nu? 15 | def __str__(self): return "GLRM Reg: override me!" 16 | def __call__(self, X): return self.reg(X) 17 | 18 | class ZeroReg(Reg): 19 | def reg(self, X): return 0 20 | def __str__(self): return "zero reg" 21 | 22 | class LinearReg(Reg): 23 | def reg(self, X): return self.nu*cp.norm1(X) 24 | def __str__(self): return "linear reg" 25 | 26 | class QuadraticReg(Reg): 27 | def reg(self, X): return self.nu*cp.sum_squares(X) 28 | def __str__(self): return "quadratic reg" 29 | 30 | class NonnegativeReg(Reg): 31 | def reg(self, X): return 1e10*cp.sum_entries(cp.neg(X)) 32 | def __str__(self): return "nonnegative reg" 33 | 34 | # XXX 35 | # - k-indicator reg 36 | -------------------------------------------------------------------------------- /glrm/util.py: -------------------------------------------------------------------------------- 1 | from numpy import ones, round, zeros, expand_dims, Inf, tile, arange, repeat, array 2 | from functools import wraps 3 | from matplotlib import pyplot as plt 4 | import matplotlib.cm as cm 5 | from numpy.ma import masked_where 6 | from numpy import maximum, minimum 7 | import cvxpy as cp 8 | 9 | def pplot(As, titles): 10 | # setup 11 | try: vmin = min([A.min() for A, t in zip(As[:-1], titles) if "missing" not in t]) # for pixel color reference 12 | except: vmin = As[0].min() 13 | try: vmax = max([A.max() for A, t in zip(As[:-1], titles) if "missing" not in t]) 14 | except: vmax = As[0].max() 15 | my_dpi = 96 16 | plt.figure(figsize=(1.4*(250*len(As))/my_dpi, 250/my_dpi), dpi = my_dpi) 17 | for i, (A, title) in enumerate(zip(As, titles)): 18 | plt.subplot(1, len(As), i+1) 19 | if i == len(As)-1: vmin, vmax = A.min(), A.max() 20 | if "missing" in title: 21 | missing = A 22 | masked_data = ones(As[i-1].shape) 23 | for j,k in missing: masked_data[j,k] = 0 24 | masked_data = masked_where(masked_data > 0.5, masked_data) 25 | plt.imshow(As[i-1], interpolation = 'nearest', vmin = vmin, vmax = vmax) 26 | plt.colorbar() 27 | plt.imshow(masked_data, cmap = cm.binary, interpolation = "nearest") 28 | else: 29 | plt.imshow(A, interpolation = 'nearest', vmin = vmin, vmax = vmax) 30 | plt.colorbar() 31 | plt.title(title) 32 | plt.axis("off") 33 | 34 | plt.show() 35 | # 36 | # def unroll_missing(missing, ns): 37 | # missing_unrolled = [] 38 | # for i, (MM, n) in enumerate(zip(missing, ns)): 39 | # for m in MM: 40 | # n2 = m[1] + sum([ns[j] for j in range(i)]) 41 | # missing_unrolled.append((m[0], n2)) 42 | # return missing_unrolled 43 | # 44 | def shrinkage(a, kappa): 45 | """ soft threshold with parameter kappa). """ 46 | try: return maximum(a - kappa(ones(a.shape), 0)) - maximum(-a - kappa*ones(a.shape), 0) 47 | except: return max(a - kappa, 0) - max(-1 - kappa, 0) 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="GLRM", 5 | version="0.0.1", 6 | author="Corinne Horn", 7 | author_email="cehorn@stanford.edu", 8 | packages=["glrm"], 9 | package_dir={"glrm":"glrm"}, 10 | url="http://github.com/cehorn/GLRM/", 11 | license="MIT", 12 | install_requires=[ "numpy >= 1.8", 13 | "scipy >= 0.13"] 14 | ) 15 | --------------------------------------------------------------------------------