├── requirements.txt ├── test ├── test_soft_impute.py ├── test_iterative_svd.py ├── test_matrix_factorization.py ├── common.py ├── test_similarity_weighted_averaging.py ├── low_rank_data.py ├── test_knn.py ├── test_solver.py ├── test_nuclear_norm_minimization.py └── test_dictionary_helpers.py ├── .gitignore ├── fancyimpute ├── __init__.py ├── simple_fill.py ├── iterative_svd.py ├── common.py ├── knn.py ├── nuclear_norm_minimization.py ├── matrix_factorization.py ├── similarity_weighted_averaging.py ├── soft_impute.py ├── solver.py ├── dictionary_helpers.py └── scaler.py ├── setup.py ├── .travis.yml ├── experiments ├── readme_example.py └── complete_faces.py ├── README.md └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | knnimpute>=0.1.0 2 | scikit-learn>=0.24.2 3 | # used by NuclearNormMinimization 4 | cvxpy 5 | cvxopt 6 | # for tests 7 | pytest 8 | nose -------------------------------------------------------------------------------- /test/test_soft_impute.py: -------------------------------------------------------------------------------- 1 | from low_rank_data import XY, XY_incomplete, missing_mask 2 | from common import reconstruction_error 3 | 4 | from fancyimpute import SoftImpute 5 | 6 | def test_soft_impute_with_low_rank_random_matrix(): 7 | solver = SoftImpute() 8 | XY_completed = solver.fit_transform(XY_incomplete) 9 | _, missing_mae = reconstruction_error( 10 | XY, 11 | XY_completed, 12 | missing_mask, 13 | name="SoftImpute") 14 | assert missing_mae < 0.1, "Error too high!" 15 | 16 | if __name__ == "__main__": 17 | test_soft_impute_with_low_rank_random_matrix() 18 | -------------------------------------------------------------------------------- /test/test_iterative_svd.py: -------------------------------------------------------------------------------- 1 | from low_rank_data import XY, XY_incomplete, missing_mask 2 | from common import reconstruction_error 3 | 4 | from fancyimpute import IterativeSVD 5 | 6 | def test_iterative_svd_with_low_rank_random_matrix(): 7 | solver = IterativeSVD(rank=3) 8 | XY_completed = solver.fit_transform(XY_incomplete) 9 | _, missing_mae = reconstruction_error( 10 | XY, 11 | XY_completed, 12 | missing_mask, 13 | name="IterativeSVD") 14 | assert missing_mae < 0.1, "Error too high!" 15 | 16 | if __name__ == "__main__": 17 | test_iterative_svd_with_low_rank_random_matrix() 18 | -------------------------------------------------------------------------------- /test/test_matrix_factorization.py: -------------------------------------------------------------------------------- 1 | from fancyimpute import MatrixFactorization 2 | 3 | from low_rank_data import XY, XY_incomplete, missing_mask 4 | from common import reconstruction_error 5 | 6 | 7 | def test_matrix_factorization_with_low_rank_random_matrix(): 8 | solver = MatrixFactorization(learning_rate=0.02, rank=5) 9 | XY_completed = solver.fit_transform(XY_incomplete) 10 | _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") 11 | assert missing_mae < 0.1, "Error too high!" 12 | 13 | 14 | if __name__ == "__main__": 15 | test_matrix_factorization_with_low_rank_random_matrix() 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /test/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def reconstruction_error(XY, XY_completed, missing_mask, name=None): 5 | """ 6 | Returns mean squared error and mean absolute error for 7 | completed matrices. 8 | """ 9 | value_pairs = [ 10 | (i, j, XY[i, j], XY_completed[i, j]) 11 | for i in range(XY.shape[0]) 12 | for j in range(XY.shape[1]) 13 | if missing_mask[i, j] 14 | ] 15 | print("First 10 reconstructed values:") 16 | for (i, j, x, xr) in value_pairs[:10]: 17 | print(" (%d,%d) %0.4f ~= %0.4f" % (i, j, x, xr)) 18 | diffs = [actual - predicted for (_, _, actual, predicted) in value_pairs] 19 | missing_mse = np.mean([diff ** 2 for diff in diffs]) 20 | missing_mae = np.mean([np.abs(diff) for diff in diffs]) 21 | print("%sMSE: %0.4f, MAE: %0.4f" % ( 22 | "" if not name else name + " ", 23 | missing_mse, 24 | missing_mae)) 25 | return missing_mse, missing_mae 26 | -------------------------------------------------------------------------------- /test/test_similarity_weighted_averaging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nose.tools import eq_ 3 | 4 | from fancyimpute import SimilarityWeightedAveraging 5 | 6 | 7 | def test_similarity_weighted_column_averaging(): 8 | X = np.array([ 9 | [0.1, 0.9, 0.2], 10 | [0.8, 0.1, 0.01], 11 | [0.95, 0.2, 0.3], 12 | [0.14, 0.85, 0.3], 13 | ]) 14 | X_incomplete = X.copy() 15 | X_incomplete[1, 1] = np.nan 16 | X_incomplete[3, 0] = np.nan 17 | missing_mask = np.isnan(X_incomplete) 18 | 19 | solver = SimilarityWeightedAveraging() 20 | X_filled = solver.fit_transform(X_incomplete) 21 | eq_(X_incomplete.shape, X_filled.shape) 22 | diff = (X - X_filled)[missing_mask] 23 | abs_diff = np.abs(diff) 24 | mae = np.mean(abs_diff) 25 | print("MAE", mae) 26 | assert mae < 0.1, "Difference between imputed values! MAE=%0.4f" % mae 27 | 28 | if __name__ == "__main__": 29 | test_similarity_weighted_column_averaging() 30 | -------------------------------------------------------------------------------- /fancyimpute/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division 2 | 3 | from .solver import Solver 4 | from .nuclear_norm_minimization import NuclearNormMinimization 5 | from .matrix_factorization import MatrixFactorization 6 | from .iterative_svd import IterativeSVD 7 | from .simple_fill import SimpleFill 8 | from .soft_impute import SoftImpute 9 | from .scaler import BiScaler 10 | from .knn import KNN 11 | from .similarity_weighted_averaging import SimilarityWeightedAveraging 12 | 13 | # while iterative imputer is experimental in sklearn, we need this 14 | from sklearn.experimental import enable_iterative_imputer 15 | from sklearn.impute import IterativeImputer 16 | 17 | __version__ = "0.7.0" 18 | 19 | __all__ = [ 20 | "Solver", 21 | "NuclearNormMinimization", 22 | "MatrixFactorization", 23 | "IterativeSVD", 24 | "SimpleFill", 25 | "SoftImpute", 26 | "BiScaler", 27 | "KNN", 28 | "SimilarityWeightedAveraging", 29 | "IterativeImputer", 30 | ] 31 | -------------------------------------------------------------------------------- /test/low_rank_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def create_rank_k_dataset( 5 | n_rows=5, 6 | n_cols=5, 7 | k=3, 8 | fraction_missing=0.1, 9 | symmetric=False, 10 | random_seed=0): 11 | np.random.seed(random_seed) 12 | x = np.random.randn(n_rows, k) 13 | y = np.random.randn(k, n_cols) 14 | 15 | XY = np.dot(x, y) 16 | 17 | if symmetric: 18 | assert n_rows == n_cols 19 | XY = 0.5 * XY + 0.5 * XY.T 20 | 21 | missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols)) 22 | missing_mask = missing_raw_values < fraction_missing 23 | 24 | XY_incomplete = XY.copy() 25 | # fill missing entries with NaN 26 | XY_incomplete[missing_mask] = np.nan 27 | 28 | return XY, XY_incomplete, missing_mask 29 | 30 | 31 | # create some default data to be shared across tests 32 | XY, XY_incomplete, missing_mask = create_rank_k_dataset( 33 | n_rows=500, 34 | n_cols=10, 35 | k=3, 36 | fraction_missing=0.25) 37 | -------------------------------------------------------------------------------- /test/test_knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nose.tools import eq_ 3 | 4 | from fancyimpute.knn import KNN 5 | 6 | from low_rank_data import XY, XY_incomplete, missing_mask 7 | 8 | 9 | def test_knn(): 10 | # get a baseline error from just zero-filling the missing entries 11 | sad_zero_fill = np.sum(np.abs(XY[missing_mask])) 12 | mad_zero_fill = sad_zero_fill / missing_mask.sum() 13 | print("MAD zero-fill = ", mad_zero_fill) 14 | for k in [5, 15, 30]: 15 | print("-- k=", k) 16 | XY_completed = KNN(k).fit_transform(XY_incomplete) 17 | mask = np.isfinite(XY_completed) 18 | eq_((~mask).sum(), 0) 19 | diff = (XY_completed - XY)[missing_mask] 20 | sad = np.sum(np.abs(diff)) 21 | print("Sum absolute differences", sad) 22 | mad = sad / missing_mask.sum() 23 | print("Mean absolute difference", mad) 24 | # knnImpute should be at least twice as good as just zero fill 25 | assert mad <= (mad_zero_fill / 2.0), \ 26 | "Expected knnImpute to be 2x better than zeroFill (%f) but got MAD=%f" % ( 27 | mad_zero_fill, 28 | mad) 29 | -------------------------------------------------------------------------------- /fancyimpute/simple_fill.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .solver import Solver 14 | 15 | 16 | class SimpleFill(Solver): 17 | def __init__(self, fill_method="mean", min_value=None, max_value=None): 18 | """ 19 | Possible values for fill_method: 20 | "zero": fill missing entries with zeros 21 | "mean": fill with column means 22 | "median" : fill with column medians 23 | "min": fill with min value per column 24 | "random": fill with gaussian noise according to mean/std of column 25 | """ 26 | Solver.__init__( 27 | self, 28 | fill_method=fill_method, 29 | min_value=None, 30 | max_value=None) 31 | 32 | def solve(self, X, missing_mask): 33 | """ 34 | Since X is given to us already filled, just return it. 35 | """ 36 | return X 37 | -------------------------------------------------------------------------------- /test/test_solver.py: -------------------------------------------------------------------------------- 1 | from fancyimpute import Solver, SimpleFill 2 | 3 | from low_rank_data import XY, XY_incomplete, missing_mask 4 | from common import reconstruction_error 5 | 6 | import numpy as np 7 | import warnings 8 | 9 | 10 | def test_prepare_input_data(): 11 | _solver = Solver() 12 | print(_solver) # for improved coverage 13 | # test that a complete matrix returns a warning 14 | X1 = np.zeros((5, 5)) 15 | with warnings.catch_warnings(record=True) as w: 16 | _solver.prepare_input_data(X1) 17 | assert str(w[0].message) == "Input matrix is not missing any values", "Warning is not generated for a complete matrix" 18 | # test that an incomplete matrix does not return a warning 19 | X2 = np.zeros((5, 5)) 20 | X2[2, 3] = None 21 | with warnings.catch_warnings(record=True) as w: 22 | _solver.prepare_input_data(X2) 23 | assert len(w) == 0, "Warning is generated for an incomplete matrix" 24 | 25 | 26 | def test_solver_fill_methods_with_low_rank_random_matrix(): 27 | for fill_method in ("zero", "mean", "median", "min", "random"): 28 | imputer = SimpleFill(fill_method=fill_method) 29 | XY_completed = imputer.fit_transform(XY_incomplete) 30 | _, missing_mae = reconstruction_error( 31 | XY, 32 | XY_completed, 33 | missing_mask, 34 | name="Solver with fill_method=%s" %fill_method) 35 | assert missing_mae < 5, "Error too high for Solver with %s fill method!" %fill_method 36 | 37 | 38 | if __name__ == "__main__": 39 | test_prepare_input_data() 40 | test_solver_fill_methods_with_low_rank_random_matrix() -------------------------------------------------------------------------------- /test/test_nuclear_norm_minimization.py: -------------------------------------------------------------------------------- 1 | from fancyimpute import NuclearNormMinimization 2 | import numpy as np 3 | 4 | from low_rank_data import XY, XY_incomplete, missing_mask 5 | from common import reconstruction_error 6 | 7 | 8 | def create_rank1_data(symmetric=False): 9 | """ 10 | Returns 5x5 rank1 matrix with missing element at index (1, 2) 11 | """ 12 | x = np.array([1, 2, 3, 4, 5], dtype=float) 13 | y = np.array([0.1, -0.1, 0.2, -0.2, 0.02]) 14 | XY = np.outer(x, y) 15 | XY_missing = XY.copy() 16 | # drop one entry 17 | XY_missing[1, 2] = np.nan 18 | 19 | if not symmetric: 20 | return XY, XY_missing 21 | 22 | # make a symmetric matrix 23 | XYXY = XY.T.dot(XY) 24 | 25 | # drop one entry 26 | XYXY_missing = XYXY.copy() 27 | XYXY_missing[1, 2] = np.nan 28 | return XYXY, XYXY_missing 29 | 30 | 31 | def test_rank1_convex_solver(): 32 | XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False) 33 | solver = NuclearNormMinimization(max_iters=50000) 34 | XY_completed_rank1 = solver.fit_transform(XY_missing_rank1) 35 | assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \ 36 | "Expected %0.4f but got %0.4f" % ( 37 | XY_rank1[1, 2], XY_completed_rank1[1, 2]) 38 | 39 | 40 | def test_rank1_symmetric_convex_solver(): 41 | XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True) 42 | solver = NuclearNormMinimization(require_symmetric_solution=True) 43 | completed = solver.fit_transform(XYXY_missing_rank1) 44 | assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \ 45 | "Expected %0.4f but got %0.4f" % ( 46 | XYXY_rank1[1, 2], completed[1, 2]) 47 | 48 | 49 | def test_nuclear_norm_minimization_with_low_rank_random_matrix(): 50 | solver = NuclearNormMinimization(max_iters=2000) 51 | XY_completed = solver.fit_transform(XY_incomplete[:100]) 52 | _, missing_mae = reconstruction_error( 53 | XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm") 54 | assert missing_mae < 0.1, "Error too high!" 55 | 56 | if __name__ == "__main__": 57 | test_rank1_convex_solver() 58 | test_rank1_symmetric_convex_solver() 59 | test_nuclear_norm_minimization_with_low_rank_random_matrix() 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os 14 | import logging 15 | import re 16 | 17 | from setuptools import setup 18 | 19 | package_name = "fancyimpute" 20 | 21 | 22 | readme_dir = os.path.dirname(__file__) 23 | readme_filename = os.path.join(readme_dir, "README.md") 24 | 25 | try: 26 | with open(readme_filename, "r") as f: 27 | readme_markdown = f.read() 28 | except: 29 | logging.warn("Failed to load %s" % readme_filename) 30 | readme_markdown = "" 31 | 32 | with open("%s/__init__.py" % package_name, "r") as f: 33 | version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE).group(1) 34 | 35 | if __name__ == "__main__": 36 | setup( 37 | name=package_name, 38 | version=version, 39 | description="Matrix completion and feature imputation algorithms", 40 | author="Alex Rubinsteyn, Sergey Feldman", 41 | author_email="alex.rubinsteyn@gmail.com", 42 | url="https://github.com/iskandr/%s" % package_name, 43 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 44 | classifiers=[ 45 | "Development Status :: 4 - Beta", 46 | "Environment :: Console", 47 | "Operating System :: OS Independent", 48 | "Intended Audience :: Science/Research", 49 | "License :: OSI Approved :: Apache Software License", 50 | "Programming Language :: Python", 51 | "Topic :: Scientific/Engineering :: Bio-Informatics", 52 | ], 53 | install_requires=[ 54 | "knnimpute>=0.1.0", 55 | "scikit-learn>=0.24.2", 56 | # used by NuclearNormMinimization 57 | "cvxpy", 58 | "cvxopt", 59 | "pytest", 60 | "nose", 61 | ], 62 | long_description=readme_markdown, 63 | long_description_content_type="text/markdown", 64 | packages=[package_name], 65 | ) 66 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false # Use container-based infrastructure 2 | language: python 3 | env: 4 | global: 5 | - KERAS_BACKEND=tensorflow 6 | - CUDA_VISIBLE_DEVICES="" 7 | matrix: 8 | include: 9 | - python: 3.6 10 | before_install: 11 | # Commands below copied from: http://conda.pydata.org/docs/travis.html 12 | # We do this conditionally because it saves us some downloading if the 13 | # version is the same. 14 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 15 | - bash miniconda.sh -b -p $HOME/miniconda 16 | - export PATH="$HOME/miniconda/bin:$PATH" 17 | # reset the shell's lookup table for program name to path mappings 18 | - hash -r 19 | - conda config --set always_yes yes --set changeps1 no 20 | - conda update -q conda 21 | # Useful for debugging any issues with conda 22 | - conda info -a 23 | addons: 24 | apt: 25 | packages: 26 | # Even though I'm installing cvxopt via conda, still seem to need these: 27 | - liblapack-dev 28 | - libatlas-base-dev 29 | install: 30 | - > 31 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION 32 | numpy=1.19.5 keras=2.4.3 scipy nose pandas matplotlib cvxopt scikit-learn 33 | - source activate test-environment 34 | - conda install -c cvxgrp scs=1.2.6 35 | - pip install tensorflow==2.5 36 | - pip install -r requirements.txt 37 | - pip install . 38 | - pip install coveralls 39 | - export PACKAGE_DIR=`pwd` 40 | script: 41 | - cd $PACKAGE_DIR 42 | - nosetests test --with-coverage --cover-package=fancyimpute 43 | after_success: 44 | - coveralls 45 | deploy: 46 | provider: pypi 47 | distributions: sdist 48 | user: openvax 49 | password: # See http://docs.travis-ci.com/user/encryption-keys/ 50 | secure: "AAzTof2771B8tjg2PzCFfctNUbJ6BcQIkH3skpKJvoyWmL0U/fqnGF6zpK0QApJBqTX/xygYhLSfKWZ788FWwyaHW6Hgw8UQ1eHJPurjC9P8O/OWYRhK3r9J7dEPL4+uHfD67C7C+JGCl9BQk8+dRGYDOJ9kx32Eown8wtaoNY7ykLwq/mXsJcm+NjvfJzA7xE4TbGlL1RFDidUkwZ4YgWtGFcfEtVZlO+pEqeprLr/PBQap6K6WPA5yjQKziaqw5DSjMAU5TVDoZgIMu3/uxUJS6EGYs7FvRM961oEFXs9QvhDz+VtKr1kY8wGR1kJXes41NDr8fq9MqBAGcz3yxHeEP1wU1Aukfbw6QUQqQ7rUWFVKSqeVAq7Phirz7RHWslXl9dSoK2REQA3C8sXggmj198YhEq7QufxzTkD4KCDj+jutbMURZI5re6oetLqBz+8zExywXLKgVtTlUnokJ9R5Fnl0E1B4LMHXRvus71+vLQfv2gCt5OWRxzUfUFzpMdkXG2FDmjFGdBw6OWMhS1W+B19ht6Ho4SoN0Tj3YzvZt2AEwShm1i0LA8ITSN1lQdEucdz0kAhvXVRJtcGa4y48/uT9e8gzeyDyANvJ1RAbCsj3/kazucZH9I0b0lRyMiadtj7mfQwnU9MXCJzG7e912sGJDImyiTXqTQfw1Us=" 51 | on: 52 | branch: master 53 | condition: $TRAVIS_PYTHON_VERSION = "3.6" 54 | -------------------------------------------------------------------------------- /experiments/readme_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from fancyimpute import ( 3 | BiScaler, 4 | KNN, 5 | NuclearNormMinimization, 6 | SoftImpute, 7 | SimpleFill 8 | ) 9 | 10 | n = 200 11 | m = 20 12 | inner_rank = 4 13 | X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) 14 | print("Mean squared element: %0.4f" % (X ** 2).mean()) 15 | 16 | # X is a data matrix which we're going to randomly drop entries from 17 | missing_mask = np.random.rand(*X.shape) < 0.1 18 | X_incomplete = X.copy() 19 | # missing entries indicated with NaN 20 | X_incomplete[missing_mask] = np.nan 21 | 22 | meanFill = SimpleFill("mean") 23 | X_filled_mean = meanFill.fit_transform(X_incomplete) 24 | 25 | # Use 3 nearest rows which have a feature to fill in each row's missing features 26 | knnImpute = KNN(k=3) 27 | X_filled_knn = knnImpute.fit_transform(X_incomplete) 28 | 29 | # matrix completion using convex optimization to find low-rank solution 30 | # that still matches observed values. Slow! 31 | X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) 32 | 33 | # Instead of solving the nuclear norm objective directly, instead 34 | # induce sparsity using singular value thresholding 35 | softImpute = SoftImpute() 36 | 37 | # simultaneously normalizes the rows and columns of your observed data, 38 | # sometimes useful for low-rank imputation methods 39 | biscaler = BiScaler() 40 | 41 | # rescale both rows and columns to have zero mean and unit variance 42 | X_incomplete_normalized = biscaler.fit_transform(X_incomplete) 43 | 44 | X_filled_softimpute_normalized = softImpute.fit_transform(X_incomplete_normalized) 45 | X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized) 46 | 47 | X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete) 48 | 49 | meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean() 50 | print("meanFill MSE: %f" % meanfill_mse) 51 | 52 | # print mean squared error for the imputation methods above 53 | nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() 54 | print("Nuclear norm minimization MSE: %f" % nnm_mse) 55 | 56 | softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean() 57 | print("SoftImpute MSE: %f" % softImpute_mse) 58 | 59 | softImpute_no_biscale_mse = ( 60 | (X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean() 61 | print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse) 62 | 63 | 64 | knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() 65 | print("knnImpute MSE: %f" % knn_mse) 66 | -------------------------------------------------------------------------------- /test/test_dictionary_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from fancyimpute.dictionary_helpers import ( 4 | dense_matrix_from_pair_dictionary, 5 | dense_matrix_from_nested_dictionary, 6 | reverse_lookup_from_nested_dict, 7 | transpose_nested_dictionary, 8 | ) 9 | from nose.tools import eq_ 10 | 11 | 12 | def test_dense_matrix_from_nested_dictionary(): 13 | d = { 14 | "a": {"b": 10}, 15 | "b": {"c": 20} 16 | } 17 | X, rows, columns = dense_matrix_from_nested_dictionary(d) 18 | eq_(rows, ["a", "b"]) 19 | eq_(columns, ["b", "c"]) 20 | eq_(X[0, 0], 10) 21 | assert np.isnan(X[0, 1]) 22 | assert np.isnan(X[1, 0]) 23 | eq_(X[1, 1], 20) 24 | 25 | 26 | def test_dense_matrix_from_nested_dictionary_square(): 27 | d = { 28 | "a": {"b": 10}, 29 | "b": {"c": 20} 30 | } 31 | X, rows, columns = dense_matrix_from_nested_dictionary(d, square_result=True) 32 | eq_(rows, ["a", "b", "c"]) 33 | eq_(columns, ["a", "b", "c"]) 34 | assert np.isnan(X[0, 0]) 35 | eq_(X[0, 1], 10) 36 | assert np.isnan(X[0, 2]) 37 | assert np.isnan(X[1, 0]) 38 | assert np.isnan(X[1, 1]) 39 | eq_(X[1, 2], 20) 40 | assert np.isnan(X[2, 0]) 41 | assert np.isnan(X[2, 1]) 42 | assert np.isnan(X[2, 2]) 43 | 44 | 45 | def test_dense_matrix_from_pair_dictionary(): 46 | d = { 47 | ("a", "b"): 10, 48 | ("b", "c"): 20 49 | } 50 | X, rows, columns = dense_matrix_from_pair_dictionary(d) 51 | eq_(rows, ["a", "b"]) 52 | eq_(columns, ["b", "c"]) 53 | eq_(X[0, 0], 10) 54 | assert np.isnan(X[0, 1]) 55 | assert np.isnan(X[1, 0]) 56 | eq_(X[1, 1], 20) 57 | 58 | 59 | def test_dense_matrix_from_pair_dictionary_square(): 60 | d = { 61 | ("a", "b"): 10, 62 | ("b", "c"): 20 63 | } 64 | X, rows, columns = dense_matrix_from_pair_dictionary(d, square_result=True) 65 | eq_(rows, ["a", "b", "c"]) 66 | eq_(columns, ["a", "b", "c"]) 67 | assert np.isnan(X[0, 0]) 68 | eq_(X[0, 1], 10) 69 | assert np.isnan(X[0, 2]) 70 | assert np.isnan(X[1, 0]) 71 | assert np.isnan(X[1, 1]) 72 | eq_(X[1, 2], 20) 73 | assert np.isnan(X[2, 0]) 74 | assert np.isnan(X[2, 1]) 75 | assert np.isnan(X[2, 2]) 76 | 77 | 78 | def test_reverse_lookup_from_nested_dict(): 79 | d = { 80 | "a": {"b": 10, "c": 20}, 81 | "b": {"c": 5}, 82 | "z": {"c": 100} 83 | } 84 | reverse_dict = reverse_lookup_from_nested_dict(d) 85 | len(reverse_dict.keys()) == 2 86 | assert "c" in reverse_dict 87 | eq_(set(reverse_dict["c"]), {("a", 20), ("b", 5), ("z", 100)}) 88 | assert "b" in reverse_dict 89 | eq_(reverse_dict["b"], [("a", 10)]) 90 | 91 | 92 | def test_transpose_nested_dictionary(): 93 | d = {"a": {"b": 20, "c": 50}, "c": {"q": 500}} 94 | transposed = transpose_nested_dictionary(d) 95 | eq_(set(transposed.keys()), {"b", "c", "q"}) 96 | eq_(transposed["q"], {"c": 500}) 97 | eq_(transposed["c"], {"a": 50}) 98 | eq_(transposed["b"], {"a": 20}) 99 | -------------------------------------------------------------------------------- /fancyimpute/iterative_svd.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from sklearn.decomposition import TruncatedSVD 14 | from sklearn.utils import check_array 15 | import numpy as np 16 | 17 | from .solver import Solver 18 | from .common import masked_mae 19 | 20 | F32PREC = np.finfo(np.float32).eps 21 | 22 | 23 | class IterativeSVD(Solver): 24 | def __init__( 25 | self, 26 | rank=10, 27 | convergence_threshold=0.00001, 28 | max_iters=200, 29 | gradual_rank_increase=True, 30 | svd_algorithm="arpack", 31 | init_fill_method="zero", 32 | min_value=None, 33 | max_value=None, 34 | verbose=True): 35 | Solver.__init__( 36 | self, 37 | fill_method=init_fill_method, 38 | min_value=min_value, 39 | max_value=max_value) 40 | self.rank = rank 41 | self.max_iters = max_iters 42 | self.svd_algorithm = svd_algorithm 43 | self.convergence_threshold = convergence_threshold 44 | self.gradual_rank_increase = gradual_rank_increase 45 | self.verbose = verbose 46 | 47 | def _converged(self, X_old, X_new, missing_mask): 48 | # check for convergence 49 | old_missing_values = X_old[missing_mask] 50 | new_missing_values = X_new[missing_mask] 51 | difference = old_missing_values - new_missing_values 52 | ssd = np.sum(difference ** 2) 53 | old_norm_squared = (old_missing_values ** 2).sum() 54 | # edge cases 55 | if old_norm_squared == 0 or \ 56 | (old_norm_squared < F32PREC and ssd > F32PREC): 57 | return False 58 | else: 59 | return (ssd / old_norm_squared) < self.convergence_threshold 60 | 61 | def solve(self, X, missing_mask): 62 | X = check_array(X, force_all_finite=False) 63 | 64 | observed_mask = ~missing_mask 65 | X_filled = X 66 | for i in range(self.max_iters): 67 | # deviation from original svdImpute algorithm: 68 | # gradually increase the rank of our approximation 69 | if self.gradual_rank_increase: 70 | curr_rank = min(2 ** i, self.rank) 71 | else: 72 | curr_rank = self.rank 73 | tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm) 74 | X_reduced = tsvd.fit_transform(X_filled) 75 | X_reconstructed = tsvd.inverse_transform(X_reduced) 76 | X_reconstructed = self.clip(X_reconstructed) 77 | mae = masked_mae( 78 | X_true=X, 79 | X_pred=X_reconstructed, 80 | mask=observed_mask) 81 | if self.verbose: 82 | print( 83 | "[IterativeSVD] Iter %d: observed MAE=%0.6f" % ( 84 | i + 1, mae)) 85 | converged = self._converged( 86 | X_old=X_filled, 87 | X_new=X_reconstructed, 88 | missing_mask=missing_mask) 89 | X_filled[missing_mask] = X_reconstructed[missing_mask] 90 | if converged: 91 | break 92 | return X_filled 93 | -------------------------------------------------------------------------------- /fancyimpute/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | import importlib 15 | 16 | import numpy as np 17 | 18 | 19 | def import_from(module, name): 20 | ''' 21 | usage example: 22 | grid = import_from('sklearn.model_selection', 'GridSearchCV') 23 | is equivalent to: 24 | from sklearn.model_selection import GridSearchV as grid 25 | ''' 26 | module = importlib.import_module(module) 27 | return getattr(module, name) 28 | 29 | 30 | def masked_mae(X_true, X_pred, mask): 31 | masked_diff = X_true[mask] - X_pred[mask] 32 | return np.mean(np.abs(masked_diff)) 33 | 34 | 35 | def masked_mse(X_true, X_pred, mask): 36 | masked_diff = X_true[mask] - X_pred[mask] 37 | return np.mean(masked_diff ** 2) 38 | 39 | 40 | def generate_random_column_samples(column): 41 | col_mask = np.isnan(column) 42 | n_missing = np.sum(col_mask) 43 | if n_missing == len(column): 44 | logging.warn("No observed values in column") 45 | return np.zeros_like(column) 46 | 47 | mean = np.nanmean(column) 48 | std = np.nanstd(column) 49 | 50 | if np.isclose(std, 0): 51 | return np.array([mean] * n_missing) 52 | else: 53 | return np.random.randn(n_missing) * std + mean 54 | 55 | 56 | def choose_solution_using_percentiles( 57 | X_original, 58 | solutions, 59 | parameters=None, 60 | verbose=False, 61 | percentiles=list(range(10, 100, 10))): 62 | """ 63 | It's tricky to pick a single matrix out of all the candidate 64 | solutions with differing shrinkage thresholds. 65 | Our heuristic is to pick the matrix whose percentiles match best 66 | between the missing and observed data. 67 | """ 68 | missing_mask = np.isnan(X_original) 69 | min_mse = np.inf 70 | best_solution = None 71 | for i, candidate in enumerate(solutions): 72 | for col_idx in range(X_original.shape[1]): 73 | col_data = candidate[:, col_idx] 74 | col_missing = missing_mask[:, col_idx] 75 | col_observed = ~col_missing 76 | if col_missing.sum() < 2: 77 | continue 78 | elif col_observed.sum() < 2: 79 | continue 80 | missing_data = col_data[col_missing] 81 | observed_data = col_data[col_observed] 82 | 83 | missing_percentiles = np.array([ 84 | np.percentile(missing_data, p) 85 | for p in percentiles]) 86 | 87 | observed_percentiles = np.array([ 88 | np.percentile(observed_data, p) 89 | for p in percentiles]) 90 | 91 | mse = np.mean((missing_percentiles - observed_percentiles) ** 2) 92 | if mse < min_mse: 93 | min_mse = mse 94 | best_solution = candidate 95 | if verbose: 96 | print("Candidate #%d/%d%s: %f" % ( 97 | i + 1, 98 | len(solutions), 99 | (" (parameter=%s) " % parameters[i] 100 | if parameters is not None 101 | else ""), 102 | mse)) 103 | return best_solution 104 | -------------------------------------------------------------------------------- /fancyimpute/knn.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import numpy as np 14 | 15 | from knnimpute import knn_impute_few_observed, knn_impute_with_argpartition 16 | from sklearn.utils import check_array 17 | 18 | from .solver import Solver 19 | 20 | 21 | class KNN(Solver): 22 | """ 23 | k-Nearest Neighbors imputation for arrays with missing data. 24 | Works only on dense arrays with at most a few thousand rows. 25 | 26 | Assumes that each feature has been centered and rescaled to have 27 | mean 0 and variance 1. 28 | 29 | Inspired by the implementation of kNNImpute from the R package 30 | imputation. 31 | See here: 32 | https://www.rdocumentation.org/packages/imputation/versions/2.0.3/topics/kNNImpute 33 | """ 34 | def __init__( 35 | self, 36 | k=5, 37 | orientation="rows", 38 | use_argpartition=False, 39 | print_interval=100, 40 | min_value=None, 41 | max_value=None, 42 | normalizer=None, 43 | verbose=True): 44 | """ 45 | Parameters 46 | ---------- 47 | k : int 48 | Number of neighboring rows to use for imputation. 49 | 50 | orientation : str 51 | Which axis of the input matrix should be treated as a sample 52 | (default is "rows" but can also be "columns") 53 | 54 | use_argpartition : bool 55 | Use a more naive implementation of kNN imputation whichs calls 56 | numpy.argpartition for each row/column pair. May give NaN if fewer 57 | than k neighbors are available for a missing value. 58 | 59 | print_interval : int 60 | 61 | min_value : float 62 | Minimum possible imputed value 63 | 64 | max_value : float 65 | Maximum possible imputed value 66 | 67 | normalizer : object 68 | Any object (such as BiScaler) with fit() and transform() methods 69 | 70 | verbose : bool 71 | """ 72 | Solver.__init__( 73 | self, 74 | min_value=min_value, 75 | max_value=max_value, 76 | normalizer=normalizer) 77 | self.k = k 78 | self.verbose = verbose 79 | self.orientation = orientation 80 | self.print_interval = print_interval 81 | if use_argpartition: 82 | self._impute_fn = knn_impute_with_argpartition 83 | else: 84 | self._impute_fn = knn_impute_few_observed 85 | 86 | def solve(self, X, missing_mask): 87 | X = check_array(X, force_all_finite=False) 88 | 89 | if self.orientation == "columns": 90 | X = X.T 91 | missing_mask = missing_mask.T 92 | 93 | elif self.orientation != "rows": 94 | raise ValueError( 95 | "Orientation must be either 'rows' or 'columns', got: %s" % ( 96 | self.orientation,)) 97 | 98 | X_imputed = self._impute_fn( 99 | X=X, 100 | missing_mask=missing_mask, 101 | k=self.k, 102 | verbose=self.verbose, 103 | print_interval=self.print_interval) 104 | 105 | failed_to_impute = np.isnan(X_imputed) 106 | n_missing_after_imputation = failed_to_impute.sum() 107 | if n_missing_after_imputation != 0: 108 | if self.verbose: 109 | print("[KNN] Warning: %d/%d still missing after imputation, replacing with 0" % ( 110 | n_missing_after_imputation, 111 | X.shape[0] * X.shape[1])) 112 | X_imputed[failed_to_impute] = X[failed_to_impute] 113 | 114 | if self.orientation == "columns": 115 | X_imputed = X_imputed.T 116 | 117 | return X_imputed 118 | -------------------------------------------------------------------------------- /fancyimpute/nuclear_norm_minimization.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import cvxpy 14 | 15 | from .solver import Solver 16 | 17 | from sklearn.utils import check_array 18 | 19 | 20 | class NuclearNormMinimization(Solver): 21 | """ 22 | Simple implementation of "Exact Matrix Completion via Convex Optimization" 23 | by Emmanuel Candes and Benjamin Recht using cvxpy. 24 | """ 25 | 26 | def __init__( 27 | self, 28 | require_symmetric_solution=False, 29 | min_value=None, 30 | max_value=None, 31 | error_tolerance=0.0001, 32 | max_iters=50000, 33 | verbose=True): 34 | """ 35 | Parameters 36 | ---------- 37 | require_symmetric_solution : bool 38 | Add symmetry constraint to convex problem 39 | 40 | min_value : float 41 | Smallest possible imputed value 42 | 43 | max_value : float 44 | Largest possible imputed value 45 | 46 | error_tolerance : bool 47 | Degree of error allowed on reconstructed values. If omitted then 48 | defaults to 0.0001 49 | 50 | max_iters : int 51 | Maximum number of iterations for the convex solver 52 | 53 | verbose : bool 54 | Print debug info 55 | """ 56 | Solver.__init__( 57 | self, 58 | min_value=min_value, 59 | max_value=max_value) 60 | self.require_symmetric_solution = require_symmetric_solution 61 | self.error_tolerance = error_tolerance 62 | self.max_iters = max_iters 63 | self.verbose = verbose 64 | 65 | def _constraints(self, X, missing_mask, S, error_tolerance): 66 | """ 67 | Parameters 68 | ---------- 69 | X : np.array 70 | Data matrix with missing values filled in 71 | 72 | missing_mask : np.array 73 | Boolean array indicating where missing values were 74 | 75 | S : cvxpy.Variable 76 | Representation of solution variable 77 | """ 78 | ok_mask = ~missing_mask 79 | masked_X = cvxpy.multiply(ok_mask, X) 80 | masked_S = cvxpy.multiply(ok_mask, S) 81 | abs_diff = cvxpy.abs(masked_S - masked_X) 82 | close_to_data = abs_diff <= error_tolerance 83 | constraints = [close_to_data] 84 | if self.require_symmetric_solution: 85 | constraints.append(S == S.T) 86 | 87 | if self.min_value is not None: 88 | constraints.append(S >= self.min_value) 89 | 90 | if self.max_value is not None: 91 | constraints.append(S <= self.max_value) 92 | 93 | return constraints 94 | 95 | def _create_objective(self, m, n): 96 | """ 97 | Parameters 98 | ---------- 99 | m, n : int 100 | Dimensions that of solution matrix 101 | Returns the objective function and a variable representing the 102 | solution to the convex optimization problem. 103 | """ 104 | # S is the completed matrix 105 | shape = (m, n) 106 | S = cvxpy.Variable(shape, name="S") 107 | norm = cvxpy.norm(S, "nuc") 108 | objective = cvxpy.Minimize(norm) 109 | return S, objective 110 | 111 | def solve(self, X, missing_mask): 112 | X = check_array(X, force_all_finite=False) 113 | 114 | m, n = X.shape 115 | S, objective = self._create_objective(m, n) 116 | constraints = self._constraints( 117 | X=X, 118 | missing_mask=missing_mask, 119 | S=S, 120 | error_tolerance=self.error_tolerance) 121 | problem = cvxpy.Problem(objective, constraints) 122 | problem.solve( 123 | verbose=self.verbose, 124 | solver=cvxpy.CVXOPT, 125 | max_iters=self.max_iters, 126 | # use_indirect, see: https://github.com/cvxgrp/cvxpy/issues/547 127 | use_indirect=False) 128 | return S.value 129 | -------------------------------------------------------------------------------- /fancyimpute/matrix_factorization.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import numpy as np 14 | from sklearn.utils import check_array 15 | from .solver import Solver 16 | from .common import masked_mae 17 | 18 | 19 | class MatrixFactorization(Solver): 20 | def __init__( 21 | self, 22 | rank=40, 23 | learning_rate=0.01, 24 | max_iters=50, 25 | shrinkage_value=0, 26 | min_value=None, 27 | max_value=None, 28 | verbose=True, 29 | ): 30 | """ 31 | Train a matrix factorization model to predict empty 32 | entries in a matrix. Mostly copied (with permission) from: 33 | https://blog.insightdatascience.com/explicit-matrix-factorization-als-sgd-and-all-that-jazz-b00e4d9b21ea 34 | 35 | Params 36 | =====+ 37 | rank : (int) 38 | Number of latent factors to use in matrix 39 | factorization model 40 | 41 | learning_rate : (float) 42 | Learning rate for optimizer 43 | 44 | max_iters : (int) 45 | Number of max_iters to train for 46 | 47 | shrinkage_value : (float) 48 | Regularization term for sgd penalty 49 | 50 | min_value : float 51 | Smallest possible imputed value 52 | 53 | max_value : float 54 | Largest possible imputed value 55 | 56 | verbose : (bool) 57 | Whether or not to printout training progress 58 | """ 59 | Solver.__init__(self, min_value=min_value, max_value=max_value) 60 | self.rank = rank 61 | self.learning_rate = learning_rate 62 | self.max_iters = max_iters 63 | self.shrinkage_value = shrinkage_value 64 | self._v = verbose 65 | 66 | def solve(self, X, missing_mask): 67 | """ Train model for max_iters iterations from scratch.""" 68 | X = check_array(X, force_all_finite=False) 69 | 70 | # shape data to fit into keras model 71 | (n_samples, n_features) = X.shape 72 | observed_mask = ~missing_mask 73 | training_indices = list(zip(*np.where(observed_mask))) 74 | 75 | self.user_vecs = np.random.normal(scale=1.0 / self.rank, size=(n_samples, self.rank)) 76 | self.item_vecs = np.random.normal(scale=1.0 / self.rank, size=(n_features, self.rank)) 77 | 78 | self.user_bias = np.zeros(n_samples) 79 | self.item_bias = np.zeros(n_features) 80 | self.global_bias = np.mean(X[observed_mask]) 81 | 82 | for i in range(self.max_iters): 83 | # to do: early stopping 84 | if (i + 1) % 10 == 0 and self._v: 85 | X_reconstruction = self.clip(self.predict_all()) 86 | mae = masked_mae(X_true=X, X_pred=X_reconstruction, mask=observed_mask) 87 | print("[MatrixFactorization] Iter %d: observed MAE=%0.6f rank=%d" % (i + 1, mae, self.rank)) 88 | 89 | np.random.shuffle(training_indices) 90 | self.sgd(X, training_indices) 91 | i += 1 92 | 93 | X_filled = X.copy() 94 | X_filled[missing_mask] = self.clip(self.predict_all()[missing_mask]) 95 | return X_filled 96 | 97 | def sgd(self, X, training_indices): 98 | # to do: batch learning 99 | for (u, i) in training_indices: 100 | prediction = self.predict(u, i) 101 | e = X[u, i] - prediction # error 102 | 103 | # Update biases 104 | self.user_bias[u] += self.learning_rate * (e - self.shrinkage_value * self.user_bias[u]) 105 | self.item_bias[i] += self.learning_rate * (e - self.shrinkage_value * self.item_bias[i]) 106 | 107 | # Update latent factors 108 | self.user_vecs[u, :] += self.learning_rate * ( 109 | e * self.item_vecs[i, :] - self.shrinkage_value * self.user_vecs[u, :] 110 | ) 111 | self.item_vecs[i, :] += self.learning_rate * ( 112 | e * self.user_vecs[u, :] - self.shrinkage_value * self.item_vecs[i, :] 113 | ) 114 | 115 | def predict(self, u, i): 116 | """ Single user and item prediction.""" 117 | prediction = self.global_bias + self.user_bias[u] + self.item_bias[i] 118 | prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T) 119 | return prediction 120 | 121 | def predict_all(self): 122 | """ Predict ratings for every user and item.""" 123 | predictions = self.user_vecs.dot(self.item_vecs.T) 124 | predictions += self.global_bias + self.user_bias[:, np.newaxis] + self.item_bias[np.newaxis, :] 125 | return predictions 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://travis-ci.org/iskandr/fancyimpute) [](https://coveralls.io/github/iskandr/fancyimpute?branch=master) [](http://dx.doi.org/10.5281/zenodo.51773) 2 | 3 | 4 |  5 | 6 | 7 | A variety of matrix completion and imputation algorithms implemented in Python 3.6. 8 | 9 | To install: 10 | 11 | `pip install fancyimpute` 12 | 13 | If you run into `tensorflow` problems and use anaconda, you can try to fix them with `conda install cudatoolkit`. 14 | 15 | ## Important Caveats 16 | 17 | (1) This project is in "bare maintenance" mode. That means we are not planning on adding more imputation algorithms or features (but might if we get inspired). Please do report bugs, and we'll try to fix them. Also, we are happy to take pull requests for more algorithms and/or features. 18 | 19 | (2) `IterativeImputer` started its life as a `fancyimpute` original, but was then merged into `scikit-learn` and we deleted it from `fancyimpute` in favor of the better-tested `sklearn` version. As a convenience, you can still `from fancyimpute import IterativeImputer`, but under the hood it's just doing `from sklearn.impute import IterativeImputer`. That means if you update `scikit-learn` in the future, you may also change the behavior of `IterativeImputer`. 20 | 21 | 22 | ## Usage 23 | 24 | ```python 25 | from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler 26 | 27 | # X is the complete data matrix 28 | # X_incomplete has the same values as X except a subset have been replace with NaN 29 | 30 | # Use 3 nearest rows which have a feature to fill in each row's missing features 31 | X_filled_knn = KNN(k=3).fit_transform(X_incomplete) 32 | 33 | # matrix completion using convex optimization to find low-rank solution 34 | # that still matches observed values. Slow! 35 | X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) 36 | 37 | # Instead of solving the nuclear norm objective directly, instead 38 | # induce sparsity using singular value thresholding 39 | X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) 40 | X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized) 41 | 42 | # print mean squared error for the imputation methods above 43 | nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() 44 | print("Nuclear norm minimization MSE: %f" % nnm_mse) 45 | 46 | softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean() 47 | print("SoftImpute MSE: %f" % softImpute_mse) 48 | 49 | knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() 50 | print("knnImpute MSE: %f" % knn_mse) 51 | ``` 52 | 53 | ## Algorithms 54 | 55 | * `SimpleFill`: Replaces missing entries with the mean or median of each column. 56 | 57 | * `KNN`: Nearest neighbor imputations which weights samples using the mean squared difference 58 | on features for which two rows both have observed data. 59 | 60 | * `SoftImpute`: Matrix completion by iterative soft thresholding of SVD decompositions. Inspired by the [softImpute](https://web.stanford.edu/~hastie/swData/softImpute/vignette.html) package for R, which is based on [Spectral Regularization Algorithms for Learning Large Incomplete Matrices](http://web.stanford.edu/~hastie/Papers/mazumder10a.pdf) by Mazumder et. al. 61 | 62 | * `IterativeImputer`: A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. A stub that links to `scikit-learn`'s [IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html). 63 | 64 | * `IterativeSVD`: Matrix completion by iterative low-rank SVD decomposition. Should be similar to SVDimpute from [Missing value estimation methods for DNA microarrays](http://www.ncbi.nlm.nih.gov/pubmed/11395428) by Troyanskaya et. al. 65 | 66 | * `MatrixFactorization`: Direct factorization of the incomplete matrix into low-rank `U` and `V`, with per-row and per-column biases, as well as a global bias. Solved by SGD in pure numpy. 67 | 68 | * `NuclearNormMinimization`: Simple implementation of [Exact Matrix Completion via Convex Optimization](http://statweb.stanford.edu/~candes/papers/MatrixCompletion.pdf 69 | ) by Emmanuel Candes and Benjamin Recht using [cvxpy](http://www.cvxpy.org). Too slow for large matrices. 70 | 71 | * `BiScaler`: Iterative estimation of row/column means and standard deviations to get doubly normalized 72 | matrix. Not guaranteed to converge but works well in practice. Taken from [Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares](http://arxiv.org/abs/1410.2596). 73 | 74 | ## Citation 75 | 76 | If you use `fancyimpute` in your academic publication, please cite it as follows: 77 | ```bibtex 78 | @software{fancyimpute, 79 | author = {Alex Rubinsteyn and Sergey Feldman}, 80 | title={fancyimpute: An Imputation Library for Python}, 81 | url = {https://github.com/iskandr/fancyimpute}, 82 | version = {0.7.0}, 83 | date = {2016}, 84 | } 85 | ``` 86 | -------------------------------------------------------------------------------- /fancyimpute/similarity_weighted_averaging.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from collections import defaultdict 14 | 15 | from sklearn.utils import check_array 16 | 17 | import numpy as np 18 | 19 | from .dictionary_helpers import ( 20 | collect_nested_keys, 21 | reverse_lookup_from_nested_dict, 22 | matrix_to_nested_dictionary, 23 | transpose_nested_dictionary, 24 | ) 25 | 26 | 27 | class SimilarityWeightedAveraging(object): 28 | """ 29 | Fill in missing each missing row/column value by averaging across the 30 | k-nearest neighbors columns (taking into account missing data when 31 | computing column similarities and choosing which neighbors to inspect). 32 | 33 | Currently does not inherit from Solver since it expects sparse inputs in 34 | the form of nested dictionaries. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | min_weight_for_similarity=0.1, 40 | min_count_for_similarity=2, 41 | similarity_exponent=4.0, 42 | shrinkage_value=0.0001, 43 | orientation="rows", 44 | verbose=False, 45 | ): 46 | """ 47 | Parameters 48 | ---------- 49 | min_weight_for_similarity : float 50 | If sum of values in shared rows between two columns falls below this 51 | threhold then similarity can't be computed between those columns. 52 | 53 | min_count_for_similarity : int 54 | If number of overlapping rows between two columns falls below this 55 | threhold then similarity can't be computed between those columns. 56 | 57 | similarity_exponent : float 58 | Exponent for turning similarities into weights on values of other 59 | columns. 60 | 61 | shrinkage_value : float 62 | Shrinks reconstructed values toward 0 63 | 64 | orientation : str 65 | Whether to compute similarities along rows or columns 66 | 67 | verbose : bool 68 | """ 69 | self.min_weight_for_similarity = min_weight_for_similarity 70 | self.min_count_for_similarity = min_count_for_similarity 71 | self.similarity_exponent = similarity_exponent 72 | self.shrinkage_value = shrinkage_value 73 | self.orientation = orientation 74 | self.verbose = verbose 75 | 76 | def jacard_similarity_from_nested_dicts(self, nested_dictionaries): 77 | """ 78 | Compute the continuous Jacard similarity between all pairs 79 | of keys in dictionary-of-dictionaries given as an input. 80 | 81 | Returns three element tuple: 82 | - similarity dictionary: (key, key) -> float 83 | - overlap count dictionary: key -> key -> int 84 | - weight dictionary: key -> key -> float 85 | """ 86 | sims = {} 87 | overlaps = {} 88 | weights = {} 89 | for a, column_dict_a in nested_dictionaries.items(): 90 | row_set_a = set(column_dict_a.keys()) 91 | for b, column_dict_b in nested_dictionaries.items(): 92 | row_set_b = set(column_dict_b.keys()) 93 | common_rows = row_set_a.intersection(row_set_b) 94 | n_overlap = len(common_rows) 95 | overlaps[(a, b)] = n_overlap 96 | total = 0.0 97 | weight = 0.0 98 | for row_name in common_rows: 99 | value_a = column_dict_a[row_name] 100 | value_b = column_dict_b[row_name] 101 | minval = min(value_a, value_b) 102 | maxval = max(value_a, value_b) 103 | total += minval 104 | weight += maxval 105 | weights[(a, b)] = weight 106 | if weight < self.min_weight_for_similarity: 107 | continue 108 | if n_overlap < self.min_count_for_similarity: 109 | continue 110 | sims[(a, b)] = total / weight 111 | return sims, overlaps, weights 112 | 113 | def complete_dict(self, values_dict): 114 | """ 115 | Keys of nested dictionaries can be arbitrary objects. 116 | """ 117 | if self.orientation != "rows": 118 | values_dict = transpose_nested_dictionary(values_dict) 119 | 120 | row_keys, column_keys = collect_nested_keys(values_dict) 121 | if self.verbose: 122 | print("[SimilarityWeightedAveraging] # rows = %d" % (len(row_keys))) 123 | print("[SimilarityWeightedAveraging] # columns = %d" % (len(column_keys))) 124 | similarities, overlaps, weights = self.jacard_similarity_from_nested_dicts(values_dict) 125 | if self.verbose: 126 | print("[SimilarityWeightedAveraging] Computed %d similarities between rows" % (len(similarities),)) 127 | column_to_row_values = reverse_lookup_from_nested_dict(values_dict) 128 | 129 | result = defaultdict(dict) 130 | 131 | exponent = self.similarity_exponent 132 | shrinkage_value = self.shrinkage_value 133 | for i, row_key in enumerate(row_keys): 134 | for column_key, value_triplets in column_to_row_values.items(): 135 | total = 0 136 | denom = shrinkage_value 137 | for (other_row_key, y) in value_triplets: 138 | sample_weight = 1.0 139 | sim = similarities.get((row_key, other_row_key), 0) 140 | combined_weight = sim ** exponent 141 | combined_weight *= sample_weight 142 | total += combined_weight * y 143 | denom += combined_weight 144 | if denom > shrinkage_value: 145 | result[row_key][column_key] = total / denom 146 | if self.orientation != "rows": 147 | result = transpose_nested_dictionary(result) 148 | return result 149 | 150 | def fit_transform(self, X): 151 | X = check_array(X, force_all_finite=False) 152 | 153 | if self.verbose: 154 | print(("[SimilarityWeightedAveraging] Creating dictionary from matrix " " with shape %s") % (X.shape,)) 155 | missing_mask = np.isnan(X) 156 | observed_mask = ~missing_mask 157 | sparse_dict = matrix_to_nested_dictionary(X, filter_fn=np.isfinite) 158 | 159 | completed_dict = self.complete_dict(sparse_dict) 160 | array_result = np.zeros_like(X) 161 | for row_idx, row_dict in completed_dict.items(): 162 | for col_idx, value in row_dict.items(): 163 | array_result[row_idx, col_idx] = value 164 | array_result[observed_mask] = X[observed_mask] 165 | return array_result 166 | -------------------------------------------------------------------------------- /fancyimpute/soft_impute.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import numpy as np 14 | from sklearn.utils.extmath import randomized_svd 15 | from sklearn.utils import check_array 16 | 17 | from .common import masked_mae 18 | from .solver import Solver 19 | 20 | F32PREC = np.finfo(np.float32).eps 21 | 22 | 23 | class SoftImpute(Solver): 24 | """ 25 | Implementation of the SoftImpute algorithm from: 26 | "Spectral Regularization Algorithms for Learning Large Incomplete Matrices" 27 | by Mazumder, Hastie, and Tibshirani. 28 | """ 29 | def __init__( 30 | self, 31 | shrinkage_value=None, 32 | convergence_threshold=0.001, 33 | max_iters=100, 34 | max_rank=None, 35 | n_power_iterations=1, 36 | init_fill_method="zero", 37 | min_value=None, 38 | max_value=None, 39 | normalizer=None, 40 | verbose=True): 41 | """ 42 | Parameters 43 | ---------- 44 | shrinkage_value : float 45 | Value by which we shrink singular values on each iteration. If 46 | omitted then the default value will be the maximum singular 47 | value of the initialized matrix (zeros for missing values) divided 48 | by 50. 49 | 50 | convergence_threshold : float 51 | Minimum ration difference between iterations (as a fraction of 52 | the Frobenius norm of the current solution) before stopping. 53 | 54 | max_iters : int 55 | Maximum number of SVD iterations 56 | 57 | max_rank : int, optional 58 | Perform a truncated SVD on each iteration with this value as its 59 | rank. 60 | 61 | n_power_iterations : int 62 | Number of power iterations to perform with randomized SVD 63 | 64 | init_fill_method : str 65 | How to initialize missing values of data matrix, default is 66 | to fill them with zeros. 67 | 68 | min_value : float 69 | Smallest allowable value in the solution 70 | 71 | max_value : float 72 | Largest allowable value in the solution 73 | 74 | normalizer : object 75 | Any object (such as BiScaler) with fit() and transform() methods 76 | 77 | verbose : bool 78 | Print debugging info 79 | """ 80 | Solver.__init__( 81 | self, 82 | fill_method=init_fill_method, 83 | min_value=min_value, 84 | max_value=max_value, 85 | normalizer=normalizer) 86 | self.shrinkage_value = shrinkage_value 87 | self.convergence_threshold = convergence_threshold 88 | self.max_iters = max_iters 89 | self.max_rank = max_rank 90 | self.n_power_iterations = n_power_iterations 91 | self.verbose = verbose 92 | 93 | def _converged(self, X_old, X_new, missing_mask): 94 | # check for convergence 95 | old_missing_values = X_old[missing_mask] 96 | new_missing_values = X_new[missing_mask] 97 | difference = old_missing_values - new_missing_values 98 | ssd = np.sum(difference ** 2) 99 | old_norm = np.sqrt((old_missing_values ** 2).sum()) 100 | # edge cases 101 | if old_norm == 0 or (old_norm < F32PREC and np.sqrt(ssd) > F32PREC): 102 | return False 103 | else: 104 | return (np.sqrt(ssd) / old_norm) < self.convergence_threshold 105 | 106 | def _svd_step(self, X, shrinkage_value, max_rank=None): 107 | """ 108 | Returns reconstructed X from low-rank thresholded SVD and 109 | the rank achieved. 110 | """ 111 | if max_rank: 112 | # if we have a max rank then perform the faster randomized SVD 113 | (U, s, V) = randomized_svd( 114 | X, 115 | max_rank, 116 | n_iter=self.n_power_iterations, 117 | random_state=None) 118 | else: 119 | # perform a full rank SVD using ARPACK 120 | (U, s, V) = np.linalg.svd( 121 | X, 122 | full_matrices=False, 123 | compute_uv=True) 124 | s_thresh = np.maximum(s - shrinkage_value, 0) 125 | rank = (s_thresh > 0).sum() 126 | s_thresh = s_thresh[:rank] 127 | U_thresh = U[:, :rank] 128 | V_thresh = V[:rank, :] 129 | S_thresh = np.diag(s_thresh) 130 | X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh)) 131 | return X_reconstruction, rank 132 | 133 | def _max_singular_value(self, X_filled): 134 | # quick decomposition of X_filled into rank-1 SVD 135 | _, s, _ = randomized_svd( 136 | X_filled, 137 | 1, 138 | n_iter=5, 139 | random_state=None) 140 | return s[0] 141 | 142 | def solve(self, X, missing_mask): 143 | X = check_array(X, force_all_finite=False) 144 | 145 | X_init = X.copy() 146 | 147 | X_filled = X 148 | observed_mask = ~missing_mask 149 | max_singular_value = self._max_singular_value(X_filled) 150 | if self.verbose: 151 | print("[SoftImpute] Max Singular Value of X_init = %f" % ( 152 | max_singular_value)) 153 | 154 | if self.shrinkage_value: 155 | shrinkage_value = self.shrinkage_value 156 | else: 157 | # totally hackish heuristic: keep only components 158 | # with at least 1/50th the max singular value 159 | shrinkage_value = max_singular_value / 50.0 160 | 161 | for i in range(self.max_iters): 162 | X_reconstruction, rank = self._svd_step( 163 | X_filled, 164 | shrinkage_value, 165 | max_rank=self.max_rank) 166 | X_reconstruction = self.clip(X_reconstruction) 167 | 168 | # print error on observed data 169 | if self.verbose: 170 | mae = masked_mae( 171 | X_true=X_init, 172 | X_pred=X_reconstruction, 173 | mask=observed_mask) 174 | print( 175 | "[SoftImpute] Iter %d: observed MAE=%0.6f rank=%d" % ( 176 | i + 1, 177 | mae, 178 | rank)) 179 | 180 | converged = self._converged( 181 | X_old=X_filled, 182 | X_new=X_reconstruction, 183 | missing_mask=missing_mask) 184 | X_filled[missing_mask] = X_reconstruction[missing_mask] 185 | if converged: 186 | break 187 | if self.verbose: 188 | print("[SoftImpute] Stopped after iteration %d for lambda=%f" % ( 189 | i + 1, 190 | shrinkage_value)) 191 | 192 | return X_filled 193 | -------------------------------------------------------------------------------- /fancyimpute/solver.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import warnings 14 | 15 | import numpy as np 16 | 17 | from sklearn.utils import check_array 18 | 19 | from .common import generate_random_column_samples 20 | 21 | 22 | class Solver(object): 23 | def __init__( 24 | self, 25 | fill_method="zero", 26 | min_value=None, 27 | max_value=None, 28 | normalizer=None): 29 | self.fill_method = fill_method 30 | self.min_value = min_value 31 | self.max_value = max_value 32 | self.normalizer = normalizer 33 | 34 | def __repr__(self): 35 | return str(self) 36 | 37 | def __str__(self): 38 | field_list = [] 39 | for (k, v) in sorted(self.__dict__.items()): 40 | if v is None or isinstance(v, (float, int)): 41 | field_list.append("%s=%s" % (k, v)) 42 | elif isinstance(v, str): 43 | field_list.append("%s='%s'" % (k, v)) 44 | return "%s(%s)" % ( 45 | self.__class__.__name__, 46 | ", ".join(field_list)) 47 | 48 | def _check_input(self, X): 49 | if len(X.shape) != 2: 50 | raise ValueError("Expected 2d matrix, got %s array" % (X.shape,)) 51 | 52 | def _check_missing_value_mask(self, missing): 53 | if not missing.any(): 54 | warnings.simplefilter("always") 55 | warnings.warn("Input matrix is not missing any values") 56 | if missing.all(): 57 | raise ValueError("Input matrix must have some non-missing values") 58 | 59 | def _fill_columns_with_fn(self, X, missing_mask, col_fn): 60 | for col_idx in range(X.shape[1]): 61 | missing_col = missing_mask[:, col_idx] 62 | n_missing = missing_col.sum() 63 | if n_missing == 0: 64 | continue 65 | col_data = X[:, col_idx] 66 | fill_values = col_fn(col_data) 67 | if np.all(np.isnan(fill_values)): 68 | fill_values = 0 69 | X[missing_col, col_idx] = fill_values 70 | 71 | def fill( 72 | self, 73 | X, 74 | missing_mask, 75 | fill_method=None, 76 | inplace=False): 77 | """ 78 | Parameters 79 | ---------- 80 | X : np.array 81 | Data array containing NaN entries 82 | 83 | missing_mask : np.array 84 | Boolean array indicating where NaN entries are 85 | 86 | fill_method : str 87 | "zero": fill missing entries with zeros 88 | "mean": fill with column means 89 | "median" : fill with column medians 90 | "min": fill with min value per column 91 | "random": fill with gaussian samples according to mean/std of column 92 | 93 | inplace : bool 94 | Modify matrix or fill a copy 95 | """ 96 | X = check_array(X, force_all_finite=False) 97 | 98 | if not inplace: 99 | X = X.copy() 100 | 101 | if not fill_method: 102 | fill_method = self.fill_method 103 | 104 | if fill_method not in ("zero", "mean", "median", "min", "random"): 105 | raise ValueError("Invalid fill method: '%s'" % (fill_method)) 106 | elif fill_method == "zero": 107 | # replace NaN's with 0 108 | X[missing_mask] = 0 109 | elif fill_method == "mean": 110 | self._fill_columns_with_fn(X, missing_mask, np.nanmean) 111 | elif fill_method == "median": 112 | self._fill_columns_with_fn(X, missing_mask, np.nanmedian) 113 | elif fill_method == "min": 114 | self._fill_columns_with_fn(X, missing_mask, np.nanmin) 115 | elif fill_method == "random": 116 | self._fill_columns_with_fn( 117 | X, 118 | missing_mask, 119 | col_fn=generate_random_column_samples) 120 | return X 121 | 122 | def prepare_input_data(self, X): 123 | """ 124 | Check to make sure that the input matrix and its mask of missing 125 | values are valid. Returns X and missing mask. 126 | """ 127 | X = check_array(X, force_all_finite=False) 128 | if X.dtype != "f" and X.dtype != "d": 129 | X = X.astype(float) 130 | 131 | self._check_input(X) 132 | missing_mask = np.isnan(X) 133 | self._check_missing_value_mask(missing_mask) 134 | return X, missing_mask 135 | 136 | def clip(self, X): 137 | """ 138 | Clip values to fall within any global or column-wise min/max constraints 139 | """ 140 | X = np.asarray(X) 141 | if self.min_value is not None: 142 | X[X < self.min_value] = self.min_value 143 | if self.max_value is not None: 144 | X[X > self.max_value] = self.max_value 145 | return X 146 | 147 | def project_result(self, X): 148 | """ 149 | First undo normalization and then clip to the user-specified min/max 150 | range. 151 | """ 152 | X = np.asarray(X) 153 | if self.normalizer is not None: 154 | X = self.normalizer.inverse_transform(X) 155 | return self.clip(X) 156 | 157 | def solve(self, X, missing_mask): 158 | """ 159 | Given an initialized matrix X and a mask of where its missing values 160 | had been, return a completion of X. 161 | """ 162 | raise ValueError("%s.solve not yet implemented!" % ( 163 | self.__class__.__name__,)) 164 | 165 | def fit_transform(self, X, y=None): 166 | """ 167 | Fit the imputer and then transform input `X` 168 | 169 | Note: all imputations should have a `fit_transform` method, 170 | but only some (like IterativeImputer in sklearn) also support inductive 171 | mode using `fit` or `fit_transform` on `X_train` and then `transform` 172 | on new `X_test`. 173 | """ 174 | X_original, missing_mask = self.prepare_input_data(X) 175 | observed_mask = ~missing_mask 176 | X = X_original.copy() 177 | if self.normalizer is not None: 178 | X = self.normalizer.fit_transform(X) 179 | X_filled = self.fill(X, missing_mask, inplace=True) 180 | if not isinstance(X_filled, np.ndarray): 181 | raise TypeError( 182 | "Expected %s.fill() to return NumPy array but got %s" % ( 183 | self.__class__.__name__, 184 | type(X_filled))) 185 | 186 | X_result = self.solve(X_filled, missing_mask) 187 | if not isinstance(X_result, np.ndarray): 188 | raise TypeError( 189 | "Expected %s.solve() to return NumPy array but got %s" % ( 190 | self.__class__.__name__, 191 | type(X_result))) 192 | 193 | X_result = self.project_result(X=X_result) 194 | X_result[observed_mask] = X_original[observed_mask] 195 | return X_result 196 | 197 | def fit(self, X, y=None): 198 | """ 199 | Fit the imputer on input `X`. 200 | 201 | Note: all imputations should have a `fit_transform` method, 202 | but only some (like IterativeImputer in sklearn) also support inductive 203 | mode using `fit` or `fit_transform` on `X_train` and then `transform` 204 | on new `X_test`. 205 | """ 206 | raise ValueError( 207 | "%s.fit not implemented! This imputation algorithm likely " 208 | "doesn't support inductive mode. Only fit_transform is " 209 | "supported at this time." % ( 210 | self.__class__.__name__,)) 211 | 212 | def transform(self, X, y=None): 213 | """ 214 | Transform input `X`. 215 | 216 | Note: all imputations should have a `fit_transform` method, 217 | but only some (like IterativeImputer in sklearn) also support inductive 218 | mode using `fit` or `fit_transform` on `X_train` and then `transform` 219 | on new `X_test`. 220 | """ 221 | raise ValueError( 222 | "%s.transform not implemented! This imputation algorithm likely " 223 | "doesn't support inductive mode. Only %s.fit_transform is " 224 | "supported at this time." % ( 225 | self.__class__.__name__, self.__class__.__name__)) 226 | -------------------------------------------------------------------------------- /fancyimpute/dictionary_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Helper functions for incomplete matrices represented using dictionaries. 15 | """ 16 | 17 | from collections import defaultdict 18 | 19 | import numpy as np 20 | from scipy.sparse import dok_matrix 21 | 22 | 23 | def dense_nan_matrix(shape, dtype): 24 | return np.ones(shape, dtype=dtype) * np.nan 25 | 26 | 27 | def collect_nested_keys(nested_dict): 28 | outer_key_list = list(sorted(nested_dict.keys())) 29 | inner_key_set = set([]) 30 | for k in outer_key_list: 31 | inner_dict = nested_dict[k] 32 | inner_key_set = inner_key_set.union(inner_dict.keys()) 33 | inner_key_list = list(sorted(inner_key_set)) 34 | return outer_key_list, inner_key_list 35 | 36 | 37 | def nested_key_indices(nested_dict): 38 | """ 39 | Give an ordering to the outer and inner keys used in a dictionary that 40 | maps to dictionaries. 41 | """ 42 | outer_keys, inner_keys = collect_nested_keys(nested_dict) 43 | outer_key_indices = {k: i for (i, k) in enumerate(outer_keys)} 44 | inner_key_indices = {k: i for (i, k) in enumerate(inner_keys)} 45 | return outer_key_indices, inner_key_indices 46 | 47 | 48 | def flattened_nested_key_indices(nested_dict): 49 | """ 50 | Combine the outer and inner keys of nested dictionaries into a single 51 | ordering. 52 | """ 53 | outer_keys, inner_keys = collect_nested_keys(nested_dict) 54 | combined_keys = list(sorted(set(outer_keys + inner_keys))) 55 | return {k: i for (i, k) in enumerate(combined_keys)} 56 | 57 | 58 | def index_dict_to_sorted_list(key_indices): 59 | sorted_list = [None] * len(key_indices) 60 | for (key, index) in key_indices.items(): 61 | sorted_list[index] = key 62 | return sorted_list 63 | 64 | 65 | def array_from_nested_dictionary( 66 | nested_dict, 67 | array_fn, 68 | dtype="float32", 69 | square_result=False): 70 | """ 71 | Parameters 72 | ---------- 73 | nested_dict : dict 74 | Dictionary which contains dictionaries 75 | 76 | array_fn : function 77 | Takes shape and dtype as arguments, returns empty array. 78 | 79 | dtype : dtype 80 | NumPy dtype of result array 81 | 82 | square_result : bool 83 | Combine keys from outer and inner dictionaries. 84 | 85 | Returns array and sorted lists of the outer and inner keys. 86 | """ 87 | if square_result: 88 | outer_key_indices = inner_key_indices = flattened_nested_key_indices( 89 | nested_dict) 90 | else: 91 | outer_key_indices, inner_key_indices = nested_key_indices( 92 | nested_dict) 93 | 94 | n_rows = len(outer_key_indices) 95 | n_cols = len(inner_key_indices) 96 | shape = (n_rows, n_cols) 97 | result = array_fn(shape, dtype) 98 | for outer_key, sub_dictionary in nested_dict.items(): 99 | i = outer_key_indices[outer_key] 100 | for inner_key, value in sub_dictionary.items(): 101 | j = inner_key_indices[inner_key] 102 | result[i, j] = value 103 | outer_key_list = index_dict_to_sorted_list(outer_key_indices) 104 | inner_key_list = index_dict_to_sorted_list(inner_key_indices) 105 | return result, outer_key_list, inner_key_list 106 | 107 | 108 | def sparse_dok_matrix_from_nested_dictionary( 109 | nested_dict, 110 | dtype="float32", 111 | square_result=False): 112 | return array_from_nested_dictionary( 113 | nested_dict, 114 | array_fn=dok_matrix, 115 | dtype=dtype, 116 | square_result=square_result) 117 | 118 | 119 | def dense_matrix_from_nested_dictionary( 120 | nested_dict, 121 | dtype="float32", 122 | square_result=False): 123 | return array_from_nested_dictionary( 124 | nested_dict, 125 | array_fn=dense_nan_matrix, 126 | dtype=dtype, 127 | square_result=square_result) 128 | 129 | 130 | def matrix_to_pair_dictionary( 131 | X, row_keys=None, column_keys=None, filter_fn=None): 132 | """ 133 | X : numpy.ndarray 134 | 135 | row_keys : dict 136 | Dictionary mapping indices to row names. If omitted then maps each 137 | number to its string representation, such as 1 -> "1". 138 | 139 | column_keys : dict 140 | If omitted and matrix is square, then use the same dictionary 141 | as the rows. Otherwise map each column index to its string form. 142 | 143 | filter_fn : function 144 | If given then only add elements for which this function returns True. 145 | """ 146 | n_rows, n_cols = X.shape 147 | 148 | if row_keys is None: 149 | row_keys = {i: i for i in range(n_rows)} 150 | 151 | if column_keys is None: 152 | if n_rows == n_cols: 153 | column_keys = row_keys 154 | else: 155 | column_keys = {j: j for j in range(n_cols)} 156 | 157 | if len(row_keys) != n_rows: 158 | raise ValueError("Need %d row keys but got list of length %d" % ( 159 | n_rows, 160 | len(row_keys))) 161 | 162 | if len(column_keys) != n_cols: 163 | raise ValueError("Need %d column keys but got list of length %d" % ( 164 | n_cols, 165 | len(column_keys))) 166 | 167 | result_dict = {} 168 | for i, X_i in enumerate(X): 169 | row_key = row_keys[i] 170 | for j, X_ij in enumerate(X_i): 171 | if filter_fn and not filter_fn(X_ij): 172 | continue 173 | column_key = column_keys[j] 174 | key_pair = (row_key, column_key) 175 | result_dict[key_pair] = X_ij 176 | return result_dict 177 | 178 | 179 | def curry_pair_dictionary(key_pair_dict, default_value=0.0): 180 | """ 181 | Transform dictionary from pairs of keys to dict -> dict -> float 182 | """ 183 | result = defaultdict(dict) 184 | for (a, b), value in key_pair_dict.items(): 185 | result[a][b] = value 186 | return result 187 | 188 | 189 | def uncurry_nested_dictionary(curried_dict): 190 | """ 191 | Transform dictionary from (key_a -> key_b -> float) to 192 | (key_a, key_b) -> float 193 | """ 194 | result = {} 195 | for a, a_dict in curried_dict.items(): 196 | for b, value in a_dict.items(): 197 | result[(a, b)] = value 198 | return result 199 | 200 | 201 | def matrix_to_nested_dictionary( 202 | X, 203 | row_keys=None, 204 | column_keys=None, 205 | filter_fn=None): 206 | pair_dict = matrix_to_pair_dictionary( 207 | X, 208 | row_keys=row_keys, 209 | column_keys=column_keys, 210 | filter_fn=filter_fn) 211 | return curry_pair_dictionary(pair_dict) 212 | 213 | 214 | def pair_dict_key_sets(pair_dict): 215 | row_keys = set([]) 216 | column_keys = set([]) 217 | for (row_key, column_key) in pair_dict.keys(): 218 | row_keys.add(row_key) 219 | column_keys.add(column_key) 220 | return row_keys, column_keys 221 | 222 | 223 | def array_from_pair_dictionary( 224 | pair_dict, 225 | array_fn, 226 | dtype="float32", 227 | square_result=False): 228 | """ 229 | Convert a dictionary whose keys are pairs (k1, k2) into a sparse 230 | or incomplete array. 231 | 232 | Parameters 233 | ---------- 234 | pair_dict : dict 235 | Dictionary from pairs of keys to values. 236 | 237 | array_fn : function 238 | Takes shape and dtype as arguments, returns empty array. 239 | 240 | dtype : dtype 241 | NumPy dtype of result array 242 | 243 | square_result : bool 244 | Combine keys from rows and columns 245 | 246 | Returns array and sorted lists of the row and column keys. 247 | """ 248 | row_key_set, column_key_set = pair_dict_key_sets(pair_dict) 249 | 250 | if square_result: 251 | combined_key_set = row_key_set.union(column_key_set) 252 | row_key_list = column_key_list = list(sorted(combined_key_set)) 253 | row_key_indices = column_key_indices = { 254 | k: i for (i, k) in enumerate(row_key_list) 255 | } 256 | else: 257 | row_key_list = list(sorted(row_key_set)) 258 | column_key_list = list(sorted(column_key_set)) 259 | row_key_indices = {k: i for (i, k) in enumerate(row_key_list)} 260 | column_key_indices = {k: i for (i, k) in enumerate(column_key_list)} 261 | 262 | n_rows = len(row_key_indices) 263 | n_cols = len(column_key_indices) 264 | shape = (n_rows, n_cols) 265 | result = array_fn(shape, dtype) 266 | for (row_key, column_key), value in pair_dict.items(): 267 | i = row_key_indices[row_key] 268 | j = column_key_indices[column_key] 269 | result[i, j] = value 270 | return result, row_key_list, column_key_list 271 | 272 | 273 | def sparse_dok_matrix_from_pair_dictionary( 274 | pair_dict, 275 | dtype="float32", 276 | square_result=False): 277 | return array_from_pair_dictionary( 278 | pair_dict, 279 | array_fn=dok_matrix, 280 | dtype=dtype, 281 | square_result=square_result) 282 | 283 | 284 | def dense_matrix_from_pair_dictionary( 285 | pair_dict, 286 | dtype="float32", 287 | square_result=False): 288 | return array_from_pair_dictionary( 289 | pair_dict, 290 | array_fn=dense_nan_matrix, 291 | dtype=dtype, 292 | square_result=square_result) 293 | 294 | 295 | def transpose_nested_dictionary(nested_dict): 296 | """ 297 | Given a nested dictionary from k1 -> k2 > value 298 | transpose its outer and inner keys so it maps 299 | k2 -> k1 -> value. 300 | """ 301 | result = defaultdict(dict) 302 | for k1, d in nested_dict.items(): 303 | for k2, v in d.items(): 304 | result[k2][k1] = v 305 | return result 306 | 307 | 308 | def reverse_lookup_from_nested_dict(values_dict): 309 | """ 310 | Create reverse-lookup dictionary mapping each row key to a list of triplets: 311 | [(column key, value), ...] 312 | 313 | Parameters 314 | ---------- 315 | nested_values_dict : dict 316 | column_key -> row_key -> value 317 | 318 | weights_dict : dict 319 | column_key -> row_key -> sample weight 320 | 321 | Returns dictionary mapping row_key -> [(column key, value)] 322 | """ 323 | reverse_lookup = defaultdict(list) 324 | for column_key, column_dict in values_dict.items(): 325 | for row_key, value in column_dict.items(): 326 | entry = (column_key, value) 327 | reverse_lookup[row_key].append(entry) 328 | return reverse_lookup 329 | -------------------------------------------------------------------------------- /experiments/complete_faces.py: -------------------------------------------------------------------------------- 1 | from os import mkdir 2 | from os.path import exists, join 3 | from collections import defaultdict 4 | 5 | import pylab 6 | from sklearn.datasets import fetch_lfw_people 7 | from sklearn.impute import IterativeImputer 8 | import numpy as np 9 | 10 | from fancyimpute import ( 11 | SimpleFill, 12 | IterativeSVD, 13 | SoftImpute, 14 | BiScaler, 15 | KNN 16 | ) 17 | 18 | from fancyimpute.common import masked_mae, masked_mse 19 | 20 | 21 | def remove_pixels( 22 | full_images, 23 | missing_square_size=32, 24 | random_seed=0): 25 | np.random.seed(random_seed) 26 | incomplete_faces = [] 27 | n_faces = len(full_images) 28 | height, width = full_images[0].shape[:2] 29 | for i in range(n_faces): 30 | image = full_images[i].copy() 31 | start_x = np.random.randint( 32 | low=0, 33 | high=height - missing_square_size + 1) 34 | start_y = np.random.randint( 35 | low=0, 36 | high=width - missing_square_size + 1) 37 | image[ 38 | start_x: start_x + missing_square_size, 39 | start_y: start_y + missing_square_size] = np.nan 40 | incomplete_faces.append(image) 41 | return np.array(incomplete_faces, dtype=np.float32) 42 | 43 | 44 | def rescale_pixel_values(images, order="C"): 45 | """ 46 | Rescale the range of values in images to be between [0, 1] 47 | """ 48 | images = np.asarray(images, order=order).astype("float32") 49 | images -= images.min() 50 | images /= images.max() 51 | return images 52 | 53 | 54 | def color_balance(images): 55 | images = images.astype("float32") 56 | red = images[:, :, :, 0] 57 | green = images[:, :, :, 1] 58 | blue = images[:, :, :, 2] 59 | combined = (red + green + blue) 60 | total_color = combined.sum() 61 | overall_fraction_red = red.sum() / total_color 62 | overall_fraction_green = green.sum() / total_color 63 | overall_fraction_blue = blue.sum() / total_color 64 | 65 | for i in range(images.shape[0]): 66 | image = images[i] 67 | image_total = combined[i].sum() 68 | red_scale = overall_fraction_red / (red[i].sum() / image_total) 69 | green_scale = overall_fraction_green / (green[i].sum() / image_total) 70 | blue_scale = overall_fraction_blue / (blue[i].sum() / image_total) 71 | image[:, :, 0] *= red_scale 72 | image[:, :, 1] *= green_scale 73 | image[:, :, 2] *= blue_scale 74 | image[image < 0] = 0 75 | image[image > 255] = 255 76 | return images 77 | 78 | 79 | class ResultsTable(object): 80 | 81 | def __init__( 82 | self, 83 | images_dict, 84 | percent_missing=0.25, 85 | saved_image_stride=25, 86 | dirname="face_images", 87 | scale_rows=False, 88 | center_rows=False): 89 | self.images_dict = images_dict 90 | self.labels = list(sorted(images_dict.keys())) 91 | self.images_array = np.array( 92 | [images_dict[k] for k in self.labels]).astype("float32") 93 | self.image_shape = self.images_array[0].shape 94 | self.width, self.height = self.image_shape[:2] 95 | self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) 96 | if self.color: 97 | self.images_array = color_balance(self.images_array) 98 | self.n_pixels = self.width * self.height 99 | self.n_features = self.n_pixels * (3 if self.color else 1) 100 | self.n_images = len(self.images_array) 101 | print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % ( 102 | self.n_images, self.color, self.n_features, self.image_shape)) 103 | 104 | self.flattened_array_shape = (self.n_images, self.n_features) 105 | 106 | self.flattened_images = self.images_array.reshape(self.flattened_array_shape) 107 | 108 | n_missing_pixels = int(self.n_pixels * percent_missing) 109 | 110 | missing_square_size = int(np.sqrt(n_missing_pixels)) 111 | print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % ( 112 | n_missing_pixels, missing_square_size)) 113 | self.incomplete_images = remove_pixels( 114 | self.images_array, 115 | missing_square_size=missing_square_size) 116 | print("[ResultsTable] Incomplete images shape = %s" % ( 117 | self.incomplete_images.shape,)) 118 | self.flattened_incomplete_images = self.incomplete_images.reshape( 119 | self.flattened_array_shape) 120 | self.missing_mask = np.isnan(self.flattened_incomplete_images) 121 | self.normalizer = BiScaler( 122 | scale_rows=scale_rows, 123 | center_rows=center_rows, 124 | min_value=self.images_array.min(), 125 | max_value=self.images_array.max()) 126 | self.incomplete_normalized = self.normalizer.fit_transform( 127 | self.flattened_incomplete_images) 128 | 129 | self.saved_image_indices = list( 130 | range(0, self.n_images, saved_image_stride)) 131 | self.saved_images = defaultdict(dict) 132 | self.dirname = dirname 133 | self.mse_dict = {} 134 | self.mae_dict = {} 135 | 136 | self.save_images(self.images_array, "original", flattened=False) 137 | self.save_images(self.incomplete_images, "incomplete", flattened=False) 138 | 139 | def ensure_dir(self, dirname): 140 | if not exists(dirname): 141 | print("Creating directory: %s" % dirname) 142 | mkdir(dirname) 143 | 144 | def save_images(self, images, base_filename, flattened=True): 145 | self.ensure_dir(self.dirname) 146 | for i in self.saved_image_indices: 147 | label = self.labels[i].lower().replace(" ", "_") 148 | image = images[i, :].copy() 149 | if flattened: 150 | image = image.reshape(self.image_shape) 151 | image[np.isnan(image)] = 0 152 | figure = pylab.gcf() 153 | axes = pylab.gca() 154 | extra_kwargs = {} 155 | if self.color: 156 | extra_kwargs["cmap"] = "gray" 157 | assert image.min() >= 0, "Image can't contain negative numbers" 158 | if image.max() <= 1: 159 | image *= 256 160 | image[image > 255] = 255 161 | axes.imshow(image.astype("uint8"), **extra_kwargs) 162 | axes.get_xaxis().set_visible(False) 163 | axes.get_yaxis().set_visible(False) 164 | filename = base_filename + ".png" 165 | subdir = join(self.dirname, label) 166 | self.ensure_dir(subdir) 167 | path = join(subdir, filename) 168 | figure.savefig( 169 | path, 170 | bbox_inches='tight') 171 | self.saved_images[i][base_filename] = path 172 | 173 | def add_entry(self, solver, name): 174 | print("Running %s" % name) 175 | completed_normalized = solver.fit_transform(self.incomplete_normalized) 176 | completed = self.normalizer.inverse_transform(completed_normalized) 177 | 178 | mae = masked_mae( 179 | X_true=self.flattened_images, 180 | X_pred=completed, 181 | mask=self.missing_mask) 182 | mse = masked_mse( 183 | X_true=self.flattened_images, 184 | X_pred=completed, 185 | mask=self.missing_mask) 186 | print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae)) 187 | self.mse_dict[name] = mse 188 | self.mae_dict[name] = mae 189 | self.save_images(completed, base_filename=name) 190 | 191 | def sorted_errors(self): 192 | """ 193 | Generator for (rank, name, MSE, MAE) sorted by increasing MAE 194 | """ 195 | for i, (name, mae) in enumerate( 196 | sorted(self.mae_dict.items(), key=lambda x: x[1])): 197 | yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],) 198 | 199 | def print_sorted_errors(self): 200 | for (rank, name, mse, mae) in self.sorted_errors(): 201 | print("%d) %s: MSE=%0.4f MAE=%0.4f" % ( 202 | rank, 203 | name, 204 | mse, 205 | mae)) 206 | 207 | def save_html_table(self, filename="results_table.html"): 208 | html = """ 209 |
| 211 | | Rank | 212 |Name | 213 |Mean Squared Error | 214 |Mean Absolute Error | 215 | 216 | """ 217 | for (rank, name, mse, mae) in self.sorted_errors(): 218 | html += """ 219 |
|---|---|---|---|
| %d | 221 |%s | 222 |%0.4f | 223 |%0.4f | 224 |