├── requirements.txt
├── test
    ├── test_soft_impute.py
    ├── test_iterative_svd.py
    ├── test_matrix_factorization.py
    ├── common.py
    ├── test_similarity_weighted_averaging.py
    ├── low_rank_data.py
    ├── test_knn.py
    ├── test_solver.py
    ├── test_nuclear_norm_minimization.py
    └── test_dictionary_helpers.py
├── .gitignore
├── fancyimpute
    ├── __init__.py
    ├── simple_fill.py
    ├── iterative_svd.py
    ├── common.py
    ├── knn.py
    ├── nuclear_norm_minimization.py
    ├── matrix_factorization.py
    ├── similarity_weighted_averaging.py
    ├── soft_impute.py
    ├── solver.py
    ├── dictionary_helpers.py
    └── scaler.py
├── setup.py
├── .travis.yml
├── experiments
    ├── readme_example.py
    └── complete_faces.py
├── README.md
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | knnimpute>=0.1.0
2 | scikit-learn>=0.24.2
3 | # used by NuclearNormMinimization
4 | cvxpy
5 | cvxopt
6 | # for tests
7 | pytest
8 | nose 


--------------------------------------------------------------------------------
/test/test_soft_impute.py:
--------------------------------------------------------------------------------
 1 | from low_rank_data import XY, XY_incomplete, missing_mask
 2 | from common import reconstruction_error
 3 | 
 4 | from fancyimpute import SoftImpute
 5 | 
 6 | def test_soft_impute_with_low_rank_random_matrix():
 7 |     solver = SoftImpute()
 8 |     XY_completed = solver.fit_transform(XY_incomplete)
 9 |     _, missing_mae = reconstruction_error(
10 |         XY,
11 |         XY_completed,
12 |         missing_mask,
13 |         name="SoftImpute")
14 |     assert missing_mae < 0.1, "Error too high!"
15 | 
16 | if __name__ == "__main__":
17 |     test_soft_impute_with_low_rank_random_matrix()
18 | 


--------------------------------------------------------------------------------
/test/test_iterative_svd.py:
--------------------------------------------------------------------------------
 1 | from low_rank_data import XY, XY_incomplete, missing_mask
 2 | from common import reconstruction_error
 3 | 
 4 | from fancyimpute import IterativeSVD
 5 | 
 6 | def test_iterative_svd_with_low_rank_random_matrix():
 7 |     solver = IterativeSVD(rank=3)
 8 |     XY_completed = solver.fit_transform(XY_incomplete)
 9 |     _, missing_mae = reconstruction_error(
10 |         XY,
11 |         XY_completed,
12 |         missing_mask,
13 |         name="IterativeSVD")
14 |     assert missing_mae < 0.1, "Error too high!"
15 | 
16 | if __name__ == "__main__":
17 |     test_iterative_svd_with_low_rank_random_matrix()
18 | 


--------------------------------------------------------------------------------
/test/test_matrix_factorization.py:
--------------------------------------------------------------------------------
 1 | from fancyimpute import MatrixFactorization
 2 | 
 3 | from low_rank_data import XY, XY_incomplete, missing_mask
 4 | from common import reconstruction_error
 5 | 
 6 | 
 7 | def test_matrix_factorization_with_low_rank_random_matrix():
 8 |     solver = MatrixFactorization(learning_rate=0.02, rank=5)
 9 |     XY_completed = solver.fit_transform(XY_incomplete)
10 |     _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization")
11 |     assert missing_mae < 0.1, "Error too high!"
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     test_matrix_factorization_with_low_rank_random_matrix()
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/test/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def reconstruction_error(XY, XY_completed, missing_mask, name=None):
 5 |     """
 6 |     Returns mean squared error and mean absolute error for
 7 |     completed matrices.
 8 |     """
 9 |     value_pairs = [
10 |         (i, j, XY[i, j], XY_completed[i, j])
11 |         for i in range(XY.shape[0])
12 |         for j in range(XY.shape[1])
13 |         if missing_mask[i, j]
14 |     ]
15 |     print("First 10 reconstructed values:")
16 |     for (i, j, x, xr) in value_pairs[:10]:
17 |         print("  (%d,%d)  %0.4f ~= %0.4f" % (i, j, x, xr))
18 |     diffs = [actual - predicted for (_, _, actual, predicted) in value_pairs]
19 |     missing_mse = np.mean([diff ** 2 for diff in diffs])
20 |     missing_mae = np.mean([np.abs(diff) for diff in diffs])
21 |     print("%sMSE: %0.4f, MAE: %0.4f" % (
22 |         "" if not name else name + " ",
23 |         missing_mse,
24 |         missing_mae))
25 |     return missing_mse, missing_mae
26 | 


--------------------------------------------------------------------------------
/test/test_similarity_weighted_averaging.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from nose.tools import eq_
 3 | 
 4 | from fancyimpute import SimilarityWeightedAveraging
 5 | 
 6 | 
 7 | def test_similarity_weighted_column_averaging():
 8 |     X = np.array([
 9 |         [0.1, 0.9, 0.2],
10 |         [0.8, 0.1, 0.01],
11 |         [0.95, 0.2, 0.3],
12 |         [0.14, 0.85, 0.3],
13 |     ])
14 |     X_incomplete = X.copy()
15 |     X_incomplete[1, 1] = np.nan
16 |     X_incomplete[3, 0] = np.nan
17 |     missing_mask = np.isnan(X_incomplete)
18 | 
19 |     solver = SimilarityWeightedAveraging()
20 |     X_filled = solver.fit_transform(X_incomplete)
21 |     eq_(X_incomplete.shape, X_filled.shape)
22 |     diff = (X - X_filled)[missing_mask]
23 |     abs_diff = np.abs(diff)
24 |     mae = np.mean(abs_diff)
25 |     print("MAE", mae)
26 |     assert mae < 0.1, "Difference between imputed values! MAE=%0.4f" % mae
27 | 
28 | if __name__ == "__main__":
29 |     test_similarity_weighted_column_averaging()
30 | 


--------------------------------------------------------------------------------
/fancyimpute/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function, division
 2 | 
 3 | from .solver import Solver
 4 | from .nuclear_norm_minimization import NuclearNormMinimization
 5 | from .matrix_factorization import MatrixFactorization
 6 | from .iterative_svd import IterativeSVD
 7 | from .simple_fill import SimpleFill
 8 | from .soft_impute import SoftImpute
 9 | from .scaler import BiScaler
10 | from .knn import KNN
11 | from .similarity_weighted_averaging import SimilarityWeightedAveraging
12 | 
13 | # while iterative imputer is experimental in sklearn, we need this
14 | from sklearn.experimental import enable_iterative_imputer
15 | from sklearn.impute import IterativeImputer
16 | 
17 | __version__ = "0.7.0"
18 | 
19 | __all__ = [
20 |     "Solver",
21 |     "NuclearNormMinimization",
22 |     "MatrixFactorization",
23 |     "IterativeSVD",
24 |     "SimpleFill",
25 |     "SoftImpute",
26 |     "BiScaler",
27 |     "KNN",
28 |     "SimilarityWeightedAveraging",
29 |     "IterativeImputer",
30 | ]
31 | 


--------------------------------------------------------------------------------
/test/low_rank_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def create_rank_k_dataset(
 5 |         n_rows=5,
 6 |         n_cols=5,
 7 |         k=3,
 8 |         fraction_missing=0.1,
 9 |         symmetric=False,
10 |         random_seed=0):
11 |     np.random.seed(random_seed)
12 |     x = np.random.randn(n_rows, k)
13 |     y = np.random.randn(k, n_cols)
14 | 
15 |     XY = np.dot(x, y)
16 | 
17 |     if symmetric:
18 |         assert n_rows == n_cols
19 |         XY = 0.5 * XY + 0.5 * XY.T
20 | 
21 |     missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
22 |     missing_mask = missing_raw_values < fraction_missing
23 | 
24 |     XY_incomplete = XY.copy()
25 |     # fill missing entries with NaN
26 |     XY_incomplete[missing_mask] = np.nan
27 | 
28 |     return XY, XY_incomplete, missing_mask
29 | 
30 | 
31 | # create some default data to be shared across tests
32 | XY, XY_incomplete, missing_mask = create_rank_k_dataset(
33 |     n_rows=500,
34 |     n_cols=10,
35 |     k=3,
36 |     fraction_missing=0.25)
37 | 


--------------------------------------------------------------------------------
/test/test_knn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from nose.tools import eq_
 3 | 
 4 | from fancyimpute.knn import KNN
 5 | 
 6 | from low_rank_data import XY, XY_incomplete, missing_mask
 7 | 
 8 | 
 9 | def test_knn():
10 |     # get a baseline error from just zero-filling the missing entries
11 |     sad_zero_fill = np.sum(np.abs(XY[missing_mask]))
12 |     mad_zero_fill = sad_zero_fill / missing_mask.sum()
13 |     print("MAD zero-fill = ", mad_zero_fill)
14 |     for k in [5, 15, 30]:
15 |         print("-- k=", k)
16 |         XY_completed = KNN(k).fit_transform(XY_incomplete)
17 |         mask = np.isfinite(XY_completed)
18 |         eq_((~mask).sum(), 0)
19 |         diff = (XY_completed - XY)[missing_mask]
20 |         sad = np.sum(np.abs(diff))
21 |         print("Sum absolute differences", sad)
22 |         mad = sad / missing_mask.sum()
23 |         print("Mean absolute difference", mad)
24 |         # knnImpute should be at least twice as good as just zero fill
25 |         assert mad <= (mad_zero_fill / 2.0), \
26 |             "Expected knnImpute to be 2x better than zeroFill (%f) but got MAD=%f" % (
27 |                 mad_zero_fill,
28 |                 mad)
29 | 


--------------------------------------------------------------------------------
/fancyimpute/simple_fill.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from .solver import Solver
14 | 
15 | 
16 | class SimpleFill(Solver):
17 |     def __init__(self, fill_method="mean", min_value=None, max_value=None):
18 |         """
19 |         Possible values for fill_method:
20 |             "zero": fill missing entries with zeros
21 |             "mean": fill with column means
22 |             "median" : fill with column medians
23 |             "min": fill with min value per column
24 |             "random": fill with gaussian noise according to mean/std of column
25 |         """
26 |         Solver.__init__(
27 |             self,
28 |             fill_method=fill_method,
29 |             min_value=None,
30 |             max_value=None)
31 | 
32 |     def solve(self, X, missing_mask):
33 |         """
34 |         Since X is given to us already filled, just return it.
35 |         """
36 |         return X
37 | 


--------------------------------------------------------------------------------
/test/test_solver.py:
--------------------------------------------------------------------------------
 1 | from fancyimpute import Solver, SimpleFill
 2 | 
 3 | from low_rank_data import XY, XY_incomplete, missing_mask
 4 | from common import reconstruction_error
 5 | 
 6 | import numpy as np
 7 | import warnings
 8 | 
 9 | 
10 | def test_prepare_input_data():
11 |     _solver = Solver()
12 |     print(_solver) # for improved coverage
13 |     # test that a complete matrix returns a warning
14 |     X1 = np.zeros((5, 5))
15 |     with warnings.catch_warnings(record=True) as w:
16 |         _solver.prepare_input_data(X1)
17 |         assert str(w[0].message) == "Input matrix is not missing any values", "Warning is not generated for a complete matrix"
18 |     # test that an incomplete matrix does not return a warning
19 |     X2 = np.zeros((5, 5))
20 |     X2[2, 3] = None
21 |     with warnings.catch_warnings(record=True) as w:
22 |         _solver.prepare_input_data(X2)
23 |         assert len(w) == 0, "Warning is generated for an incomplete matrix"
24 | 
25 | 
26 | def test_solver_fill_methods_with_low_rank_random_matrix():
27 |     for fill_method in ("zero", "mean", "median", "min", "random"):
28 |         imputer = SimpleFill(fill_method=fill_method)
29 |         XY_completed = imputer.fit_transform(XY_incomplete)
30 |         _, missing_mae = reconstruction_error(
31 |             XY,
32 |             XY_completed,
33 |             missing_mask,
34 |             name="Solver with fill_method=%s" %fill_method)
35 |         assert missing_mae < 5, "Error too high for Solver with %s fill method!" %fill_method
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     test_prepare_input_data()
40 |     test_solver_fill_methods_with_low_rank_random_matrix()


--------------------------------------------------------------------------------
/test/test_nuclear_norm_minimization.py:
--------------------------------------------------------------------------------
 1 | from fancyimpute import NuclearNormMinimization
 2 | import numpy as np
 3 | 
 4 | from low_rank_data import XY, XY_incomplete, missing_mask
 5 | from common import reconstruction_error
 6 | 
 7 | 
 8 | def create_rank1_data(symmetric=False):
 9 |     """
10 |     Returns 5x5 rank1 matrix with missing element at index (1, 2)
11 |     """
12 |     x = np.array([1, 2, 3, 4, 5], dtype=float)
13 |     y = np.array([0.1, -0.1, 0.2, -0.2, 0.02])
14 |     XY = np.outer(x, y)
15 |     XY_missing = XY.copy()
16 |     # drop one entry
17 |     XY_missing[1, 2] = np.nan
18 | 
19 |     if not symmetric:
20 |         return XY, XY_missing
21 | 
22 |     # make a symmetric matrix
23 |     XYXY = XY.T.dot(XY)
24 | 
25 |     # drop one entry
26 |     XYXY_missing = XYXY.copy()
27 |     XYXY_missing[1, 2] = np.nan
28 |     return XYXY, XYXY_missing
29 | 
30 | 
31 | def test_rank1_convex_solver():
32 |     XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False)
33 |     solver = NuclearNormMinimization(max_iters=50000)
34 |     XY_completed_rank1 = solver.fit_transform(XY_missing_rank1)
35 |     assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \
36 |         "Expected %0.4f but got %0.4f" % (
37 |             XY_rank1[1, 2], XY_completed_rank1[1, 2])
38 | 
39 | 
40 | def test_rank1_symmetric_convex_solver():
41 |     XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True)
42 |     solver = NuclearNormMinimization(require_symmetric_solution=True)
43 |     completed = solver.fit_transform(XYXY_missing_rank1)
44 |     assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \
45 |         "Expected %0.4f but got %0.4f" % (
46 |             XYXY_rank1[1, 2], completed[1, 2])
47 | 
48 | 
49 | def test_nuclear_norm_minimization_with_low_rank_random_matrix():
50 |     solver = NuclearNormMinimization(max_iters=2000)
51 |     XY_completed = solver.fit_transform(XY_incomplete[:100])
52 |     _, missing_mae = reconstruction_error(
53 |         XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm")
54 |     assert missing_mae < 0.1, "Error too high!"
55 | 
56 | if __name__ == "__main__":
57 |     test_rank1_convex_solver()
58 |     test_rank1_symmetric_convex_solver()
59 |     test_nuclear_norm_minimization_with_low_rank_random_matrix()
60 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import os
14 | import logging
15 | import re
16 | 
17 | from setuptools import setup
18 | 
19 | package_name = "fancyimpute"
20 | 
21 | 
22 | readme_dir = os.path.dirname(__file__)
23 | readme_filename = os.path.join(readme_dir, "README.md")
24 | 
25 | try:
26 |     with open(readme_filename, "r") as f:
27 |         readme_markdown = f.read()
28 | except:
29 |     logging.warn("Failed to load %s" % readme_filename)
30 |     readme_markdown = ""
31 | 
32 | with open("%s/__init__.py" % package_name, "r") as f:
33 |     version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE).group(1)
34 | 
35 | if __name__ == "__main__":
36 |     setup(
37 |         name=package_name,
38 |         version=version,
39 |         description="Matrix completion and feature imputation algorithms",
40 |         author="Alex Rubinsteyn, Sergey Feldman",
41 |         author_email="alex.rubinsteyn@gmail.com",
42 |         url="https://github.com/iskandr/%s" % package_name,
43 |         license="http://www.apache.org/licenses/LICENSE-2.0.html",
44 |         classifiers=[
45 |             "Development Status :: 4 - Beta",
46 |             "Environment :: Console",
47 |             "Operating System :: OS Independent",
48 |             "Intended Audience :: Science/Research",
49 |             "License :: OSI Approved :: Apache Software License",
50 |             "Programming Language :: Python",
51 |             "Topic :: Scientific/Engineering :: Bio-Informatics",
52 |         ],
53 |         install_requires=[
54 |             "knnimpute>=0.1.0",
55 |             "scikit-learn>=0.24.2",
56 |             # used by NuclearNormMinimization
57 |             "cvxpy",
58 |             "cvxopt",
59 |             "pytest",
60 |             "nose",
61 |         ],
62 |         long_description=readme_markdown,
63 |         long_description_content_type="text/markdown",
64 |         packages=[package_name],
65 |     )
66 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false  # Use container-based infrastructure
 2 | language: python
 3 | env:
 4 |   global:
 5 |     - KERAS_BACKEND=tensorflow
 6 |     - CUDA_VISIBLE_DEVICES=""
 7 | matrix:
 8 |   include:
 9 |     - python: 3.6
10 | before_install:
11 |   # Commands below copied from: http://conda.pydata.org/docs/travis.html
12 |   # We do this conditionally because it saves us some downloading if the
13 |   # version is the same.
14 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
15 |   - bash miniconda.sh -b -p $HOME/miniconda
16 |   - export PATH="$HOME/miniconda/bin:$PATH"
17 |   # reset the shell's lookup table for program name to path mappings
18 |   - hash -r
19 |   - conda config --set always_yes yes --set changeps1 no
20 |   - conda update -q conda
21 |   # Useful for debugging any issues with conda
22 |   - conda info -a
23 | addons:
24 |   apt:
25 |     packages:
26 |     # Even though I'm installing cvxopt via conda, still seem to need these:
27 |     - liblapack-dev
28 |     - libatlas-base-dev
29 | install:
30 |   - >
31 |       conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
32 |       numpy=1.19.5 keras=2.4.3 scipy nose pandas matplotlib cvxopt scikit-learn
33 |   - source activate test-environment
34 |   - conda install -c cvxgrp scs=1.2.6
35 |   - pip install tensorflow==2.5
36 |   - pip install -r requirements.txt
37 |   - pip install .
38 |   - pip install coveralls
39 |   - export PACKAGE_DIR=`pwd`
40 | script:
41 |   - cd $PACKAGE_DIR
42 |   - nosetests test --with-coverage --cover-package=fancyimpute
43 | after_success:
44 |   - coveralls
45 | deploy:
46 |   provider: pypi
47 |   distributions: sdist
48 |   user: openvax
49 |   password: # See http://docs.travis-ci.com/user/encryption-keys/
50 |     secure: "AAzTof2771B8tjg2PzCFfctNUbJ6BcQIkH3skpKJvoyWmL0U/fqnGF6zpK0QApJBqTX/xygYhLSfKWZ788FWwyaHW6Hgw8UQ1eHJPurjC9P8O/OWYRhK3r9J7dEPL4+uHfD67C7C+JGCl9BQk8+dRGYDOJ9kx32Eown8wtaoNY7ykLwq/mXsJcm+NjvfJzA7xE4TbGlL1RFDidUkwZ4YgWtGFcfEtVZlO+pEqeprLr/PBQap6K6WPA5yjQKziaqw5DSjMAU5TVDoZgIMu3/uxUJS6EGYs7FvRM961oEFXs9QvhDz+VtKr1kY8wGR1kJXes41NDr8fq9MqBAGcz3yxHeEP1wU1Aukfbw6QUQqQ7rUWFVKSqeVAq7Phirz7RHWslXl9dSoK2REQA3C8sXggmj198YhEq7QufxzTkD4KCDj+jutbMURZI5re6oetLqBz+8zExywXLKgVtTlUnokJ9R5Fnl0E1B4LMHXRvus71+vLQfv2gCt5OWRxzUfUFzpMdkXG2FDmjFGdBw6OWMhS1W+B19ht6Ho4SoN0Tj3YzvZt2AEwShm1i0LA8ITSN1lQdEucdz0kAhvXVRJtcGa4y48/uT9e8gzeyDyANvJ1RAbCsj3/kazucZH9I0b0lRyMiadtj7mfQwnU9MXCJzG7e912sGJDImyiTXqTQfw1Us="
51 |   on:
52 |     branch: master
53 |     condition: $TRAVIS_PYTHON_VERSION = "3.6"
54 | 


--------------------------------------------------------------------------------
/experiments/readme_example.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from fancyimpute import (
 3 |     BiScaler,
 4 |     KNN,
 5 |     NuclearNormMinimization,
 6 |     SoftImpute,
 7 |     SimpleFill
 8 | )
 9 | 
10 | n = 200
11 | m = 20
12 | inner_rank = 4
13 | X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
14 | print("Mean squared element: %0.4f" % (X ** 2).mean())
15 | 
16 | # X is a data matrix which we're going to randomly drop entries from
17 | missing_mask = np.random.rand(*X.shape) < 0.1
18 | X_incomplete = X.copy()
19 | # missing entries indicated with NaN
20 | X_incomplete[missing_mask] = np.nan
21 | 
22 | meanFill = SimpleFill("mean")
23 | X_filled_mean = meanFill.fit_transform(X_incomplete)
24 | 
25 | # Use 3 nearest rows which have a feature to fill in each row's missing features
26 | knnImpute = KNN(k=3)
27 | X_filled_knn = knnImpute.fit_transform(X_incomplete)
28 | 
29 | # matrix completion using convex optimization to find low-rank solution
30 | # that still matches observed values. Slow!
31 | X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)
32 | 
33 | # Instead of solving the nuclear norm objective directly, instead
34 | # induce sparsity using singular value thresholding
35 | softImpute = SoftImpute()
36 | 
37 | # simultaneously normalizes the rows and columns of your observed data,
38 | # sometimes useful for low-rank imputation methods
39 | biscaler = BiScaler()
40 | 
41 | # rescale both rows and columns to have zero mean and unit variance
42 | X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
43 | 
44 | X_filled_softimpute_normalized = softImpute.fit_transform(X_incomplete_normalized)
45 | X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)
46 | 
47 | X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete)
48 | 
49 | meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
50 | print("meanFill MSE: %f" % meanfill_mse)
51 | 
52 | # print mean squared error for the imputation methods above
53 | nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
54 | print("Nuclear norm minimization MSE: %f" % nnm_mse)
55 | 
56 | softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
57 | print("SoftImpute MSE: %f" % softImpute_mse)
58 | 
59 | softImpute_no_biscale_mse = (
60 |     (X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean()
61 | print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse)
62 | 
63 | 
64 | knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
65 | print("knnImpute MSE: %f" % knn_mse)
66 | 


--------------------------------------------------------------------------------
/test/test_dictionary_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from fancyimpute.dictionary_helpers import (
 4 |     dense_matrix_from_pair_dictionary,
 5 |     dense_matrix_from_nested_dictionary,
 6 |     reverse_lookup_from_nested_dict,
 7 |     transpose_nested_dictionary,
 8 | )
 9 | from nose.tools import eq_
10 | 
11 | 
12 | def test_dense_matrix_from_nested_dictionary():
13 |     d = {
14 |         "a": {"b": 10},
15 |         "b": {"c": 20}
16 |     }
17 |     X, rows, columns = dense_matrix_from_nested_dictionary(d)
18 |     eq_(rows, ["a", "b"])
19 |     eq_(columns, ["b", "c"])
20 |     eq_(X[0, 0], 10)
21 |     assert np.isnan(X[0, 1])
22 |     assert np.isnan(X[1, 0])
23 |     eq_(X[1, 1], 20)
24 | 
25 | 
26 | def test_dense_matrix_from_nested_dictionary_square():
27 |     d = {
28 |         "a": {"b": 10},
29 |         "b": {"c": 20}
30 |     }
31 |     X, rows, columns = dense_matrix_from_nested_dictionary(d, square_result=True)
32 |     eq_(rows, ["a", "b", "c"])
33 |     eq_(columns, ["a", "b", "c"])
34 |     assert np.isnan(X[0, 0])
35 |     eq_(X[0, 1], 10)
36 |     assert np.isnan(X[0, 2])
37 |     assert np.isnan(X[1, 0])
38 |     assert np.isnan(X[1, 1])
39 |     eq_(X[1, 2], 20)
40 |     assert np.isnan(X[2, 0])
41 |     assert np.isnan(X[2, 1])
42 |     assert np.isnan(X[2, 2])
43 | 
44 | 
45 | def test_dense_matrix_from_pair_dictionary():
46 |     d = {
47 |         ("a", "b"): 10,
48 |         ("b", "c"): 20
49 |     }
50 |     X, rows, columns = dense_matrix_from_pair_dictionary(d)
51 |     eq_(rows, ["a", "b"])
52 |     eq_(columns, ["b", "c"])
53 |     eq_(X[0, 0], 10)
54 |     assert np.isnan(X[0, 1])
55 |     assert np.isnan(X[1, 0])
56 |     eq_(X[1, 1], 20)
57 | 
58 | 
59 | def test_dense_matrix_from_pair_dictionary_square():
60 |     d = {
61 |         ("a", "b"): 10,
62 |         ("b", "c"): 20
63 |     }
64 |     X, rows, columns = dense_matrix_from_pair_dictionary(d, square_result=True)
65 |     eq_(rows, ["a", "b", "c"])
66 |     eq_(columns, ["a", "b", "c"])
67 |     assert np.isnan(X[0, 0])
68 |     eq_(X[0, 1], 10)
69 |     assert np.isnan(X[0, 2])
70 |     assert np.isnan(X[1, 0])
71 |     assert np.isnan(X[1, 1])
72 |     eq_(X[1, 2], 20)
73 |     assert np.isnan(X[2, 0])
74 |     assert np.isnan(X[2, 1])
75 |     assert np.isnan(X[2, 2])
76 | 
77 | 
78 | def test_reverse_lookup_from_nested_dict():
79 |     d = {
80 |         "a": {"b": 10, "c": 20},
81 |         "b": {"c": 5},
82 |         "z": {"c": 100}
83 |     }
84 |     reverse_dict = reverse_lookup_from_nested_dict(d)
85 |     len(reverse_dict.keys()) == 2
86 |     assert "c" in reverse_dict
87 |     eq_(set(reverse_dict["c"]), {("a", 20), ("b", 5), ("z", 100)})
88 |     assert "b" in reverse_dict
89 |     eq_(reverse_dict["b"], [("a", 10)])
90 | 
91 | 
92 | def test_transpose_nested_dictionary():
93 |     d = {"a": {"b": 20, "c": 50}, "c": {"q": 500}}
94 |     transposed = transpose_nested_dictionary(d)
95 |     eq_(set(transposed.keys()), {"b", "c", "q"})
96 |     eq_(transposed["q"], {"c": 500})
97 |     eq_(transposed["c"], {"a": 50})
98 |     eq_(transposed["b"], {"a": 20})
99 | 


--------------------------------------------------------------------------------
/fancyimpute/iterative_svd.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from sklearn.decomposition import TruncatedSVD
14 | from sklearn.utils import check_array
15 | import numpy as np
16 | 
17 | from .solver import Solver
18 | from .common import masked_mae
19 | 
20 | F32PREC = np.finfo(np.float32).eps
21 | 
22 | 
23 | class IterativeSVD(Solver):
24 |     def __init__(
25 |             self,
26 |             rank=10,
27 |             convergence_threshold=0.00001,
28 |             max_iters=200,
29 |             gradual_rank_increase=True,
30 |             svd_algorithm="arpack",
31 |             init_fill_method="zero",
32 |             min_value=None,
33 |             max_value=None,
34 |             verbose=True):
35 |         Solver.__init__(
36 |             self,
37 |             fill_method=init_fill_method,
38 |             min_value=min_value,
39 |             max_value=max_value)
40 |         self.rank = rank
41 |         self.max_iters = max_iters
42 |         self.svd_algorithm = svd_algorithm
43 |         self.convergence_threshold = convergence_threshold
44 |         self.gradual_rank_increase = gradual_rank_increase
45 |         self.verbose = verbose
46 | 
47 |     def _converged(self, X_old, X_new, missing_mask):
48 |         # check for convergence
49 |         old_missing_values = X_old[missing_mask]
50 |         new_missing_values = X_new[missing_mask]
51 |         difference = old_missing_values - new_missing_values
52 |         ssd = np.sum(difference ** 2)
53 |         old_norm_squared = (old_missing_values ** 2).sum()
54 |         # edge cases
55 |         if old_norm_squared == 0 or \
56 |                 (old_norm_squared < F32PREC and ssd > F32PREC):
57 |             return False
58 |         else:
59 |             return (ssd / old_norm_squared) < self.convergence_threshold
60 | 
61 |     def solve(self, X, missing_mask):
62 |         X = check_array(X, force_all_finite=False)
63 | 
64 |         observed_mask = ~missing_mask
65 |         X_filled = X
66 |         for i in range(self.max_iters):
67 |             # deviation from original svdImpute algorithm:
68 |             # gradually increase the rank of our approximation
69 |             if self.gradual_rank_increase:
70 |                 curr_rank = min(2 ** i, self.rank)
71 |             else:
72 |                 curr_rank = self.rank
73 |             tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm)
74 |             X_reduced = tsvd.fit_transform(X_filled)
75 |             X_reconstructed = tsvd.inverse_transform(X_reduced)
76 |             X_reconstructed = self.clip(X_reconstructed)
77 |             mae = masked_mae(
78 |                 X_true=X,
79 |                 X_pred=X_reconstructed,
80 |                 mask=observed_mask)
81 |             if self.verbose:
82 |                 print(
83 |                     "[IterativeSVD] Iter %d: observed MAE=%0.6f" % (
84 |                         i + 1, mae))
85 |             converged = self._converged(
86 |                 X_old=X_filled,
87 |                 X_new=X_reconstructed,
88 |                 missing_mask=missing_mask)
89 |             X_filled[missing_mask] = X_reconstructed[missing_mask]
90 |             if converged:
91 |                 break
92 |         return X_filled
93 | 


--------------------------------------------------------------------------------
/fancyimpute/common.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import logging
 14 | import importlib
 15 | 
 16 | import numpy as np
 17 | 
 18 | 
 19 | def import_from(module, name):
 20 |     '''
 21 |     usage example:
 22 |     grid = import_from('sklearn.model_selection', 'GridSearchCV')
 23 |     is equivalent to:
 24 |     from sklearn.model_selection import GridSearchV as grid
 25 |     '''
 26 |     module = importlib.import_module(module)
 27 |     return getattr(module, name)
 28 | 
 29 | 
 30 | def masked_mae(X_true, X_pred, mask):
 31 |     masked_diff = X_true[mask] - X_pred[mask]
 32 |     return np.mean(np.abs(masked_diff))
 33 | 
 34 | 
 35 | def masked_mse(X_true, X_pred, mask):
 36 |     masked_diff = X_true[mask] - X_pred[mask]
 37 |     return np.mean(masked_diff ** 2)
 38 | 
 39 | 
 40 | def generate_random_column_samples(column):
 41 |     col_mask = np.isnan(column)
 42 |     n_missing = np.sum(col_mask)
 43 |     if n_missing == len(column):
 44 |         logging.warn("No observed values in column")
 45 |         return np.zeros_like(column)
 46 | 
 47 |     mean = np.nanmean(column)
 48 |     std = np.nanstd(column)
 49 | 
 50 |     if np.isclose(std, 0):
 51 |         return np.array([mean] * n_missing)
 52 |     else:
 53 |         return np.random.randn(n_missing) * std + mean
 54 | 
 55 | 
 56 | def choose_solution_using_percentiles(
 57 |         X_original,
 58 |         solutions,
 59 |         parameters=None,
 60 |         verbose=False,
 61 |         percentiles=list(range(10, 100, 10))):
 62 |     """
 63 |     It's tricky to pick a single matrix out of all the candidate
 64 |     solutions with differing shrinkage thresholds.
 65 |     Our heuristic is to pick the matrix whose percentiles match best
 66 |     between the missing and observed data.
 67 |     """
 68 |     missing_mask = np.isnan(X_original)
 69 |     min_mse = np.inf
 70 |     best_solution = None
 71 |     for i, candidate in enumerate(solutions):
 72 |         for col_idx in range(X_original.shape[1]):
 73 |             col_data = candidate[:, col_idx]
 74 |             col_missing = missing_mask[:, col_idx]
 75 |             col_observed = ~col_missing
 76 |             if col_missing.sum() < 2:
 77 |                 continue
 78 |             elif col_observed.sum() < 2:
 79 |                 continue
 80 |             missing_data = col_data[col_missing]
 81 |             observed_data = col_data[col_observed]
 82 | 
 83 |             missing_percentiles = np.array([
 84 |                 np.percentile(missing_data, p)
 85 |                 for p in percentiles])
 86 | 
 87 |             observed_percentiles = np.array([
 88 |                 np.percentile(observed_data, p)
 89 |                 for p in percentiles])
 90 | 
 91 |             mse = np.mean((missing_percentiles - observed_percentiles) ** 2)
 92 |         if mse < min_mse:
 93 |             min_mse = mse
 94 |             best_solution = candidate
 95 |         if verbose:
 96 |             print("Candidate #%d/%d%s: %f" % (
 97 |                 i + 1,
 98 |                 len(solutions),
 99 |                 (" (parameter=%s) " % parameters[i]
100 |                     if parameters is not None
101 |                     else ""),
102 |                 mse))
103 |     return best_solution
104 | 


--------------------------------------------------------------------------------
/fancyimpute/knn.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import numpy as np
 14 | 
 15 | from knnimpute import knn_impute_few_observed, knn_impute_with_argpartition
 16 | from sklearn.utils import check_array
 17 | 
 18 | from .solver import Solver
 19 | 
 20 | 
 21 | class KNN(Solver):
 22 |     """
 23 |     k-Nearest Neighbors imputation for arrays with missing data.
 24 |     Works only on dense arrays with at most a few thousand rows.
 25 | 
 26 |     Assumes that each feature has been centered and rescaled to have
 27 |     mean 0 and variance 1.
 28 | 
 29 |     Inspired by the implementation of kNNImpute from the R package
 30 |     imputation.
 31 |     See here:
 32 |     https://www.rdocumentation.org/packages/imputation/versions/2.0.3/topics/kNNImpute
 33 |     """
 34 |     def __init__(
 35 |             self,
 36 |             k=5,
 37 |             orientation="rows",
 38 |             use_argpartition=False,
 39 |             print_interval=100,
 40 |             min_value=None,
 41 |             max_value=None,
 42 |             normalizer=None,
 43 |             verbose=True):
 44 |         """
 45 |         Parameters
 46 |         ----------
 47 |         k : int
 48 |             Number of neighboring rows to use for imputation.
 49 | 
 50 |         orientation : str
 51 |             Which axis of the input matrix should be treated as a sample
 52 |             (default is "rows" but can also be "columns")
 53 | 
 54 |         use_argpartition : bool
 55 |            Use a more naive implementation of kNN imputation whichs calls
 56 |            numpy.argpartition for each row/column pair. May give NaN if fewer
 57 |            than k neighbors are available for a missing value.
 58 | 
 59 |         print_interval : int
 60 | 
 61 |         min_value : float
 62 |             Minimum possible imputed value
 63 | 
 64 |         max_value : float
 65 |             Maximum possible imputed value
 66 | 
 67 |         normalizer : object
 68 |             Any object (such as BiScaler) with fit() and transform() methods
 69 | 
 70 |         verbose : bool
 71 |         """
 72 |         Solver.__init__(
 73 |             self,
 74 |             min_value=min_value,
 75 |             max_value=max_value,
 76 |             normalizer=normalizer)
 77 |         self.k = k
 78 |         self.verbose = verbose
 79 |         self.orientation = orientation
 80 |         self.print_interval = print_interval
 81 |         if use_argpartition:
 82 |             self._impute_fn = knn_impute_with_argpartition
 83 |         else:
 84 |             self._impute_fn = knn_impute_few_observed
 85 | 
 86 |     def solve(self, X, missing_mask):
 87 |         X = check_array(X, force_all_finite=False)
 88 | 
 89 |         if self.orientation == "columns":
 90 |             X = X.T
 91 |             missing_mask = missing_mask.T
 92 | 
 93 |         elif self.orientation != "rows":
 94 |             raise ValueError(
 95 |                 "Orientation must be either 'rows' or 'columns', got: %s" % (
 96 |                     self.orientation,))
 97 | 
 98 |         X_imputed = self._impute_fn(
 99 |             X=X,
100 |             missing_mask=missing_mask,
101 |             k=self.k,
102 |             verbose=self.verbose,
103 |             print_interval=self.print_interval)
104 | 
105 |         failed_to_impute = np.isnan(X_imputed)
106 |         n_missing_after_imputation = failed_to_impute.sum()
107 |         if n_missing_after_imputation != 0:
108 |             if self.verbose:
109 |                 print("[KNN] Warning: %d/%d still missing after imputation, replacing with 0" % (
110 |                     n_missing_after_imputation,
111 |                     X.shape[0] * X.shape[1]))
112 |             X_imputed[failed_to_impute] = X[failed_to_impute]
113 | 
114 |         if self.orientation == "columns":
115 |             X_imputed = X_imputed.T
116 | 
117 |         return X_imputed
118 | 


--------------------------------------------------------------------------------
/fancyimpute/nuclear_norm_minimization.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import cvxpy
 14 | 
 15 | from .solver import Solver
 16 | 
 17 | from sklearn.utils import check_array
 18 | 
 19 | 
 20 | class NuclearNormMinimization(Solver):
 21 |     """
 22 |     Simple implementation of "Exact Matrix Completion via Convex Optimization"
 23 |     by Emmanuel Candes and Benjamin Recht using cvxpy.
 24 |     """
 25 | 
 26 |     def __init__(
 27 |             self,
 28 |             require_symmetric_solution=False,
 29 |             min_value=None,
 30 |             max_value=None,
 31 |             error_tolerance=0.0001,
 32 |             max_iters=50000,
 33 |             verbose=True):
 34 |         """
 35 |         Parameters
 36 |         ----------
 37 |         require_symmetric_solution : bool
 38 |             Add symmetry constraint to convex problem
 39 | 
 40 |         min_value : float
 41 |             Smallest possible imputed value
 42 | 
 43 |         max_value : float
 44 |             Largest possible imputed value
 45 | 
 46 |         error_tolerance : bool
 47 |             Degree of error allowed on reconstructed values. If omitted then
 48 |             defaults to 0.0001
 49 | 
 50 |         max_iters : int
 51 |             Maximum number of iterations for the convex solver
 52 | 
 53 |         verbose : bool
 54 |             Print debug info
 55 |         """
 56 |         Solver.__init__(
 57 |             self,
 58 |             min_value=min_value,
 59 |             max_value=max_value)
 60 |         self.require_symmetric_solution = require_symmetric_solution
 61 |         self.error_tolerance = error_tolerance
 62 |         self.max_iters = max_iters
 63 |         self.verbose = verbose
 64 | 
 65 |     def _constraints(self, X, missing_mask, S, error_tolerance):
 66 |         """
 67 |         Parameters
 68 |         ----------
 69 |         X : np.array
 70 |             Data matrix with missing values filled in
 71 | 
 72 |         missing_mask : np.array
 73 |             Boolean array indicating where missing values were
 74 | 
 75 |         S : cvxpy.Variable
 76 |             Representation of solution variable
 77 |         """
 78 |         ok_mask = ~missing_mask
 79 |         masked_X = cvxpy.multiply(ok_mask, X)
 80 |         masked_S = cvxpy.multiply(ok_mask, S)
 81 |         abs_diff = cvxpy.abs(masked_S - masked_X)
 82 |         close_to_data = abs_diff <= error_tolerance
 83 |         constraints = [close_to_data]
 84 |         if self.require_symmetric_solution:
 85 |             constraints.append(S == S.T)
 86 | 
 87 |         if self.min_value is not None:
 88 |             constraints.append(S >= self.min_value)
 89 | 
 90 |         if self.max_value is not None:
 91 |             constraints.append(S <= self.max_value)
 92 | 
 93 |         return constraints
 94 | 
 95 |     def _create_objective(self, m, n):
 96 |         """
 97 |         Parameters
 98 |         ----------
 99 |         m, n : int
100 |             Dimensions that of solution matrix
101 |         Returns the objective function and a variable representing the
102 |         solution to the convex optimization problem.
103 |         """
104 |         # S is the completed matrix
105 |         shape = (m, n)
106 |         S = cvxpy.Variable(shape, name="S")
107 |         norm = cvxpy.norm(S, "nuc")
108 |         objective = cvxpy.Minimize(norm)
109 |         return S, objective
110 | 
111 |     def solve(self, X, missing_mask):
112 |         X = check_array(X, force_all_finite=False)
113 | 
114 |         m, n = X.shape
115 |         S, objective = self._create_objective(m, n)
116 |         constraints = self._constraints(
117 |             X=X,
118 |             missing_mask=missing_mask,
119 |             S=S,
120 |             error_tolerance=self.error_tolerance)
121 |         problem = cvxpy.Problem(objective, constraints)
122 |         problem.solve(
123 |             verbose=self.verbose,
124 |             solver=cvxpy.CVXOPT,
125 |             max_iters=self.max_iters,
126 |             # use_indirect, see: https://github.com/cvxgrp/cvxpy/issues/547
127 |             use_indirect=False)
128 |         return S.value
129 | 


--------------------------------------------------------------------------------
/fancyimpute/matrix_factorization.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import numpy as np
 14 | from sklearn.utils import check_array
 15 | from .solver import Solver
 16 | from .common import masked_mae
 17 | 
 18 | 
 19 | class MatrixFactorization(Solver):
 20 |     def __init__(
 21 |         self,
 22 |         rank=40,
 23 |         learning_rate=0.01,
 24 |         max_iters=50,
 25 |         shrinkage_value=0,
 26 |         min_value=None,
 27 |         max_value=None,
 28 |         verbose=True,
 29 |     ):
 30 |         """
 31 |         Train a matrix factorization model to predict empty
 32 |         entries in a matrix. Mostly copied (with permission) from:
 33 |         https://blog.insightdatascience.com/explicit-matrix-factorization-als-sgd-and-all-that-jazz-b00e4d9b21ea
 34 | 
 35 |         Params
 36 |         =====+
 37 |         rank : (int)
 38 |             Number of latent factors to use in matrix
 39 |             factorization model
 40 | 
 41 |         learning_rate : (float)
 42 |             Learning rate for optimizer
 43 | 
 44 |         max_iters : (int)
 45 |             Number of max_iters to train for
 46 | 
 47 |         shrinkage_value : (float)
 48 |             Regularization term for sgd penalty
 49 | 
 50 |         min_value : float
 51 |             Smallest possible imputed value
 52 | 
 53 |         max_value : float
 54 |             Largest possible imputed value
 55 | 
 56 |         verbose : (bool)
 57 |             Whether or not to printout training progress
 58 |         """
 59 |         Solver.__init__(self, min_value=min_value, max_value=max_value)
 60 |         self.rank = rank
 61 |         self.learning_rate = learning_rate
 62 |         self.max_iters = max_iters
 63 |         self.shrinkage_value = shrinkage_value
 64 |         self._v = verbose
 65 | 
 66 |     def solve(self, X, missing_mask):
 67 |         """ Train model for max_iters iterations from scratch."""
 68 |         X = check_array(X, force_all_finite=False)
 69 | 
 70 |         # shape data to fit into keras model
 71 |         (n_samples, n_features) = X.shape
 72 |         observed_mask = ~missing_mask
 73 |         training_indices = list(zip(*np.where(observed_mask)))
 74 | 
 75 |         self.user_vecs = np.random.normal(scale=1.0 / self.rank, size=(n_samples, self.rank))
 76 |         self.item_vecs = np.random.normal(scale=1.0 / self.rank, size=(n_features, self.rank))
 77 | 
 78 |         self.user_bias = np.zeros(n_samples)
 79 |         self.item_bias = np.zeros(n_features)
 80 |         self.global_bias = np.mean(X[observed_mask])
 81 | 
 82 |         for i in range(self.max_iters):
 83 |             # to do: early stopping
 84 |             if (i + 1) % 10 == 0 and self._v:
 85 |                 X_reconstruction = self.clip(self.predict_all())
 86 |                 mae = masked_mae(X_true=X, X_pred=X_reconstruction, mask=observed_mask)
 87 |                 print("[MatrixFactorization] Iter %d: observed MAE=%0.6f rank=%d" % (i + 1, mae, self.rank))
 88 | 
 89 |             np.random.shuffle(training_indices)
 90 |             self.sgd(X, training_indices)
 91 |             i += 1
 92 | 
 93 |         X_filled = X.copy()
 94 |         X_filled[missing_mask] = self.clip(self.predict_all()[missing_mask])
 95 |         return X_filled
 96 | 
 97 |     def sgd(self, X, training_indices):
 98 |         # to do: batch learning
 99 |         for (u, i) in training_indices:
100 |             prediction = self.predict(u, i)
101 |             e = X[u, i] - prediction  # error
102 | 
103 |             # Update biases
104 |             self.user_bias[u] += self.learning_rate * (e - self.shrinkage_value * self.user_bias[u])
105 |             self.item_bias[i] += self.learning_rate * (e - self.shrinkage_value * self.item_bias[i])
106 | 
107 |             # Update latent factors
108 |             self.user_vecs[u, :] += self.learning_rate * (
109 |                 e * self.item_vecs[i, :] - self.shrinkage_value * self.user_vecs[u, :]
110 |             )
111 |             self.item_vecs[i, :] += self.learning_rate * (
112 |                 e * self.user_vecs[u, :] - self.shrinkage_value * self.item_vecs[i, :]
113 |             )
114 | 
115 |     def predict(self, u, i):
116 |         """ Single user and item prediction."""
117 |         prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
118 |         prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
119 |         return prediction
120 | 
121 |     def predict_all(self):
122 |         """ Predict ratings for every user and item."""
123 |         predictions = self.user_vecs.dot(self.item_vecs.T)
124 |         predictions += self.global_bias + self.user_bias[:, np.newaxis] + self.item_bias[np.newaxis, :]
125 |         return predictions
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/iskandr/fancyimpute.svg?branch=master)](https://travis-ci.org/iskandr/fancyimpute) [![Coverage Status](https://coveralls.io/repos/github/iskandr/fancyimpute/badge.svg?branch=master)](https://coveralls.io/github/iskandr/fancyimpute?branch=master) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.51773.svg)](http://dx.doi.org/10.5281/zenodo.51773)
 2 | 
 3 | 
 4 | ![plot](https://user-images.strikinglycdn.com/res/hrscywv4p/image/upload/c_limit,fl_lossy,h_1440,w_720,f_auto,q_auto/174108/654579_364680.png)
 5 | 
 6 | 
 7 | A variety of matrix completion and imputation algorithms implemented in Python 3.6.
 8 | 
 9 | To install:
10 | 
11 | `pip install fancyimpute`
12 | 
13 | If you run into `tensorflow` problems and use anaconda, you can try to fix them with `conda install cudatoolkit`. 
14 | 
15 | ## Important Caveats
16 | 
17 | (1) This project is in "bare maintenance" mode. That means we are not planning on adding more imputation algorithms or features (but might if we get inspired). Please do report bugs, and we'll try to fix them. Also, we are happy to take pull requests for more algorithms and/or features. 
18 | 
19 | (2) `IterativeImputer` started its life as a `fancyimpute` original, but was then merged into `scikit-learn` and we deleted it from `fancyimpute` in favor of the better-tested `sklearn` version. As a convenience, you can still `from fancyimpute import IterativeImputer`, but under the hood it's just doing `from sklearn.impute import IterativeImputer`.  That means if you update `scikit-learn` in the future, you may also change the behavior of `IterativeImputer`. 
20 | 
21 | 
22 | ## Usage
23 | 
24 | ```python
25 | from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler
26 | 
27 | # X is the complete data matrix
28 | # X_incomplete has the same values as X except a subset have been replace with NaN
29 | 
30 | # Use 3 nearest rows which have a feature to fill in each row's missing features
31 | X_filled_knn = KNN(k=3).fit_transform(X_incomplete)
32 | 
33 | # matrix completion using convex optimization to find low-rank solution
34 | # that still matches observed values. Slow!
35 | X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)
36 | 
37 | # Instead of solving the nuclear norm objective directly, instead
38 | # induce sparsity using singular value thresholding
39 | X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
40 | X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)
41 | 
42 | # print mean squared error for the  imputation methods above
43 | nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
44 | print("Nuclear norm minimization MSE: %f" % nnm_mse)
45 | 
46 | softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
47 | print("SoftImpute MSE: %f" % softImpute_mse)
48 | 
49 | knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
50 | print("knnImpute MSE: %f" % knn_mse)
51 | ```
52 | 
53 | ## Algorithms
54 | 
55 | * `SimpleFill`: Replaces missing entries with the mean or median of each column.
56 | 
57 | * `KNN`: Nearest neighbor imputations which weights samples using the mean squared difference
58 | on features for which two rows both have observed data.
59 | 
60 | * `SoftImpute`: Matrix completion by iterative soft thresholding of SVD decompositions. Inspired by the [softImpute](https://web.stanford.edu/~hastie/swData/softImpute/vignette.html) package for R, which is based on [Spectral Regularization Algorithms for Learning Large Incomplete Matrices](http://web.stanford.edu/~hastie/Papers/mazumder10a.pdf) by Mazumder et. al.
61 | 
62 | * `IterativeImputer`: A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. A stub that links to `scikit-learn`'s [IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html).
63 | 
64 | * `IterativeSVD`: Matrix completion by iterative low-rank SVD decomposition. Should be similar to SVDimpute from [Missing value estimation methods for DNA microarrays](http://www.ncbi.nlm.nih.gov/pubmed/11395428) by Troyanskaya et. al.
65 | 
66 | * `MatrixFactorization`: Direct factorization of the incomplete matrix into low-rank `U` and `V`, with per-row and per-column biases, as well as a global bias. Solved by SGD in pure numpy.
67 | 
68 | * `NuclearNormMinimization`: Simple implementation of [Exact Matrix Completion via Convex Optimization](http://statweb.stanford.edu/~candes/papers/MatrixCompletion.pdf
69 | ) by Emmanuel Candes and Benjamin Recht using [cvxpy](http://www.cvxpy.org). Too slow for large matrices.
70 | 
71 | * `BiScaler`: Iterative estimation of row/column means and standard deviations to get doubly normalized
72 | matrix. Not guaranteed to converge but works well in practice. Taken from [Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares](http://arxiv.org/abs/1410.2596).
73 | 
74 | ## Citation
75 | 
76 | If you use `fancyimpute` in your academic publication, please cite it as follows:
77 | ```bibtex
78 | @software{fancyimpute,
79 |   author = {Alex Rubinsteyn and Sergey Feldman},
80 |   title={fancyimpute: An Imputation Library for Python},
81 |   url = {https://github.com/iskandr/fancyimpute},
82 |   version = {0.7.0},
83 |   date = {2016},
84 | }
85 | ```
86 | 


--------------------------------------------------------------------------------
/fancyimpute/similarity_weighted_averaging.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from collections import defaultdict
 14 | 
 15 | from sklearn.utils import check_array
 16 | 
 17 | import numpy as np
 18 | 
 19 | from .dictionary_helpers import (
 20 |     collect_nested_keys,
 21 |     reverse_lookup_from_nested_dict,
 22 |     matrix_to_nested_dictionary,
 23 |     transpose_nested_dictionary,
 24 | )
 25 | 
 26 | 
 27 | class SimilarityWeightedAveraging(object):
 28 |     """
 29 |     Fill in missing each missing row/column value by averaging across the
 30 |     k-nearest neighbors columns (taking into account missing data when
 31 |     computing column similarities and choosing which neighbors to inspect).
 32 | 
 33 |     Currently does not inherit from Solver since it expects sparse inputs in
 34 |     the form of nested dictionaries.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         min_weight_for_similarity=0.1,
 40 |         min_count_for_similarity=2,
 41 |         similarity_exponent=4.0,
 42 |         shrinkage_value=0.0001,
 43 |         orientation="rows",
 44 |         verbose=False,
 45 |     ):
 46 |         """
 47 |         Parameters
 48 |         ----------
 49 |         min_weight_for_similarity : float
 50 |             If sum of values in shared rows between two columns falls below this
 51 |             threhold then similarity can't be computed between those columns.
 52 | 
 53 |         min_count_for_similarity : int
 54 |             If number of overlapping rows between two columns falls below this
 55 |             threhold then similarity can't be computed between those columns.
 56 | 
 57 |         similarity_exponent : float
 58 |             Exponent for turning similarities into weights on values of other
 59 |             columns.
 60 | 
 61 |         shrinkage_value : float
 62 |             Shrinks reconstructed values toward 0
 63 | 
 64 |         orientation : str
 65 |             Whether to compute similarities along rows or columns
 66 | 
 67 |         verbose : bool
 68 |         """
 69 |         self.min_weight_for_similarity = min_weight_for_similarity
 70 |         self.min_count_for_similarity = min_count_for_similarity
 71 |         self.similarity_exponent = similarity_exponent
 72 |         self.shrinkage_value = shrinkage_value
 73 |         self.orientation = orientation
 74 |         self.verbose = verbose
 75 | 
 76 |     def jacard_similarity_from_nested_dicts(self, nested_dictionaries):
 77 |         """
 78 |         Compute the continuous Jacard similarity between all pairs
 79 |         of keys in dictionary-of-dictionaries given as an input.
 80 | 
 81 |         Returns three element tuple:
 82 |             - similarity dictionary: (key, key) -> float
 83 |             - overlap count dictionary: key -> key -> int
 84 |             - weight dictionary: key -> key -> float
 85 |         """
 86 |         sims = {}
 87 |         overlaps = {}
 88 |         weights = {}
 89 |         for a, column_dict_a in nested_dictionaries.items():
 90 |             row_set_a = set(column_dict_a.keys())
 91 |             for b, column_dict_b in nested_dictionaries.items():
 92 |                 row_set_b = set(column_dict_b.keys())
 93 |                 common_rows = row_set_a.intersection(row_set_b)
 94 |                 n_overlap = len(common_rows)
 95 |                 overlaps[(a, b)] = n_overlap
 96 |                 total = 0.0
 97 |                 weight = 0.0
 98 |                 for row_name in common_rows:
 99 |                     value_a = column_dict_a[row_name]
100 |                     value_b = column_dict_b[row_name]
101 |                     minval = min(value_a, value_b)
102 |                     maxval = max(value_a, value_b)
103 |                     total += minval
104 |                     weight += maxval
105 |                 weights[(a, b)] = weight
106 |                 if weight < self.min_weight_for_similarity:
107 |                     continue
108 |                 if n_overlap < self.min_count_for_similarity:
109 |                     continue
110 |                 sims[(a, b)] = total / weight
111 |         return sims, overlaps, weights
112 | 
113 |     def complete_dict(self, values_dict):
114 |         """
115 |         Keys of nested dictionaries can be arbitrary objects.
116 |         """
117 |         if self.orientation != "rows":
118 |             values_dict = transpose_nested_dictionary(values_dict)
119 | 
120 |         row_keys, column_keys = collect_nested_keys(values_dict)
121 |         if self.verbose:
122 |             print("[SimilarityWeightedAveraging] # rows = %d" % (len(row_keys)))
123 |             print("[SimilarityWeightedAveraging] # columns = %d" % (len(column_keys)))
124 |         similarities, overlaps, weights = self.jacard_similarity_from_nested_dicts(values_dict)
125 |         if self.verbose:
126 |             print("[SimilarityWeightedAveraging] Computed %d similarities between rows" % (len(similarities),))
127 |         column_to_row_values = reverse_lookup_from_nested_dict(values_dict)
128 | 
129 |         result = defaultdict(dict)
130 | 
131 |         exponent = self.similarity_exponent
132 |         shrinkage_value = self.shrinkage_value
133 |         for i, row_key in enumerate(row_keys):
134 |             for column_key, value_triplets in column_to_row_values.items():
135 |                 total = 0
136 |                 denom = shrinkage_value
137 |                 for (other_row_key, y) in value_triplets:
138 |                     sample_weight = 1.0
139 |                     sim = similarities.get((row_key, other_row_key), 0)
140 |                     combined_weight = sim ** exponent
141 |                     combined_weight *= sample_weight
142 |                     total += combined_weight * y
143 |                     denom += combined_weight
144 |                 if denom > shrinkage_value:
145 |                     result[row_key][column_key] = total / denom
146 |         if self.orientation != "rows":
147 |             result = transpose_nested_dictionary(result)
148 |         return result
149 | 
150 |     def fit_transform(self, X):
151 |         X = check_array(X, force_all_finite=False)
152 | 
153 |         if self.verbose:
154 |             print(("[SimilarityWeightedAveraging] Creating dictionary from matrix " " with shape %s") % (X.shape,))
155 |         missing_mask = np.isnan(X)
156 |         observed_mask = ~missing_mask
157 |         sparse_dict = matrix_to_nested_dictionary(X, filter_fn=np.isfinite)
158 | 
159 |         completed_dict = self.complete_dict(sparse_dict)
160 |         array_result = np.zeros_like(X)
161 |         for row_idx, row_dict in completed_dict.items():
162 |             for col_idx, value in row_dict.items():
163 |                 array_result[row_idx, col_idx] = value
164 |         array_result[observed_mask] = X[observed_mask]
165 |         return array_result
166 | 


--------------------------------------------------------------------------------
/fancyimpute/soft_impute.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import numpy as np
 14 | from sklearn.utils.extmath import randomized_svd
 15 | from sklearn.utils import check_array
 16 | 
 17 | from .common import masked_mae
 18 | from .solver import Solver
 19 | 
 20 | F32PREC = np.finfo(np.float32).eps
 21 | 
 22 | 
 23 | class SoftImpute(Solver):
 24 |     """
 25 |     Implementation of the SoftImpute algorithm from:
 26 |     "Spectral Regularization Algorithms for Learning Large Incomplete Matrices"
 27 |     by Mazumder, Hastie, and Tibshirani.
 28 |     """
 29 |     def __init__(
 30 |             self,
 31 |             shrinkage_value=None,
 32 |             convergence_threshold=0.001,
 33 |             max_iters=100,
 34 |             max_rank=None,
 35 |             n_power_iterations=1,
 36 |             init_fill_method="zero",
 37 |             min_value=None,
 38 |             max_value=None,
 39 |             normalizer=None,
 40 |             verbose=True):
 41 |         """
 42 |         Parameters
 43 |         ----------
 44 |         shrinkage_value : float
 45 |             Value by which we shrink singular values on each iteration. If
 46 |             omitted then the default value will be the maximum singular
 47 |             value of the initialized matrix (zeros for missing values) divided
 48 |             by 50.
 49 | 
 50 |         convergence_threshold : float
 51 |             Minimum ration difference between iterations (as a fraction of
 52 |             the Frobenius norm of the current solution) before stopping.
 53 | 
 54 |         max_iters : int
 55 |             Maximum number of SVD iterations
 56 | 
 57 |         max_rank : int, optional
 58 |             Perform a truncated SVD on each iteration with this value as its
 59 |             rank.
 60 | 
 61 |         n_power_iterations : int
 62 |             Number of power iterations to perform with randomized SVD
 63 | 
 64 |         init_fill_method : str
 65 |             How to initialize missing values of data matrix, default is
 66 |             to fill them with zeros.
 67 | 
 68 |         min_value : float
 69 |             Smallest allowable value in the solution
 70 | 
 71 |         max_value : float
 72 |             Largest allowable value in the solution
 73 | 
 74 |         normalizer : object
 75 |             Any object (such as BiScaler) with fit() and transform() methods
 76 | 
 77 |         verbose : bool
 78 |             Print debugging info
 79 |         """
 80 |         Solver.__init__(
 81 |             self,
 82 |             fill_method=init_fill_method,
 83 |             min_value=min_value,
 84 |             max_value=max_value,
 85 |             normalizer=normalizer)
 86 |         self.shrinkage_value = shrinkage_value
 87 |         self.convergence_threshold = convergence_threshold
 88 |         self.max_iters = max_iters
 89 |         self.max_rank = max_rank
 90 |         self.n_power_iterations = n_power_iterations
 91 |         self.verbose = verbose
 92 | 
 93 |     def _converged(self, X_old, X_new, missing_mask):
 94 |         # check for convergence
 95 |         old_missing_values = X_old[missing_mask]
 96 |         new_missing_values = X_new[missing_mask]
 97 |         difference = old_missing_values - new_missing_values
 98 |         ssd = np.sum(difference ** 2)
 99 |         old_norm = np.sqrt((old_missing_values ** 2).sum())
100 |         # edge cases
101 |         if old_norm == 0 or (old_norm < F32PREC and np.sqrt(ssd) > F32PREC):
102 |             return False
103 |         else:
104 |             return (np.sqrt(ssd) / old_norm) < self.convergence_threshold
105 | 
106 |     def _svd_step(self, X, shrinkage_value, max_rank=None):
107 |         """
108 |         Returns reconstructed X from low-rank thresholded SVD and
109 |         the rank achieved.
110 |         """
111 |         if max_rank:
112 |             # if we have a max rank then perform the faster randomized SVD
113 |             (U, s, V) = randomized_svd(
114 |                 X,
115 |                 max_rank,
116 |                 n_iter=self.n_power_iterations,
117 |                 random_state=None)
118 |         else:
119 |             # perform a full rank SVD using ARPACK
120 |             (U, s, V) = np.linalg.svd(
121 |                 X,
122 |                 full_matrices=False,
123 |                 compute_uv=True)
124 |         s_thresh = np.maximum(s - shrinkage_value, 0)
125 |         rank = (s_thresh > 0).sum()
126 |         s_thresh = s_thresh[:rank]
127 |         U_thresh = U[:, :rank]
128 |         V_thresh = V[:rank, :]
129 |         S_thresh = np.diag(s_thresh)
130 |         X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
131 |         return X_reconstruction, rank
132 | 
133 |     def _max_singular_value(self, X_filled):
134 |         # quick decomposition of X_filled into rank-1 SVD
135 |         _, s, _ = randomized_svd(
136 |             X_filled,
137 |             1,
138 |             n_iter=5,
139 |             random_state=None)
140 |         return s[0]
141 | 
142 |     def solve(self, X, missing_mask):
143 |         X = check_array(X, force_all_finite=False)
144 | 
145 |         X_init = X.copy()
146 | 
147 |         X_filled = X
148 |         observed_mask = ~missing_mask
149 |         max_singular_value = self._max_singular_value(X_filled)
150 |         if self.verbose:
151 |             print("[SoftImpute] Max Singular Value of X_init = %f" % (
152 |                 max_singular_value))
153 | 
154 |         if self.shrinkage_value:
155 |             shrinkage_value = self.shrinkage_value
156 |         else:
157 |             # totally hackish heuristic: keep only components
158 |             # with at least 1/50th the max singular value
159 |             shrinkage_value = max_singular_value / 50.0
160 | 
161 |         for i in range(self.max_iters):
162 |             X_reconstruction, rank = self._svd_step(
163 |                 X_filled,
164 |                 shrinkage_value,
165 |                 max_rank=self.max_rank)
166 |             X_reconstruction = self.clip(X_reconstruction)
167 | 
168 |             # print error on observed data
169 |             if self.verbose:
170 |                 mae = masked_mae(
171 |                     X_true=X_init,
172 |                     X_pred=X_reconstruction,
173 |                     mask=observed_mask)
174 |                 print(
175 |                     "[SoftImpute] Iter %d: observed MAE=%0.6f rank=%d" % (
176 |                         i + 1,
177 |                         mae,
178 |                         rank))
179 | 
180 |             converged = self._converged(
181 |                 X_old=X_filled,
182 |                 X_new=X_reconstruction,
183 |                 missing_mask=missing_mask)
184 |             X_filled[missing_mask] = X_reconstruction[missing_mask]
185 |             if converged:
186 |                 break
187 |         if self.verbose:
188 |             print("[SoftImpute] Stopped after iteration %d for lambda=%f" % (
189 |                 i + 1,
190 |                 shrinkage_value))
191 | 
192 |         return X_filled
193 | 


--------------------------------------------------------------------------------
/fancyimpute/solver.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import warnings
 14 | 
 15 | import numpy as np
 16 | 
 17 | from sklearn.utils import check_array
 18 | 
 19 | from .common import generate_random_column_samples
 20 | 
 21 | 
 22 | class Solver(object):
 23 |     def __init__(
 24 |             self,
 25 |             fill_method="zero",
 26 |             min_value=None,
 27 |             max_value=None,
 28 |             normalizer=None):
 29 |         self.fill_method = fill_method
 30 |         self.min_value = min_value
 31 |         self.max_value = max_value
 32 |         self.normalizer = normalizer
 33 | 
 34 |     def __repr__(self):
 35 |         return str(self)
 36 | 
 37 |     def __str__(self):
 38 |         field_list = []
 39 |         for (k, v) in sorted(self.__dict__.items()):
 40 |             if v is None or isinstance(v, (float, int)):
 41 |                 field_list.append("%s=%s" % (k, v))
 42 |             elif isinstance(v, str):
 43 |                 field_list.append("%s='%s'" % (k, v))
 44 |         return "%s(%s)" % (
 45 |             self.__class__.__name__,
 46 |             ", ".join(field_list))
 47 | 
 48 |     def _check_input(self, X):
 49 |         if len(X.shape) != 2:
 50 |             raise ValueError("Expected 2d matrix, got %s array" % (X.shape,))
 51 | 
 52 |     def _check_missing_value_mask(self, missing):
 53 |         if not missing.any():
 54 |             warnings.simplefilter("always")
 55 |             warnings.warn("Input matrix is not missing any values")
 56 |         if missing.all():
 57 |             raise ValueError("Input matrix must have some non-missing values")
 58 | 
 59 |     def _fill_columns_with_fn(self, X, missing_mask, col_fn):
 60 |         for col_idx in range(X.shape[1]):
 61 |             missing_col = missing_mask[:, col_idx]
 62 |             n_missing = missing_col.sum()
 63 |             if n_missing == 0:
 64 |                 continue
 65 |             col_data = X[:, col_idx]
 66 |             fill_values = col_fn(col_data)
 67 |             if np.all(np.isnan(fill_values)):
 68 |                 fill_values = 0
 69 |             X[missing_col, col_idx] = fill_values
 70 | 
 71 |     def fill(
 72 |             self,
 73 |             X,
 74 |             missing_mask,
 75 |             fill_method=None,
 76 |             inplace=False):
 77 |         """
 78 |         Parameters
 79 |         ----------
 80 |         X : np.array
 81 |             Data array containing NaN entries
 82 | 
 83 |         missing_mask : np.array
 84 |             Boolean array indicating where NaN entries are
 85 | 
 86 |         fill_method : str
 87 |             "zero": fill missing entries with zeros
 88 |             "mean": fill with column means
 89 |             "median" : fill with column medians
 90 |             "min": fill with min value per column
 91 |             "random": fill with gaussian samples according to mean/std of column
 92 | 
 93 |         inplace : bool
 94 |             Modify matrix or fill a copy
 95 |         """
 96 |         X = check_array(X, force_all_finite=False)
 97 | 
 98 |         if not inplace:
 99 |             X = X.copy()
100 | 
101 |         if not fill_method:
102 |             fill_method = self.fill_method
103 | 
104 |         if fill_method not in ("zero", "mean", "median", "min", "random"):
105 |             raise ValueError("Invalid fill method: '%s'" % (fill_method))
106 |         elif fill_method == "zero":
107 |             # replace NaN's with 0
108 |             X[missing_mask] = 0
109 |         elif fill_method == "mean":
110 |             self._fill_columns_with_fn(X, missing_mask, np.nanmean)
111 |         elif fill_method == "median":
112 |             self._fill_columns_with_fn(X, missing_mask, np.nanmedian)
113 |         elif fill_method == "min":
114 |             self._fill_columns_with_fn(X, missing_mask, np.nanmin)
115 |         elif fill_method == "random":
116 |             self._fill_columns_with_fn(
117 |                 X,
118 |                 missing_mask,
119 |                 col_fn=generate_random_column_samples)
120 |         return X
121 | 
122 |     def prepare_input_data(self, X):
123 |         """
124 |         Check to make sure that the input matrix and its mask of missing
125 |         values are valid. Returns X and missing mask.
126 |         """
127 |         X = check_array(X, force_all_finite=False)
128 |         if X.dtype != "f" and X.dtype != "d":
129 |             X = X.astype(float)
130 | 
131 |         self._check_input(X)
132 |         missing_mask = np.isnan(X)
133 |         self._check_missing_value_mask(missing_mask)
134 |         return X, missing_mask
135 | 
136 |     def clip(self, X):
137 |         """
138 |         Clip values to fall within any global or column-wise min/max constraints
139 |         """
140 |         X = np.asarray(X)
141 |         if self.min_value is not None:
142 |             X[X < self.min_value] = self.min_value
143 |         if self.max_value is not None:
144 |             X[X > self.max_value] = self.max_value
145 |         return X
146 | 
147 |     def project_result(self, X):
148 |         """
149 |         First undo normalization and then clip to the user-specified min/max
150 |         range.
151 |         """
152 |         X = np.asarray(X)
153 |         if self.normalizer is not None:
154 |             X = self.normalizer.inverse_transform(X)
155 |         return self.clip(X)
156 | 
157 |     def solve(self, X, missing_mask):
158 |         """
159 |         Given an initialized matrix X and a mask of where its missing values
160 |         had been, return a completion of X.
161 |         """
162 |         raise ValueError("%s.solve not yet implemented!" % (
163 |             self.__class__.__name__,))
164 | 
165 |     def fit_transform(self, X, y=None):
166 |         """
167 |         Fit the imputer and then transform input `X`
168 | 
169 |         Note: all imputations should have a `fit_transform` method,
170 |         but only some (like IterativeImputer in sklearn) also support inductive
171 |         mode using `fit` or `fit_transform` on `X_train` and then `transform`
172 |         on new `X_test`.
173 |         """
174 |         X_original, missing_mask = self.prepare_input_data(X)
175 |         observed_mask = ~missing_mask
176 |         X = X_original.copy()
177 |         if self.normalizer is not None:
178 |             X = self.normalizer.fit_transform(X)
179 |         X_filled = self.fill(X, missing_mask, inplace=True)
180 |         if not isinstance(X_filled, np.ndarray):
181 |             raise TypeError(
182 |                 "Expected %s.fill() to return NumPy array but got %s" % (
183 |                     self.__class__.__name__,
184 |                     type(X_filled)))
185 | 
186 |         X_result = self.solve(X_filled, missing_mask)
187 |         if not isinstance(X_result, np.ndarray):
188 |             raise TypeError(
189 |                 "Expected %s.solve() to return NumPy array but got %s" % (
190 |                     self.__class__.__name__,
191 |                     type(X_result)))
192 | 
193 |         X_result = self.project_result(X=X_result)
194 |         X_result[observed_mask] = X_original[observed_mask]
195 |         return X_result
196 | 
197 |     def fit(self, X, y=None):
198 |         """
199 |         Fit the imputer on input `X`.
200 | 
201 |         Note: all imputations should have a `fit_transform` method,
202 |         but only some (like IterativeImputer in sklearn) also support inductive
203 |         mode using `fit` or `fit_transform` on `X_train` and then `transform`
204 |         on new `X_test`.
205 |         """
206 |         raise ValueError(
207 |             "%s.fit not implemented! This imputation algorithm likely "
208 |             "doesn't support inductive mode. Only fit_transform is "
209 |             "supported at this time." % (
210 |                 self.__class__.__name__,))
211 | 
212 |     def transform(self, X, y=None):
213 |         """
214 |         Transform input `X`.
215 | 
216 |         Note: all imputations should have a `fit_transform` method,
217 |         but only some (like IterativeImputer in sklearn) also support inductive
218 |         mode using `fit` or `fit_transform` on `X_train` and then `transform`
219 |         on new `X_test`.
220 |         """
221 |         raise ValueError(
222 |             "%s.transform not implemented! This imputation algorithm likely "
223 |             "doesn't support inductive mode. Only %s.fit_transform is "
224 |             "supported at this time." % (
225 |                 self.__class__.__name__, self.__class__.__name__))
226 | 


--------------------------------------------------------------------------------
/fancyimpute/dictionary_helpers.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """
 14 | Helper functions for incomplete matrices represented using dictionaries.
 15 | """
 16 | 
 17 | from collections import defaultdict
 18 | 
 19 | import numpy as np
 20 | from scipy.sparse import dok_matrix
 21 | 
 22 | 
 23 | def dense_nan_matrix(shape, dtype):
 24 |     return np.ones(shape, dtype=dtype) * np.nan
 25 | 
 26 | 
 27 | def collect_nested_keys(nested_dict):
 28 |     outer_key_list = list(sorted(nested_dict.keys()))
 29 |     inner_key_set = set([])
 30 |     for k in outer_key_list:
 31 |         inner_dict = nested_dict[k]
 32 |         inner_key_set = inner_key_set.union(inner_dict.keys())
 33 |     inner_key_list = list(sorted(inner_key_set))
 34 |     return outer_key_list, inner_key_list
 35 | 
 36 | 
 37 | def nested_key_indices(nested_dict):
 38 |     """
 39 |     Give an ordering to the outer and inner keys used in a dictionary that
 40 |     maps to dictionaries.
 41 |     """
 42 |     outer_keys, inner_keys = collect_nested_keys(nested_dict)
 43 |     outer_key_indices = {k: i for (i, k) in enumerate(outer_keys)}
 44 |     inner_key_indices = {k: i for (i, k) in enumerate(inner_keys)}
 45 |     return outer_key_indices, inner_key_indices
 46 | 
 47 | 
 48 | def flattened_nested_key_indices(nested_dict):
 49 |     """
 50 |     Combine the outer and inner keys of nested dictionaries into a single
 51 |     ordering.
 52 |     """
 53 |     outer_keys, inner_keys = collect_nested_keys(nested_dict)
 54 |     combined_keys = list(sorted(set(outer_keys + inner_keys)))
 55 |     return {k: i for (i, k) in enumerate(combined_keys)}
 56 | 
 57 | 
 58 | def index_dict_to_sorted_list(key_indices):
 59 |     sorted_list = [None] * len(key_indices)
 60 |     for (key, index) in key_indices.items():
 61 |         sorted_list[index] = key
 62 |     return sorted_list
 63 | 
 64 | 
 65 | def array_from_nested_dictionary(
 66 |         nested_dict,
 67 |         array_fn,
 68 |         dtype="float32",
 69 |         square_result=False):
 70 |     """
 71 |     Parameters
 72 |     ----------
 73 |     nested_dict : dict
 74 |         Dictionary which contains dictionaries
 75 | 
 76 |     array_fn : function
 77 |         Takes shape and dtype as arguments, returns empty array.
 78 | 
 79 |     dtype : dtype
 80 |         NumPy dtype of result array
 81 | 
 82 |     square_result : bool
 83 |         Combine keys from outer and inner dictionaries.
 84 | 
 85 |     Returns array and sorted lists of the outer and inner keys.
 86 |     """
 87 |     if square_result:
 88 |         outer_key_indices = inner_key_indices = flattened_nested_key_indices(
 89 |             nested_dict)
 90 |     else:
 91 |         outer_key_indices, inner_key_indices = nested_key_indices(
 92 |             nested_dict)
 93 | 
 94 |     n_rows = len(outer_key_indices)
 95 |     n_cols = len(inner_key_indices)
 96 |     shape = (n_rows, n_cols)
 97 |     result = array_fn(shape, dtype)
 98 |     for outer_key, sub_dictionary in nested_dict.items():
 99 |         i = outer_key_indices[outer_key]
100 |         for inner_key, value in sub_dictionary.items():
101 |             j = inner_key_indices[inner_key]
102 |             result[i, j] = value
103 |     outer_key_list = index_dict_to_sorted_list(outer_key_indices)
104 |     inner_key_list = index_dict_to_sorted_list(inner_key_indices)
105 |     return result, outer_key_list, inner_key_list
106 | 
107 | 
108 | def sparse_dok_matrix_from_nested_dictionary(
109 |         nested_dict,
110 |         dtype="float32",
111 |         square_result=False):
112 |     return array_from_nested_dictionary(
113 |         nested_dict,
114 |         array_fn=dok_matrix,
115 |         dtype=dtype,
116 |         square_result=square_result)
117 | 
118 | 
119 | def dense_matrix_from_nested_dictionary(
120 |         nested_dict,
121 |         dtype="float32",
122 |         square_result=False):
123 |     return array_from_nested_dictionary(
124 |         nested_dict,
125 |         array_fn=dense_nan_matrix,
126 |         dtype=dtype,
127 |         square_result=square_result)
128 | 
129 | 
130 | def matrix_to_pair_dictionary(
131 |         X, row_keys=None, column_keys=None, filter_fn=None):
132 |     """
133 |     X : numpy.ndarray
134 | 
135 |     row_keys : dict
136 |         Dictionary mapping indices to row names. If omitted then maps each
137 |         number to its string representation, such as 1 -> "1".
138 | 
139 |     column_keys : dict
140 |         If omitted and matrix is square, then use the same dictionary
141 |         as the rows. Otherwise map each column index to its string form.
142 | 
143 |     filter_fn : function
144 |         If given then only add elements for which this function returns True.
145 |     """
146 |     n_rows, n_cols = X.shape
147 | 
148 |     if row_keys is None:
149 |         row_keys = {i: i for i in range(n_rows)}
150 | 
151 |     if column_keys is None:
152 |         if n_rows == n_cols:
153 |             column_keys = row_keys
154 |         else:
155 |             column_keys = {j: j for j in range(n_cols)}
156 | 
157 |     if len(row_keys) != n_rows:
158 |         raise ValueError("Need %d row keys but got list of length %d" % (
159 |             n_rows,
160 |             len(row_keys)))
161 | 
162 |     if len(column_keys) != n_cols:
163 |         raise ValueError("Need %d column keys but got list of length %d" % (
164 |             n_cols,
165 |             len(column_keys)))
166 | 
167 |     result_dict = {}
168 |     for i, X_i in enumerate(X):
169 |         row_key = row_keys[i]
170 |         for j, X_ij in enumerate(X_i):
171 |             if filter_fn and not filter_fn(X_ij):
172 |                 continue
173 |             column_key = column_keys[j]
174 |             key_pair = (row_key, column_key)
175 |             result_dict[key_pair] = X_ij
176 |     return result_dict
177 | 
178 | 
179 | def curry_pair_dictionary(key_pair_dict, default_value=0.0):
180 |     """
181 |     Transform dictionary from pairs of keys to dict -> dict -> float
182 |     """
183 |     result = defaultdict(dict)
184 |     for (a, b), value in key_pair_dict.items():
185 |         result[a][b] = value
186 |     return result
187 | 
188 | 
189 | def uncurry_nested_dictionary(curried_dict):
190 |     """
191 |     Transform dictionary from (key_a -> key_b -> float) to
192 |     (key_a, key_b) -> float
193 |     """
194 |     result = {}
195 |     for a, a_dict in curried_dict.items():
196 |         for b, value in a_dict.items():
197 |             result[(a, b)] = value
198 |     return result
199 | 
200 | 
201 | def matrix_to_nested_dictionary(
202 |         X,
203 |         row_keys=None,
204 |         column_keys=None,
205 |         filter_fn=None):
206 |     pair_dict = matrix_to_pair_dictionary(
207 |         X,
208 |         row_keys=row_keys,
209 |         column_keys=column_keys,
210 |         filter_fn=filter_fn)
211 |     return curry_pair_dictionary(pair_dict)
212 | 
213 | 
214 | def pair_dict_key_sets(pair_dict):
215 |     row_keys = set([])
216 |     column_keys = set([])
217 |     for (row_key, column_key) in pair_dict.keys():
218 |         row_keys.add(row_key)
219 |         column_keys.add(column_key)
220 |     return row_keys, column_keys
221 | 
222 | 
223 | def array_from_pair_dictionary(
224 |         pair_dict,
225 |         array_fn,
226 |         dtype="float32",
227 |         square_result=False):
228 |     """
229 |     Convert a dictionary whose keys are pairs (k1, k2) into a sparse
230 |     or incomplete array.
231 | 
232 |     Parameters
233 |     ----------
234 |     pair_dict : dict
235 |         Dictionary from pairs of keys to values.
236 | 
237 |     array_fn : function
238 |         Takes shape and dtype as arguments, returns empty array.
239 | 
240 |     dtype : dtype
241 |         NumPy dtype of result array
242 | 
243 |     square_result : bool
244 |         Combine keys from rows and columns
245 | 
246 |     Returns array and sorted lists of the row and column keys.
247 |     """
248 |     row_key_set, column_key_set = pair_dict_key_sets(pair_dict)
249 | 
250 |     if square_result:
251 |         combined_key_set = row_key_set.union(column_key_set)
252 |         row_key_list = column_key_list = list(sorted(combined_key_set))
253 |         row_key_indices = column_key_indices = {
254 |             k: i for (i, k) in enumerate(row_key_list)
255 |         }
256 |     else:
257 |         row_key_list = list(sorted(row_key_set))
258 |         column_key_list = list(sorted(column_key_set))
259 |         row_key_indices = {k: i for (i, k) in enumerate(row_key_list)}
260 |         column_key_indices = {k: i for (i, k) in enumerate(column_key_list)}
261 | 
262 |     n_rows = len(row_key_indices)
263 |     n_cols = len(column_key_indices)
264 |     shape = (n_rows, n_cols)
265 |     result = array_fn(shape, dtype)
266 |     for (row_key, column_key), value in pair_dict.items():
267 |         i = row_key_indices[row_key]
268 |         j = column_key_indices[column_key]
269 |         result[i, j] = value
270 |     return result, row_key_list, column_key_list
271 | 
272 | 
273 | def sparse_dok_matrix_from_pair_dictionary(
274 |         pair_dict,
275 |         dtype="float32",
276 |         square_result=False):
277 |     return array_from_pair_dictionary(
278 |         pair_dict,
279 |         array_fn=dok_matrix,
280 |         dtype=dtype,
281 |         square_result=square_result)
282 | 
283 | 
284 | def dense_matrix_from_pair_dictionary(
285 |         pair_dict,
286 |         dtype="float32",
287 |         square_result=False):
288 |     return array_from_pair_dictionary(
289 |         pair_dict,
290 |         array_fn=dense_nan_matrix,
291 |         dtype=dtype,
292 |         square_result=square_result)
293 | 
294 | 
295 | def transpose_nested_dictionary(nested_dict):
296 |     """
297 |     Given a nested dictionary from k1 -> k2 > value
298 |     transpose its outer and inner keys so it maps
299 |     k2 -> k1 -> value.
300 |     """
301 |     result = defaultdict(dict)
302 |     for k1, d in nested_dict.items():
303 |         for k2, v in d.items():
304 |             result[k2][k1] = v
305 |     return result
306 | 
307 | 
308 | def reverse_lookup_from_nested_dict(values_dict):
309 |     """
310 |     Create reverse-lookup dictionary mapping each row key to a list of triplets:
311 |     [(column key, value), ...]
312 | 
313 |     Parameters
314 |     ----------
315 |     nested_values_dict : dict
316 |         column_key -> row_key -> value
317 | 
318 |     weights_dict : dict
319 |         column_key -> row_key -> sample weight
320 | 
321 |     Returns dictionary mapping row_key -> [(column key, value)]
322 |     """
323 |     reverse_lookup = defaultdict(list)
324 |     for column_key, column_dict in values_dict.items():
325 |         for row_key, value in column_dict.items():
326 |             entry = (column_key, value)
327 |             reverse_lookup[row_key].append(entry)
328 |     return reverse_lookup
329 | 


--------------------------------------------------------------------------------
/experiments/complete_faces.py:
--------------------------------------------------------------------------------
  1 | from os import mkdir
  2 | from os.path import exists, join
  3 | from collections import defaultdict
  4 | 
  5 | import pylab
  6 | from sklearn.datasets import fetch_lfw_people
  7 | from sklearn.impute import IterativeImputer
  8 | import numpy as np
  9 | 
 10 | from fancyimpute import (
 11 |     SimpleFill,
 12 |     IterativeSVD,
 13 |     SoftImpute,
 14 |     BiScaler,
 15 |     KNN
 16 | )
 17 | 
 18 | from fancyimpute.common import masked_mae, masked_mse
 19 | 
 20 | 
 21 | def remove_pixels(
 22 |         full_images,
 23 |         missing_square_size=32,
 24 |         random_seed=0):
 25 |     np.random.seed(random_seed)
 26 |     incomplete_faces = []
 27 |     n_faces = len(full_images)
 28 |     height, width = full_images[0].shape[:2]
 29 |     for i in range(n_faces):
 30 |         image = full_images[i].copy()
 31 |         start_x = np.random.randint(
 32 |             low=0,
 33 |             high=height - missing_square_size + 1)
 34 |         start_y = np.random.randint(
 35 |             low=0,
 36 |             high=width - missing_square_size + 1)
 37 |         image[
 38 |             start_x: start_x + missing_square_size,
 39 |             start_y: start_y + missing_square_size] = np.nan
 40 |         incomplete_faces.append(image)
 41 |     return np.array(incomplete_faces, dtype=np.float32)
 42 | 
 43 | 
 44 | def rescale_pixel_values(images, order="C"):
 45 |     """
 46 |     Rescale the range of values in images to be between [0, 1]
 47 |     """
 48 |     images = np.asarray(images, order=order).astype("float32")
 49 |     images -= images.min()
 50 |     images /= images.max()
 51 |     return images
 52 | 
 53 | 
 54 | def color_balance(images):
 55 |     images = images.astype("float32")
 56 |     red = images[:, :, :, 0]
 57 |     green = images[:, :, :, 1]
 58 |     blue = images[:, :, :, 2]
 59 |     combined = (red + green + blue)
 60 |     total_color = combined.sum()
 61 |     overall_fraction_red = red.sum() / total_color
 62 |     overall_fraction_green = green.sum() / total_color
 63 |     overall_fraction_blue = blue.sum() / total_color
 64 | 
 65 |     for i in range(images.shape[0]):
 66 |         image = images[i]
 67 |         image_total = combined[i].sum()
 68 |         red_scale = overall_fraction_red / (red[i].sum() / image_total)
 69 |         green_scale = overall_fraction_green / (green[i].sum() / image_total)
 70 |         blue_scale = overall_fraction_blue / (blue[i].sum() / image_total)
 71 |         image[:, :, 0] *= red_scale
 72 |         image[:, :, 1] *= green_scale
 73 |         image[:, :, 2] *= blue_scale
 74 |     image[image < 0] = 0
 75 |     image[image > 255] = 255
 76 |     return images
 77 | 
 78 | 
 79 | class ResultsTable(object):
 80 | 
 81 |     def __init__(
 82 |             self,
 83 |             images_dict,
 84 |             percent_missing=0.25,
 85 |             saved_image_stride=25,
 86 |             dirname="face_images",
 87 |             scale_rows=False,
 88 |             center_rows=False):
 89 |         self.images_dict = images_dict
 90 |         self.labels = list(sorted(images_dict.keys()))
 91 |         self.images_array = np.array(
 92 |             [images_dict[k] for k in self.labels]).astype("float32")
 93 |         self.image_shape = self.images_array[0].shape
 94 |         self.width, self.height = self.image_shape[:2]
 95 |         self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3)
 96 |         if self.color:
 97 |             self.images_array = color_balance(self.images_array)
 98 |         self.n_pixels = self.width * self.height
 99 |         self.n_features = self.n_pixels * (3 if self.color else 1)
100 |         self.n_images = len(self.images_array)
101 |         print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (
102 |             self.n_images, self.color, self.n_features, self.image_shape))
103 | 
104 |         self.flattened_array_shape = (self.n_images, self.n_features)
105 | 
106 |         self.flattened_images = self.images_array.reshape(self.flattened_array_shape)
107 | 
108 |         n_missing_pixels = int(self.n_pixels * percent_missing)
109 | 
110 |         missing_square_size = int(np.sqrt(n_missing_pixels))
111 |         print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (
112 |             n_missing_pixels, missing_square_size))
113 |         self.incomplete_images = remove_pixels(
114 |             self.images_array,
115 |             missing_square_size=missing_square_size)
116 |         print("[ResultsTable] Incomplete images shape = %s" % (
117 |             self.incomplete_images.shape,))
118 |         self.flattened_incomplete_images = self.incomplete_images.reshape(
119 |             self.flattened_array_shape)
120 |         self.missing_mask = np.isnan(self.flattened_incomplete_images)
121 |         self.normalizer = BiScaler(
122 |             scale_rows=scale_rows,
123 |             center_rows=center_rows,
124 |             min_value=self.images_array.min(),
125 |             max_value=self.images_array.max())
126 |         self.incomplete_normalized = self.normalizer.fit_transform(
127 |             self.flattened_incomplete_images)
128 | 
129 |         self.saved_image_indices = list(
130 |             range(0, self.n_images, saved_image_stride))
131 |         self.saved_images = defaultdict(dict)
132 |         self.dirname = dirname
133 |         self.mse_dict = {}
134 |         self.mae_dict = {}
135 | 
136 |         self.save_images(self.images_array, "original", flattened=False)
137 |         self.save_images(self.incomplete_images, "incomplete", flattened=False)
138 | 
139 |     def ensure_dir(self, dirname):
140 |         if not exists(dirname):
141 |             print("Creating directory: %s" % dirname)
142 |             mkdir(dirname)
143 | 
144 |     def save_images(self, images, base_filename, flattened=True):
145 |         self.ensure_dir(self.dirname)
146 |         for i in self.saved_image_indices:
147 |             label = self.labels[i].lower().replace(" ", "_")
148 |             image = images[i, :].copy()
149 |             if flattened:
150 |                 image = image.reshape(self.image_shape)
151 |             image[np.isnan(image)] = 0
152 |             figure = pylab.gcf()
153 |             axes = pylab.gca()
154 |             extra_kwargs = {}
155 |             if self.color:
156 |                 extra_kwargs["cmap"] = "gray"
157 |             assert image.min() >= 0, "Image can't contain negative numbers"
158 |             if image.max() <= 1:
159 |                 image *= 256
160 |             image[image > 255] = 255
161 |             axes.imshow(image.astype("uint8"), **extra_kwargs)
162 |             axes.get_xaxis().set_visible(False)
163 |             axes.get_yaxis().set_visible(False)
164 |             filename = base_filename + ".png"
165 |             subdir = join(self.dirname, label)
166 |             self.ensure_dir(subdir)
167 |             path = join(subdir, filename)
168 |             figure.savefig(
169 |                 path,
170 |                 bbox_inches='tight')
171 |             self.saved_images[i][base_filename] = path
172 | 
173 |     def add_entry(self, solver, name):
174 |         print("Running %s" % name)
175 |         completed_normalized = solver.fit_transform(self.incomplete_normalized)
176 |         completed = self.normalizer.inverse_transform(completed_normalized)
177 | 
178 |         mae = masked_mae(
179 |             X_true=self.flattened_images,
180 |             X_pred=completed,
181 |             mask=self.missing_mask)
182 |         mse = masked_mse(
183 |             X_true=self.flattened_images,
184 |             X_pred=completed,
185 |             mask=self.missing_mask)
186 |         print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae))
187 |         self.mse_dict[name] = mse
188 |         self.mae_dict[name] = mae
189 |         self.save_images(completed, base_filename=name)
190 | 
191 |     def sorted_errors(self):
192 |         """
193 |         Generator for (rank, name, MSE, MAE) sorted by increasing MAE
194 |         """
195 |         for i, (name, mae) in enumerate(
196 |                 sorted(self.mae_dict.items(), key=lambda x: x[1])):
197 |             yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],)
198 | 
199 |     def print_sorted_errors(self):
200 |         for (rank, name, mse, mae) in self.sorted_errors():
201 |             print("%d) %s: MSE=%0.4f MAE=%0.4f" % (
202 |                 rank,
203 |                 name,
204 |                 mse,
205 |                 mae))
206 | 
207 |     def save_html_table(self, filename="results_table.html"):
208 |         html = """
209 |             <table>
210 |             <th>
211 |                 <td>Rank</td>
212 |                 <td>Name</td>
213 |                 <td>Mean Squared Error</td>
214 |                 <td>Mean Absolute Error</td>
215 |             </th>
216 |         """
217 |         for (rank, name, mse, mae) in self.sorted_errors():
218 |             html += """
219 |             <tr>
220 |                 <td>%d</td>
221 |                 <td>%s</td>
222 |                 <td>%0.4f</td>
223 |                 <td>%0.4f</td>
224 |             </tr>
225 |             """ % (rank, name, mse, mae)
226 |         html += "</table>"
227 |         self.ensure_dir(self.dirname)
228 |         path = join(self.dirname, filename)
229 |         with open(path, "w") as f:
230 |             f.write(html)
231 |         return html
232 | 
233 | 
234 | def image_per_label(images, label_indices, label_names, max_size=2000):
235 |     groups = defaultdict(list)
236 |     for i, label_idx in enumerate(label_indices):
237 |         label = label_names[label_idx].lower().strip().replace(" ", "_")
238 |         groups[label].append(images[i])
239 | 
240 |     # as a pretty arbitrary heuristic, let's try taking the min variance
241 |     # image for each person
242 |     singe_images = {}
243 |     for label, images in sorted(groups.items()):
244 |         singe_images[label] = min(images, key=lambda image: image.std())
245 |         if max_size and len(singe_images) >= max_size:
246 |             break
247 |     return singe_images
248 | 
249 | 
250 | def get_lfw(max_size=None):
251 |     dataset = fetch_lfw_people(color=True)
252 |     # keep only one image per person
253 |     return image_per_label(
254 |         dataset.images,
255 |         dataset.target,
256 |         dataset.target_names,
257 |         max_size=max_size)
258 | 
259 | if __name__ == "__main__":
260 |     images_dict = get_lfw(max_size=2000)
261 |     table = ResultsTable(
262 |         images_dict=images_dict,
263 |         scale_rows=False,
264 |         center_rows=False)
265 | 
266 |     for negative_log_regularization_weight in [2, 3, 4]:
267 |         regularization_weight = 10.0 ** -negative_log_regularization_weight
268 |         table.add_entry(
269 |             solver=IterativeImputer(
270 |                 n_nearest_features=80,
271 |                 max_iter=50
272 |             ),
273 |             name="IterativeImputer_%d" % negative_log_regularization_weight)
274 | 
275 |     for fill_method in ["mean", "median"]:
276 |         table.add_entry(
277 |             solver=SimpleFill(fill_method=fill_method),
278 |             name="SimpleFill_%s" % fill_method)
279 | 
280 |     for k in [1, 3, 7]:
281 |         table.add_entry(
282 |             solver=KNN(
283 |                 k=k,
284 |                 orientation="rows"),
285 |             name="KNN_k%d" % (k,))
286 | 
287 |     for shrinkage_value in [25, 50, 100]:
288 |         # SoftImpute without rank constraints
289 |         table.add_entry(
290 |             solver=SoftImpute(
291 |                 shrinkage_value=shrinkage_value),
292 |             name="SoftImpute_lambda%d" % (shrinkage_value,))
293 | 
294 |     for rank in [10, 20, 40]:
295 |         table.add_entry(
296 |             solver=IterativeSVD(
297 |                 rank=rank,
298 |                 init_fill_method="zero"),
299 |             name="IterativeSVD_rank%d" % (rank,))
300 | 
301 |     table.save_html_table()
302 |     table.print_sorted_errors()
303 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/fancyimpute/scaler.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | class Scaler(object):
 17 |     """
 18 |     Iterative estimation of row and column centering/scaling
 19 |     using the algorithm from page 31 of:
 20 |         Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares
 21 |     """
 22 | 
 23 |     def __init__(
 24 |             self,
 25 |             center_columns=True,
 26 |             scale_columns=True,
 27 |             min_value=None,
 28 |             max_value=None,
 29 |             verbose=True):
 30 |         self.center_columns = center_columns
 31 |         self.scale_columns = scale_columns
 32 |         self.min_value = min_value
 33 |         self.max_value = max_value
 34 |         self.verbose = verbose
 35 | 
 36 |         self.column_centers = None
 37 |         self.column_scales = None
 38 | 
 39 |     def fit(self, X):
 40 |         if self.center_columns:
 41 |             self.column_centers = np.nanmean(X, axis=0)
 42 |         if self.scale_columns:
 43 |             self.column_scales = np.nanstd(X, axis=0)
 44 |             self.column_scales[self.column_scales == 0] = 1.0
 45 |         return self
 46 | 
 47 |     def transform(self, X):
 48 |         X = np.asarray(X).copy()
 49 |         if self.center_columns:
 50 |             X -= self.column_centers
 51 |         if self.scale_columns:
 52 |             X /= self.column_scales
 53 |         return X
 54 | 
 55 |     def fit_transform(self, X):
 56 |         self.fit(X)
 57 |         return self.transform(X)
 58 | 
 59 |     def inverse_transform(self, X):
 60 |         X = np.asarray(X).copy()
 61 |         if self.scale_columns:
 62 |             X *= self.column_scales
 63 |         if self.center_columns:
 64 |             X += self.column_centers
 65 |         return X
 66 | 
 67 | 
 68 | class BiScaler(object):
 69 |     """
 70 |     Iterative estimation of row and column centering/scaling
 71 |     using the algorithm from page 31 of:
 72 |         Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares
 73 |     """
 74 | 
 75 |     def __init__(
 76 |             self,
 77 |             center_rows=True,
 78 |             center_columns=True,
 79 |             scale_rows=True,
 80 |             scale_columns=True,
 81 |             min_value=None,
 82 |             max_value=None,
 83 |             max_iters=100,
 84 |             tolerance=0.001,
 85 |             verbose=True):
 86 |         self.center_rows = center_rows
 87 |         self.center_columns = center_columns
 88 |         self.scale_rows = scale_rows
 89 |         self.scale_columns = scale_columns
 90 |         self.min_value = min_value
 91 |         self.max_value = max_value
 92 |         self.max_iters = max_iters
 93 |         self.tolerance = tolerance
 94 |         self.verbose = verbose
 95 | 
 96 |     def estimate_row_means(
 97 |             self,
 98 |             X,
 99 |             observed,
100 |             column_means,
101 |             column_scales):
102 |         """
103 |         row_center[i] =
104 |         sum{j in observed[i, :]}{
105 |             (1 / column_scale[j]) * (X[i, j] - column_center[j])
106 |         }
107 |         ------------------------------------------------------------
108 |         sum{j in observed[i, :]}{1 / column_scale[j]}
109 |         """
110 | 
111 |         n_rows, n_cols = X.shape
112 | 
113 |         column_means = np.asarray(column_means)
114 |         if len(column_means) != n_cols:
115 |             raise ValueError("Expected length %d but got shape %s" % (
116 |                 n_cols, column_means.shape))
117 |         X = X - column_means.reshape((1, n_cols))
118 |         column_weights = 1.0 / column_scales
119 |         X *= column_weights.reshape((1, n_cols))
120 |         row_means = np.zeros(n_rows, dtype=X.dtype)
121 |         row_residual_sums = np.nansum(X, axis=1)
122 |         for i in range(n_rows):
123 |             row_mask = observed[i, :]
124 |             sum_weights = column_weights[row_mask].sum()
125 |             row_means[i] = row_residual_sums[i] / sum_weights
126 |         return row_means
127 | 
128 |     def estimate_column_means(
129 |             self,
130 |             X,
131 |             observed,
132 |             row_means,
133 |             row_scales):
134 |         """
135 |         column_center[j] =
136 |         sum{i in observed[:, j]}{
137 |             (1 / row_scale[i]) * (X[i, j]) - row_center[i])
138 |         }
139 |         ------------------------------------------------------------
140 |         sum{i in observed[:, j]}{1 / row_scale[i]}
141 |         """
142 |         n_rows, n_cols = X.shape
143 |         row_means = np.asarray(row_means)
144 | 
145 |         if len(row_means) != n_rows:
146 |             raise ValueError("Expected length %d but got shape %s" % (
147 |                 n_rows, row_means.shape))
148 |         column_means = np.zeros(n_cols, dtype=X.dtype)
149 | 
150 |         X = X - row_means.reshape((n_rows, 1))
151 |         row_weights = 1.0 / row_scales
152 |         X *= row_weights.reshape((n_rows, 1))
153 |         col_residual_sums = np.nansum(X, axis=0)
154 |         for j in range(n_cols):
155 |             col_mask = observed[:, j]
156 |             sum_weights = row_weights[col_mask].sum()
157 |             column_means[j] = col_residual_sums[j] / sum_weights
158 |         return column_means
159 | 
160 |     def center(self, X, row_means, column_means, inplace=False):
161 |         n_rows, n_cols = X.shape
162 |         row_means = np.asarray(row_means)
163 |         column_means = np.asarray(column_means)
164 |         if len(row_means) != n_rows:
165 |             raise ValueError("Expected length %d but got shape %s" % (
166 |                 n_rows, row_means.shape))
167 |         if len(column_means) != n_cols:
168 |             raise ValueError("Expected length %d but got shape %s" % (
169 |                 n_cols, column_means.shape))
170 |         if not inplace:
171 |             X = X.copy()
172 |         X -= row_means.reshape((n_rows, 1))
173 |         X -= column_means.reshape((1, n_cols))
174 |         return X
175 | 
176 |     def rescale(self, X, row_scales, column_scales, inplace=False):
177 |         if not inplace:
178 |             X = X.copy()
179 |         n_rows, n_cols = X.shape
180 |         X /= row_scales.reshape((n_rows, 1))
181 |         X /= column_scales.reshape((1, n_cols))
182 |         return X
183 | 
184 |     def estimate_row_scales(
185 |             self,
186 |             X_centered,
187 |             column_scales):
188 |         """
189 |         row_scale[i]**2 =
190 |         mean{j in observed[i, :]}{
191 |             (X[i, j] - row_center[i] - column_center[j]) ** 2
192 |             --------------------------------------------------
193 |                         column_scale[j] ** 2
194 |         }
195 |         """
196 |         n_rows, n_cols = X_centered.shape
197 |         column_scales = np.asarray(column_scales)
198 |         if len(column_scales) != n_cols:
199 |             raise ValueError("Expected length %d but got shape %s" % (
200 |                 n_cols, column_scales))
201 |         row_variances = np.nanmean(
202 |             X_centered ** 2 / (column_scales ** 2).reshape((1, n_cols)),
203 |             axis=1)
204 |         row_variances[row_variances == 0] = 1.0
205 |         assert len(row_variances) == n_rows, "%d != %d" % (
206 |             len(row_variances),
207 |             n_rows)
208 |         return np.sqrt(row_variances)
209 | 
210 |     def estimate_column_scales(
211 |             self,
212 |             X_centered,
213 |             row_scales):
214 |         """
215 |         column_scale[j] ** 2 =
216 |           mean{i in observed[:, j]}{
217 |             (X[i, j] - row_center[i] - column_center[j]) ** 2
218 |             -------------------------------------------------
219 |                         row_scale[i] ** 2
220 |         }
221 |         """
222 |         n_rows, n_cols = X_centered.shape
223 |         row_scales = np.asarray(row_scales)
224 | 
225 |         if len(row_scales) != n_rows:
226 |             raise ValueError("Expected length %s, got shape %s" % (
227 |                 n_rows, row_scales.shape,))
228 | 
229 |         column_variances = np.nanmean(
230 |             X_centered ** 2 / (row_scales ** 2).reshape((n_rows, 1)),
231 |             axis=0)
232 |         column_variances[column_variances == 0] = 1.0
233 |         assert len(column_variances) == n_cols, "%d != %d" % (
234 |             len(column_variances),
235 |             n_cols)
236 |         return np.sqrt(column_variances)
237 | 
238 |     def residual(self, X_normalized):
239 |         total = 0
240 |         if self.center_rows:
241 |             row_means = np.nanmean(X_normalized, axis=1)
242 |             total += (row_means ** 2).sum()
243 | 
244 |         if self.center_columns:
245 |             column_means = np.nanmean(X_normalized, axis=0)
246 |             total += (column_means ** 2).sum()
247 | 
248 |         if self.scale_rows:
249 |             row_variances = np.nanvar(X_normalized, axis=1)
250 |             row_variances[row_variances == 0] = 1.0
251 |             total += (np.log(row_variances) ** 2).sum()
252 | 
253 |         if self.scale_columns:
254 |             column_variances = np.nanvar(X_normalized, axis=0)
255 |             column_variances[column_variances == 0] = 1.0
256 |             total += (np.log(column_variances) ** 2).sum()
257 | 
258 |         return total
259 | 
260 |     def clamp(self, X, inplace=False):
261 |         if not inplace:
262 |             X = X.copy()
263 |         if self.min_value is not None:
264 |             X[X < self.min_value] = self.min_value
265 |         if self.max_value is not None:
266 |             X[X > self.max_value] = self.max_value
267 |         return X
268 | 
269 |     def fit(self, X):
270 |         X = self.clamp(X)
271 |         n_rows, n_cols = X.shape
272 |         dtype = X.dtype
273 | 
274 |         # To avoid inefficient memory access we keep around two copies
275 |         # of the array, one contiguous in the rows and the other
276 |         # contiguous in the columns
277 |         X_row_major = np.asarray(X, order="C")
278 |         X_column_major = np.asarray(X, order="F")
279 | 
280 |         observed_row_major = ~np.isnan(X_row_major)
281 |         n_observed_per_row = observed_row_major.sum(axis=1)
282 |         n_empty_rows = (n_observed_per_row == 0).sum()
283 | 
284 |         if n_empty_rows > 0:
285 |             raise ValueError("%d rows have no observed values" % n_empty_rows)
286 | 
287 |         observed_column_major = np.asarray(observed_row_major, order="F")
288 |         n_observed_per_column = observed_column_major.sum(axis=0)
289 |         n_empty_columns = (n_observed_per_column == 0).sum()
290 |         if n_empty_columns > 0:
291 |             raise ValueError("%d columns have no observed values" % (
292 |                 n_empty_columns,))
293 |         # initialize by assuming that rows are zero-mean/unit variance and
294 |         # with a direct estimate of mean and standard deviation
295 |         # of each column
296 |         row_means = np.zeros(n_rows, dtype=dtype)
297 |         row_scales = np.ones(n_rows, dtype=dtype)
298 | 
299 |         if self.center_columns:
300 |             column_means = np.nanmean(X, axis=0)
301 |         else:
302 |             column_means = np.zeros(n_cols, dtype=dtype)
303 | 
304 |         if self.scale_columns:
305 |             column_scales = np.nanstd(X, axis=0)
306 |             column_scales[column_scales == 0] = 1.0
307 |         else:
308 |             column_scales = np.ones(n_cols, dtype=dtype)
309 | 
310 |         last_residual = self.residual(X)
311 |         if self.verbose:
312 |             print("[BiScaler] Initial log residual value = %f" % (
313 |                 np.log(last_residual),))
314 |         for i in range(self.max_iters):
315 |             if last_residual == 0:
316 |                 # already have a perfect fit, so let's get out of here
317 |                 print("[BiScaler] No room for improvement")
318 |                 break
319 | 
320 |             assert len(column_means) == n_cols, \
321 |                 "Wrong number of column means, expected %d but got %d" % (
322 |                     n_cols,
323 |                     len(column_means))
324 |             assert len(column_scales) == n_cols, \
325 |                 "Wrong number of column scales, expected %d but got %d" % (
326 |                     n_cols,
327 |                     len(column_scales))
328 |             assert len(row_means) == n_rows, \
329 |                 "Wrong number of row means, expected %d but got %d" % (
330 |                     n_rows,
331 |                     len(row_means))
332 |             assert len(row_scales) == n_rows, \
333 |                 "Wrong number of row scales, expected %d but got %d" % (
334 |                     n_rows,
335 |                     len(row_scales))
336 | 
337 |             if self.center_rows:
338 |                 row_means = self.estimate_row_means(
339 |                     X=X_row_major,
340 |                     observed=observed_row_major,
341 |                     column_means=column_means,
342 |                     column_scales=column_scales)
343 |             if self.center_columns:
344 |                 column_means = self.estimate_column_means(
345 |                     X=X_column_major,
346 |                     observed=observed_column_major,
347 |                     row_means=row_means,
348 |                     row_scales=row_scales)
349 | 
350 |             X_centered = self.center(
351 |                 X,
352 |                 row_means,
353 |                 column_means)
354 |             if self.scale_rows:
355 |                 row_scales = self.estimate_row_scales(
356 |                     X_centered=X_centered,
357 |                     column_scales=column_scales)
358 |             if self.scale_columns:
359 |                 column_scales = self.estimate_column_scales(
360 |                     X_centered=X_centered,
361 |                     row_scales=row_scales)
362 | 
363 |             X_normalized = self.rescale(X_centered, row_scales, column_scales)
364 |             residual = self.residual(X_normalized)
365 |             change_in_residual = last_residual - residual
366 |             if self.verbose:
367 |                 print("[BiScaler] Iter %d: log residual = %f, log improvement ratio=%f" % (
368 |                     i + 1,
369 |                     np.log(residual),
370 |                     np.log(last_residual / residual)))
371 |             if change_in_residual / last_residual < self.tolerance:
372 |                 break
373 |             last_residual = residual
374 |         self.row_means = row_means
375 |         self.row_scales = row_scales
376 |         self.column_means = column_means
377 |         self.column_scales = column_scales
378 | 
379 |     def transform(self, X):
380 |         X = np.asarray(X).copy()
381 |         X = self.center(X, self.row_means, self.column_means, inplace=True)
382 |         X = self.rescale(X, self.row_scales, self.column_scales, inplace=True)
383 |         return X
384 | 
385 |     def inverse_transform(self, X, inplace=False):
386 |         X = np.asarray(X)
387 |         if not inplace:
388 |             X = X.copy()
389 |         X = self.rescale(
390 |             X,
391 |             1.0 / self.row_scales,
392 |             1.0 / self.column_scales,
393 |             inplace=True)
394 |         X = self.center(X, -self.row_means, -self.column_means, inplace=True)
395 |         return self.clamp(X)
396 | 
397 |     def fit_transform(self, X):
398 |         self.fit(X)
399 |         return self.transform(X)
400 | 


--------------------------------------------------------------------------------