├── papers └── readit.md ├── ycimpute ├── nn │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── gainnets.cpython-37.pyc │ │ └── autoencoder.cpython-37.pyc │ ├── autoencoder.py │ └── gainnets.py ├── tree │ ├── __init__.py │ └── tree.py ├── unsupervised │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-37.pyc │ └── knn │ │ ├── __pycache__ │ │ ├── common.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── reference.cpython-37.pyc │ │ ├── optimistic.cpython-37.pyc │ │ ├── argpartition.cpython-37.pyc │ │ ├── normalized_distance.cpython-37.pyc │ │ └── few_observed_entries.cpython-37.pyc │ │ ├── __init__.py │ │ ├── common.py │ │ ├── reference.py │ │ ├── few_observed_entries.py │ │ ├── argpartition.py │ │ ├── normalized_distance.py │ │ └── optimistic.py ├── utils │ ├── __init__.py │ ├── evaluator │ │ ├── __init__.py │ │ └── evaluate_em.py │ ├── shower │ │ ├── test_show.py │ │ ├── __init__.py │ │ └── show.py │ ├── __pycache__ │ │ ├── tools.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ └── normalizer.cpython-37.pyc │ ├── evaluate.py │ ├── normalizer.py │ └── tools.py ├── esemble │ ├── __init__.py │ └── random_forest.py ├── datasets │ ├── iris.hdf5 │ ├── wine.hdf5 │ ├── boston.hdf5 │ ├── load_data.py │ └── dpath.py ├── __pycache__ │ └── __init__.cpython-37.pyc ├── imputer │ ├── __pycache__ │ │ ├── gain.cpython-37.pyc │ │ ├── mice.cpython-37.pyc │ │ ├── mida.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── knnimput.cpython-37.pyc │ │ ├── iterforest.cpython-37.pyc │ │ └── expectation_maximization.cpython-37.pyc │ ├── __init__.py │ ├── expectation_maximization.py │ ├── knnimput.py │ ├── mida.py │ ├── gain.py │ ├── mice.py │ └── iterforest.py ├── __init__.py ├── doc_zh_cn.ipynb └── doc.ipynb ├── test_data └── readme.md ├── img ├── readme.md ├── 1.png ├── 2.png ├── 3.png ├── WINE.svg └── IRIS.svg ├── requirements.txt ├── test ├── metrics.py ├── test_em.py ├── test_gain.py ├── test_knn.py ├── test_mida.py ├── test_mice.py ├── test_missforest.py └── generate_data.py ├── setup.py ├── README.md ├── doc_eng.md └── LICENSE /papers/readit.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ycimpute/nn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_data/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ycimpute/tree/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ycimpute/unsupervised/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ycimpute/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /ycimpute/utils/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ycimpute/utils/shower/test_show.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/readme.md: -------------------------------------------------------------------------------- 1 | ......................... 2 | -------------------------------------------------------------------------------- /ycimpute/esemble/__init__.py: -------------------------------------------------------------------------------- 1 | from ..tree import tree 2 | 3 | __all__=["tree"] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.17.1 2 | numpy 3 | pandas 4 | torch>=1.1.0 5 | -------------------------------------------------------------------------------- /img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/img/1.png -------------------------------------------------------------------------------- /img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/img/2.png -------------------------------------------------------------------------------- /img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/img/3.png -------------------------------------------------------------------------------- /ycimpute/datasets/iris.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/datasets/iris.hdf5 -------------------------------------------------------------------------------- /ycimpute/datasets/wine.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/datasets/wine.hdf5 -------------------------------------------------------------------------------- /ycimpute/datasets/boston.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/datasets/boston.hdf5 -------------------------------------------------------------------------------- /ycimpute/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/nn/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/nn/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/nn/__pycache__/gainnets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/nn/__pycache__/gainnets.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/utils/__pycache__/tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/utils/__pycache__/tools.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/gain.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/gain.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/mice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/mice.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/mida.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/mida.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/knnimput.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/knnimput.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/nn/__pycache__/autoencoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/nn/__pycache__/autoencoder.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/utils/__pycache__/normalizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/utils/__pycache__/normalizer.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/iterforest.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/iterforest.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/reference.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/reference.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/optimistic.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/optimistic.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/argpartition.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/argpartition.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/imputer/__pycache__/expectation_maximization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/imputer/__pycache__/expectation_maximization.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/normalized_distance.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/normalized_distance.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__pycache__/few_observed_entries.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIDEA-YunanUniversity/ycimpute/HEAD/ycimpute/unsupervised/knn/__pycache__/few_observed_entries.cpython-37.pyc -------------------------------------------------------------------------------- /ycimpute/utils/shower/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | from ...imputer.mice import MICE 3 | from ...imputer.knnimput import KNN 4 | from ...imputer.iterforest import IterImput 5 | from ...imputer.simple import SimpleFill 6 | 7 | __all__=["MICE", 8 | "KNN", 9 | "IterImput", 10 | "SimpleFill"] 11 | """ -------------------------------------------------------------------------------- /ycimpute/imputer/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | #from ..unsupervised.expectation_maximization import EM 3 | from .mice import MICE 4 | from .iterforest import MissForest 5 | from .expectation_maximization import EM 6 | from .knnimput import KNN 7 | from .mida import MIDA 8 | from .gain import GAIN 9 | #from .simple import SimpleFill 10 | 11 | __all__=['MICE', 12 | 'MissForest', 13 | 'EM', 14 | 'KNN', 15 | 'MIDA', 16 | 'GAIN'] -------------------------------------------------------------------------------- /test/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | def get_missing_index(mask_all): 8 | return np.where(mask_all==True) 9 | 10 | 11 | def accuracy(original, filled): 12 | score = accuracy_score(original, filled) 13 | return score 14 | 15 | def RMSE(original, filled): 16 | from sklearn.metrics import mean_squared_error 17 | score = np.sqrt(mean_squared_error(original, filled)) 18 | return score 19 | -------------------------------------------------------------------------------- /ycimpute/utils/evaluate.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | from sklearn.metrics import accuracy_score 5 | 6 | def get_missing_index(mask_all): 7 | return np.where(mask_all==True) 8 | 9 | 10 | def accuracy(original, filled): 11 | score = accuracy_score(original, filled) 12 | return score 13 | 14 | def RMSE(original, filled): 15 | from sklearn.metrics import mean_squared_error 16 | score = np.sqrt(mean_squared_error(original, filled)) 17 | return score 18 | -------------------------------------------------------------------------------- /test/test_em.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import EM 5 | from .metrics import RMSE 6 | from .generate_data import missing_data,missing_mask,complete_data 7 | from ycimpute.utils.normalizer import min_max_scale 8 | 9 | def test_em(): 10 | X_filled = EM().complete(missing_data) 11 | complete_data_, _, _ = min_max_scale(complete_data) 12 | X_filled, _, _ = min_max_scale(X_filled) 13 | 14 | score = RMSE(complete_data_[missing_mask], 15 | X_filled[missing_mask]) 16 | print(score) 17 | -------------------------------------------------------------------------------- /test/test_gain.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import GAIN 5 | from .metrics import RMSE 6 | from .generate_data import missing_mask,missing_data,complete_data 7 | 8 | from ycimpute.utils.normalizer import min_max_scale 9 | def test_gain(): 10 | X_filled = GAIN().complete(missing_data) 11 | complete_data_, _, _ = min_max_scale(complete_data) 12 | X_filled, _, _ = min_max_scale(X_filled) 13 | 14 | score = RMSE(complete_data_[missing_mask], 15 | X_filled[missing_mask]) 16 | print(score) 17 | -------------------------------------------------------------------------------- /test/test_knn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import KNN 5 | from .metrics import RMSE 6 | from .generate_data import missing_mask,missing_data,complete_data 7 | from ycimpute.utils.normalizer import min_max_scale 8 | 9 | def test_knn(): 10 | X_filled = KNN().complete(missing_data) 11 | complete_data_,_ ,_ = min_max_scale(complete_data) 12 | X_filled , _ ,_ = min_max_scale(X_filled) 13 | 14 | score = RMSE(complete_data_[missing_mask], 15 | X_filled[missing_mask]) 16 | print(score) 17 | -------------------------------------------------------------------------------- /test/test_mida.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import MIDA 5 | from .metrics import RMSE 6 | from .generate_data import missing_mask,missing_data,complete_data 7 | 8 | from ycimpute.utils.normalizer import min_max_scale 9 | def test_mida(): 10 | X_filled = MIDA().complete(missing_data) 11 | complete_data_, _, _ = min_max_scale(complete_data) 12 | X_filled, _, _ = min_max_scale(X_filled) 13 | 14 | score = RMSE(complete_data_[missing_mask], 15 | X_filled[missing_mask]) 16 | print(score) 17 | -------------------------------------------------------------------------------- /test/test_mice.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import MICE 5 | from .metrics import RMSE 6 | from .generate_data import missing_mask,missing_data,complete_data 7 | 8 | from ycimpute.utils.normalizer import min_max_scale 9 | 10 | def test_mice(): 11 | X_filled = MICE().complete(missing_data) 12 | complete_data_, _, _ = min_max_scale(complete_data) 13 | X_filled, _, _ = min_max_scale(X_filled) 14 | 15 | score = RMSE(complete_data_[missing_mask], 16 | X_filled[missing_mask]) 17 | print(score) 18 | 19 | -------------------------------------------------------------------------------- /test/test_missforest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from ycimpute.imputer import MissForest 5 | from .metrics import RMSE 6 | from .generate_data import missing_data,missing_mask,complete_data 7 | from ycimpute.utils.normalizer import min_max_scale 8 | 9 | def test_missforest(): 10 | X_filled = MissForest().complete(missing_data) 11 | complete_data_, _, _ = min_max_scale(complete_data) 12 | X_filled, _, _ = min_max_scale(X_filled) 13 | 14 | score = RMSE(complete_data_[missing_mask], 15 | X_filled[missing_mask]) 16 | print(score) 17 | 18 | -------------------------------------------------------------------------------- /ycimpute/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright, the CVXPY authors 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from ycimpute.imputer import * 18 | 19 | -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalized_distance import ( 2 | all_pairs_normalized_distances, 3 | all_pairs_normalized_distances_reference 4 | ) 5 | from .reference import knn_impute_reference 6 | from .optimistic import knn_impute_optimistic 7 | from .common import knn_initialize 8 | from .few_observed_entries import knn_impute_few_observed 9 | from .argpartition import knn_impute_with_argpartition 10 | 11 | __version__ = "0.1.0" 12 | 13 | __all__ = [ 14 | "all_pairs_normalized_distances", 15 | "all_pairs_normalized_distances_reference", 16 | "knn_initialize", 17 | "knn_impute_reference", 18 | "knn_impute_optimistic", 19 | "knn_impute_few_observed", 20 | "knn_impute_with_argpartition", 21 | ] -------------------------------------------------------------------------------- /ycimpute/doc_zh_cn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.6.3" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 2 34 | } 35 | -------------------------------------------------------------------------------- /ycimpute/utils/evaluator/evaluate_em.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | 4 | from ycimpute.utils import evaluate 5 | from ycimpute.imputer import EM 6 | from ycimpute.datasets import load_data 7 | 8 | class Evaluate(object): 9 | def __init__(self): 10 | pass 11 | 12 | def evaluate(self, X_mis,X_full): 13 | missing_index = evaluate.get_missing_index(np.isnan(X_mis)) 14 | original_arr = X_full[missing_index] 15 | em_X_filled = EM().complete(copy.copy(X_mis)) 16 | em_filled_arr = em_X_filled[missing_index] 17 | rmse_em_score = evaluate.RMSE(original_arr, em_filled_arr) 18 | return rmse_em_score 19 | 20 | if __name__ == '__main__': 21 | boston_mis, boston_full = load_data.load_boston() 22 | iris_mis, iris_ful = load_data.load_iris() 23 | 24 | boston_score = Evaluate().evaluate(boston_mis, boston_full) 25 | iris_score = Evaluate().evaluate(iris_mis, iris_ful) 26 | print(boston_score) 27 | print(iris_score) -------------------------------------------------------------------------------- /ycimpute/utils/normalizer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | 5 | def min_max_scale(x): 6 | cols = x.shape[1] 7 | min_record = [] 8 | max_record = [] 9 | 10 | for col in range(cols): 11 | min_val= np.min(x[:,col]) 12 | max_val = np.max(x[:,col]) 13 | x[:,col] = (x[:,col] - min_val)/(max_val - min_val) 14 | min_record.append(min_val) 15 | max_record.append(max_val) 16 | 17 | return x, min_record,max_record 18 | 19 | def zero_score_scale(x): 20 | cols = x.shape[1] 21 | for col in range(cols): 22 | x[:,col] = (x[:,col]-np.mean(x[:,col]))/(np.std(x[:,col])) 23 | 24 | return x 25 | 26 | def min_max_recover(X, min_vec, max_vec): 27 | cols = X.shape[1] 28 | for col in range(cols): 29 | X[:,col] = X[:,col]*(max_vec[col]-min_vec[col])+min_vec[col] 30 | return X 31 | 32 | 33 | NORMALIZERS = {'min_max':min_max_scale, 34 | 'zero_score':zero_score_scale} 35 | 36 | RECOVER = {'min_max':min_max_recover} -------------------------------------------------------------------------------- /ycimpute/nn/autoencoder.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | 4 | class Autoencoder(nn.Module): 5 | def __init__(self, dim,theta): 6 | super(Autoencoder, self).__init__() 7 | self.dim = dim 8 | 9 | self.drop_out = nn.Dropout(p=0.1) 10 | 11 | self.encoder = nn.Sequential( 12 | nn.Linear(dim + theta * 0, dim + theta * 1), 13 | nn.Tanh(), 14 | nn.Linear(dim + theta * 1, dim + theta * 2), 15 | nn.Tanh(), 16 | nn.Linear(dim + theta * 2, dim + theta * 3) 17 | ) 18 | 19 | self.decoder = nn.Sequential( 20 | nn.Linear(dim + theta * 3, dim + theta * 2), 21 | nn.Tanh(), 22 | nn.Linear(dim + theta * 2, dim + theta * 1), 23 | nn.Tanh(), 24 | nn.Linear(dim + theta * 1, dim + theta * 0) 25 | ) 26 | 27 | def forward(self, x): 28 | x = x.view(-1, self.dim) 29 | x_missed = self.drop_out(x) 30 | 31 | z = self.encoder(x_missed) 32 | out = self.decoder(z) 33 | 34 | out = out.view(-1, self.dim) 35 | 36 | return out -------------------------------------------------------------------------------- /ycimpute/doc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Welcome to ycimpute!" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "## ycimpute Overview\n", 17 | "### yc impute is a high-level missing value imputation methods collection API. Its writen in python and capable of running on [scikit-lean](http://scikit-learn.org/stable/)\n", 18 | "### It was developed with a focus on solving the common problem of " 19 | ] 20 | } 21 | ], 22 | "metadata": { 23 | "kernelspec": { 24 | "display_name": "Python 3", 25 | "language": "python", 26 | "name": "python3" 27 | }, 28 | "language_info": { 29 | "codemirror_mode": { 30 | "name": "ipython", 31 | "version": 3 32 | }, 33 | "file_extension": ".py", 34 | "mimetype": "text/x-python", 35 | "name": "python", 36 | "nbconvert_exporter": "python", 37 | "pygments_lexer": "ipython3", 38 | "version": "3.6.3" 39 | } 40 | }, 41 | "nbformat": 4, 42 | "nbformat_minor": 2 43 | } 44 | -------------------------------------------------------------------------------- /ycimpute/datasets/load_data.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | from os.path import dirname 3 | 4 | import numpy as np 5 | 6 | def load_iris(): 7 | abs_path = dirname(__file__).__add__('/iris.hdf5') 8 | try: 9 | file = h5py.File(abs_path,'r') 10 | missing_x = file['missing'] 11 | original_x = file['full'] 12 | return np.asarray(missing_x), np.asarray(original_x) 13 | except: 14 | file.close() 15 | raise ("can't load data") 16 | 17 | def load_boston(): 18 | abs_path = dirname(__file__).__add__('/boston.hdf5') 19 | try: 20 | file = h5py.File(abs_path,'r') 21 | missing_x = file['missing'] 22 | original_x = file['full'] 23 | return np.asarray(missing_x), np.asarray(original_x) 24 | except: 25 | file.close() 26 | raise ("can't load data") 27 | 28 | 29 | 30 | def load_wine(): 31 | abs_path = dirname(__file__).__add__('/wine.hdf5') 32 | try: 33 | file = h5py.File(abs_path,'r') 34 | missing_x = file['missing'] 35 | original_x = file['full'] 36 | return np.asarray(missing_x), np.asarray(original_x) 37 | except: 38 | file.close() 39 | raise ("can't load data") -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/common.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import, print_function, division 4 | 5 | import numpy as np 6 | 7 | from .normalized_distance import all_pairs_normalized_distances 8 | 9 | 10 | def knn_initialize( 11 | X, 12 | missing_mask, 13 | min_dist=1e-6, 14 | max_dist_multiplier=1e6): 15 | """ 16 | Fill X with NaN values if necessary, construct the n_samples x n_samples 17 | distance matrix and set the self-distance of each row to infinity. 18 | 19 | Returns contents of X laid out in row-major, the distance matrix, 20 | and an "effective infinity" which is larger than any entry of the 21 | distance matrix. 22 | """ 23 | X_row_major = X.copy("C") 24 | if missing_mask.sum() != np.isnan(X_row_major).sum(): 25 | # if the missing values have already been zero-filled need 26 | # to put NaN's back in the data matrix for the distances function 27 | X_row_major[missing_mask] = np.nan 28 | D = all_pairs_normalized_distances(X_row_major) 29 | D_finite_flat = D[np.isfinite(D)] 30 | if len(D_finite_flat) > 0: 31 | max_dist = max_dist_multiplier * max(1, D_finite_flat.max()) 32 | else: 33 | max_dist = max_dist_multiplier 34 | # set diagonal of distance matrix to a large value since we don't want 35 | # points considering themselves as neighbors 36 | np.fill_diagonal(D, max_dist) 37 | D[D < min_dist] = min_dist # prevents 0s 38 | D[D > max_dist] = max_dist # prevents infinities 39 | return X_row_major, D, max_dist 40 | -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/reference.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import, print_function, division 3 | 4 | import numpy as np 5 | from six.moves import range 6 | 7 | from .common import knn_initialize 8 | 9 | def knn_impute_reference( 10 | X, 11 | missing_mask, 12 | k, 13 | verbose=False, 14 | print_interval=100): 15 | """ 16 | Reference implementation of kNN imputation logic. 17 | """ 18 | n_rows, n_cols = X.shape 19 | X_result, D, effective_infinity = \ 20 | knn_initialize(X, missing_mask) 21 | 22 | for i in range(n_rows): 23 | for j in np.where(missing_mask[i, :])[0]: 24 | distances = D[i, :].copy() 25 | 26 | # any rows that don't have the value we're currently trying 27 | # to impute are set to infinite distances 28 | distances[missing_mask[:, j]] = effective_infinity 29 | neighbor_indices = np.argsort(distances) 30 | neighbor_distances = distances[neighbor_indices] 31 | 32 | # get rid of any infinite distance neighbors in the top k 33 | valid_distances = neighbor_distances < effective_infinity 34 | neighbor_distances = neighbor_distances[valid_distances][:k] 35 | neighbor_indices = neighbor_indices[valid_distances][:k] 36 | 37 | weights = 1.0 / neighbor_distances 38 | weight_sum = weights.sum() 39 | 40 | if weight_sum > 0: 41 | column = X[:, j] 42 | values = column[neighbor_indices] 43 | X_result[i, j] = np.dot(values, weights) / weight_sum 44 | return X_result 45 | -------------------------------------------------------------------------------- /ycimpute/datasets/dpath.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import h5py 4 | 5 | 6 | 7 | def make_missing(npdata): 8 | import random 9 | import numpy as np 10 | rows, cols = npdata.shape 11 | random_cols = range(cols) 12 | for col in random_cols: 13 | random_rows = random.sample(range(rows - 1), int(0.1 * rows)) 14 | npdata[random_rows, col] = np.nan 15 | return npdata 16 | 17 | 18 | def create_data(data): 19 | import copy 20 | full_data = copy.copy(data) 21 | missing_data = make_missing(data) 22 | 23 | return missing_data, full_data 24 | 25 | 26 | def boston(): 27 | from sklearn.datasets import load_boston 28 | boston = load_boston() 29 | data = boston.data 30 | missing_data, full_data = create_data(data) 31 | h5_file = h5py.File('boston.hdf5','w') 32 | h5_file['missing'] = missing_data 33 | h5_file['full'] = full_data 34 | h5_file.close() 35 | 36 | 37 | def diabetes(): 38 | """ 39 | Pima Indians Diabetes Datase 40 | :return: 41 | """ 42 | from sklearn.datasets import load_diabetes 43 | load_diabetes = load_diabetes() 44 | data = load_diabetes.data 45 | missing_data, full_data = create_data(data) 46 | h5_file = h5py.File('diabetes.hdf5', 'w') 47 | h5_file['missing'] = missing_data 48 | h5_file['full'] = full_data 49 | h5_file.close() 50 | 51 | 52 | def iris(): 53 | from sklearn.datasets import load_iris 54 | data = load_iris().data 55 | missing_data, full_data = create_data(data) 56 | h5_file = h5py.File('iris.hdf5', 'w') 57 | h5_file['missing'] = missing_data 58 | h5_file['full'] = full_data 59 | h5_file.close() 60 | 61 | def wine(): 62 | from sklearn.datasets import load_wine 63 | data = load_wine().data 64 | missing_data, full_data = create_data(data) 65 | h5_file = h5py.File('wine.hdf5', 'w') 66 | h5_file['missing'] = missing_data 67 | h5_file['full'] = full_data 68 | h5_file.close() 69 | 70 | if __name__=="__main__": 71 | #boston() 72 | #diabetes() 73 | #iris() 74 | wine() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os 14 | import logging 15 | 16 | from setuptools import setup, find_packages 17 | 18 | package_name = 'ycimpute' 19 | 20 | readme_dir = os.path.dirname(__file__) 21 | readme_filename = os.path.join(readme_dir, 'README.md') 22 | 23 | try: 24 | with open(readme_filename, 'r') as f: 25 | readme = f.read() 26 | except: 27 | logging.warning("Failed to load %s" % readme_filename) 28 | readme = "" 29 | 30 | try: 31 | import pypandoc 32 | readme = pypandoc.convert(readme, to='rst', format='md') 33 | except: 34 | logging.warning("Conversion of long_description from MD to RST failed") 35 | pass 36 | 37 | if __name__ == '__main__': 38 | setup( 39 | name=package_name, 40 | version="0.2", 41 | description="Matrix completion and feature imputation algorithms", 42 | author="zhouyc", 43 | author_email="yuanchenzhouhcmy@gmail.com", 44 | url="https://github.com/OpenIDEA-YunanUniversity/ycimpute", 45 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 46 | classifiers=[ 47 | 'Development Status :: 3 - Alpha', 48 | 'Environment :: Console', 49 | 'Operating System :: OS Independent', 50 | 'Intended Audience :: Science/Research', 51 | 'License :: OSI Approved :: Apache Software License', 52 | 'Programming Language :: Python', 53 | 'Topic :: Missing Value Imputation', 54 | ], 55 | install_requires=[ 56 | 'six', 57 | 'numpy>=1.10', 58 | 'scipy', 59 | 'scikit-learn>=0.17.1', 60 | 'torch>=1.1.0', 61 | ], 62 | long_description=readme, 63 | packages=find_packages(), 64 | ) 65 | -------------------------------------------------------------------------------- /ycimpute/nn/gainnets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class NetD(torch.nn.Module): 4 | def __init__(self, feature_dim): 5 | """ 6 | 7 | :param feature_dim: 8 | """ 9 | super(NetD, self).__init__() 10 | self.fc1 = torch.nn.Linear(feature_dim * 2, 256) 11 | self.fc2 = torch.nn.Linear(256, 128) 12 | self.fc3 = torch.nn.Linear(128, feature_dim) 13 | self.relu = torch.nn.ReLU() 14 | self.sigmoid = torch.nn.Sigmoid() 15 | self.init_weight() 16 | 17 | def init_weight(self): 18 | layers = [self.fc1, self.fc2, self.fc3] 19 | [torch.nn.init.xavier_normal_(layer.weight) for layer in layers] 20 | 21 | def forward(self, x, m, g, h): 22 | """ 23 | reference equation(4) in paper 24 | 25 | :param x: original data 26 | :param m: missing mask 27 | :param g: generated data by Generator 28 | :param h: hint, see paper 29 | :return: as a prob matrix, denote where is missing or not 30 | """ 31 | self.init_weight() 32 | inp = m * x + (1 - m) * g 33 | inp = torch.cat((inp, h), dim=1) 34 | out = self.relu(self.fc1(inp)) 35 | out = self.relu(self.fc2(out)) 36 | out = self.sigmoid(self.fc3(out)) 37 | 38 | return out 39 | 40 | 41 | class NetG(torch.nn.Module): 42 | def __init__(self,feature_dim): 43 | """ 44 | 45 | :param feature_dim: 46 | """ 47 | super(NetG, self).__init__() 48 | self.fc1 = torch.nn.Linear(feature_dim * 2, 256) 49 | self.fc2 = torch.nn.Linear(256, 128) 50 | self.fc3 = torch.nn.Linear(128, feature_dim) 51 | self.relu = torch.nn.ReLU() 52 | self.sigmoid = torch.nn.Sigmoid() 53 | self.init_weight() 54 | 55 | def init_weight(self): 56 | layers = [self.fc1, self.fc2, self.fc3] 57 | [torch.nn.init.xavier_normal_(layer.weight) for layer in layers] 58 | 59 | def forward(self, x, z, m): 60 | """ 61 | 62 | reference equation(2,3) in paper 63 | 64 | :param x: mising data 65 | :param z: noise 66 | :param m: missing mask, used to replace missing part bu noise 67 | :return: generated data, size same as original data 68 | """ 69 | self.init_weight() 70 | inp = m * x + (1 - m) * z 71 | inp = torch.cat((inp, m), dim=1) 72 | out = self.relu(self.fc1(inp)) 73 | out = self.relu(self.fc2(out)) 74 | out = self.sigmoid(self.fc3(out)) 75 | 76 | return out -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/few_observed_entries.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import, print_function, division 4 | import time 5 | 6 | import numpy as np 7 | from six.moves import range 8 | 9 | from .common import knn_initialize 10 | 11 | def knn_impute_few_observed( 12 | X, missing_mask, k, verbose=False, print_interval=100): 13 | """ 14 | Seems to be the fastest kNN implementation. Pre-sorts each rows neighbors 15 | and then filters these sorted indices using each columns mask of 16 | observed values. 17 | 18 | Important detail: If k observed values are not available then uses fewer 19 | than k neighboring rows. 20 | 21 | Parameters 22 | ---------- 23 | X : np.ndarray 24 | Matrix to fill of shape (n_samples, n_features) 25 | 26 | missing_mask : np.ndarray 27 | Boolean array of same shape as X 28 | 29 | k : int 30 | 31 | verbose : bool 32 | """ 33 | start_t = time.time() 34 | n_rows, n_cols = X.shape 35 | # put the missing mask in column major order since it's accessed 36 | # one column at a time 37 | missing_mask_column_major = np.asarray(missing_mask, order="F") 38 | observed_mask_column_major = ~missing_mask_column_major 39 | X_column_major = X.copy(order="F") 40 | X_row_major, D, effective_infinity = \ 41 | knn_initialize(X, missing_mask) 42 | # get rid of infinities, replace them with a very large number 43 | D_sorted = np.argsort(D, axis=1) 44 | inv_D = 1.0 / D 45 | D_valid_mask = D < effective_infinity 46 | valid_distances_per_row = D_valid_mask.sum(axis=1) 47 | 48 | # trim the number of other rows we consider to exclude those 49 | # with infinite distances 50 | D_sorted = [ 51 | D_sorted[i, :count] 52 | for i, count in enumerate(valid_distances_per_row) 53 | ] 54 | 55 | dot = np.dot 56 | for i in range(n_rows): 57 | missing_row = missing_mask[i, :] 58 | missing_indices = np.where(missing_row)[0] 59 | row_weights = inv_D[i, :] 60 | if verbose and i % print_interval == 0: 61 | print( 62 | "Imputing row %d/%d with %d missing, elapsed time: %0.3f" % ( 63 | i + 1, 64 | n_rows, 65 | len(missing_indices), 66 | time.time() - start_t)) 67 | candidate_neighbor_indices = D_sorted[i] 68 | 69 | for j in missing_indices: 70 | observed = observed_mask_column_major[:, j] 71 | sorted_observed = observed[candidate_neighbor_indices] 72 | observed_neighbor_indices = candidate_neighbor_indices[sorted_observed] 73 | k_nearest_indices = observed_neighbor_indices[:k] 74 | weights = row_weights[k_nearest_indices] 75 | weight_sum = weights.sum() 76 | if weight_sum > 0: 77 | column = X_column_major[:, j] 78 | values = column[k_nearest_indices] 79 | X_row_major[i, j] = dot(values, weights) / weight_sum 80 | return X_row_major 81 | -------------------------------------------------------------------------------- /ycimpute/imputer/expectation_maximization.py: -------------------------------------------------------------------------------- 1 | 2 | from ..utils.tools import Solver 3 | 4 | import numpy as np 5 | import copy 6 | 7 | class EM(Solver): 8 | """ 9 | this algorithm just require to lean the Gauss distribution elements 'mu' and 'sigma' 10 | """ 11 | def __init__(self, 12 | max_iter=100, 13 | theta=1e-5, 14 | normalizer='min_max'): 15 | Solver.__init__(self, 16 | normalizer=normalizer) 17 | 18 | self.max_iter = max_iter 19 | self.theta = theta 20 | 21 | def _init_parameters(self, X): 22 | rows, cols = X.shape 23 | mu_init = np.nanmean(X, axis=0) 24 | sigma_init = np.zeros((cols, cols)) 25 | for i in range(cols): 26 | for j in range(i, cols): 27 | vec_col = X[:, [i, j]] 28 | vec_col = vec_col[~np.any(np.isnan(vec_col), axis=1), :].T 29 | if len(vec_col) > 0: 30 | cov = np.cov(vec_col) 31 | cov = cov[0, 1] 32 | sigma_init[i, j] = cov 33 | sigma_init[j, i] = cov 34 | 35 | else: 36 | sigma_init[i, j] = 1.0 37 | sigma_init[j, i] = 1.0 38 | 39 | return mu_init, sigma_init 40 | 41 | def _e_step(self, mu,sigma, X): 42 | samples,_ = X.shape 43 | for sample in range(samples): 44 | if np.any(np.isnan(X[sample,:])): 45 | loc_nan = np.isnan(X[sample,:]) 46 | new_mu = np.dot(sigma[loc_nan, :][:, ~loc_nan], 47 | np.dot(np.linalg.inv(sigma[~loc_nan, :][:, ~loc_nan]), 48 | (X[sample, ~loc_nan] - mu[~loc_nan])[:,np.newaxis])) 49 | nan_count = np.sum(loc_nan) 50 | X[sample, loc_nan] = mu[loc_nan] + new_mu.reshape(1,nan_count) 51 | 52 | return X 53 | 54 | def _m_step(self,X): 55 | rows, cols = X.shape 56 | mu = np.mean(X, axis=0) 57 | sigma = np.cov(X.T) 58 | tmp_theta = -0.5 * rows * (cols * np.log(2 * np.pi) + 59 | np.log(np.linalg.det(sigma))) 60 | 61 | return mu, sigma,tmp_theta 62 | 63 | 64 | 65 | def solve(self, X, missing_mask): 66 | mu, sigma = self._init_parameters(X) 67 | complete_X,updated_X = None, None 68 | rows,_ = X.shape 69 | theta = -np.inf 70 | for iter in range(self.max_iter): 71 | updated_X = self._e_step(mu=mu, sigma=sigma, X=copy.copy(X)) 72 | mu, sigma, tmp_theta = self._m_step(updated_X) 73 | for i in range(rows): 74 | tmp_theta -= 0.5 * np.dot((updated_X[i, :] - mu), 75 | np.dot(np.linalg.inv(sigma), (updated_X[i, :] - mu)[:, np.newaxis])) 76 | if abs(tmp_theta-theta)= effective_infinity: 73 | # if there aren't k rows with the feature of interest then 74 | # we need to filter out indices of points at infinite distance 75 | neighbor_indices = array([ 76 | neighbor_index 77 | for neighbor_index in neighbor_indices 78 | if d_copy[neighbor_index] < effective_infinity 79 | ]) 80 | n_current_neighbors = len(neighbor_indices) 81 | 82 | if n_current_neighbors > 0: 83 | neighbor_weights = inv_d[neighbor_indices] 84 | X_row_major[i, j] = ( 85 | dot(X[:, j][neighbor_indices], neighbor_weights) / 86 | neighbor_weights.sum() 87 | ) 88 | return X_row_major 89 | -------------------------------------------------------------------------------- /test/generate_data.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | import os 5 | 6 | 7 | def shulle_data(data): 8 | seed = 2323 # 1314(second val) #123456(first val) 9 | 10 | np.random.seed(seed) 11 | np.random.shuffle(data) 12 | 13 | return data 14 | 15 | def missing(m, n, rate): 16 | p_miss_vec = rate * np.ones((n, 1)) 17 | Missing = np.zeros((m, n)) 18 | 19 | for i in range(n): 20 | A = np.random.uniform(0., 1., size=[m, ]) 21 | B = A > p_miss_vec[i] 22 | Missing[:, i] = 1. * B 23 | 24 | return Missing 25 | 26 | def sample_Z(m, n): 27 | return np.random.uniform(0., 0.01, size = [m, n]) 28 | 29 | 30 | def make_dataset(data_path, missing_rate, train_ratio=0.8, is_label_numerical=False): 31 | data = np.loadtxt(data_path, delimiter=',') 32 | 33 | data = shulle_data(data) 34 | 35 | label = data[:, 0] 36 | data = data[:, 1:] 37 | 38 | data_dim = data.shape[1] 39 | min_val = np.zeros(data_dim) 40 | max_val = np.zeros(data_dim) 41 | min_label = None 42 | max_label = None 43 | 44 | for i in range(data_dim): 45 | min_val[i] = np.min(data[:, i]) 46 | max_val[i] = np.max(data[:, i]) 47 | if max_val[i] == 0: 48 | max_val[i] = 0.1 49 | 50 | if is_label_numerical: 51 | min_label = np.min(label) 52 | max_label = np.max(label) 53 | 54 | label = (label - min_label) / (max_label - min_label) 55 | 56 | missing_mat = missing(data.shape[0], data.shape[1], 57 | missing_rate) 58 | 59 | train = data[:int(train_ratio * data.shape[0])] 60 | test = data[int(train_ratio * data.shape[0]):] 61 | 62 | train_label = label[:int(train_ratio * data.shape[0])] 63 | test_label = label[int(train_ratio * data.shape[0]):] 64 | 65 | train_missing = missing_mat[:int(train_ratio * data.shape[0])] 66 | test_missing = missing_mat[int(train_ratio * data.shape[0]):] 67 | 68 | train_noise = sample_Z(train.shape[0], train.shape[1]) 69 | test_noise = sample_Z(test.shape[0], test.shape[1]) 70 | 71 | info = {'train': train, 72 | 'test': test, 73 | 'train_missing': train_missing, 74 | 'test_missing': test_missing, 75 | 'train_noise': train_noise, 76 | 'test_noise': test_noise, 77 | 'min_val': min_val, 78 | 'max_val': max_val, 79 | 'train_label': train_label, 80 | 'test_label': test_label, 81 | 'max_label': max_label, 82 | 'min_label': min_label, 83 | 'missing_rate': missing_rate, 84 | 'train_rate': train_ratio 85 | } 86 | 87 | return info 88 | 89 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 90 | data_path = os.path.join(dir_path,'test_data/wave.csv') 91 | info_dict = make_dataset(data_path=data_path, 92 | missing_rate=0.2, 93 | is_label_numerical=False) 94 | 95 | 96 | missing_mask = info_dict['train_missing'][2000:2600,:] 97 | complete_data = info_dict['train'][2000:2600,:] 98 | 99 | missing_mask[:300] = True 100 | missing_data = complete_data.copy() 101 | missing_mask = missing_mask.astype(bool) 102 | missing_mask = ~missing_mask 103 | missing_data[missing_mask]=np.nan -------------------------------------------------------------------------------- /ycimpute/imputer/mida.py: -------------------------------------------------------------------------------- 1 | from ..utils.tools import Solver 2 | from ..nn.autoencoder import Autoencoder 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.utils.data 8 | 9 | class MIDA(Solver): 10 | def __init__( 11 | self, 12 | theta=5, 13 | epochs=300, 14 | use_cuda=False, 15 | batch_size=64, 16 | early_stop=1e-06, 17 | normalizer='min_max', 18 | verbose=True): 19 | 20 | Solver.__init__( 21 | self, 22 | normalizer=normalizer) 23 | 24 | self.theta = theta 25 | self.epochs = epochs 26 | self.use_cuda = use_cuda 27 | self.batch_size = batch_size 28 | self.verbose = verbose 29 | self.early_stop = early_stop 30 | 31 | self.device = torch.device("cuda:0" if self.use_cuda else "cpu") 32 | 33 | def training(self, training_data): 34 | n_features = training_data.shape[1] 35 | training_data = torch.from_numpy(training_data).float() 36 | 37 | train_loader = torch.utils.data.DataLoader(dataset=training_data, 38 | batch_size=self.batch_size, 39 | shuffle=True) 40 | 41 | model = Autoencoder(dim=n_features, 42 | theta=self.theta).to(self.device) 43 | loss = nn.MSELoss() 44 | optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001,momentum=0.8) 45 | 46 | cost_list = [] 47 | early_stop = False 48 | total_batch = len(training_data) // self.batch_size 49 | 50 | for epoch in range(self.epochs): 51 | for i, batch_data in enumerate(train_loader): 52 | batch_data = batch_data.to(self.device) 53 | reconst_data = model(batch_data) 54 | cost = loss(reconst_data, batch_data) 55 | 56 | optimizer.zero_grad() 57 | cost.backward() 58 | optimizer.step() 59 | if self.verbose: 60 | if (i + 1) % (total_batch // 2) == 0: 61 | print('Epoch [%d/%d], lter [%d/%d], Loss: %.6f' % 62 | (epoch + 1, self.epochs, i + 1, total_batch, cost.item())) 63 | 64 | # early stopping rule 1 : MSE < 1e-06 65 | if cost.item() < 1e-06: 66 | early_stop = True 67 | break 68 | 69 | cost_list.append(cost.item()) 70 | 71 | if early_stop: 72 | break 73 | return model 74 | 75 | 76 | def solve(self, X, missing_mask): 77 | complete_rows_index, missing_rows_index = self.detect_complete_part(missing_mask) 78 | if len(complete_rows_index)==0: 79 | raise ValueError('Cant find a completely part for training...') 80 | missing_data = X[missing_rows_index] 81 | training_data = X[complete_rows_index] 82 | 83 | model = self.training(training_data.copy()) 84 | model.eval() 85 | 86 | missing_data = torch.from_numpy(missing_data).float() 87 | filled_data = model(missing_data.to(self.device)) 88 | filled_data = filled_data.cpu().detach().numpy() 89 | tmp_mask = missing_mask[missing_rows_index] 90 | missing_data = missing_data.cpu().numpy() 91 | filled_data = missing_data * (1 - tmp_mask) + filled_data * (tmp_mask) 92 | 93 | X[missing_rows_index] = filled_data 94 | X[complete_rows_index] = training_data 95 | 96 | return X -------------------------------------------------------------------------------- /ycimpute/utils/shower/show.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import copy 5 | import h5py 6 | ################################################## 7 | 8 | from ...imputer.mice import MICE 9 | from ...imputer.knnimput import KNN 10 | from ...imputer.iterforest import IterImput 11 | from ...imputer.simple import SimpleFill 12 | from ...imputer import EM 13 | 14 | from ...utils.tools import Solver 15 | from .. import evaluate 16 | from .. import config 17 | 18 | solver = Solver() 19 | 20 | def analysiser(missing_X, original_X): 21 | missing_X = np.asarray(missing_X) 22 | original_X = np.asarray(original_X) 23 | 24 | mask_all = solver.masker(missing_X)[config.all] 25 | missing_index = evaluate.get_missing_index(mask_all) 26 | original_arr = original_X[missing_index] 27 | 28 | ################################################## 29 | 30 | mice_X_filled = MICE().complete(copy.copy(missing_X)) 31 | mice_filled_arr = mice_X_filled[missing_index] 32 | rmse_mice_score = evaluate.RMSE(original_arr, mice_filled_arr) 33 | 34 | ######################################################### 35 | iterforest_X_filled = IterImput().complete(copy.copy(missing_X)) 36 | iterforest_filled_arr = iterforest_X_filled[missing_index] 37 | rmse_iterforest_score = evaluate.RMSE(original_arr, iterforest_filled_arr) 38 | 39 | 40 | ############################################################ 41 | knn_X_filled = KNN(k=3).complete(copy.copy(missing_X)) 42 | knn_filled_arr = knn_X_filled[missing_index] 43 | rmse_knn_score = evaluate.RMSE(original_arr, knn_filled_arr) 44 | 45 | ###################################################### 46 | mean_X_filled = SimpleFill(fill_method='mean').complete(copy.copy(missing_X)) 47 | mean_filled_arr = mean_X_filled[missing_index] 48 | rmse_mean_score = evaluate.RMSE(original_arr, mean_filled_arr) 49 | ################################################################# 50 | zero_X_filled = SimpleFill(fill_method='zero').complete(copy.copy(missing_X)) 51 | zero_filled_arr = zero_X_filled[missing_index] 52 | rmse_zero_score = evaluate.RMSE(original_arr, zero_filled_arr) 53 | 54 | ################################################ 55 | median_X_filled = SimpleFill(fill_method='median').complete(copy.copy(missing_X)) 56 | median_filled_arr = median_X_filled[missing_index] 57 | rmse_median_score = evaluate.RMSE(original_arr, median_filled_arr) 58 | ########################################################################## 59 | min_X_filled = SimpleFill(fill_method='min').complete(copy.copy(missing_X)) 60 | min_filled_arr = min_X_filled[missing_index] 61 | rmse_min_score = evaluate.RMSE(original_arr, min_filled_arr) 62 | 63 | ####################################################### 64 | em_X_filled = EM().complete(copy.copy(missing_X)) 65 | em_filled_arr = em_X_filled[missing_index] 66 | rmse_em_score = evaluate.RMSE(original_arr,em_filled_arr) 67 | ################################################ 68 | 69 | return {'rmse_mice_score':rmse_mice_score, 70 | 'rmse_iterforest_score':rmse_iterforest_score, 71 | 'rmse_knn_score':rmse_knn_score, 72 | 'rmse_mean_score':rmse_mean_score, 73 | 'rmse_zero_score':rmse_zero_score, 74 | 'rmse_median_score':rmse_median_score, 75 | 'rmse_min_score':rmse_min_score, 76 | 'rmse_em_score': rmse_em_score 77 | } 78 | 79 | 80 | def example(): 81 | from ...datasets import load_data 82 | boston_mis, boston_full = load_data.load_boston() 83 | print(analysiser(boston_mis, boston_full)) -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/normalized_distance.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import, print_function, division 4 | 5 | from six.moves import range 6 | import numpy as np 7 | 8 | def all_pairs_normalized_distances(X): 9 | """ 10 | We can't really compute distances over incomplete data since 11 | rows are missing different numbers of entries. 12 | The next best thing is the mean squared difference between two vectors 13 | (a normalized distance), which gets computed only over the columns that 14 | two vectors have in common. If two vectors have no features in common 15 | then their distance is infinity. 16 | 17 | Parameters 18 | ---------- 19 | X : np.ndarray 20 | Data matrix of shape (n_samples, n_features) with missing entries 21 | marked using np.nan 22 | 23 | Returns a (n_samples, n_samples) matrix of pairwise normalized distances. 24 | """ 25 | n_rows, n_cols = X.shape 26 | 27 | # matrix of mean squared difference between between samples 28 | D = np.ones((n_rows, n_rows), dtype="float32", order="C") * np.inf 29 | 30 | # we can cheaply determine the number of columns that two rows share 31 | # by taking the dot product between their finite masks 32 | observed_elements = np.isfinite(X).astype(int) 33 | n_shared_features_for_pairs_of_rows = np.dot( 34 | observed_elements, 35 | observed_elements.T) 36 | no_overlapping_features_rows = n_shared_features_for_pairs_of_rows == 0 37 | number_incomparable_rows = no_overlapping_features_rows.sum(axis=1) 38 | row_overlaps_every_other_row = (number_incomparable_rows == 0) 39 | row_overlaps_no_other_rows = number_incomparable_rows == n_rows 40 | valid_rows_mask = ~row_overlaps_no_other_rows 41 | valid_row_indices = np.where(valid_rows_mask)[0] 42 | 43 | # preallocate all the arrays that we would otherwise create in the 44 | # following loop and pass them as "out" parameters to NumPy ufuncs 45 | diffs = np.zeros_like(X) 46 | missing_differences = np.zeros_like(diffs, dtype=bool) 47 | valid_rows = np.zeros(n_rows, dtype=bool) 48 | ssd = np.zeros(n_rows, dtype=X.dtype) 49 | 50 | for i in valid_row_indices: 51 | x = X[i, :] 52 | np.subtract(X, x.reshape((1, n_cols)), out=diffs) 53 | np.isnan(diffs, out=missing_differences) 54 | 55 | # zero out all NaN's 56 | diffs[missing_differences] = 0 57 | 58 | # square each difference 59 | diffs **= 2 60 | 61 | observed_counts_per_row = n_shared_features_for_pairs_of_rows[i] 62 | 63 | if row_overlaps_every_other_row[i]: 64 | # add up all the non-missing squared differences 65 | diffs.sum(axis=1, out=D[i, :]) 66 | D[i, :] /= observed_counts_per_row 67 | else: 68 | np.logical_not(no_overlapping_features_rows[i], out=valid_rows) 69 | 70 | # add up all the non-missing squared differences 71 | diffs.sum(axis=1, out=ssd) 72 | ssd[valid_rows] /= observed_counts_per_row[valid_rows] 73 | D[i, valid_rows] = ssd[valid_rows] 74 | return D 75 | 76 | 77 | def all_pairs_normalized_distances_reference(X): 78 | """ 79 | Reference implementation of normalized all-pairs distance, used 80 | for testing the more efficient implementation above for equivalence. 81 | """ 82 | n_samples, n_cols = X.shape 83 | # matrix of mean squared difference between between samples 84 | D = np.ones((n_samples, n_samples), dtype="float32") * np.inf 85 | for i in range(n_samples): 86 | diffs = X - X[i, :].reshape((1, n_cols)) 87 | missing_diffs = np.isnan(diffs) 88 | missing_counts_per_row = missing_diffs.sum(axis=1) 89 | valid_rows = missing_counts_per_row < n_cols 90 | D[i, valid_rows] = np.nanmean( 91 | diffs[valid_rows, :] ** 2, 92 | axis=1) 93 | return D 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ycimpute 3 | 4 | 【Notice!】 I've been so busy at work since i was graduated from colleage, so this project will not be maintain anymore. I apologize for any inconvenience caused and thank you for your support. 5 | 6 | # Updated 7 | - pypi updated 8 | - added GAN based algorithm 9 | 10 | ![AppVeyor](https://img.shields.io/appveyor/ci/gruntjs/grunt.svg) 11 | ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) 12 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Django.svg) 13 | 14 | ## [中文文档]( https://hcmy.gitbooks.io/ycimpute/content/)     [Documentation](https://hcmy.gitbooks.io/ycimpute-api/content/) 15 | # What is ycimpute? 16 | ycimpute is a high-level API for padding missing values library. It is written in python, which integrates methods for missing values imputation based on machine learning and statistics. Some modules require [scikit-lean](http://scikit-learn.org/stable/) support. 17 | 18 | The original intention of writing this library is that I often encounter some missing values in the process of doing data mining, most of the missing values of the scene can use the same set of missing approach, so the final decision to write a function library to facilitate the call 19 | 20 | ## Up untill now, There are a couple of methods I've been implemented: 21 | 22 | For various algorithms' detail, Please look up the API below: 23 | 24 | - simple imputation methods(mean value, padding zero, select maxmum, minimum ...etc) 25 | - based on Random Forest (missforest) 26 | - Multiple Imputation(MICE) 27 | - based on Expectation Maximization (EM) 28 | - based on KNN 29 | - based AutoEncoder MIDA[MIDA: Multiple Imputation using Denoising Autoencoders](https://arxiv.org/abs/1705.02737) 30 | - based GAIN[GAIN: Missing Data Imputation using Generative Adversarial Nets](https://arxiv.org/abs/1806.02920) 31 | 32 | ### Suggestion: Data loss mechanism varies in different scenarios, which requires the engineer to choose the appropriate filling method based on the business. 33 | ## Missing values can be of three general types: 34 | 35 | >+ Missing Completely At Random (MCAR): 36 | When missing data are MCAR, the presence/absence of data is completely independent of observable variables and parameters of interest. In this case, the analysis performed on the data are unbiased. In practice, it is highly unlikely. 37 | >+ Missing At Random (MAR): 38 | When missing data is not random but can be totally related to a variable where there is complete information. An example is that males are less likely to fill in a depression survey but this has nothing to do with their level of depression, after accounting for maleness. This kind of missing data can induce a bias in your analysis especially if it unbalances your data because of many missing values in a certain category. 39 | >+ Missing Not At Random (MNAR): 40 | When the missing values are neither MCAR nor MAR. In the previous example that would be the case if people tended not to answer the survey depending on their depression level. 41 | Let's check out the performance of per imputation methods in various data sets: 42 | 43 | ### the data sets include: [IRIS dataset](https://github.com/OpenIDEA-YunanUniversity/ycimpute/tree/master/ycimpute/datasets) [WINE dataset](https://github.com/OpenIDEA-YunanUniversity/ycimpute/tree/master/ycimpute/datasets) [Boston dataset](https://github.com/OpenIDEA-YunanUniversity/ycimpute/tree/master/ycimpute/datasets). 44 | 45 | ## These are the complete data. I used them to experiment and evaluate the model after randomly deleting the data. About 10% of the data is missing, and each feature contains different degrees of data loss. 46 | 47 | ## All of the data are continuous, the evaluation function which I used was RMSE(root mean square error) Red line represents the average of all errors.(Note: All data has not been normalized so RMSE looks higher) 48 | ![葡萄酒数据集](https://github.com/HCMY/ycimpute/blob/master/img/WINE.svg) 49 | ![IRIS数据集](https://github.com/HCMY/ycimpute/blob/master/img/IRIS.svg) 50 | ![波士顿房产数据集](https://github.com/HCMY/ycimpute/blob/master/img/BOSTON.svg) 51 | 52 | 53 | -------------------------------------------------------------------------------- /ycimpute/imputer/gain.py: -------------------------------------------------------------------------------- 1 | from ..utils.tools import generate_noise 2 | from ..utils.tools import Solver 3 | from ..nn.gainnets import NetD,NetG 4 | 5 | import torch 6 | import numpy as np 7 | import torch.nn as nn 8 | import torch.utils.data 9 | from torch.utils.data import Dataset, DataLoader 10 | 11 | 12 | def generate_hint(n_rows, n_cols, missing_rate): 13 | """ 14 | @n_rows: number of rows to generate missing matrix 15 | @n_cols: number of columns to generate missing matrix 16 | """ 17 | 18 | 19 | random_data = np.random.uniform(0., 1., size=[n_rows, n_cols]) 20 | tmp = random_data > missing_rate 21 | missing_mat = 1. * tmp 22 | 23 | return missing_mat 24 | 25 | 26 | class SimpleDataLoader(Dataset): 27 | """Face Landmarks dataset.""" 28 | 29 | def __init__(self, specify_data, mask): 30 | """ 31 | """ 32 | self.specify_data = specify_data 33 | self.mask = mask 34 | 35 | def __len__(self): 36 | return len(self.specify_data) 37 | 38 | def __getitem__(self, idx): 39 | data = self.specify_data[idx] 40 | mask = self.mask[idx] 41 | 42 | return data, mask 43 | 44 | class GAIN(Solver): 45 | def __init__(self, 46 | normalizer='min_max', 47 | epochs=10, 48 | use_cuda=False, 49 | batch_size=64, 50 | verbose=True, 51 | alpha = 0.2, 52 | lr = 0.0001, 53 | hint_rate=0.2, 54 | ): 55 | Solver.__init__(self, 56 | normalizer=normalizer) 57 | 58 | self.epochs = epochs 59 | self.lr = lr 60 | self.alpha = alpha 61 | self.use_cuda = use_cuda 62 | self.batch_size = batch_size 63 | self.verbose = verbose 64 | self.hint_rate = hint_rate 65 | self.device = torch.device("cuda:0" if self.use_cuda else "cpu") 66 | 67 | 68 | def training(self,training_data,train_mask): 69 | train_mask = ~train_mask 70 | train_mask = train_mask.astype(int) 71 | 72 | _, n_cols = training_data.shape 73 | netD = NetD(feature_dim=n_cols).to(self.device) 74 | netG = NetG(feature_dim=n_cols).to(self.device) 75 | optimD = torch.optim.RMSprop(netD.parameters(), lr=self.lr) 76 | optimG = torch.optim.RMSprop(netG.parameters(), lr=self.lr) 77 | 78 | train_dset = SimpleDataLoader(training_data,train_mask) 79 | train_loder = DataLoader(train_dset, 80 | batch_size=self.batch_size, 81 | num_workers=1) 82 | bce_loss = torch.nn.BCEWithLogitsLoss(reduction="elementwise_mean") 83 | mse_loss = torch.nn.MSELoss(reduction="elementwise_mean") 84 | 85 | for epoch in range(self.epochs): 86 | for idx, (x, mask) in enumerate(train_loder): 87 | noise = generate_noise(x.shape[0], x.shape[1]) 88 | hint = generate_hint(x.shape[0], x.shape[1], self.hint_rate) 89 | 90 | x = torch.tensor(x).float().to(self.device) 91 | noise = torch.tensor(noise).float().to(self.device) 92 | mask = torch.tensor(mask).float().to(self.device) 93 | hint = torch.tensor(hint).float().to(self.device) 94 | 95 | hint = mask * hint + 0.5 * (1 - hint) 96 | 97 | # train D 98 | optimD.zero_grad() 99 | G_sample = netG(x, noise, mask) 100 | 101 | D_prob = netD(x, mask, G_sample, hint) 102 | D_loss = bce_loss(D_prob, mask) 103 | D_loss.backward() 104 | optimD.step() 105 | # train G 106 | optimG.zero_grad() 107 | G_sample = netG(x, noise, mask) 108 | 109 | D_prob = netD(x, mask, G_sample, hint) 110 | 111 | D_prob.detach_() 112 | G_loss = ((1 - mask) * (torch.sigmoid(D_prob) + 1e-8).log()).mean() / (1 - mask).sum()+0.001 113 | G_mse_loss = mse_loss(mask * x, mask * G_sample) / mask.sum()+0.0001 114 | G_loss = G_loss + self.alpha * G_mse_loss 115 | 116 | G_loss.backward() 117 | optimG.step() 118 | 119 | G_mse_train = mse_loss((mask) * x, (mask) * G_sample) / (mask).sum() 120 | if self.verbose: 121 | if epoch % 2 == 0: 122 | print('Iter:{}\tD_loss: {:.4f}\tG_loss: {:.4f}\tTrain MSE:{:.4f}'. \ 123 | format(epoch, D_loss, G_loss, np.sqrt(G_mse_train.data.cpu().numpy()))) 124 | 125 | return netG 126 | 127 | 128 | def solve(self, X,missing_mask): 129 | complete_rows_index, missing_rows_index = self.detect_complete_part(missing_mask) 130 | if len(complete_rows_index) == 0: 131 | raise ValueError('Cant find a completely part for training...') 132 | model = self.training(training_data=X.copy(),train_mask=missing_mask.copy()) 133 | model.eval() 134 | 135 | missing_mask = ~missing_mask 136 | missing_mask = missing_mask.astype(int) 137 | 138 | noise = generate_noise(X.shape[0], X.shape[1]) 139 | noise = torch.tensor(noise).float().to(self.device) 140 | X = torch.tensor(X).float().to(self.device) 141 | mask = torch.tensor(missing_mask).float().to(self.device) 142 | 143 | filled_data = model(X,noise,mask) 144 | filled_data = filled_data.cpu().detach().numpy() 145 | 146 | X = X.cpu().detach().numpy() 147 | X[missing_rows_index] = filled_data[missing_rows_index] 148 | 149 | return X 150 | 151 | -------------------------------------------------------------------------------- /ycimpute/unsupervised/knn/optimistic.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import, print_function, division 4 | import time 5 | 6 | from six.moves import range 7 | import numpy as np 8 | 9 | from .common import knn_initialize 10 | 11 | def knn_impute_optimistic( 12 | X, 13 | missing_mask, 14 | k, 15 | verbose=False, 16 | print_interval=100): 17 | """ 18 | Fill in the given incomplete matrix using k-nearest neighbor imputation. 19 | 20 | This version assumes that most of the time the same neighbors will be 21 | used so first performs the weighted average of a row's k-nearest neighbors 22 | and checks afterward whether it was valid (due to possible missing values). 23 | 24 | Has been observed to be a lot faster for 1/4 missing images matrix 25 | with 1000 rows and ~9000 columns. 26 | 27 | Parameters 28 | ---------- 29 | X : np.ndarray 30 | Matrix to fill of shape (n_samples, n_features) 31 | 32 | missing_mask : np.ndarray 33 | Boolean array of same shape as X 34 | 35 | k : int 36 | 37 | verbose : bool 38 | 39 | Modifies X by replacing its missing values with weighted averages of 40 | similar rows. Returns the modified X. 41 | """ 42 | start_t = time.time() 43 | n_rows, n_cols = X.shape 44 | X_row_major, D, _ = knn_initialize(X, missing_mask) 45 | D_sorted_indices = np.argsort(D, axis=1) 46 | X_column_major = X_row_major.copy(order="F") 47 | 48 | dot = np.dot 49 | 50 | # preallocate array to prevent repeated creation in the following loops 51 | neighbor_weights = np.ones(k, dtype=X.dtype) 52 | 53 | missing_mask_column_major = np.asarray(missing_mask, order="F") 54 | observed_mask_column_major = ~missing_mask_column_major 55 | 56 | for i in range(n_rows): 57 | missing_columns = np.where(missing_mask[i])[0] 58 | if verbose and i % print_interval == 0: 59 | print( 60 | "Imputing row %d/%d with %d missing, elapsed time: %0.3f" % ( 61 | i + 1, 62 | n_rows, 63 | len(missing_columns), 64 | time.time() - start_t)) 65 | n_missing_columns = len(missing_columns) 66 | if n_missing_columns == 0: 67 | continue 68 | 69 | row_distances = D[i, :] 70 | neighbor_indices = D_sorted_indices[i, :] 71 | X_missing_columns = X_column_major[:, missing_columns] 72 | 73 | # precompute these for the fast path where the k nearest neighbors 74 | # are not missing the feature value we're currently trying to impute 75 | k_nearest_indices = neighbor_indices[:k] 76 | np.divide(1.0, row_distances[k_nearest_indices], out=neighbor_weights) 77 | # optimistically impute all the columns from the k nearest neighbors 78 | # we'll have to back-track for some of the columns for which 79 | # one of the neighbors did not have a value 80 | X_knn = X_missing_columns[k_nearest_indices, :] 81 | weighted_average_of_neighboring_rows = dot( 82 | X_knn.T, 83 | neighbor_weights) 84 | sum_weights = neighbor_weights.sum() 85 | weighted_average_of_neighboring_rows /= sum_weights 86 | imputed_values = weighted_average_of_neighboring_rows 87 | 88 | observed_mask_missing_columns = observed_mask_column_major[:, missing_columns] 89 | observed_mask_missing_columns_sorted = observed_mask_missing_columns[ 90 | neighbor_indices, :] 91 | 92 | # We can determine the maximum number of other rows that must be 93 | # inspected across all features missing for this row by 94 | # looking at the column-wise running sums of the observed feature 95 | # matrix. 96 | observed_cumulative_sum = observed_mask_missing_columns_sorted.cumsum(axis=0) 97 | sufficient_rows = (observed_cumulative_sum == k) 98 | n_rows_needed = sufficient_rows.argmax(axis=0) + 1 99 | max_rows_needed = n_rows_needed.max() 100 | 101 | if max_rows_needed == k: 102 | # if we never needed more than k rows then we're done after the 103 | # optimistic averaging above, so go on to the next sample 104 | X[i, missing_columns] = imputed_values 105 | continue 106 | 107 | # truncate all the sorted arrays to only include the necessary 108 | # number of rows (should significantly speed up the "slow" path) 109 | necessary_indices = neighbor_indices[:max_rows_needed] 110 | d_sorted = row_distances[necessary_indices] 111 | X_missing_columns_sorted = X_missing_columns[necessary_indices, :] 112 | observed_mask_missing_columns_sorted = observed_mask_missing_columns_sorted[ 113 | :max_rows_needed, :] 114 | 115 | for missing_column_idx in range(n_missing_columns): 116 | # since all the arrays we're looking into have already been 117 | # sliced out at the missing features, we need to address these 118 | # features from 0..n_missing using missing_idx rather than j 119 | if n_rows_needed[missing_column_idx] == k: 120 | assert np.isfinite(imputed_values[missing_column_idx]), \ 121 | "Expected finite imputed value #%d (column #%d for row %d)" % ( 122 | missing_column_idx, 123 | missing_columns[missing_column_idx], 124 | i) 125 | continue 126 | row_mask = observed_mask_missing_columns_sorted[:, missing_column_idx] 127 | sorted_column_values = X_missing_columns_sorted[:, missing_column_idx] 128 | neighbor_distances = d_sorted[row_mask][:k] 129 | 130 | # may not have enough values in a column for all k neighbors 131 | k_or_less = len(neighbor_distances) 132 | usable_weights = neighbor_weights[:k_or_less] 133 | np.divide( 134 | 1.0, 135 | neighbor_distances, out=usable_weights) 136 | neighbor_values = sorted_column_values[row_mask][:k_or_less] 137 | 138 | imputed_values[missing_column_idx] = ( 139 | dot(neighbor_values, usable_weights) / usable_weights.sum()) 140 | 141 | X[i, missing_columns] = imputed_values 142 | return X 143 | -------------------------------------------------------------------------------- /doc_eng.md: -------------------------------------------------------------------------------- 1 | 2 | ### Welcome to ycimpute! 3 | ![AppVeyor](https://img.shields.io/appveyor/ci/gruntjs/grunt.svg) 4 | ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) 5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Django.svg) 6 | 7 | ## ycimpute Overview 8 | #### ycimpute is a high-level API for padding missing values library. It is written in python, which integrates methods for missing values imputation based on machine learning and statistics. Some modules require scikit-lean support. 9 | ##### The original intention of writing this library is that I often encounter some missing values in the process of doing data mining, most of the missing values of the scene can use the same set of missing approach, so the final decision to write a function library to facilitate the call 10 | ### Suggestion: Data loss mechanism varies in different scenarios, which requires the engineer to choose the appropriate filling method based on the business. 11 | 12 | ## performence of various models 13 | 14 | ![UCI WINE data set](https://github.com/HCMY/ycimpute/blob/master/img/WINE.svg) 15 | ![IRIS data set](https://github.com/HCMY/ycimpute/blob/master/img/IRIS.svg) 16 | ![BOSTON housing data set](https://github.com/HCMY/ycimpute/blob/master/img/BOSTON.svg) 17 | 18 | # Install 19 | 20 | ### via pip 21 | 22 | pip install ycimpute 23 | 24 | ### via source 25 | 26 | 27 | ```python 28 | git clone https://github.com/HCMY/ycimpute.git 29 | cd ycimpute 30 | python setup install 31 | ``` 32 | 33 | ## API Reference 34 | 35 | ## select surpvised methods 36 | 37 | ### 1 based on Random Forest 38 | 39 | theories of this method: [MissForest—non-parametric missing value imputation for mixed-type data](https://academic.oup.com/bioinformatics/article/28/1/112/219101) 40 | 41 | ### usage: 42 | 43 | #### Before using the example, you need to download the data file and copy it to the function directory of your python package 44 | ( ``` your python path / site-packages / ycimpute / datasets / ``` ) 45 | #### Linux users can use wget download, data download in the current working directory: 46 | ``` 47 | wget https://github.com/HCMY/ycimpute/raw/master/test_data/boston.hdf5 48 | wget https://github.com/HCMY/ycimpute/raw/master/test_data/iris.hdf5 49 | wget https://github.com/HCMY/ycimpute/raw/master/test_data/wine.hdf5 50 | ``` 51 | 52 | 53 | ```python 54 | import numpy as np 55 | from ycimpute.datasets.load_data import load_boston 56 | from ycimpute.imputer.iterforest import IterImput 57 | X_missing, X_original = load_boston()#加载boston房产数据 58 | 59 | print(X_missing.shape) 60 | print("X missing\n\n",np.argwhere(np.isnan(X_missing))) 61 | X_filled = IterImput().complete(X_missing) 62 | print("X filled\n\n",np.argwhere(np.isnan(X_filled))) 63 | ``` 64 | 65 | (506, 13) 66 | X missing 67 | 68 | [[ 1 2] 69 | [ 1 4] 70 | [ 1 8] 71 | ..., 72 | [502 12] 73 | [504 3] 74 | [504 7]] 75 | X filled 76 | 77 | [] 78 | 79 | 80 | ### parameters: 81 | ### TODO 82 | 83 | ## fill based on MICE 84 | theories of this method:[Multiple Imputation by Chained Equations](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074241/) 85 | 86 | #### usage: 87 | 88 | 89 | ```python 90 | from ycimpute.imputer.mice import MICE 91 | print("X missing\n\n",np.argwhere(np.isnan(X_missing))) 92 | X_filled = MICE().complete(X_missing) 93 | print("X filled\n\n",np.argwhere(np.isnan(X_filled))) 94 | ``` 95 | 96 | X missing 97 | 98 | [[ 1 2] 99 | [ 1 4] 100 | [ 1 8] 101 | ..., 102 | [502 12] 103 | [504 3] 104 | [504 7]] 105 | X filled 106 | 107 | [] 108 | 109 | 110 | ### parameters: 111 | ### TODO 112 | 113 | ## select unsurpvised methods 114 | 115 | ### based on KNN 116 | 117 | #### usage 118 | 119 | 120 | ```python 121 | from ycimpute.imputer.knnimput import KNN 122 | print("X missing\n\n",np.argwhere(np.isnan(X_missing))) 123 | X_filled = KNN(k=4).complete(X_missing) 124 | print("X filled\n\n",np.argwhere(np.isnan(X_filled))) 125 | ``` 126 | 127 | X missing 128 | 129 | [[ 1 2] 130 | [ 1 4] 131 | [ 1 8] 132 | ..., 133 | [502 12] 134 | [504 3] 135 | [504 7]] 136 | Imputing row 1/506 with 0 missing, elapsed time: 0.094 137 | Imputing row 101/506 with 2 missing, elapsed time: 0.096 138 | Imputing row 201/506 with 2 missing, elapsed time: 0.098 139 | Imputing row 301/506 with 3 missing, elapsed time: 0.100 140 | Imputing row 401/506 with 1 missing, elapsed time: 0.102 141 | Imputing row 501/506 with 1 missing, elapsed time: 0.104 142 | X filled 143 | 144 | [] 145 | 146 | 147 | ### parameters 148 | parameter | function | value 149 | - | :-: | -: 150 | k | --- | int 151 | 152 | ## visualization fill effects of these method,metrics used by rmse 153 | 154 | 155 | ```python 156 | from ycimpute.utils.tools import Solver 157 | from ycimpute.utils import evaluate 158 | from ycimpute.datasets.load_data import load_boston 159 | solver = Solver() 160 | X_missing, X_original = load_boston() 161 | from ycimpute.imputer.mice import MICE 162 | 163 | X_filled = MICE().complete(X_missing) 164 | mask_all = solver.masker(X_missing)['all'] 165 | missing_index = evaluate.get_missing_index(mask_all) 166 | original_arr = X_original[missing_index] 167 | mice_filled_arr = X_filled[missing_index] 168 | rmse_mice_score = evaluate.RMSE(original_arr, mice_filled_arr) 169 | print(rmse_mice_score) 170 | ``` 171 | 172 | 29.1028614966 173 | 174 | 175 | ### you could look over all of methods effects one shot: 176 | notes: all the model use default parameters, which shoule be improved :) 177 | 178 | 179 | ```python 180 | from ycimpute.utils.test_evaluate import show 181 | result = show.analysiser(X_missing,X_original) 182 | import pandas as pd 183 | result = pd.DataFrame.from_dict(result, orient='index') 184 | print(result) 185 | ``` 186 | 187 | Imputing row 1/506 with 0 missing, elapsed time: 0.050 188 | Imputing row 101/506 with 2 missing, elapsed time: 0.052 189 | Imputing row 201/506 with 2 missing, elapsed time: 0.054 190 | Imputing row 301/506 with 3 missing, elapsed time: 0.056 191 | Imputing row 401/506 with 1 missing, elapsed time: 0.058 192 | Imputing row 501/506 with 1 missing, elapsed time: 0.060 193 | 0 194 | rmse_mice_score 28.971895 195 | rmse_iterforest_score 23.639840 196 | rmse_knn_score 40.944330 197 | rmse_mean_score 52.154860 198 | rmse_zero_score 159.534384 199 | rmse_median_score 57.616702 200 | rmse_min_score 127.874980 201 | -------------------------------------------------------------------------------- /ycimpute/imputer/mice.py: -------------------------------------------------------------------------------- 1 | 2 | from time import time 3 | import numpy as np 4 | from sklearn.linear_model import LinearRegression 5 | 6 | from ..utils.tools import Solver 7 | 8 | class MICE(Solver): 9 | """ 10 | Basic implementation of MICE package from R. 11 | This version assumes all of the columns are ordinal, 12 | and uses ridge regression. 13 | 14 | Parameters 15 | ---------- 16 | visit_sequence : str 17 | Possible values: "monotone" (default), "roman", "arabic", 18 | "revmonotone". 19 | 20 | n_imputations : int 21 | Defaults to 100 22 | 23 | n_burn_in : int 24 | Defaults to 10 25 | 26 | impute_type : str 27 | "pmm" (default) is probablistic moment matching. 28 | "col" means fill in with samples from posterior predictive 29 | distribution. 30 | 31 | n_pmm_neighbors : int 32 | Number of nearest neighbors for PMM, defaults to 5. 33 | 34 | model : predictor function 35 | A model that has fit, predict, and predict_dist methods. 36 | Defaults to LinerRegression() from scikit-learn 37 | Note that the regularization parameter lambda_reg 38 | is by default scaled by np.linalg.norm(np.dot(X.T,X)). 39 | Sensible lambda_regs to try: 0.25, 0.1, 0.01, 0.001, 0.0001. 40 | 41 | n_nearest_columns : int 42 | Number of other columns to use to estimate current column. 43 | Useful when number of columns is huge. 44 | Default is to use all columns. 45 | 46 | init_fill_method : str 47 | Valid values: {"mean", "median", or "random"} 48 | (the latter meaning fill with random samples from the observed 49 | values of a column) 50 | 51 | min_value : float 52 | Minimum possible imputed value 53 | 54 | max_value : float 55 | Maximum possible imputed value 56 | 57 | verbose : boolean 58 | """ 59 | def __init__(self, 60 | visit_sequence='monotone', 61 | n_imputations=100, 62 | n_burn_in=10, 63 | n_pmm_neighbors=5, 64 | impute_type='pmm', 65 | model=LinearRegression(), 66 | n_nearest_columns=np.infty, 67 | init_fill_method="mean", 68 | min_value=None, 69 | max_value=None, 70 | verbose=False, 71 | normalizer='min_max'): 72 | 73 | 74 | Solver.__init__(self, 75 | normalizer=normalizer) 76 | 77 | self.visit_sequence = visit_sequence 78 | self.n_burn_in = n_burn_in 79 | self.n_pmm_neighbors = n_pmm_neighbors 80 | self.impute_type = impute_type 81 | self.model = model 82 | self.n_nearest_columns = n_nearest_columns 83 | self.verbose = verbose 84 | self.fill_method = init_fill_method 85 | self.min_value = min_value 86 | self.max_value = max_value 87 | self.n_imputations = n_imputations 88 | 89 | def _imputation_round(self, X_filled, visit_indices,missing_mask): 90 | global imputed_values 91 | for col in visit_indices: 92 | x_obs, y_obs, x_mis = self.split(X_filled, col, missing_mask) 93 | model = self.model 94 | model.fit(x_obs, y_obs) 95 | 96 | if self.impute_type == 'pmm': 97 | col_preds_missing = model.predict(x_mis) 98 | col_preds_observed = model.predict(x_obs) 99 | D = np.abs(col_preds_missing[:, np.newaxis] - col_preds_observed) 100 | k = np.minimum(self.n_pmm_neighbors, len(col_preds_observed) - 1) 101 | k_nearest_indices = np.argpartition(D, k, 1)[:, :k] 102 | imputed_indices = np.array([ 103 | np.random.choice(neighbor_index) 104 | for neighbor_index in k_nearest_indices]) 105 | imputed_values = y_obs[imputed_indices] 106 | elif self.impute_type == 'col': 107 | pass 108 | 109 | X_filled[missing_mask[:, col], col] = imputed_values 110 | return X_filled 111 | 112 | def clip(self, X, *kwargs): 113 | """ 114 | Clip values to fall within any global or column-wise min/max constraints 115 | :param **kwargs: 116 | """ 117 | if self.min_value is not None: 118 | X[X < self.min_value] = self.min_value 119 | if self.max_value is not None: 120 | X[X > self.max_value] = self.max_value 121 | return X 122 | 123 | def get_visit_indices(self, missing_mask): 124 | """ 125 | Decide what order we will update the columns.e.g. sort columns 126 | As a homage to the MICE package, we will have 4 options of 127 | how to order the updates. 128 | """ 129 | n_rows, n_cols = missing_mask.shape 130 | if self.visit_sequence == 'roman': 131 | return np.arange(n_cols) 132 | elif self.visit_sequence == 'arabic': 133 | return np.arange(n_cols - 1, -1, -1) # same as np.arange(d)[::-1] 134 | elif self.visit_sequence == 'monotone': 135 | return np.argsort(missing_mask.sum(0))[::-1] 136 | elif self.visit_sequence == 'revmonotone': 137 | return np.argsort(missing_mask.sum(0)) 138 | else: 139 | raise ValueError("Invalid choice for visit order: %s" % self.visit_sequence) 140 | 141 | 142 | def solve(self, X, missing_mask): 143 | if self.verbose: 144 | print("[MICE] Completing matrix with shape %s" % (X.shape,)) 145 | start_t = time() 146 | 147 | X_filled = np.array(X.copy()) 148 | visit_idx = self.sort_col(missing_mask) 149 | total_rounds = self.n_burn_in + self.n_imputations 150 | 151 | results_list = [] 152 | 153 | for m in range(total_rounds): 154 | if self.verbose: 155 | print( 156 | "[MICE] Starting imputation round %d/%d, elapsed time %0.3f" % ( 157 | m + 1, 158 | total_rounds, 159 | time() - start_t)) 160 | X_filled = self._imputation_round(X_filled, visit_idx, missing_mask) 161 | 162 | if m >= self.n_burn_in: 163 | results_list.append(X_filled[missing_mask]) 164 | 165 | imputed_arrays = np.asarray(results_list) 166 | 167 | # average the imputed values for each feature 168 | average_imputated_values = imputed_arrays.mean(axis=0) 169 | 170 | X[missing_mask] = average_imputated_values 171 | 172 | return X 173 | -------------------------------------------------------------------------------- /ycimpute/esemble/random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..tree.tree import ClassifyTree, RegressionTree 3 | from abc import ABCMeta 4 | from scipy.stats import mode 5 | 6 | 7 | 8 | class RandomForest(metaclass=ABCMeta): 9 | """ 10 | Attributes 11 | ---------- 12 | num_trees : the number of trees to be made in the forest 13 | max_depth : the maximum depth that each tree is allowed to grow 14 | min_size : the minimum number of data observations needed in each split 15 | sample_percentage : size of data to be sampled per tree 16 | 17 | Note 18 | ---- 19 | This class is not to be instantiated. It is simply a base class for the 20 | classification and regression forest classes 21 | """ 22 | 23 | def __init__(self, 24 | num_trees, 25 | seed, max_depth, 26 | min_size, 27 | sample_percentage): 28 | """ 29 | Initializes the random forest 30 | 31 | Parameters 32 | ---------- 33 | num_trees : the number of trees to be made in the forest 34 | seed : the seed from which the random sample choices will be made 35 | max_depth : the maximum depth that each tree is allowed to grow 36 | min_size : the minimum number of data observations needed in each split 37 | sample_percentage : size of data to be sampled per tree 38 | """ 39 | 40 | self.num_trees = num_trees 41 | self.max_depth = max_depth 42 | self.min_size = min_size 43 | self.sample_percentage = sample_percentage 44 | np.random.seed(seed) 45 | 46 | def fit(self, X, y): 47 | """ 48 | Grows a forest of decision trees based off the num_trees 49 | attribute 50 | 51 | Parameters 52 | ---------- 53 | X : N x D matrix of real or ordinal values 54 | y : size N vector consisting of either real values or labels for corresponding 55 | index in X 56 | """ 57 | 58 | data = np.column_stack((X, y)) 59 | self.forest = np.empty(shape=self.num_trees, dtype='object') 60 | sample_size = int(X.shape[0] * self.sample_percentage) 61 | 62 | for i in range(self.num_trees): 63 | sample = data[np.random.choice(data.shape[0], sample_size, replace=True)] 64 | 65 | sampled_X = data[:, :data.shape[1] - 1] 66 | sampled_y = data[:, data.shape[1] - 1] 67 | 68 | if isinstance(self, RegressionForest): 69 | tree = RegressionTree( 70 | max_depth=self.max_depth, 71 | min_size=self.min_size, 72 | in_forest=True) 73 | else: 74 | tree = ClassifyTree( 75 | max_depth=self.max_depth, 76 | min_size=self.min_size, 77 | in_forest=True) 78 | 79 | tree.fit(sampled_X, sampled_y) 80 | self.forest[i] = tree 81 | 82 | def predict(self, X): 83 | """ 84 | Predicts the output (y) of a given matrix X 85 | 86 | Parameters 87 | ---------- 88 | X : numerical or ordinal matrix of values corresponding to some output 89 | 90 | Returns 91 | ------- 92 | The predict values corresponding to the inputs 93 | """ 94 | 95 | votes = np.zeros(shape=(self.num_trees, X.shape[0])) 96 | for i, tree in enumerate(self.forest): 97 | votes[i] = tree.predict(X) 98 | 99 | predictions = np.zeros(shape=X.shape[0]) 100 | if isinstance(self, RegressionForest): 101 | predictions = votes.mean(axis=0) 102 | else: 103 | # print(votes) 104 | predictions = np.squeeze(mode(votes, axis=0)[0]) 105 | 106 | return predictions 107 | 108 | 109 | class RegressionForest(RandomForest): 110 | """ 111 | Attributes 112 | ---------- 113 | num_trees : the number of trees to be made in the forest 114 | max_depth : the maximum depth that each tree is allowed to grow 115 | cost_func : function that determines the cost of each split in the trees 116 | min_size : the minimum number of data observations needed in each split 117 | sample_percentage : size of data to be sampled per tree 118 | """ 119 | 120 | def __init__(self, 121 | num_trees=10, 122 | seed=0, 123 | max_depth=None, 124 | min_size=1, 125 | sample_percentage=1): 126 | """ 127 | Initializes Regression Forest 128 | 129 | Parameters 130 | ---------- 131 | num_trees : the number of trees to be made in the forest 132 | seed : the seed from which the random sample choices will be made 133 | max_depth : the maximum depth that each tree is allowed to grow 134 | min_size : the minimum number of data observations needed in each split 135 | sample_percentage : size of data to be sampled per tree 136 | """ 137 | 138 | self.num_trees = num_trees 139 | self.max_depth = max_depth 140 | self.min_size = min_size 141 | self.sample_percentage = sample_percentage 142 | super(RegressionForest, self).__init__( 143 | num_trees=num_trees, 144 | seed=seed, 145 | max_depth=max_depth, 146 | min_size=min_size, 147 | sample_percentage=sample_percentage 148 | ) 149 | 150 | 151 | class ClassificationForest(RandomForest): 152 | """ 153 | Attributes 154 | ---------- 155 | num_trees : the number of trees to be made in the forest 156 | max_depth : the maximum depth that each tree is allowed to grow 157 | cost_func : function that determines the cost of each split in the trees 158 | min_size : the minimum number of data observations needed in each split 159 | sample_percentage : size of data to be sampled per tree 160 | """ 161 | 162 | def __init__(self, 163 | num_trees=10, 164 | seed=0, 165 | max_depth=None, 166 | min_size=1, 167 | sample_percentage=1): 168 | """ 169 | Initializes Regression Forest 170 | 171 | Parameters 172 | ---------- 173 | num_trees : the number of trees to be made in the forest 174 | seed : the seed from which the random sample choices will be made 175 | max_depth : the maximum depth that each tree is allowed to grow 176 | cost_func : function that determines the cost of each split in the trees 177 | min_size : the minimum number of data observations needed in each split 178 | sample_percentage : size of data to be sampled per tree 179 | """ 180 | 181 | self.num_trees = num_trees 182 | self.max_depth = max_depth 183 | self.min_size = min_size 184 | self.sample_percentage = sample_percentage 185 | super(ClassificationForest,self).__init__( 186 | num_trees=num_trees, 187 | seed=seed, 188 | max_depth=max_depth, 189 | min_size=min_size, 190 | sample_percentage=sample_percentage 191 | ) 192 | -------------------------------------------------------------------------------- /ycimpute/imputer/iterforest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestRegressor 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.utils import check_array 4 | import numpy as np 5 | 6 | from ..utils.tools import Solver 7 | 8 | class MissForest(Solver): 9 | def __init__( 10 | self, 11 | n_estimators=300, 12 | max_depth=None, 13 | min_samples_split=2, 14 | min_samples_leaf=1, 15 | max_features='auto', 16 | max_samples=None, 17 | normalizer='min_max'): 18 | """ 19 | Parameters 20 | ---------- 21 | n_estimators: integer, optional (default=10) 22 | max_depth: integer or None, optional (default=None) 23 | The maximum depth of the tree. 24 | If None, then nodes are expanded until all leaves are pure 25 | or until all leaves contain less than min_samples_split samples. 26 | min_samples_split: int, float, optional (default=2) 27 | The minimum number of samples required to split an internal node 28 | min_samples_leaf: int, float, optional (default=1) 29 | The minimum number of samples required to be at a leaf node. 30 | A split point at any depth will only be considered if it leaves 31 | at least min_samples_leaf training samples in each of the left and right branches. 32 | This may have the effect of smoothing the model, especially in regression. 33 | max_features: int, float, string or None, optional (default=”auto”) 34 | The number of features to consider when looking for the best split 35 | if int, then consider max_features features at each split. 36 | If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split. 37 | If “auto”, then max_features=n_features. 38 | If “sqrt”, then max_features=sqrt(n_features). 39 | If “log2”, then max_features=log2(n_features). 40 | If None, then max_features=n_features. 41 | max_samples: int or float, default=None 42 | If bootstrap is True, the number of samples to draw from X to train each base estimator. 43 | If None (default), then draw X.shape[0] samples. 44 | If int, then draw max_samples samples. 45 | If float, then draw max_samples * X.shape[0] samples. Thus, max_samples should be in the interval (0, 1) 46 | """ 47 | self.coltype_dict = None 48 | self.mask_memo_dict = None 49 | self.sorted_col = None 50 | self.stop = False 51 | self.rf_reg = RandomForestRegressor(n_estimators=n_estimators, 52 | max_depth=max_depth, 53 | min_samples_leaf=min_samples_leaf, 54 | max_features=max_features, 55 | min_samples_split=min_samples_split) 56 | self.rf_cla = RandomForestClassifier(n_estimators=n_estimators, 57 | max_depth=max_depth, 58 | min_samples_leaf=min_samples_leaf, 59 | max_features=max_features, 60 | min_samples_split=min_samples_split) 61 | self.imp_continuous_index = None 62 | self.imp_categorical_index = None 63 | self.normalizer = normalizer 64 | 65 | Solver.__init__(self, 66 | normalizer=normalizer) 67 | 68 | def solve(self, X, missing_mask): 69 | X = check_array(X, force_all_finite=False) 70 | self.sorted_col = self.sort_col(missing_mask) 71 | self.coltype_dict = self._judge_type(X) 72 | 73 | self.imp_continuous_index, self.imp_categorical_index = \ 74 | self.get_type_index(missing_mask, self.coltype_dict) 75 | 76 | differ_categorical = float('inf') 77 | differ_continuous = float('inf') 78 | 79 | init_fill = X 80 | 81 | while self.stop is False: 82 | 83 | differ_categorical_old = differ_categorical 84 | differ_continuous_old = differ_continuous 85 | 86 | x_old_imp = init_fill 87 | 88 | x_new_imp = [] 89 | 90 | for col in self.sorted_col: 91 | tmp = [] 92 | if self.coltype_dict[col] is 'categorical': 93 | model = self.rf_cla 94 | else: 95 | model = self.rf_reg 96 | 97 | x_obs, y_obs, x_mis = self.split(init_fill, col, missing_mask) 98 | model.fit(x_obs, y_obs) 99 | y_mis = model.predict(x_mis) 100 | for ele in y_mis: 101 | tmp.append(ele) 102 | x_new_imp.append(ele) 103 | init_fill[:, col][missing_mask[:,col]] = tmp 104 | x_new_imp = np.asarray(x_new_imp) 105 | 106 | differ_continuous, differ_categorical = self._lose_func(x_new_imp, x_old_imp) 107 | if differ_continuous >= differ_continuous_old and differ_categorical >= differ_categorical_old: 108 | self.stop = True 109 | return init_fill 110 | 111 | def _lose_func(self, imp_new, imp_old): 112 | """ 113 | Evaluation Method, mathematical concept are available at 'https://www.stu-zhouyc.com/iterForest/metrics' 114 | :param imputed_data_old: a dict like {'col name':[predicted value1,...],...} 115 | the dict contains original missing index which is part of the original data 116 | its the last estimated data 117 | accompany with brand-new imputed data, they are going to be evaluate. 118 | :return: 119 | """ 120 | 121 | continuous_imp_new = imp_new[self.imp_continuous_index] 122 | continuous_imp_old = imp_old[self.imp_continuous_index] 123 | categorical_imp_new = imp_new[self.imp_categorical_index] 124 | categorical_imp_old = imp_old[self.imp_categorical_index] 125 | 126 | try: 127 | continuous_div = continuous_imp_new - continuous_imp_old 128 | continuous_div = continuous_div.dot(continuous_div) 129 | continuous_sum = continuous_imp_new.dot(continuous_imp_new) 130 | 131 | categorical_count = np.sum(categorical_imp_new == categorical_imp_old) 132 | categorical_var_len = len(categorical_imp_new) 133 | 134 | except: 135 | categorical_var_len = 0.01 136 | categorical_count = 0 137 | 138 | continuous_div = 0 139 | continuous_sum = 0.001 140 | 141 | if categorical_var_len is 0: 142 | categorical_differ = 0 143 | else: 144 | categorical_differ = categorical_count / categorical_var_len 145 | 146 | if continuous_sum is 0: 147 | continuous_differ = 0 148 | else: 149 | continuous_differ = continuous_div / continuous_sum 150 | return continuous_differ, categorical_differ -------------------------------------------------------------------------------- /ycimpute/tree/tree.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import numpy.linalg as la 4 | import scipy.stats as stats 5 | from abc import ABCMeta 6 | 7 | class DecisionTree(metaclass=ABCMeta): 8 | """ 9 | use CART tree 10 | """ 11 | def __init__(self, 12 | lose_func=None, 13 | max_depth=None, 14 | min_sample_split=5, 15 | min_cost=None, 16 | is_forest=False 17 | ): 18 | self.max_depth = max_depth 19 | self.min_sample_split = min_sample_split 20 | self.min_cost = min_cost 21 | self.is_forest = is_forest 22 | self.lose_func = lose_func 23 | self.num_samples = None 24 | 25 | if isinstance(self, RegressionTree): 26 | self.lose_func = self._mse 27 | elif isinstance(self, ClassifyTree): 28 | self.lose_func = self._gini_index 29 | 30 | def _mse(self, y): 31 | """ 32 | MSE(mean-square error) see https:// 33 | :param y: ndarray, a vector like array 34 | :return: the mse value of y, flaot 35 | """ 36 | if (y.size == 0): 37 | return 0 38 | c_m = np.mean(y) 39 | diff = np.abs(c_m-y) 40 | mse = np.square(diff).sum() 41 | return mse 42 | 43 | def _gini_index(self, pure_y): 44 | """ 45 | GINI INDEX see: https:// 46 | :param pure_y: ndarray, vector like 47 | :return:flaot 48 | """ 49 | dist = np.empty(np.unique(pure_y).shape) 50 | for lable in range(dist.shape[0]): 51 | dist[lable] = np.sum(pure_y==lable) / pure_y.shape[0] 52 | sub_feature_gini = 1.0-np.sum(np.square(dist)) 53 | return abs(pure_y.shape[0]/self.num_samples)*sub_feature_gini 54 | 55 | def _entropy(self): 56 | """ 57 | up until now, cart tree do not necessary need entropy except ID3 or C4.5 58 | :return: None 59 | """ 60 | pass 61 | 62 | def cost_reduction(self, data_left, data_right): 63 | y_total = np.hstack((data_left[1], data_right[1])) 64 | total_norm = la.norm(y_total) 65 | left_norm = la.norm(data_left[1]) 66 | right_norm = la.norm(data_right[1]) 67 | 68 | total_cost = self.lose_func(y_total) 69 | normalized_left = (left_norm / total_norm) * self.lose_func(data_left[1]) 70 | normalized_right = (right_norm / total_norm) * self.lose_func(data_right[1]) 71 | 72 | return total_cost - (normalized_left + normalized_right) 73 | 74 | def choose_best_feature(self, X, y, node): 75 | split_threshold = None 76 | split_feature = None 77 | min_gini_index = None 78 | 79 | real_features = range(X.shape[1]) 80 | self.num_samples = X.shape[0] 81 | if self.is_forest: 82 | if isinstance(self, RegressionTree): 83 | features = np.random.choice(real_features, size=int(X.shape[1]/3)) 84 | else: 85 | features = np.random.choice(real_features, size=int(np.sqrt(X.shape[1]))) 86 | 87 | else: 88 | features = real_features 89 | 90 | 91 | for feature in features: 92 | for sub_feature in np.unique(X[:, feature]): 93 | left = y[X[:, feature]==sub_feature] 94 | right = y[X[:, feature]!= sub_feature] 95 | gini_index = self.lose_func(left)+self.lose_func(right) 96 | if min_gini_index is None or gini_index self.max_depth: 110 | return True 111 | if not isinstance(self, ClassifyTree) and \ 112 | self.cost_reduction(left_data, right_data) 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 40 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 72 | 73 | 74 | 75 | 76 | 77 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 113 | 114 | 115 | 116 | 117 | 118 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 160 | 161 | 162 | 163 | 164 | 165 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 227 | 242 | 273 | 287 | 288 | 319 | 340 | 356 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 400 | 401 | 402 | 403 | 404 | 405 | 435 | 441 | 452 | 472 | 492 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 544 | 545 | 546 | 547 | 548 | 549 | 562 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 605 | 606 | 607 | 608 | 609 | 610 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 667 | 668 | 669 | 670 | 671 | 672 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 780 | 781 | 782 | 783 | 784 | 785 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 826 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 869 | 870 | 871 | 877 | 878 | 879 | 885 | 886 | 887 | 893 | 894 | 895 | 901 | 902 | 903 | 909 | 910 | 911 | 917 | 918 | 919 | 922 | 923 | 924 | 927 | 928 | 929 | 932 | 933 | 934 | 937 | 938 | 939 | 942 | 943 | 944 | 945 | 946 | 961 | 967 | 979 | 996 | 1010 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | -------------------------------------------------------------------------------- /img/IRIS.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 40 | 61 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 80 | 81 | 82 | 83 | 84 | 85 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 122 | 123 | 124 | 125 | 126 | 127 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 169 | 170 | 171 | 172 | 173 | 174 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 227 | 228 | 229 | 230 | 231 | 232 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 293 | 294 | 295 | 296 | 297 | 298 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 353 | 368 | 399 | 413 | 414 | 445 | 466 | 482 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 526 | 527 | 528 | 529 | 530 | 531 | 561 | 567 | 578 | 598 | 618 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 670 | 671 | 672 | 673 | 674 | 675 | 688 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 731 | 732 | 733 | 734 | 735 | 736 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 793 | 794 | 795 | 796 | 797 | 798 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 906 | 907 | 908 | 909 | 910 | 911 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 952 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 995 | 996 | 997 | 1003 | 1004 | 1005 | 1011 | 1012 | 1013 | 1019 | 1020 | 1021 | 1027 | 1028 | 1029 | 1035 | 1036 | 1037 | 1043 | 1044 | 1045 | 1048 | 1049 | 1050 | 1053 | 1054 | 1055 | 1058 | 1059 | 1060 | 1063 | 1064 | 1065 | 1068 | 1069 | 1070 | 1071 | 1072 | 1078 | 1095 | 1109 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 | 1129 | 1130 | 1131 | 1132 | 1133 | 1134 | 1135 | 1136 | 1137 | 1138 | 1139 | 1140 | 1141 | 1142 | 1143 | 1144 | --------------------------------------------------------------------------------