├── README.md └── skmice.py /README.md: -------------------------------------------------------------------------------- 1 | # Scikit-mice 2 | 3 | Scikit-mice runs the MICE imputation algorithm. Based on the following paper. 4 | 5 | 6 | ### Documentation: 7 | The MiceImputer class is similar to the sklearn Imputer class. 8 | 9 | MiceImputer has the same instantiation parameters as Imputer. 10 | 11 | The MiceImputer.transform() function takes in three arguments. 12 | 13 | | Param | Type | Description | 14 | | --------------------- | ------------ | ------------------------------------------------ | 15 | | `X` | `matrix` | Numpy matrix or python matrix of data. | 16 | | `model_class` | `class` | Scikit-learn model class. | 17 | | `iterations` | `int` | Int for numbe of interations to run. | 18 | 19 | 20 | What is returned by MiceImputer is a tuple of imputed values as well as a matrix of model performance for each iteration and column. 21 | ``` 22 | (imputed_x, model_specs_matrix) 23 | ``` 24 | 25 | ### Example: 26 | 27 | ``` 28 | from sklearn.linear_model import LinearRegression 29 | import skmice 30 | 31 | imputer = MiceImputer() 32 | X = [[1, 2], [np.nan, 3], [7, 6]] 33 | 34 | X, specs = imputer.transform(X, LinearRegression, 10) 35 | 36 | print specs 37 | 38 | ``` 39 | 40 | What is returned is a MICE imputed matrix running 10 iterations using a simple LinearRegression. -------------------------------------------------------------------------------- /skmice.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import Imputer 2 | from sklearn.linear_model import LinearRegression 3 | from sklearn.cross_validation import train_test_split 4 | import numpy as np 5 | 6 | class MiceImputer(object): 7 | 8 | def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True): 9 | self.missing_values = missing_values 10 | self.strategy = strategy 11 | self.axis = axis 12 | self.verbose = verbose 13 | self.copy = copy 14 | self.imp = Imputer(missing_values=self.missing_values, strategy=self.strategy, axis= self.axis, verbose=self.verbose, copy=self.copy) 15 | 16 | def _seed_values(self, X): 17 | self.imp.fit(X) 18 | return self.imp.transform(X) 19 | 20 | def _get_mask(X, value_to_mask): 21 | if value_to_mask == "NaN" or np.isnan(value_to_mask): 22 | return np.isnan(X) 23 | else: 24 | return X == value_to_mask 25 | 26 | def _process(self, X, column, model_class): 27 | # Remove values that are in mask 28 | mask = np.array(self._get_mask(X)[:, column].T)[0] 29 | mask_indices = np.where(mask==True)[0] 30 | X_data = np.delete(X, mask_indices, 0) 31 | 32 | # Instantiate the model 33 | model = model_class() 34 | 35 | # Slice out the column to predict and delete the column. 36 | y_data = X[:, column] 37 | X_data = np.delete(X_data, column, 1) 38 | 39 | # Split training and test data 40 | X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.33, random_state=42) 41 | 42 | # Fit the model 43 | model.fit(X_train, y_train) 44 | 45 | # Score the model 46 | scores = model.score(X_test, y_test) 47 | 48 | # Predict missing vars 49 | X_predict = np.delete(X, column, 1) 50 | y = model.predict(X_predict) 51 | 52 | # Replace values in X with their predictions 53 | predict_indices = np.where(mask==False)[0] 54 | np.put(X, predict_indicies, np.take(y, predict_indices)) 55 | 56 | # Return model and scores 57 | return (model, scores) 58 | 59 | def transform(self, X, model_class=LinearRegression, iterations=10): 60 | X = np.matrix(X) 61 | mask = _get_mask(X, self.missing_values) 62 | seeded = self._seed_values(X) 63 | specs = np.zeros(iterations, len(X.T)) 64 | 65 | for i in range(iterations): 66 | for c in range(len(X.T) - 1): 67 | specs[i][c] = self._process(X, c, model_class) 68 | 69 | # Return X matrix with imputed values 70 | return (X, specs) --------------------------------------------------------------------------------