├── MetaCost.py ├── README.md └── etc └── metacost.jpg /MetaCost.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.base import clone 5 | 6 | 7 | class MetaCost(object): 8 | 9 | """A procedure for making error-based classifiers cost-sensitive 10 | 11 | >>> from sklearn.datasets import load_iris 12 | >>> from sklearn.linear_model import LogisticRegression 13 | >>> import pandas as pd 14 | >>> import numpy as np 15 | >>> S = pd.DataFrame(load_iris().data) 16 | >>> S['target'] = load_iris().target 17 | >>> LR = LogisticRegression(solver='lbfgs', multi_class='multinomial') 18 | >>> C = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]]) 19 | >>> model = MetaCost(S, LR, C).fit('target', 3) 20 | >>> model.predict_proba(load_iris().data[[2]]) 21 | >>> model.score(S[[0, 1, 2, 3]].values, S['target']) 22 | 23 | .. note:: The form of the cost matrix C must be as follows: 24 | +---------------+----------+----------+----------+ 25 | | actual class | | | | 26 | + | | | | 27 | | + | y(x)=j_1 | y(x)=j_2 | y(x)=j_3 | 28 | | + | | | | 29 | | + | | | | 30 | |predicted class| | | | 31 | +---------------+----------+----------+----------+ 32 | | h(x)=j_1 | 0 | a | b | 33 | | h(x)=j_2 | c | 0 | d | 34 | | h(x)=j_3 | e | f | 0 | 35 | +---------------+----------+----------+----------+ 36 | | C = np.array([[0, a, b],[c, 0 , d],[e, f, 0]]) | 37 | +------------------------------------------------+ 38 | """ 39 | def __init__(self, S, L, C, m=50, n=1, p=True, q=True): 40 | """ 41 | :param S: The training set 42 | :param L: A classification learning algorithm 43 | :param C: A cost matrix 44 | :param q: Is True iff all resamples are to be used for each examples 45 | :param m: The number of resamples to generate 46 | :param n: The number of examples in each resample 47 | :param p: Is True iff L produces class probabilities 48 | """ 49 | if not isinstance(S, pd.DataFrame): 50 | raise ValueError('S must be a DataFrame object') 51 | new_index = list(range(len(S))) 52 | S.index = new_index 53 | self.S = S 54 | self.L = L 55 | self.C = C 56 | self.m = m 57 | self.n = len(S) * n 58 | self.p = p 59 | self.q = q 60 | 61 | def fit(self, flag, num_class): 62 | """ 63 | :param flag: The name of classification labels 64 | :param num_class: The number of classes 65 | :return: Classifier 66 | """ 67 | col = [col for col in self.S.columns if col != flag] 68 | S_ = {} 69 | M = [] 70 | 71 | for i in range(self.m): 72 | # Let S_[i] be a resample of S with self.n examples 73 | S_[i] = self.S.sample(n=self.n, replace=True) 74 | 75 | X = S_[i][col].values 76 | y = S_[i][flag].values 77 | 78 | # Let M[i] = model produced by applying L to S_[i] 79 | model = clone(self.L) 80 | M.append(model.fit(X, y)) 81 | 82 | label = [] 83 | S_array = self.S[col].values 84 | for i in range(len(self.S)): 85 | if not self.q: 86 | k_th = [k for k, v in S_.items() if i not in v.index] 87 | M_ = list(np.array(M)[k_th]) 88 | else: 89 | M_ = M 90 | 91 | if self.p: 92 | P_j = [model.predict_proba(S_array[[i]]) for model in M_] 93 | else: 94 | P_j = [] 95 | vector = [0] * num_class 96 | for model in M_: 97 | vector[model.predict(S_array[[i]])] = 1 98 | P_j.append(vector) 99 | 100 | # Calculate P(j|x) 101 | P = np.array(np.mean(P_j, 0)).T 102 | 103 | # Relabel 104 | label.append(np.argmin(self.C.dot(P))) 105 | 106 | # Model produced by applying L to S with relabeled y 107 | X_train = self.S[col].values 108 | y_train = np.array(label) 109 | model_new = clone(self.L) 110 | model_new.fit(X_train, y_train) 111 | 112 | return model_new 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## MetaCost 2 | This method was first proposed by P. Domingos, and I coded it with Python. 3 | 4 |
5 |
6 |