├── MetaCost.py
├── README.md
└── etc
    └── metacost.jpg


/MetaCost.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.base import clone
  5 | 
  6 | 
  7 | class MetaCost(object):
  8 | 
  9 |     """A procedure for making error-based classifiers cost-sensitive
 10 | 
 11 |     >>> from sklearn.datasets import load_iris
 12 |     >>> from sklearn.linear_model import LogisticRegression
 13 |     >>> import pandas as pd
 14 |     >>> import numpy as np
 15 |     >>> S = pd.DataFrame(load_iris().data)
 16 |     >>> S['target'] = load_iris().target
 17 |     >>> LR = LogisticRegression(solver='lbfgs', multi_class='multinomial')
 18 |     >>> C = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]])
 19 |     >>> model = MetaCost(S, LR, C).fit('target', 3)
 20 |     >>> model.predict_proba(load_iris().data[[2]])
 21 |     >>> model.score(S[[0, 1, 2, 3]].values, S['target'])
 22 | 
 23 |     .. note:: The form of the cost matrix C must be as follows:
 24 |     +---------------+----------+----------+----------+
 25 |     |  actual class |          |          |          |
 26 |     +               |          |          |          |
 27 |     |   +           | y(x)=j_1 | y(x)=j_2 | y(x)=j_3 |
 28 |     |       +       |          |          |          |
 29 |     |           +   |          |          |          |
 30 |     |predicted class|          |          |          |
 31 |     +---------------+----------+----------+----------+
 32 |     |   h(x)=j_1    |    0     |    a     |     b    |
 33 |     |   h(x)=j_2    |    c     |    0     |     d    |
 34 |     |   h(x)=j_3    |    e     |    f     |     0    |
 35 |     +---------------+----------+----------+----------+
 36 |     | C = np.array([[0, a, b],[c, 0 , d],[e, f, 0]]) |
 37 |     +------------------------------------------------+
 38 |     """
 39 |     def __init__(self, S, L, C, m=50, n=1, p=True, q=True):
 40 |         """
 41 |         :param S: The training set
 42 |         :param L: A classification learning algorithm
 43 |         :param C: A cost matrix
 44 |         :param q: Is True iff all resamples are to be used  for each examples
 45 |         :param m: The number of resamples to generate
 46 |         :param n: The number of examples in each resample
 47 |         :param p: Is True iff L produces class probabilities
 48 |         """
 49 |         if not isinstance(S, pd.DataFrame):
 50 |             raise ValueError('S must be a DataFrame object')
 51 |         new_index = list(range(len(S)))
 52 |         S.index = new_index
 53 |         self.S = S
 54 |         self.L = L
 55 |         self.C = C
 56 |         self.m = m
 57 |         self.n = len(S) * n
 58 |         self.p = p
 59 |         self.q = q
 60 | 
 61 |     def fit(self, flag, num_class):
 62 |         """
 63 |         :param flag: The name of classification labels
 64 |         :param num_class: The number of classes
 65 |         :return: Classifier
 66 |         """
 67 |         col = [col for col in self.S.columns if col != flag]
 68 |         S_ = {}
 69 |         M = []
 70 | 
 71 |         for i in range(self.m):
 72 |             # Let S_[i] be a resample of S with self.n examples
 73 |             S_[i] = self.S.sample(n=self.n, replace=True)
 74 | 
 75 |             X = S_[i][col].values
 76 |             y = S_[i][flag].values
 77 | 
 78 |             # Let M[i] = model produced by applying L to S_[i]
 79 |             model = clone(self.L)
 80 |             M.append(model.fit(X, y))
 81 | 
 82 |         label = []
 83 |         S_array = self.S[col].values
 84 |         for i in range(len(self.S)):
 85 |             if not self.q:
 86 |                 k_th = [k for k, v in S_.items() if i not in v.index]
 87 |                 M_ = list(np.array(M)[k_th])
 88 |             else:
 89 |                 M_ = M
 90 | 
 91 |             if self.p:
 92 |                 P_j = [model.predict_proba(S_array[[i]]) for model in M_]
 93 |             else:
 94 |                 P_j = []
 95 |                 vector = [0] * num_class
 96 |                 for model in M_:
 97 |                     vector[model.predict(S_array[[i]])] = 1
 98 |                     P_j.append(vector)
 99 | 
100 |             # Calculate P(j|x)
101 |             P = np.array(np.mean(P_j, 0)).T
102 | 
103 |             # Relabel
104 |             label.append(np.argmin(self.C.dot(P)))
105 | 
106 |         # Model produced by applying L to S with relabeled y
107 |         X_train = self.S[col].values
108 |         y_train = np.array(label)
109 |         model_new = clone(self.L)
110 |         model_new.fit(X_train, y_train)
111 | 
112 |         return model_new
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## MetaCost
 2 | This method was first proposed by P. Domingos, and I coded it with Python.
 3 | 
 4 | <p align="center">
 5 | <img src="https://github.com/Treers/MetaCost/blob/master/etc/metacost.jpg" />
 6 | </p>
 7 | 
 8 | ## Example
 9 | Below is an example of using MetaCost to perform imbalanced learning.
10 | ```python
11 |  >>> from sklearn.datasets import load_iris
12 |  >>> from sklearn.linear_model import LogisticRegression
13 |  >>> import pandas as pd
14 |  >>> import numpy as np
15 |  >>> S = pd.DataFrame(load_iris().data)
16 |  >>> S['target'] = load_iris().target
17 |  >>> LR = LogisticRegression(solver='lbfgs', multi_class='multinomial')
18 |  >>> C = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]])
19 |  >>> model = MetaCost(S, LR, C).fit('target', 3)
20 |  >>> model.predict_proba(load_iris().data[[2]])
21 |  >>> model.score(S[[0, 1, 2, 3]].values, S['target'])
22 |  
23 |  """
24 |  Note:: The form of the cost matrix C must be as follows
25 |  +---------------+----------+----------+----------+
26 |  |  actual class |          |          |          |
27 |  +               |          |          |          |
28 |  |   +           | y(x)=j_1 | y(x)=j_2 | y(x)=j_3 |
29 |  |       +       |          |          |          |
30 |  |           +   |          |          |          |
31 |  |predicted class|          |          |          |
32 |  +---------------+----------+----------+----------+
33 |  |   h(x)=j_1    |    0     |    a     |     b    |
34 |  |   h(x)=j_2    |    c     |    0     |     d    |
35 |  |   h(x)=j_3    |    e     |    f     |     0    |
36 |  +---------------+----------+----------+----------+
37 |  | C = np.array([[0, a, b],[c, 0 , d],[e, f, 0]]) |
38 |  +------------------------------------------------+
39 |  """
40 | ```
41 | 
42 | Reference
43 | ---------
44 | [P. Domingos, "MetaCost: A General Method for Making Classifiers Cost-Sensitive", Proc. Int’l Conf. Knowledge Discovery and Data Mining, pp. 155-164, 1999.](https://homes.cs.washington.edu/~pedrod/papers/kdd99.pdf)
45 | 


--------------------------------------------------------------------------------
/etc/metacost.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Treers/MetaCost/f8d25134fc914609a240cf1c43de2a31b6233989/etc/metacost.jpg


--------------------------------------------------------------------------------