├── .gitignore ├── README.md ├── __init__.py ├── default.py ├── dm.py ├── feature.py ├── many2many.py ├── one2many.py ├── one2one.py └── ple.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ple 2 | 基于sklearn,强化Pipeline和FeatureUnion两个类。对FeatureUnion类,使其支持部分数据处理;对两者,增加特征转换行为记录的功能。 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from ple import FeatureUnionExt, PipelineExt 2 | 3 | __all__ = ['FeatureUnionExt', 'PipelineExt'] 4 | -------------------------------------------------------------------------------- /default.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | from feature import Feature 4 | 5 | def doWithDefault(model, featureList): 6 | leaves = np.array([]) 7 | 8 | n_features = len(featureList) 9 | 10 | for i in range(n_features): 11 | feature = featureList[i] 12 | newFeature = Feature(feature.name) 13 | feature.transform(model.__class__.__name__, newFeature) 14 | leaves = np.append(leaves, newFeature) 15 | 16 | return leaves 17 | 18 | 19 | def main(): 20 | pass 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /dm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_iris 3 | from sklearn.preprocessing import Imputer 4 | from sklearn.preprocessing import OneHotEncoder 5 | from sklearn.preprocessing import FunctionTransformer 6 | from sklearn.preprocessing import Binarizer 7 | from sklearn.preprocessing import MinMaxScaler 8 | from sklearn.feature_selection import SelectKBest 9 | from sklearn.feature_selection import chi2 10 | from sklearn.decomposition import PCA 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.pipeline import Pipeline, FeatureUnion 13 | from ple import PipelineExt, FeatureUnionExt, initRoot, draw 14 | 15 | def datamining(iris, featureList): 16 | step1 = ('Imputer', Imputer()) 17 | step2_1 = ('OneHotEncoder', OneHotEncoder(sparse=False)) 18 | step2_2 = ('ToLog', FunctionTransformer(np.log1p)) 19 | step2_3 = ('ToBinary', Binarizer()) 20 | step2 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=[step2_1, step2_2, step2_3], idx_list=[[0], [1, 2, 3], [4]])) 21 | step3 = ('MinMaxScaler', MinMaxScaler()) 22 | step4 = ('SelectKBest', SelectKBest(chi2, k=3)) 23 | step5 = ('PCA', PCA(n_components=2)) 24 | step6 = ('LogisticRegression', LogisticRegression(penalty='l2')) 25 | pipeline = PipelineExt(steps=[step1, step2, step3, step4, step5, step6]) 26 | pipeline.fit(iris.data, iris.target) 27 | leaves = pipeline.getFeatureList(featureList) 28 | for i in range(len(leaves)): 29 | print leaves[i], pipeline.steps[-1][-1].coef_[i] 30 | 31 | def main(): 32 | iris = load_iris() 33 | iris.data = np.hstack((np.random.choice([0, 1, 2], size=iris.data.shape[0]+1).reshape(-1,1), np.vstack((iris.data, np.full(4, np.nan).reshape(1,-1))))) 34 | iris.target = np.hstack((iris.target, np.array([np.median(iris.target)]))) 35 | root = initRoot(['color', 'Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']) 36 | featureList = np.array([transform.feature for transform in root.transformList]) 37 | 38 | datamining(iris, featureList) 39 | 40 | root.printTree() 41 | draw(root) 42 | 43 | if __name__ == '__main__': 44 | main() 45 | 46 | -------------------------------------------------------------------------------- /feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Transform(object): 4 | def __init__(self, label, feature): 5 | super(Transform, self).__init__() 6 | self.label = label 7 | self.feature = feature 8 | 9 | class Feature(object): 10 | def __init__(self, name): 11 | super(Feature, self).__init__() 12 | self.name = name 13 | self.label = '%s[%d]' % (self.name, id(self)) 14 | self.transformList = np.array([]) 15 | 16 | def transform(self, label, feature): 17 | self.transformList = np.append(self.transformList, Transform(label, feature)) 18 | 19 | def printTree(self): 20 | print self.label 21 | for transform in self.transformList: 22 | feature = transform.feature 23 | print '--%s-->' % transform.label, 24 | feature.printTree() 25 | 26 | def __str__(self): 27 | return self.label 28 | -------------------------------------------------------------------------------- /many2many.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | from sklearn.decomposition import PCA 4 | from feature import Feature 5 | 6 | def doWithPCA(model, featureList): 7 | leaves = np.array([]) 8 | 9 | n_features = len(featureList) 10 | 11 | for i in range(model.n_components_): 12 | newFeature = Feature(model.__class__.__name__) 13 | leaves = np.append(leaves, newFeature) 14 | 15 | for i in range(n_features): 16 | feature = featureList[i] 17 | for j in range(model.n_components_): 18 | newFeature = leaves[j] 19 | feature.transform(model.__class__.__name__, newFeature) 20 | 21 | return leaves 22 | 23 | def main(): 24 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 25 | 26 | root = Feature('root') 27 | featureList = np.array([]) 28 | for i in range(len(X[0])): 29 | feature = Feature('feature_%d' % i) 30 | root.transform('init', feature) 31 | featureList = np.append(featureList, feature) 32 | 33 | model = PCA(n_components=1) 34 | model.fit(X) 35 | doWithPCA(model, featureList) 36 | root.printTree() 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /one2many.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | from sklearn.preprocessing import OneHotEncoder 4 | from feature import Feature 5 | 6 | def doWithOneHotEncoder(model, featureList): 7 | assert(isinstance(model, OneHotEncoder)) 8 | assert(hasattr(model, 'feature_indices_')) 9 | 10 | leaves = np.array([]) 11 | 12 | n_features = len(featureList) 13 | 14 | if model.categorical_features == 'all': 15 | mask_features = np.ones(n_features) 16 | else: 17 | mask_features = np.zeros(n_features) 18 | mask_features[self.categorical_features] = 1 19 | 20 | n_qualitativeFeatures = len(model.feature_indices_) - 1 21 | if model.n_values == 'auto': 22 | n_activeFeatures = len(model.active_features_) 23 | j = k = 0 24 | for i in range(n_features): 25 | feature = featureList[i] 26 | if mask_features[i]: 27 | if model.n_values == 'auto': 28 | while k < n_activeFeatures and model.active_features_[k] < model.feature_indices_[j+1]: 29 | newFeature = Feature(feature.name) 30 | feature.transform('%s[%d]' % (model.__class__.__name__, model.active_features_[k] - model.feature_indices_[j]), newFeature) 31 | leaves = np.append(leaves, newFeature) 32 | k += 1 33 | else: 34 | for k in range(model.feature_indices_[j]+1, model.feature_indices_[j+1]): 35 | newFeature = Feature(feature.name) 36 | feature.transform('%s[%d]' % (model.__class__.__name__, k - model.feature_indices_[j]), newFeature) 37 | leaves = np.append(leaves, newFeature) 38 | j += 1 39 | else: 40 | newFeature = Feature(feature.name) 41 | feature.transform('%s[r]' % model.__class__.__name__, newFeature) 42 | leaves = append(leaves, newFeatures) 43 | 44 | return leaves 45 | 46 | def main(): 47 | X = [[1, 2], [2, 3]] 48 | 49 | root = Feature('root') 50 | featureList = np.array([]) 51 | for i in range(len(X[0])): 52 | feature = Feature('feature_%d' % i) 53 | root.transform('init', feature) 54 | featureList = np.append(featureList, feature) 55 | 56 | model = OneHotEncoder(n_values=[5,8], sparse=True) 57 | model.fit(X) 58 | doWithOneHotEncoder(model, featureList) 59 | root.printTree() 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /one2one.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | from sklearn.feature_selection.base import SelectorMixin 4 | from feature import Feature 5 | 6 | def doWithSelector(model, featureList): 7 | assert(isinstance(model, SelectorMixin)) 8 | 9 | leaves = np.array([]) 10 | 11 | n_features = len(featureList) 12 | 13 | mask_features = model.get_support() 14 | 15 | for i in range(n_features): 16 | feature = featureList[i] 17 | if mask_features[i]: 18 | newFeature = Feature(feature.name) 19 | feature.transform(model.__class__.__name__, newFeature) 20 | leaves = np.append(leaves, newFeature) 21 | else: 22 | newFeature = Feature('Abandomed') 23 | feature.transform(model.__class__.__name__, newFeature) 24 | 25 | return leaves 26 | 27 | def main(): 28 | from sklearn.feature_selection import VarianceThreshold 29 | X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] 30 | 31 | root = Feature('root') 32 | featureList = np.array([]) 33 | for i in range(len(X[0])): 34 | feature = Feature('feature_%d' % i) 35 | root.transform('init', feature) 36 | featureList = np.append(featureList, feature) 37 | 38 | model = VarianceThreshold() 39 | model.fit(X) 40 | doWithSelector(model, featureList) 41 | root.printTree() 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /ple.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection.base import SelectorMixin 2 | from sklearn.preprocessing import OneHotEncoder 3 | from sklearn.decomposition import PCA 4 | from sklearn.pipeline import Pipeline, FeatureUnion, _fit_one_transformer, _fit_transform_one, _transform_one 5 | from sklearn.externals.joblib import Parallel, delayed 6 | from scipy import sparse 7 | import numpy as np 8 | from matplotlib import pyplot as plt 9 | from default import doWithDefault 10 | from one2one import doWithSelector 11 | from one2many import doWithOneHotEncoder 12 | from many2many import doWithPCA 13 | from feature import Feature 14 | 15 | class PipelineExt(Pipeline): 16 | def _pre_get_featues(self, featureList): 17 | leaves = featureList 18 | for name, transform in self.steps[:-1]: 19 | leaves = _doWithModel(transform, leaves) 20 | return leaves 21 | 22 | def getFeatureList(self, featureList): 23 | leaves = self._pre_get_featues(featureList) 24 | model = self.steps[-1][-1] 25 | if hasattr(model, 'fit_transform') or hasattr(model, 'transform'): 26 | leaves = _doWithModel(model, leaves) 27 | return leaves 28 | 29 | class FeatureUnionExt(FeatureUnion): 30 | def __init__(self, transformer_list, idx_list, n_jobs=1, transformer_weights=None): 31 | self.idx_list = idx_list 32 | FeatureUnion.__init__(self, transformer_list=map(lambda trans:(trans[0], trans[1]), transformer_list), n_jobs=n_jobs, transformer_weights=transformer_weights) 33 | 34 | def fit(self, X, y=None): 35 | transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) 36 | transformers = Parallel(n_jobs=self.n_jobs)( 37 | delayed(_fit_one_transformer)(trans, X[:,idx], y) 38 | for name, trans, idx in transformer_idx_list) 39 | self._update_transformer_list(transformers) 40 | return self 41 | 42 | def fit_transform(self, X, y=None, **fit_params): 43 | transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) 44 | result = Parallel(n_jobs=self.n_jobs)( 45 | delayed(_fit_transform_one)(trans, name, X[:,idx], y, 46 | self.transformer_weights, **fit_params) 47 | for name, trans, idx in transformer_idx_list) 48 | 49 | Xs, transformers = zip(*result) 50 | self._update_transformer_list(transformers) 51 | if any(sparse.issparse(f) for f in Xs): 52 | Xs = sparse.hstack(Xs).tocsr() 53 | else: 54 | Xs = np.hstack(Xs) 55 | return Xs 56 | 57 | def transform(self, X): 58 | transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) 59 | Xs = Parallel(n_jobs=self.n_jobs)( 60 | delayed(_transform_one)(trans, name, X[:,idx], self.transformer_weights) 61 | for name, trans, idx in transformer_idx_list) 62 | if any(sparse.issparse(f) for f in Xs): 63 | Xs = sparse.hstack(Xs).tocsr() 64 | else: 65 | Xs = np.hstack(Xs) 66 | return Xs 67 | 68 | def getFeatureList(self, featureList): 69 | transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) 70 | leaves = np.array(Parallel(n_jobs=self.n_jobs)( 71 | delayed(_doWithModel)(trans, featureList[idx]) 72 | for name, trans, idx in transformer_idx_list)) 73 | leaves = np.hstack(leaves) 74 | return leaves 75 | 76 | def _doWithModel(model, featureList): 77 | if isinstance(model, SelectorMixin): 78 | return doWithSelector(model, featureList) 79 | elif isinstance(model, OneHotEncoder): 80 | return doWithOneHotEncoder(model, featureList) 81 | elif isinstance(model, PCA): 82 | return doWithPCA(model, featureList) 83 | elif isinstance(model, FeatureUnionExt) or isinstance(model, PipelineExt): 84 | return model.getFeatureList(featureList) 85 | else: 86 | return doWithDefault(model, featureList) 87 | 88 | def initRoot(featureNameList): 89 | root = Feature('root') 90 | for featureName in featureNameList: 91 | newFeature = Feature(featureName) 92 | root.transform('init', newFeature) 93 | return root 94 | 95 | def _draw(G, root, nodeLabelDict, edgeLabelDict): 96 | nodeLabelDict[root.label] = root.name 97 | for transform in root.transformList: 98 | G.add_edge(root.label, transform.feature.label) 99 | edgeLabelDict[(root.label, transform.feature.label)] = transform.label 100 | _draw(G, transform.feature, nodeLabelDict, edgeLabelDict) 101 | 102 | def _isCyclic(root, walked): 103 | if root in walked: 104 | return True 105 | else: 106 | walked.add(root) 107 | for transform in root.transformList: 108 | ret = _isCyclic(transform.feature, walked) 109 | if ret: 110 | return True 111 | walked.remove(root) 112 | return False 113 | 114 | def fall_layout(root, x_space=1, y_space=1): 115 | layout = {} 116 | if _isCyclic(root, set()): 117 | raise Exception('Graph is cyclic') 118 | 119 | queue = [None, root] 120 | nodeDict = {} 121 | levelDict = {} 122 | level = 0 123 | while len(queue) > 0: 124 | head = queue.pop() 125 | if head is None: 126 | if len(queue) > 0: 127 | level += 1 128 | queue.insert(0, None) 129 | else: 130 | if head in nodeDict: 131 | levelDict[nodeDict[head]].remove(head) 132 | nodeDict[head] = level 133 | levelDict[level] = levelDict.get(level, []) + [head] 134 | for transform in head.transformList: 135 | queue.insert(0, transform.feature) 136 | 137 | for level in levelDict.keys(): 138 | nodeList = levelDict[level] 139 | n_nodes = len(nodeList) 140 | offset = - n_nodes / 2 141 | for i in range(n_nodes): 142 | layout[nodeList[i].label] = (level * x_space, (i + offset) * y_space) 143 | 144 | return layout 145 | 146 | def draw(root): 147 | import networkx as nx 148 | G = nx.DiGraph() 149 | nodeLabelDict = {} 150 | edgeLabelDict = {} 151 | 152 | _draw(G, root, nodeLabelDict, edgeLabelDict) 153 | # pos=nx.spring_layout(G, iterations=150) 154 | pos = fall_layout(root) 155 | 156 | nx.draw_networkx_nodes(G,pos,node_size=100, node_color="white") 157 | nx.draw_networkx_edges(G,pos, width=1,alpha=0.5,edge_color='black') 158 | nx.draw_networkx_labels(G,pos,labels=nodeLabelDict, font_size=10,font_family='sans-serif') 159 | nx.draw_networkx_edge_labels(G, pos, edgeLabelDict) 160 | 161 | plt.show() 162 | --------------------------------------------------------------------------------