├── pk ├── __init__.py ├── main │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ └── main.py │ └── ui │ │ ├── __init__.py │ │ ├── main_gui.ui │ │ └── main_gui.py ├── utils │ ├── __init__.py │ ├── prygress.py │ ├── imaging.py │ ├── metrics.py │ ├── preprocess.py │ ├── regression_utils.py │ ├── classification_utils.py │ ├── clustering.py │ └── loading.py ├── tests │ ├── Wine.xls │ ├── __init__.py │ ├── blank.csv │ ├── iris.csv │ ├── test_imports.py │ ├── test_models.py │ ├── test_cl_gui.py │ ├── test_preprocessing.py │ ├── test_classification.py │ ├── ratings_best.arff │ ├── test_regression.py │ ├── iris2.csv │ ├── faithful.csv │ ├── test_loading.py │ ├── credit-g.arff │ └── correct_array.pkl ├── controller.py └── models.py ├── setup.cfg ├── Makefile ├── install.sh ├── .gitignore ├── README.md └── cl_gui.py /pk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pk/main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pk/main/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pk/main/ui/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pk/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pk/tests/Wine.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bsuhagia/Pykit-Learn/HEAD/pk/tests/Wine.xls -------------------------------------------------------------------------------- /pk/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.append(os.path.abspath(os.path.dirname(__file__))) -------------------------------------------------------------------------------- /pk/tests/blank.csv: -------------------------------------------------------------------------------- 1 | "F1","F2","F3","F4","Class" 2 | 1,2,3,4,good 3 | 2,3,4,5,good 4 | 1,?,?,3,bad 5 | 2,3,4,5,good 6 | ?,?,?,?,bad 7 | ?,0,s,1,bad 8 | 1,2,3,4,good 9 | -------------------------------------------------------------------------------- /pk/tests/iris.csv: -------------------------------------------------------------------------------- 1 | "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | 5.8,4,1.2,0.2,"setosa" 3 | 5.9,3,4.2,1.5,"versicolor" 4 | 6.5,3.2,5.1,2,"virginica" 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | verbosity = 2 3 | detailed-errors = 1 4 | with-doctest = 1 5 | doctest-tests = 1 6 | doctest-extension = rst 7 | doctest-fixtures = _fixture 8 | #doctest-options = +ELLIPSIS,+NORMALIZE_WHITESPACE 9 | where = pk/tests/ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | @nosetests -a !slow 3 | 4 | test-all: 5 | @nosetests 6 | 7 | clean: 8 | @find . -name *.pyc -type f -delete 9 | @find . -name cl_gui.log -type f -delete 10 | 11 | install: 12 | @chmod 755 ./install.sh 13 | @./install.sh 14 | -------------------------------------------------------------------------------- /pk/tests/test_imports.py: -------------------------------------------------------------------------------- 1 | # Author: Sean Dai 2 | 3 | def test_imports(): 4 | """ 5 | Required modules are installed. 6 | """ 7 | import sklearn 8 | import matplotlib 9 | import seaborn 10 | import numpy 11 | import scipy 12 | import pandas 13 | import PyQt4 14 | import PIL -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Install pip if not already installed 4 | if ! pip_loc="$(type -p "pip")" || [ -z "$pip_loc" ]; then 5 | sudo curl https://bootstrap.pypa.io/ez_setup.py -o - | sudo python 6 | fi 7 | sudo easy_install pip 8 | 9 | # Python module dependencies 10 | sudo pip install sklearn 11 | sudo pip install numpy 12 | sudo pip install matplotlib 13 | sudo pip install pandas 14 | sudo pip install scipy 15 | sudo pip install seaborn 16 | sudo pip install Pillow 17 | sudo pip install nose 18 | 19 | PROJ_DIR=`pwd` 20 | echo `export PYTHONPATH=$PYTHONPATH:$PROJ_DIR` >> ~/.bash_profile 21 | 22 | -------------------------------------------------------------------------------- /pk/tests/test_models.py: -------------------------------------------------------------------------------- 1 | from pk.models import * 2 | from sklearn.datasets import load_iris 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.mixture import GMM 5 | from nose.tools import raises 6 | 7 | def test_alg_creation(): 8 | alg = Algorithm(DecisionTreeClassifier()) 9 | assert alg.clf_name == 'DecisionTreeClassifier' 10 | 11 | def test_fit_supervised_algorithm_with_dt(): 12 | iris = load_iris() 13 | X, y = iris.data, iris.target 14 | 15 | alg = SupervisedAlgorithm(DecisionTreeClassifier()) 16 | assert alg.fitted == False 17 | assert alg.params['tree_'] is None 18 | 19 | alg.fit(X,y) 20 | assert alg.fitted == True 21 | assert alg.params['tree_'] is not None 22 | 23 | @raises(Exception) 24 | def test_predict_untrained(): 25 | alg = UnsupervisedAlgorithm(GMM()) 26 | alg.predict([0]) 27 | 28 | 29 | -------------------------------------------------------------------------------- /pk/utils/prygress.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import threading 4 | from functools import wraps 5 | 6 | def progress(function=None, stream=sys.stdout, char='.', pause=0.2): 7 | """Shows a progress bar while a function runs.""" 8 | if function is None: 9 | return lambda func: progress(func, stream, char, pause) 10 | 11 | @wraps(function) 12 | def wrap_function(*args, **kwargs): 13 | stop = False 14 | 15 | def progress_bar(): 16 | stream.write('') 17 | while not stop: 18 | stream.write(char) 19 | stream.flush() 20 | time.sleep(pause) 21 | stream.flush() 22 | 23 | try: 24 | p = threading.Thread(target=progress_bar) 25 | p.start() 26 | return function(*args, **kwargs) 27 | finally: 28 | stop = True 29 | 30 | return wrap_function 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # IntelliJ Files 60 | .idea/ 61 | 62 | # IPython Checkpoints 63 | pk/.ipynb_checkpoints/ 64 | *.ipynb 65 | 66 | # Runtime Folder 67 | temp/ 68 | 69 | # Logging files 70 | *.log 71 | 72 | 73 | -------------------------------------------------------------------------------- /pk/main/src/main.py: -------------------------------------------------------------------------------- 1 | from PyQt4 import QtCore, QtGui 2 | from PyQt4.QtCore import * 3 | from PyQt4.QtGui import * 4 | from pk.main.ui.main_gui import Ui_main_tab 5 | import sys 6 | from pk.utils.loading import * 7 | 8 | class MainWindow(QtGui.QTabWidget, Ui_main_tab): 9 | def __init__(self, parent=None): 10 | QtGui.QWidget.__init__(self, parent) 11 | self.setupUi(self) 12 | # GUI functions goes here 13 | class FileOpener(object): 14 | def __init__(self, filename): 15 | self.filename = filename 16 | @staticmethod 17 | def load_file(filename): 18 | extension = filename[filename.rfind('.'):] 19 | if (extension == '.csv'): 20 | return load_csv(filename) 21 | elif (extension == '.arff'): 22 | return load_arff(filename) 23 | elif (extension == '.xls' or extension == '.xlsx'): 24 | return load_excel(filename) 25 | 26 | 27 | 28 | def openfile(): 29 | filename = QFileDialog.getOpenFileName('Open File','/') 30 | filename = str(filename) 31 | X, y = FileOpener.load_file(filename) 32 | return X,y 33 | 34 | 35 | 36 | # Scikit functions goes here 37 | 38 | # main function to run the program 39 | def main(): 40 | app = QtGui.QApplication(sys.argv) 41 | ex = MainWindow() 42 | ex.openfile_btn.clicked.connect(openfile) 43 | ex.show() 44 | sys.exit(app.exec_()) 45 | 46 | # runs the main function 47 | if __name__ == '__main__': 48 | main() -------------------------------------------------------------------------------- /pk/tests/test_cl_gui.py: -------------------------------------------------------------------------------- 1 | """This module tests the command line GUI. 2 | Author: Sean Dai 3 | """ 4 | import cl_gui 5 | from nose.plugins.attrib import attr 6 | from nose.tools import nottest 7 | from nose.tools import assert_raises 8 | from nose.tools import assert_true 9 | import os 10 | 11 | def setup(): 12 | os.chdir(os.path.abspath(os.path.join(__file__, '../../..'))) 13 | cl_gui.setup() 14 | 15 | def td(): 16 | with assert_raises(SystemExit): 17 | cl_gui.quit_gui() 18 | 19 | @nottest 20 | def get_test_accuracy(): 21 | pass 22 | 23 | @attr('slow') 24 | def test_visualize_iris(): 25 | setup() 26 | cl_gui.process('load pk/tests/iris.csv') 27 | cl_gui.process('visualize --suppress') 28 | temp_files = os.listdir('_temp/') 29 | assert_true('plot_andrews.png' in temp_files) 30 | assert_true('plot_frequency.png' in temp_files) 31 | assert_true('plot_radial.png' in temp_files) 32 | td() 33 | 34 | @attr('slow') 35 | def test_preprocess_flow(): 36 | setup() 37 | cl_gui.process('load pk/tests/iris2.csv') 38 | cl_gui.process('preprocess -std -norm') 39 | cl_gui.process('plot_radial --suppress') 40 | temp_files = os.listdir('_temp/') 41 | assert_true('plot_radial.png' in temp_files) 42 | td() 43 | 44 | @attr('slow') 45 | def test_run_decision_tree(): 46 | setup() 47 | cl_gui.process('load pk/tests/iris2.csv') 48 | cl_gui.process('run -A dt -test_ratio .5 -cv 15') 49 | td() 50 | 51 | @attr('slow') 52 | def test_plot_2d(): 53 | setup() 54 | cl_gui.process('load_random') 55 | cl_gui.process('plot_2d --suppress') 56 | temp_files = os.listdir('_temp/') 57 | assert_true('plot_2d.png' in temp_files) 58 | td() -------------------------------------------------------------------------------- /pk/controller.py: -------------------------------------------------------------------------------- 1 | """This file contains classes and functions for controller objects. 2 | Author: Sean Dai 3 | """ 4 | from PyQt4 import QtGui 5 | from PyQt4.QtGui import QFileDialog 6 | from PyQt4.QtGui import * 7 | from PyQt4.QtCore import * 8 | from PyQt4.QtGui import QDialogButtonBox 9 | 10 | class ViewGenerator(object): 11 | 12 | def open_file_dialog(self, app, filter): 13 | """ 14 | Opens a file dialog for the user to select the desired file. 15 | """ 16 | frame = QtGui.QWidget() 17 | path = QFileDialog.getOpenFileName(parent=frame, caption="Open File", 18 | filter=filter) 19 | frame.destroy() 20 | app.closeAllWindows() 21 | return str(path) 22 | 23 | def get_preprocess_options(self, app): 24 | """ 25 | Opens a new window for selecting preprocessing options for the dataset. 26 | """ 27 | class PreprocessFrame(QtGui.QWidget): 28 | def __init__(self): 29 | QtGui.QWidget.__init__(self) 30 | layout = QVBoxLayout(self) 31 | self.setWindowTitle("Preprocessing") 32 | self.cbox1 = QCheckBox("Normalize") 33 | self.cbox2 = QCheckBox("Standardize") 34 | self.cbox3 = QCheckBox("Remove examples containing:") 35 | 36 | text_area = QLineEdit(self.cbox3) 37 | text_area.setText("?") 38 | dbx = QDialogButtonBox(self) 39 | dbx.setStandardButtons(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) 40 | # self.connect(dbx, SIGNAL("accepted()"), dbx, SLOT("accept()")) 41 | # self.connect(dbx, SIGNAL("rejected()"), dbx, SLOT("reject()")) 42 | layout.addWidget(self.cbox1) 43 | layout.addWidget(self.cbox2) 44 | layout.addWidget(self.cbox3) 45 | layout.addWidget(text_area) 46 | layout.addWidget(dbx, alignment=Qt.AlignCenter) 47 | self.setLayout(layout) 48 | 49 | pf = PreprocessFrame() 50 | pf.show() 51 | app.exec_() 52 | -------------------------------------------------------------------------------- /pk/utils/imaging.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from sklearn.feature_extraction.image import grid_to_graph 4 | from sklearn.cluster import AgglomerativeClustering 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from scipy.ndimage import imread 8 | from sklearn.cluster import DBSCAN 9 | 10 | 11 | def segment_image(im_file, n_segments=5, alg='ac'): 12 | img = imread(im_file) 13 | img = img[:,:,0] 14 | X = np.reshape(img, (-1, 1)) 15 | 16 | if alg == 'ac': 17 | # Define the structure A of the data. Pixels connected to their neighbors. 18 | connectivity = grid_to_graph(*img.shape) 19 | 20 | # Compute clustering 21 | print("Compute structured hierarchical clustering...") 22 | st = time.time() 23 | n_clusters = n_segments # number of regions 24 | ward = AgglomerativeClustering(n_clusters=n_clusters, 25 | linkage='complete', connectivity=connectivity).fit(X) 26 | label = np.reshape(ward.labels_, img.shape) 27 | elif alg == 'dbscan': 28 | print("Compute DBScan clustering...") 29 | st = time.time() 30 | dbs = DBSCAN(eps=1).fit(X) 31 | label = np.reshape(dbs.labels_, img.shape) 32 | 33 | print("Elapsed time: ", time.time() - st) 34 | print("Number of pixels: ", label.size) 35 | print("Number of clusters: ", np.unique(label).size) 36 | 37 | return label 38 | 39 | def visualize_segments(label, type='mask', im_file=None): 40 | if type == 'mask': 41 | plt.imshow(label, cmap=plt.cm.Paired) 42 | elif type == 'contour': 43 | if im_file is not None: 44 | img = imread(im_file) 45 | n_clusters = np.unique(label).size 46 | plt.imshow(img, cmap=plt.cm.gray) 47 | for cluster_i in range(n_clusters): 48 | plt.contour(label == cluster_i, contours=1, 49 | colors=[plt.cm.spectral( 50 | cluster_i/float(n_clusters)),]) 51 | plt.xticks(()) 52 | plt.yticks(()) 53 | plt.show() 54 | 55 | # im_file = "/Users/sd/Downloads/sample_images/biking.jpg" 56 | # segment_labels = segment_image(im_file, n_segments=15, alg='ac') 57 | # visualize_segments(segment_labels, type='contour', im_file=im_file) 58 | # visualize_segments(segment_labels, type='mask', im_file=im_file) -------------------------------------------------------------------------------- /pk/tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Author: Sean Dai 2 | import os 3 | 4 | from numpy.testing import assert_array_equal 5 | from nose.tools import assert_true 6 | from numpy.testing import assert_array_almost_equal 7 | 8 | from pk.utils.preprocess import PreprocessingEngine 9 | from sklearn.datasets import load_digits 10 | from sklearn.datasets import load_boston 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.preprocessing import Normalizer 13 | 14 | from pk.utils.loading import load_csv 15 | import numpy as np 16 | 17 | __DIR_NAME = os.path.abspath(os.path.dirname(__file__)) + '/' 18 | pe = PreprocessingEngine() 19 | 20 | def test_standardize(): 21 | digits = load_digits() 22 | X = digits.data 23 | assert_true((pe.standardize(X) == StandardScaler().fit_transform(X)).all()) 24 | 25 | def test_normalize_data(): 26 | boston = load_boston() 27 | X = boston.data 28 | assert_true((pe.normalize_data(X) == Normalizer().fit_transform(X)).all()) 29 | 30 | def test_remove_incomplete_examples(): 31 | X, y, _ = load_csv(__DIR_NAME + 'blank.csv') 32 | assert len(X) == len(y) 33 | X, y = pe.remove_incomplete_examples(X, y, '?') 34 | 35 | exp_X = np.array([[1,2,3,4], 36 | [2,3,4,5], 37 | [2,3,4,5], 38 | [1,2,3,4]]) 39 | exp_X = exp_X.astype('str') 40 | exp_y = np.array(['good', 'good', 'good', 'good']) 41 | assert_array_equal(X, exp_X) 42 | assert_array_equal(y, exp_y) 43 | 44 | def test_label_encoder(): 45 | X = np.array([['a','b',1], ['a','a',11], ['b','b',13], ['c', 'c', 100]]) 46 | expX = np.array([[0, 1, 1], 47 | [0, 0, 11], 48 | [1, 1, 13], 49 | [2, 2, 100]]) 50 | assert_array_almost_equal(pe.convert_to_float_array(pe.encode_labels(X)), 51 | expX) 52 | 53 | def test_binarize(): 54 | y = ['a', 'b', 'c', 'a'] 55 | exp_y = [[1, 0, 0], 56 | [0, 1, 0], 57 | [0, 0, 1], 58 | [1, 0, 0]] 59 | assert_array_almost_equal(pe.binarize(y), exp_y) 60 | 61 | def test_inpute_missing_values(): 62 | X = np.array([[1,2,'NaN'], [3,'NaN',5], [1,2,3]]) 63 | X = pe.encode_labels(X) 64 | X = pe.impute_missing_values(X, missing_values='NaN') 65 | exp_X = np.array([[ 1., 2., 4.], 66 | [ 3., 2., 5.], 67 | [ 1., 2., 3.]]) 68 | assert_array_almost_equal(X, exp_X) -------------------------------------------------------------------------------- /pk/models.py: -------------------------------------------------------------------------------- 1 | """ This module contains the model objects for the GUI. 2 | Author: Sean Dai 3 | """ 4 | from sklearn.base import BaseEstimator 5 | from sklearn.base import clone 6 | 7 | class BaseModel(object): 8 | """ 9 | A base model class to hold information. 10 | """ 11 | def __init__(self): 12 | self.observers = [] 13 | self.data = None 14 | 15 | def add_observer(self, observer): 16 | """ 17 | Register an observer. 18 | """ 19 | self.observers.append(observer) 20 | 21 | def changed(self, event): 22 | """ 23 | Notify observers of changes. 24 | """ 25 | for obs in self.observers: 26 | obs.update(event, self) 27 | 28 | class Algorithm(BaseModel): 29 | """ 30 | This class wraps the machine learning algorithm around an object. 31 | """ 32 | def __init__(self, clf=BaseEstimator()): 33 | super(Algorithm, self).__init__() 34 | self.clf = clf 35 | # Create a deep copy of origin classifier for retraining purposes. 36 | self.clf_name = type(clf).__name__ 37 | self.fitted = False 38 | 39 | def __repr__(self): 40 | return str(vars(self)) 41 | 42 | @property 43 | def params(self): 44 | """ 45 | Gets the classifier parameters 46 | """ 47 | return self.clf.__dict__ 48 | 49 | def _fit(self, *args, **kwargs): 50 | """ 51 | Runs the algorithm with the passed-in parameters. 52 | """ 53 | self.fitted = True 54 | return self.clf.fit(*args, **kwargs) 55 | 56 | def predict(self, X): 57 | if not self.fitted: 58 | raise Exception("Can't predict with untrained classifier!") 59 | return self.clf.predict(X) 60 | 61 | class SupervisedAlgorithm(Algorithm): 62 | """ 63 | Wrapper class for supervised learning algorithms. 64 | """ 65 | def __init__(self, clf): 66 | super(SupervisedAlgorithm, self).__init__(clf) 67 | 68 | def fit(self, X, y): 69 | self.clf = self._fit(X, y) 70 | 71 | class UnsupervisedAlgorithm(Algorithm): 72 | """ 73 | Class for unsupervised algorithms (eg. clustering, PCA, etc.) 74 | """ 75 | def __init__(self, clf): 76 | super(UnsupervisedAlgorithm, self).__init__(clf) 77 | 78 | def fit(self, X): 79 | self.clf = self._fit(X) 80 | 81 | 82 | # from sklearn.tree import DecisionTreeClassifier 83 | # from sklearn.mixture import GMM 84 | # from pk.utils.loading import load_csv 85 | # X,y,_ = load_csv('tests/iris2.csv') 86 | # clf = GMM(n_components=3) 87 | # a = UnsupervisedAlgorithm(clf) 88 | # print a.params 89 | # print a.fitted 90 | # import time 91 | # s = time.time() 92 | # a.fit(X) 93 | # print "Took %f secs" % (time.time() - s) 94 | # print a.fitted 95 | # print a.params 96 | # print a 97 | # print a.clf.means_ 98 | # print a.predict([[1,2,3,4]]) 99 | -------------------------------------------------------------------------------- /pk/utils/metrics.py: -------------------------------------------------------------------------------- 1 | # __author__ = 'Bhavesh' 2 | # 3 | from sklearn.metrics import confusion_matrix, explained_variance_score, mean_squared_error, mean_absolute_error, r2_score, adjusted_rand_score, adjusted_mutual_info_score, homogeneity_score, silhouette_score, v_measure_score 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from sklearn import cross_validation 7 | 8 | def get_confusion_matrix(clf, X, true_y): 9 | predicted_y = clf.predict(X) 10 | matrix = confusion_matrix(true_y, predicted_y) 11 | print 'Confusion Matrix is: \n%s' % matrix 12 | return matrix 13 | 14 | 15 | def plot_confusion_matrix(cm, y, title='Confusion matrix', cmap=plt.cm.Blues, 16 | continuous_class=False): 17 | if continuous_class: 18 | return None 19 | plt.clf() 20 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 21 | plt.title(title) 22 | plt.colorbar() 23 | tick_marks = np.arange(len(np.unique(y))) 24 | plt.xticks(tick_marks, np.unique(y), rotation=45) 25 | plt.yticks(tick_marks, np.unique(y)) 26 | plt.tight_layout() 27 | plt.ylabel('True label') 28 | plt.xlabel('Predicted label') 29 | plt.show(block=False) 30 | 31 | 32 | def get_train_accuracy(clf, X, y): 33 | return round(((clf.score(X, y))*100),5) 34 | 35 | 36 | def get_test_accuracy(clf, X, y): 37 | return round(((clf.score(X, y))*100),5) 38 | 39 | 40 | def get_cv_accuracy(clf, X, y, cv=10): 41 | scores = cross_validation.cross_val_score(clf, X, y, cv=cv) 42 | avg = scores.mean() 43 | return scores, round(avg*100, 5) 44 | 45 | # def get_variance_score(clf, X_test, true_y): 46 | # pred_y = clf.predict(X_test) 47 | # return round(explained_variance_score(true_y, pred_y), 4) 48 | # 49 | # def get_mean_abs_error(clf, X_test, true_y): 50 | # pred_y = clf.predict(X_test) 51 | # return round(mean_absolute_error(true_y, pred_y), 4) 52 | # 53 | # def get_mean_squared_error(clf, X_test, true_y): 54 | # pred_y = clf.predict(X_test) 55 | # return round(mean_squared_error(true_y, pred_y), 4) 56 | # 57 | # def get_median_abs_error(clf, X_test, true_y): 58 | # pred_y = clf.predict(X_test) 59 | # return round(median_absolute_error(true_y, pred_y), 4) 60 | # 61 | # def get_r2_score(clf, X_test, true_y): 62 | # pred_y = clf.predict(X_test) 63 | # return round(r2_score(true_y, pred_y), 4) 64 | # 65 | # def get_adjusted_rand_index(clf, X_test, true_y): 66 | # pred_y = clf.predict(X_test) 67 | # return round(adjusted_rand_score(true_y, pred_y), 4) 68 | # 69 | # def get_adjusted_mutual_info(clf, X_test, true_y): 70 | # pred_y = clf.predict(X_test) 71 | # return round(adjusted_mutual_info_score(true_y, pred_y), 4) 72 | # 73 | # def get_homogeneity_score(clf, X_test, true_y): 74 | # pred_y = clf.predict(X_test) 75 | # return round(homogeneity_score(true_y, pred_y), 4) 76 | # 77 | # def get_vscore(clf, X_test, true_y): 78 | # pred_y = clf.predict(X_test) 79 | # return round(v_measure_score(true_y, pred_y), 4) 80 | # 81 | # def get_silhouette_score(clf, X): 82 | # # pred_y = clf.predict(X_test) 83 | # return round(silhouette_score(X, clf.means_, metric='euclidean')) 84 | # 85 | # def benchmark(X, y, training_func, *args, **kwargs): 86 | # clf = training_func(X, y, *args, **kwargs) 87 | # get_train_accuracy(clf, X, y) 88 | # get_test_accuracy(clf, X, y) 89 | -------------------------------------------------------------------------------- /pk/tests/test_classification.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Bhavesh' 2 | 3 | """ 4 | from pk.utils.metrics import * 5 | from pk.utils.classification_utils import * 6 | from prettytable import PrettyTable 7 | from pk.utils.loading import * 8 | import warnings 9 | 10 | def runall_classification(X, y): 11 | warnings.filterwarnings('ignore') 12 | T = PrettyTable(["Method", "Train Accuracy (%)", "Test Accuracy (%)", "Cross Validation Accuracy (%)"]) 13 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3,random_state=0) 14 | 15 | dts = train_decision_tree(X_train, y_train) 16 | dts_train_acc = get_train_accuracy(dts, X_train, y_train) 17 | dts_test_acc = get_test_accuracy(dts, X_test, y_test) 18 | _, dts_cv_acc = get_cv_accuracy(dts, X, y) 19 | T.add_row((["Decision Tree", dts_train_acc, dts_test_acc, dts_cv_acc])) 20 | 21 | knn = train_knn(X_train, y_train) 22 | knn_train_acc = get_train_accuracy(knn, X_train, y_train) 23 | knn_test_acc = get_test_accuracy(knn, X_test, y_test) 24 | _, knn_cv_acc = get_cv_accuracy(dts, X, y) 25 | T.add_row((["Nearest Neighbor", knn_train_acc, knn_test_acc, knn_cv_acc])) 26 | 27 | svm = train_svm(X_train, y_train) 28 | svm_train_acc = get_train_accuracy(svm, X_train, y_train) 29 | svm_test_acc = get_test_accuracy(svm, X_test, y_test) 30 | _, svm_cv_acc = get_cv_accuracy(svm, X, y) 31 | T.add_row((["Support Vector Machine", svm_train_acc, svm_test_acc, svm_cv_acc])) 32 | 33 | nb = train_naive_bayes(X_train, y_train) 34 | nb_train_acc = get_train_accuracy(nb, X_train, y_train) 35 | nb_test_acc = get_test_accuracy(nb, X_test, y_test) 36 | _, nb_cv_acc = get_cv_accuracy(nb, X, y) 37 | T.add_row((["Naive Bayes", nb_train_acc, nb_test_acc, nb_cv_acc])) 38 | 39 | ada = train_adaboost(X_train, y_train, base_estimator=dts) 40 | ada_train_acc = get_train_accuracy(ada, X_train, y_train) 41 | ada_test_acc = get_test_accuracy(ada, X_test, y_test) 42 | _, ada_cv_acc = get_cv_accuracy(ada, X, y) 43 | T.add_row((["AdaBoost", ada_train_acc, ada_test_acc, ada_cv_acc])) 44 | 45 | lda = train_lda(X_train, y_train) 46 | lda_train_acc = get_train_accuracy(lda, X_train, y_train) 47 | lda_test_acc = get_test_accuracy(lda, X_test, y_test) 48 | _, lda_cv_acc = get_cv_accuracy(lda, X, y) 49 | T.add_row((["Linear Discriminant Analysis", lda_train_acc, lda_test_acc, lda_cv_acc])) 50 | 51 | qda = train_qda(X_train, y_train) 52 | qda_train_acc = get_train_accuracy(qda, X_train, y_train) 53 | qda_test_acc = get_test_accuracy(qda, X_test, y_test) 54 | _, qda_cv_acc = get_cv_accuracy(qda, X, y) 55 | T.add_row((["Quadratic Discriminant Analysis", qda_train_acc, qda_test_acc, qda_cv_acc])) 56 | 57 | bag = train_bagging(X_train, y_train, base_estimator=dts) 58 | bag_train_acc = get_train_accuracy(bag, X_train, y_train) 59 | bag_test_acc = get_test_accuracy(bag, X_test, y_test) 60 | _, bag_cv_acc = get_cv_accuracy(bag, X, y) 61 | T.add_row((["Bagging", bag_train_acc, bag_test_acc, bag_cv_acc])) 62 | 63 | rf = train_randomForest(X_train, y_train) 64 | rf_train_acc = get_train_accuracy(rf, X_train, y_train) 65 | rf_test_acc = get_test_accuracy(rf, X_test, y_test) 66 | _, rf_cv_acc = get_cv_accuracy(rf, X, y) 67 | T.add_row((["Random Forest", rf_train_acc, rf_test_acc, rf_cv_acc])) 68 | 69 | sgd = train_stochaticGradientDescent(X_train, y_train) 70 | sgd_train_acc = get_train_accuracy(sgd, X_train, y_train) 71 | sgd_test_acc = get_test_accuracy(sgd, X_test, y_test) 72 | _, sgd_cv_acc = get_cv_accuracy(sgd, X, y) 73 | T.add_row((["Stochastic Gradient Descent", sgd_train_acc, sgd_test_acc, sgd_cv_acc])) 74 | print T 75 | 76 | X_data, y_data, dataset = load_csv('iris2.csv') 77 | runall_classification(X_data,y_data) 78 | """ -------------------------------------------------------------------------------- /pk/tests/ratings_best.arff: -------------------------------------------------------------------------------- 1 | @relation user-funness-rating 2 | 3 | @attribute probBuildJump numeric 4 | @attribute probBuildCannons numeric 5 | @attribute probBuildHillStraight numeric 6 | @attribute probBuildTubes numeric 7 | @attribute probBuildStraight numeric 8 | @attribute difficulty numeric 9 | @attribute blocksCoins numeric 10 | @attribute blocksEmpty numeric 11 | @attribute blocksPower numeric 12 | @attribute enemies numeric 13 | @attribute fun numeric 14 | 15 | @data 16 | 0.216954,0.193906,0.056133,0.080410,0.452598,0,5,14,1,0,0.000000 17 | 0.066983,0.000000,0.333621,0.423212,0.176184,0,2,3,0,1,0 18 | 0.200653,0.173610,0.625737,0.000000,0.000000,1,3,7,3,7,1.000000 19 | 0.118236,0.104627,0.114326,0.351013,0.311797,2,3,14,0,4,1.00000 20 | 0.187419,0.404103,0.268916,0.000000,0.139563,2,3,11,2,16,1.000000 21 | 0.484383,0.000000,0.000000,0.000000,0.515617,1,10,20,5,3,0.000000 22 | 0.241743,0.107256,0.100075,0.429201,0.121725,0,5,6,0,2,0.000000 23 | 0.000000,0.000000,0.058207,0.346389,0.595404,2,6,29,4,6,0.000000 24 | 0.371011,0.017317,0.451252,0.000000,0.160420,4,0,3,0,47,1.000000 25 | 0.235802,0.002202,0.723037,0.000000,0.038960,1,1,2,0,11,1.000000 26 | 0.379509,0.015927,0.368553,0.169505,0.066507,1,1,7,0,10,0.000000 27 | 0.087263,0.000000,0.462896,0.109292,0.340549,0,7,17,1,0,0.000000 28 | 0.705555,0.021981,0.046962,0.196528,0.028975,0,0,0,0,1,0.000000 29 | 0.268377,0.000000,0.000000,0.731623,0.000000,1,0,0,0,6,0.000000 30 | 0.091454,0.209781,0.205219,0.011092,0.482454,3,6,25,3,18,1.000000 31 | 0.047697,0.218361,0.063613,0.279565,0.390765,2,9,17,1,4,0.000000 32 | 0.000000,0.021094,0.214678,0.619140,0.145088,1,2,7,1,11,1.000000 33 | 0.633017,0.055214,0.311769,0.000000,0.000000,1,0,0,0,2,0.000000 34 | 0.000000,0.262451,0.000000,0.491089,0.246460,4,4,2,1,12,1.000000 35 | 0.040965,0.643791,0.061195,0.000000,0.254049,1,12,39,2,0,1.000000 36 | 0.105690,0.478350,0.156265,0.000000,0.259695,1,9,16,0,7,1.000000 37 | 0.367145,0.108050,0.289100,0.000000,0.235705,0,1,5,0,0,0.000000 38 | 0.216320,0.136071,0.167827,0.087172,0.392610,0,10,27,4,0,0.000000 39 | 0.177538,0.308546,0.163225,0.227862,0.122830,2,1,3,0,6,0.000000 40 | 0.234807,0.070752,0.297166,0.397275,0.000000,4,2,4,0,54,1.000000 41 | 0.239356,0.181646,0.000000,0.398061,0.180937,0,6,13,2,2,1.000000 42 | 0.640998,0.183456,0.000000,0.175546,0.000000,1,0,0,0,3,0.000000 43 | 0.083503,0.087156,0.159052,0.464611,0.205679,2,0,4,0,12,1.000000 44 | 0.000000,0.316622,0.000000,0.409672,0.273706,2,4,14,1,6,1.000000 45 | 0.030001,0.135286,0.519963,0.314750,0.000000,2,1,6,2,16,1.000000 46 | 0.206727,0.595252,0.141556,0.056465,0.000000,0,3,7,0,0,0.000000 47 | 0.000000,0.233946,0.000000,0.596895,0.169159,1,1,6,3,11,0.000000 48 | 0.023783,0.424868,0.203230,0.038263,0.309856,1,8,25,4,5,1.000000 49 | 0.000000,0.715850,0.000000,0.283413,0.000737,2,0,0,0,4,0.000000 50 | 0.000000,0.000000,0.627885,0.372115,0.000000,1,3,7,0,11,0.000000 51 | 0.189209,0.159652,0.166200,0.327410,0.157529,3,3,1,0,33,1.000000 52 | 0.129294,0.000000,0.000000,0.622321,0.248385,1,4,14,0,7,0.000000 53 | 0.172677,0.353326,0.246494,0.227223,0.000280,2,2,4,0,20,1.000000 54 | 0.000000,0.001455,0.172851,0.303352,0.522343,2,10,16,2,9,0.000000 55 | 0.367742,0.000000,0.207829,0.275175,0.149254,0,1,5,0,0,0.000000 56 | 0.000000,0.569791,0.030189,0.215236,0.184784,4,1,5,0,4,0.000000 57 | 0.044086,0.000000,0.242088,0.618698,0.095129,1,3,3,0,10,1.000000 58 | 0.062823,0.012348,0.104718,0.126612,0.693498,0,18,39,5,0,1.000000 59 | 0.302281,0.451045,0.000000,0.191085,0.055588,3,0,0,0,3,1.000000 60 | 0.000000,0.000000,0.000000,0.000000,1.000000,4,16,48,4,38,1.000000 61 | 0.000000,0.000000,0.330264,0.416459,0.253277,2,5,9,1,20,1.000000 62 | 0.130570,0.079232,0.704769,0.048698,0.036732,1,2,4,0,15,0.000000 63 | 0.211736,0.171886,0.217981,0.304112,0.094285,2,0,5,1,19,1.000000 64 | 0.000000,0.482511,0.179350,0.181636,0.156503,1,1,2,0,12,0.000000 65 | 0.169489,0.127587,0.362061,0.127720,0.213142,1,5,15,1,6,0.000000 66 | 0.063654,0.483960,0.082888,0.271711,0.097787,2,3,9,0,16,1.000000 67 | 0.187733,0.450175,0.293570,0.000000,0.068522,2,2,1,0,17,1.000000 68 | 0.072522,0.672800,0.080936,0.160421,0.013321,4,0,0,0,22,1.000000 69 | 0.158278,0.063573,0.428389,0.349759,0.000000,2,0,2,1,19,0.000000 70 | 0.000000,0.419720,0.247382,0.310430,0.022467,2,2,10,0,21,1.000000 71 | 0.169931,0.143725,0.246039,0.440305,0.000000,3,0,4,0,50,0.000000 72 | -------------------------------------------------------------------------------- /pk/tests/test_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | __author__ = 'Bhavesh' 3 | 4 | 5 | from pk.utils.loading import * 6 | from pk.utils.regression_utils import * 7 | from pk.utils.metrics import * 8 | from prettytable import PrettyTable 9 | def runall_regression(X, y): 10 | T = PrettyTable(["Regression Method", "Train Accuracy (%)", "Test Accuracy (%)", "Variance score", "Mean Squared Error", "Mean Abs Error", "Median Abs Error", "R2 score"]) 11 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3,random_state=0) 12 | 13 | # leastsquare 14 | ls = train_leastSquareModel(X_train, y_train) 15 | ls_train_acc = get_train_accuracy(ls, X_train, y_train) 16 | ls_test_acc = get_test_accuracy(ls, X_test, y_test) 17 | ls_var = get_variance_score(ls, X_test, y_test) 18 | ls_mse = get_mean_squared_error(ls, X_test, y_test) 19 | ls_mean = get_mean_abs_error(ls, X_test, y_test) 20 | ls_med = get_median_abs_error(ls, X_test, y_test) 21 | ls_r2 = get_r2_score(ls, X_test, y_test) 22 | T.add_row((["Least Square Linear", ls_train_acc, ls_test_acc, ls_var, ls_mse, ls_mean, ls_med, ls_r2])) 23 | 24 | # polynomial model with degree 3 25 | poly = train_polynomialRegressionModel(X_train, y_train, degree=3) 26 | poly_train_acc = get_train_accuracy(poly, X_train, y_train) 27 | poly_test_acc = get_test_accuracy(poly, X_test, y_test) 28 | poly_var = get_variance_score(poly, X_test, y_test) 29 | poly_mse = get_mean_squared_error(poly, X_test, y_test) 30 | poly_mean = get_mean_abs_error(poly, X_test, y_test) 31 | poly_med = get_median_abs_error(poly, X_test, y_test) 32 | poly_r2 = get_r2_score(poly, X_test, y_test) 33 | T.add_row((["Polynomial (degree = 3)", poly_train_acc, poly_test_acc, poly_var, poly_mse, poly_mean, poly_med, poly_r2])) 34 | 35 | # logistic regression 36 | log = train_logisticRegressionModel(X_train, y_train) 37 | log_var = get_variance_score(log, X_test, y_test) 38 | log_mse = get_mean_squared_error(log, X_test, y_test) 39 | log_mean = get_mean_abs_error(log, X_test, y_test) 40 | log_med = get_median_abs_error(log, X_test, y_test) 41 | log_r2 = get_r2_score(log, X_test, y_test) 42 | T.add_row((["Logistic", "NA", "NA", log_var, log_mse, log_mean, log_med, log_r2])) 43 | 44 | # RANSAN 45 | ransac = train_RANSACRegressionModel(X_train, y_train) 46 | ransac_train_acc = get_train_accuracy(ransac, X_train, y_train) 47 | ransac_test_acc = get_test_accuracy(ransac, X_test, y_test) 48 | ransac_var = get_variance_score(ransac, X_test, y_test) 49 | ransac_mse = get_mean_squared_error(ransac, X_test, y_test) 50 | ransac_mean = get_mean_abs_error(ransac, X_test, y_test) 51 | ransac_med = get_median_abs_error(ransac, X_test, y_test) 52 | ransac_r2 = get_r2_score(ransac, X_test, y_test) 53 | T.add_row((["RANSAC", ransac_train_acc, ransac_test_acc, ransac_var, ransac_mse, ransac_mean, ransac_med, ransac_r2])) 54 | 55 | # Bayes 56 | bayes = train_BayesianRegressionModel(X_train, y_train) 57 | bayes_train_acc = get_train_accuracy(bayes, X_train, y_train) 58 | bayes_test_acc = get_test_accuracy(bayes, X_test, y_test) 59 | bayes_var = get_variance_score(bayes, X_test, y_test) 60 | bayes_mse = get_mean_squared_error(bayes, X_test, y_test) 61 | bayes_mean = get_mean_abs_error(bayes, X_test, y_test) 62 | bayes_med = get_median_abs_error(bayes, X_test, y_test) 63 | bayes_r2 = get_r2_score(bayes, X_test, y_test) 64 | T.add_row((["Bayesian", bayes_train_acc, bayes_test_acc, bayes_var, bayes_mse, bayes_mean, bayes_med, bayes_r2])) 65 | 66 | # Kernel ridge 67 | kr = train_kernelRidgeModel(X_train, y_train) 68 | kr_train_acc = get_train_accuracy(kr, X_train, y_train) 69 | kr_test_acc = get_test_accuracy(kr, X_test, y_test) 70 | kr_var = get_variance_score(kr, X_test, y_test) 71 | kr_mse = get_mean_squared_error(kr, X_test, y_test) 72 | kr_mean = get_mean_abs_error(kr, X_test, y_test) 73 | kr_med = get_median_abs_error(kr, X_test, y_test) 74 | kr_r2 = get_r2_score(kr, X_test, y_test) 75 | T.add_row((["Kernel Ridge", kr_train_acc, kr_test_acc, kr_var, kr_mse, kr_mean, kr_med, kr_r2])) 76 | print T 77 | 78 | # dataset for regression 79 | X, y, _ = load_csv('concrete.csv') 80 | runall_regression(X, y) 81 | 82 | """ 83 | -------------------------------------------------------------------------------- /pk/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import scale 3 | from sklearn.preprocessing import normalize 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.preprocessing import LabelBinarizer 6 | from sklearn.preprocessing import Imputer 7 | 8 | from pk.utils.loading import is_number 9 | 10 | 11 | class PreprocessingEngine(object): 12 | """ 13 | This class provides functions for preprocessing the feature array. 14 | """ 15 | def standardize(self, X, axis=0, with_mean=True, with_std=True, copy=True): 16 | """ 17 | Standardize a dataset along any axis. 18 | 19 | Args: 20 | X: numpy feature array of size (n_examples, n_features) 21 | axis: axis to compute mean and stds along. 22 | with_mean: if True, center data before scaling 23 | with_std: if True, scale to unit variance 24 | copy: if False, do inplace normalization and avoid copying array 25 | 26 | Returns: 27 | changed_X: mean-shifted X with unit variance 28 | """ 29 | return scale(X, axis, with_mean, with_std, copy) 30 | 31 | def normalize_data(self, X, norm='l2', axis=1, copy=True): 32 | """ 33 | Scale input vectors to unit norm. 34 | 35 | Args: 36 | X: numpy feature array with shape (n_samples, n_features) 37 | norm: the norm to use 38 | axis: axis along which to normalize 39 | copy: if False, do inplace row normalization 40 | 41 | Returns: 42 | A normalized numpy array. 43 | """ 44 | return normalize(X, norm, axis, copy) 45 | 46 | def binarize(self, y): 47 | """ 48 | Binarize class labels to support 1 vs. all classfication. 49 | 50 | Args: 51 | y: target - numpy array (1, n_examples) 52 | 53 | Returns: 54 | A binarized target array 55 | """ 56 | return LabelBinarizer().fit_transform(y) 57 | 58 | def encode_labels(self, X): 59 | """ 60 | Converts categorical feature columns to a numerical values 0 - num_features. 61 | 62 | Arguments: 63 | X: feature array 64 | 65 | Returns: 66 | Feature array with categorical columns replaced with numbers. 67 | """ 68 | # Gets feature labels and stores them in a dict. 69 | feature_dict = { i:X[:, i] for i in xrange(len(X[0])) } 70 | for i in feature_dict: 71 | if not is_number(X[0, i]): 72 | feature_dict[i] = LabelEncoder().fit_transform(feature_dict[i]) 73 | 74 | return np.array(feature_dict.values()).T 75 | 76 | def convert_to_float_array(self, arr): 77 | """ 78 | Converts a numpy array to float data type. 79 | """ 80 | return arr.astype(float) 81 | 82 | def remove_incomplete_examples(self, X, y, missing_char="?"): 83 | """ 84 | Removes examples with missing/incomplete features. 85 | 86 | Args: 87 | missing_char: Placeholder for intended value 88 | 89 | Returns: 90 | Numpy feature array X with bad examples removed 91 | target classes with bad examples removed 92 | """ 93 | row_ind, _ = np.where(X == missing_char) 94 | row_ind = np.unique(row_ind) 95 | valid_rows = np.delete(np.arange(len(X)), row_ind) 96 | return X[valid_rows, :], y[valid_rows] 97 | 98 | def impute_missing_values(self, X, missing_values='NaN', strategy='mean', 99 | axis=0, verbose=0, copy=True): 100 | """ 101 | Replaces missing values in the feature array with inferred values. 102 | 103 | Args: 104 | missing_values: placeholder for missing value 105 | strategy: default - 'mean', replace missing_values using strategy 106 | along the axis 107 | axis: axis along which to impute 108 | verbose: verbosity of imputer 109 | copy: if True, create a copy of X 110 | 111 | Returns: 112 | Numpy feature array X with bad examples removed 113 | target classes with bad examples removed 114 | """ 115 | imp = Imputer(missing_values=missing_values, strategy=strategy, axis=axis, 116 | verbose=verbose, copy=copy) 117 | return imp.fit_transform(X) -------------------------------------------------------------------------------- /pk/tests/iris2.csv: -------------------------------------------------------------------------------- 1 | Sepal Length,Sepal Width,Petal Length,Petal Width,Species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3.0,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5.0,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5.0,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3.0,1.4,0.1,setosa 15 | 4.3,3.0,1.1,0.1,setosa 16 | 5.8,4.0,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1.0,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5.0,3.0,1.6,0.2,setosa 28 | 5.0,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.1,setosa 37 | 5.0,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.1,1.5,0.1,setosa 40 | 4.4,3.0,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5.0,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5.0,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3.0,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5.0,3.3,1.4,0.2,setosa 52 | 7.0,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4.0,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1.0,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5.0,2.0,3.5,1.0,versicolor 63 | 5.9,3.0,4.2,1.5,versicolor 64 | 6.0,2.2,4.0,1.0,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3.0,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1.0,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4.0,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3.0,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3.0,5.0,1.7,versicolor 80 | 6.0,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1.0,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1.0,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6.0,2.7,5.1,1.6,versicolor 86 | 5.4,3.0,4.5,1.5,versicolor 87 | 6.0,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3.0,4.1,1.3,versicolor 91 | 5.5,2.5,4.0,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3.0,4.6,1.4,versicolor 94 | 5.8,2.6,4.0,1.2,versicolor 95 | 5.0,2.3,3.3,1.0,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3.0,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3.0,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6.0,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3.0,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3.0,5.8,2.2,virginica 107 | 7.6,3.0,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2.0,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3.0,5.5,2.1,virginica 115 | 5.7,2.5,5.0,2.0,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3.0,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6.0,2.2,5.0,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2.0,virginica 124 | 7.7,2.8,6.7,2.0,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6.0,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3.0,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3.0,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2.0,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3.0,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6.0,3.0,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3.0,5.2,2.3,virginica 148 | 6.3,2.5,5.0,1.9,virginica 149 | 6.5,3.0,5.2,2.0,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3.0,5.1,1.8,virginica 152 | -------------------------------------------------------------------------------- /pk/utils/regression_utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Bhavesh' 2 | 3 | from sklearn.linear_model import LinearRegression, LogisticRegression, RANSACRegressor, BayesianRidge 4 | from sklearn.kernel_ridge import KernelRidge 5 | from sklearn.preprocessing import PolynomialFeatures 6 | from sklearn.pipeline import Pipeline 7 | from numpy import inf 8 | 9 | def train_leastSquareModel(X, y, fit_intercept=True, normalize=False, 10 | copy_X=True, n_jobs=1): 11 | """ 12 | Train a regression model using Least Square method 13 | """ 14 | model = LinearRegression(fit_intercept=fit_intercept, 15 | normalize=normalize, 16 | copy_X=copy_X, 17 | n_jobs=n_jobs) 18 | model = model.fit(X, y) 19 | return model 20 | 21 | def train_kernelRidgeModel(X, y, alpha=1, kernel='linear',gamma=None, degree=3, 22 | coef0=1, kernel_params=None): 23 | """ 24 | Train a kernel ridge regression model 25 | """ 26 | model = KernelRidge(alpha=alpha, 27 | kernel=kernel, 28 | gamma=gamma, 29 | degree=degree, 30 | coef0=coef0, 31 | kernel_params=kernel_params) 32 | model = model.fit(X, y) 33 | return model 34 | 35 | def train_logisticRegressionModel(X, y, penalty='l2', dual=False, tol=0.0001, 36 | C=1.0, fit_intercept=True, intercept_scaling=1, 37 | class_weight=None, random_state=None, 38 | solver='liblinear', max_iter=100, 39 | multi_class='ovr', verbose=False): 40 | """ 41 | Train a logistic regression model 42 | """ 43 | model = LogisticRegression(penalty=penalty, 44 | dual=dual, 45 | tol=tol, 46 | C=C, 47 | fit_intercept=fit_intercept, 48 | intercept_scaling=intercept_scaling, 49 | class_weight=class_weight, 50 | random_state=random_state, 51 | solver=solver, 52 | max_iter=max_iter, 53 | multi_class=multi_class, 54 | verbose=verbose) 55 | model = model.fit(X,y) 56 | return model 57 | 58 | def train_polynomialRegressionModel(X, y, degree=2, interaction_only=False, 59 | include_bias=True): 60 | """ 61 | Train a polynomial model using Linear Regression Pipeline with degrees 62 | """ 63 | model = Pipeline([('poly', PolynomialFeatures(degree=degree)), 64 | ('linear', LinearRegression(fit_intercept=False))]) 65 | model = model.fit(X, y) 66 | return model 67 | 68 | def train_BayesianRegressionModel(X, y,n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False): 69 | """ 70 | Train a Bayesian regression model 71 | """ 72 | model = BayesianRidge(n_iter=n_iter, 73 | tol=tol, 74 | alpha_1=alpha_1, 75 | alpha_2=alpha_2, 76 | lambda_1=lambda_1, 77 | lambda_2=lambda_2, 78 | compute_score=compute_score, 79 | fit_intercept=fit_intercept, 80 | normalize=normalize, 81 | copy_X=copy_X, 82 | verbose=verbose) 83 | model = model.fit(X,y) 84 | return model 85 | 86 | def train_RANSACRegressionModel(X, y, base_estimator=None, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, stop_n_inliers=inf, stop_score=inf, stop_probability=0.99, residual_metric=None, random_state=None): 87 | """ 88 | Train a RANSAC regression model 89 | """ 90 | model = RANSACRegressor(base_estimator=base_estimator, 91 | min_samples=min_samples, 92 | residual_threshold=residual_threshold, 93 | is_data_valid=is_data_valid, 94 | is_model_valid=is_model_valid, 95 | max_trials=max_trials, 96 | stop_n_inliers=stop_n_inliers, 97 | stop_score=stop_score, 98 | stop_probability=stop_probability, 99 | residual_metric=residual_metric, 100 | random_state=random_state) 101 | model = model.fit(X, y) 102 | return model 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /pk/tests/faithful.csv: -------------------------------------------------------------------------------- 1 | "","eruptions","waiting" 2 | "1",3.6,79 3 | "2",1.8,54 4 | "3",3.333,74 5 | "4",2.283,62 6 | "5",4.533,85 7 | "6",2.883,55 8 | "7",4.7,88 9 | "8",3.6,85 10 | "9",1.95,51 11 | "10",4.35,85 12 | "11",1.833,54 13 | "12",3.917,84 14 | "13",4.2,78 15 | "14",1.75,47 16 | "15",4.7,83 17 | "16",2.167,52 18 | "17",1.75,62 19 | "18",4.8,84 20 | "19",1.6,52 21 | "20",4.25,79 22 | "21",1.8,51 23 | "22",1.75,47 24 | "23",3.45,78 25 | "24",3.067,69 26 | "25",4.533,74 27 | "26",3.6,83 28 | "27",1.967,55 29 | "28",4.083,76 30 | "29",3.85,78 31 | "30",4.433,79 32 | "31",4.3,73 33 | "32",4.467,77 34 | "33",3.367,66 35 | "34",4.033,80 36 | "35",3.833,74 37 | "36",2.017,52 38 | "37",1.867,48 39 | "38",4.833,80 40 | "39",1.833,59 41 | "40",4.783,90 42 | "41",4.35,80 43 | "42",1.883,58 44 | "43",4.567,84 45 | "44",1.75,58 46 | "45",4.533,73 47 | "46",3.317,83 48 | "47",3.833,64 49 | "48",2.1,53 50 | "49",4.633,82 51 | "50",2,59 52 | "51",4.8,75 53 | "52",4.716,90 54 | "53",1.833,54 55 | "54",4.833,80 56 | "55",1.733,54 57 | "56",4.883,83 58 | "57",3.717,71 59 | "58",1.667,64 60 | "59",4.567,77 61 | "60",4.317,81 62 | "61",2.233,59 63 | "62",4.5,84 64 | "63",1.75,48 65 | "64",4.8,82 66 | "65",1.817,60 67 | "66",4.4,92 68 | "67",4.167,78 69 | "68",4.7,78 70 | "69",2.067,65 71 | "70",4.7,73 72 | "71",4.033,82 73 | "72",1.967,56 74 | "73",4.5,79 75 | "74",4,71 76 | "75",1.983,62 77 | "76",5.067,76 78 | "77",2.017,60 79 | "78",4.567,78 80 | "79",3.883,76 81 | "80",3.6,83 82 | "81",4.133,75 83 | "82",4.333,82 84 | "83",4.1,70 85 | "84",2.633,65 86 | "85",4.067,73 87 | "86",4.933,88 88 | "87",3.95,76 89 | "88",4.517,80 90 | "89",2.167,48 91 | "90",4,86 92 | "91",2.2,60 93 | "92",4.333,90 94 | "93",1.867,50 95 | "94",4.817,78 96 | "95",1.833,63 97 | "96",4.3,72 98 | "97",4.667,84 99 | "98",3.75,75 100 | "99",1.867,51 101 | "100",4.9,82 102 | "101",2.483,62 103 | "102",4.367,88 104 | "103",2.1,49 105 | "104",4.5,83 106 | "105",4.05,81 107 | "106",1.867,47 108 | "107",4.7,84 109 | "108",1.783,52 110 | "109",4.85,86 111 | "110",3.683,81 112 | "111",4.733,75 113 | "112",2.3,59 114 | "113",4.9,89 115 | "114",4.417,79 116 | "115",1.7,59 117 | "116",4.633,81 118 | "117",2.317,50 119 | "118",4.6,85 120 | "119",1.817,59 121 | "120",4.417,87 122 | "121",2.617,53 123 | "122",4.067,69 124 | "123",4.25,77 125 | "124",1.967,56 126 | "125",4.6,88 127 | "126",3.767,81 128 | "127",1.917,45 129 | "128",4.5,82 130 | "129",2.267,55 131 | "130",4.65,90 132 | "131",1.867,45 133 | "132",4.167,83 134 | "133",2.8,56 135 | "134",4.333,89 136 | "135",1.833,46 137 | "136",4.383,82 138 | "137",1.883,51 139 | "138",4.933,86 140 | "139",2.033,53 141 | "140",3.733,79 142 | "141",4.233,81 143 | "142",2.233,60 144 | "143",4.533,82 145 | "144",4.817,77 146 | "145",4.333,76 147 | "146",1.983,59 148 | "147",4.633,80 149 | "148",2.017,49 150 | "149",5.1,96 151 | "150",1.8,53 152 | "151",5.033,77 153 | "152",4,77 154 | "153",2.4,65 155 | "154",4.6,81 156 | "155",3.567,71 157 | "156",4,70 158 | "157",4.5,81 159 | "158",4.083,93 160 | "159",1.8,53 161 | "160",3.967,89 162 | "161",2.2,45 163 | "162",4.15,86 164 | "163",2,58 165 | "164",3.833,78 166 | "165",3.5,66 167 | "166",4.583,76 168 | "167",2.367,63 169 | "168",5,88 170 | "169",1.933,52 171 | "170",4.617,93 172 | "171",1.917,49 173 | "172",2.083,57 174 | "173",4.583,77 175 | "174",3.333,68 176 | "175",4.167,81 177 | "176",4.333,81 178 | "177",4.5,73 179 | "178",2.417,50 180 | "179",4,85 181 | "180",4.167,74 182 | "181",1.883,55 183 | "182",4.583,77 184 | "183",4.25,83 185 | "184",3.767,83 186 | "185",2.033,51 187 | "186",4.433,78 188 | "187",4.083,84 189 | "188",1.833,46 190 | "189",4.417,83 191 | "190",2.183,55 192 | "191",4.8,81 193 | "192",1.833,57 194 | "193",4.8,76 195 | "194",4.1,84 196 | "195",3.966,77 197 | "196",4.233,81 198 | "197",3.5,87 199 | "198",4.366,77 200 | "199",2.25,51 201 | "200",4.667,78 202 | "201",2.1,60 203 | "202",4.35,82 204 | "203",4.133,91 205 | "204",1.867,53 206 | "205",4.6,78 207 | "206",1.783,46 208 | "207",4.367,77 209 | "208",3.85,84 210 | "209",1.933,49 211 | "210",4.5,83 212 | "211",2.383,71 213 | "212",4.7,80 214 | "213",1.867,49 215 | "214",3.833,75 216 | "215",3.417,64 217 | "216",4.233,76 218 | "217",2.4,53 219 | "218",4.8,94 220 | "219",2,55 221 | "220",4.15,76 222 | "221",1.867,50 223 | "222",4.267,82 224 | "223",1.75,54 225 | "224",4.483,75 226 | "225",4,78 227 | "226",4.117,79 228 | "227",4.083,78 229 | "228",4.267,78 230 | "229",3.917,70 231 | "230",4.55,79 232 | "231",4.083,70 233 | "232",2.417,54 234 | "233",4.183,86 235 | "234",2.217,50 236 | "235",4.45,90 237 | "236",1.883,54 238 | "237",1.85,54 239 | "238",4.283,77 240 | "239",3.95,79 241 | "240",2.333,64 242 | "241",4.15,75 243 | "242",2.35,47 244 | "243",4.933,86 245 | "244",2.9,63 246 | "245",4.583,85 247 | "246",3.833,82 248 | "247",2.083,57 249 | "248",4.367,82 250 | "249",2.133,67 251 | "250",4.35,74 252 | "251",2.2,54 253 | "252",4.45,83 254 | "253",3.567,73 255 | "254",4.5,73 256 | "255",4.15,88 257 | "256",3.817,80 258 | "257",3.917,71 259 | "258",4.45,83 260 | "259",2,56 261 | "260",4.283,79 262 | "261",4.767,78 263 | "262",4.533,84 264 | "263",1.85,58 265 | "264",4.25,83 266 | "265",1.983,43 267 | "266",2.25,60 268 | "267",4.75,75 269 | "268",4.117,81 270 | "269",2.15,46 271 | "270",4.417,90 272 | "271",1.817,46 273 | "272",4.467,74 274 | -------------------------------------------------------------------------------- /pk/tests/test_loading.py: -------------------------------------------------------------------------------- 1 | # Author: Sean Dai 2 | import cPickle 3 | import logging 4 | import os 5 | 6 | from numpy.testing import assert_array_equal 7 | from nose.tools import assert_true 8 | from nose.plugins.attrib import attr 9 | from numpy.testing import assert_array_almost_equal 10 | from pandas.util.testing import assert_frame_equal 11 | from pk.utils.loading import * 12 | 13 | 14 | __DIR_NAME = os.path.abspath(os.path.dirname(__file__)) + '/' 15 | 16 | 17 | def test_load_arff(): 18 | X, y, _ = load_arff(__DIR_NAME + "ratings_best.arff") 19 | X2, y2 = cPickle.load(open(__DIR_NAME + 'correct_array.pkl', 'r')) 20 | assert_true((X == X2).all()) 21 | assert_true((y == y2).all()) 22 | 23 | 24 | def test_load_arff_categorical(): 25 | X, y, _ = load_arff(__DIR_NAME + "credit-g.arff") 26 | print X, y 27 | logging.info((X, y)) 28 | 29 | 30 | def test_vectorize(): 31 | X = np.array([['a', 1], ['b', 2], ['a', 1]]) 32 | y = np.array(['0', '1', '0']) 33 | features = ['f1', 'f2', 'class'] 34 | X2, y2 = vectorize_categorical_data(X, y, features) 35 | exp_vec_X = np.array([[1., 0., 1.], 36 | [0., 1., 2.], 37 | [1., 0., 1.]]) 38 | assert_array_equal(exp_vec_X, X2) 39 | 40 | 41 | def test_vectorize_numeric(): 42 | X = np.array([[0, 1, 3, 4], [2, 1, 1, 1], [4, 55, 2, 1]]) 43 | y = np.array([0, 1, 0, 1]) 44 | features = ['num1', 'num2', 'num3', 'num4', 'class'] 45 | X2, y2 = vectorize_categorical_data(X, y, features) 46 | exp_vec_X = np.array([[0., 1., 3., 4.], 47 | [2., 1., 1., 1.], 48 | [4., 55., 2., 1.]]) 49 | exp_vec_y = np.array([0, 1, 0, 1]) 50 | assert_array_equal(X2, exp_vec_X) 51 | assert_array_equal(y2, exp_vec_y) 52 | 53 | def test_vectorize_bool_numeric(): 54 | X = np.array([[0, 1, 3, True], [2, 1, 1, False], [4, 55, 2, True]]) 55 | y = np.array([0, 1, 0, 1]) 56 | features = ['num1', 'num2', 'num3', 'num4', 'class'] 57 | X2, y2 = vectorize_categorical_data(X, y, features) 58 | exp_vec_X = np.array([[0., 1., 3., 1.], 59 | [2., 1., 1., 0.], 60 | [4., 55., 2., 1.]]) 61 | exp_vec_y = np.array([0, 1, 0, 1]) 62 | assert_array_equal(X2, exp_vec_X) 63 | assert_array_equal(y2, exp_vec_y) 64 | 65 | def test_vectorize_bool_only(): 66 | X = np.array([[False, True], [False, False], [True, True]]) 67 | y = np.array([0, 1, 0]) 68 | features = ['bool1', 'bool2', 'class'] 69 | X2, y2 = vectorize_categorical_data(X, y, features) 70 | exp_vec_X = np.array([[0, 1], 71 | [0, 0], 72 | [1, 1]]) 73 | exp_vec_y = np.array([0, 1, 0]) 74 | assert_array_equal(X2, exp_vec_X) 75 | assert_array_equal(y2, exp_vec_y) 76 | 77 | def test_load_categorical_no_vectorize(): 78 | X, y, _ = load_arff(__DIR_NAME + "credit-g.arff", vectorize_data=False) 79 | correct_list = ["'<0'", '6.0', "'critical/other existing credit'", 'radio/tv', '1169.0', 80 | "'no known savings'", "'>=7'", '4.0', "'male single'", 'none', '4.0', 81 | "'real estate'", '67.0', 'none' ,'own' ,'2.0', 'skilled', '1.0', 'yes', 'yes'] 82 | assert_array_equal(X[0], correct_list) 83 | 84 | def test_load_csv(): 85 | filename = __DIR_NAME + 'iris.csv' 86 | X, y, _ = load_csv(filename) 87 | expX = [[5.8,4,1.2,0.2], 88 | [5.9,3,4.2,1.5], 89 | [6.5,3.2,5.1,2]] 90 | expY = ['setosa', 'versicolor', 'virginica'] 91 | assert_array_equal(X, expX) 92 | assert_array_equal(y, expY) 93 | 94 | def test_load_excel(): 95 | filename = __DIR_NAME + 'Wine.xls' 96 | X, y, _ = load_excel(filename) 97 | expX = np.array([[ 1.42300000e+01, 1.71000000e+00, 2.43000000e+00, 98 | 1.56000000e+01, 1.27000000e+02, 2.80000000e+00, 99 | 3.06000000e+00, 2.80000000e-01, 2.29000000e+00, 100 | 5.64000000e+00, 1.04000000e+00, 3.92000000e+00, 101 | 1.06500000e+03], 102 | [ 1.23700000e+01, 9.40000000e-01, 1.36000000e+00, 103 | 1.06000000e+01, 8.80000000e+01, 1.98000000e+00, 104 | 5.70000000e-01, 2.80000000e-01, 4.20000000e-01, 105 | 1.95000000e+00, 1.05000000e+00, 1.82000000e+00, 106 | 5.20000000e+02], 107 | [ 1.28600000e+01, 1.35000000e+00, 2.32000000e+00, 108 | 1.80000000e+01, 1.22000000e+02, 1.51000000e+00, 109 | 1.25000000e+00, 2.10000000e-01, 9.40000000e-01, 110 | 4.10000000e+00, 7.60000000e-01, 1.29000000e+00, 111 | 6.30000000e+02]]) 112 | expY = ['A', 'B', 'C'] 113 | assert_array_almost_equal(X, expX) 114 | assert_array_equal(y, expY) 115 | 116 | def test_generate_random(): 117 | np.random.seed(42) 118 | X, y, df = generate_random_points(5) 119 | expX = np.array([[-0.92998481, 9.78172086], 120 | [ 4.88184111, 0.05988944], 121 | [-2.97867201, 9.55684617], 122 | [-8.60454502, -7.44239712], 123 | [ 4.17646114, 1.50743993]]) 124 | expY = np.array([0, 1, 0, 2, 1]) 125 | exp_df = pd.DataFrame(np.hstack((expX,expY[:, np.newaxis]))) 126 | 127 | assert_array_almost_equal(X, expX) 128 | assert_array_equal(y, expY) 129 | assert_frame_equal(df, exp_df) 130 | 131 | @attr('slow') 132 | def test_mldata(): 133 | dl = DatasetIO() 134 | X, y, df = dl.load_from_mldata('iris') 135 | 136 | # test_load_arff() 137 | # test_load_arff_categorical() 138 | # test_vectorize() 139 | # test_vectorize_numeric() 140 | # test_load_categorical_no_vectorize() 141 | # test_load_excel() 142 | # test_generate_random() -------------------------------------------------------------------------------- /pk/utils/classification_utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Bhavesh' 2 | 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn import svm 5 | from sklearn.neighbors import KNeighborsClassifier 6 | from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB 7 | from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier 8 | from sklearn.lda import LDA 9 | from sklearn.qda import QDA 10 | from sklearn.linear_model import SGDClassifier 11 | 12 | from prygress import * 13 | 14 | def train_decision_tree(X, y, criterion='gini', splitter='best', max_depth=None, 15 | min_samples_split=2, min_samples_leaf=1, 16 | max_features=None, random_state=None, 17 | max_leaf_nodes=None): 18 | """ 19 | Builds a decision tree model. 20 | 21 | Returns: 22 | clf: Fitted Decision tree classifier object 23 | """ 24 | clf = DecisionTreeClassifier(criterion=criterion, 25 | splitter=splitter, 26 | max_depth=max_depth, 27 | min_samples_split=min_samples_split, 28 | min_samples_leaf=min_samples_leaf, 29 | max_features=max_features, 30 | random_state=random_state, 31 | max_leaf_nodes=max_leaf_nodes) 32 | clf = clf.fit(X, y) 33 | print 'Decision Tree done!' 34 | return clf 35 | 36 | def train_svm(X, y, C=1.0, kernel='linear', degree=3, gamma=0.0, coef0=0.0, 37 | shrinking=True, probability=False, tol=0.001, cache_size=200, 38 | class_weight=None, verbose=False, max_iter=-1, random_state=None): 39 | """ 40 | Builds a support vector machine model 41 | 42 | Returns: 43 | clf: Fitted SVM classifier object 44 | """ 45 | clf = svm.SVC(C=C, 46 | kernel=kernel, 47 | degree=degree, 48 | gamma=gamma, 49 | coef0=gamma, 50 | shrinking=shrinking, 51 | probability=probability, 52 | tol=tol, 53 | cache_size=cache_size, 54 | class_weight=class_weight, 55 | verbose=verbose, 56 | max_iter=max_iter, 57 | random_state=random_state) 58 | clf = clf.fit(X, y) 59 | print 'SVM completed!' 60 | return clf 61 | 62 | def train_knn(X, y, n_neighbors=5, weights='uniform', algorithm='auto', 63 | leaf_size=30, p=2, metric='minkowski', metric_params=None): 64 | """ 65 | Builds a k-nearest neighbor model 66 | 67 | Returns: 68 | clf: Fitted nearest neighbor model 69 | """ 70 | clf = KNeighborsClassifier(n_neighbors=n_neighbors, 71 | weights=weights, 72 | algorithm=algorithm, 73 | leaf_size=leaf_size, 74 | p=p, 75 | metric=metric, 76 | metric_params=metric_params) 77 | clf = clf.fit(X, y) 78 | print 'KNN completed!' 79 | return clf 80 | 81 | def train_naive_bayes(X, y, distribution='Gaussian'): 82 | """ 83 | Builds a naive bayes classification model 84 | 85 | Returns: 86 | clf: Fitted naive bayes model 87 | """ 88 | if (distribution == 'Guassian'): 89 | clf = GaussianNB() 90 | elif (distribution == 'Multinomial'): 91 | clf = MultinomialNB() 92 | else: 93 | clf = BernoulliNB() 94 | clf = clf.fit(X,y) 95 | print 'Naive Bayes completed!' 96 | return clf 97 | 98 | def train_adaboost(X, y, base_estimator=DecisionTreeClassifier, n_estimators=50, learning_rate=1.0, 99 | algorithm='SAMME.R', random_state=None): 100 | """ 101 | Builds a Boost classifier with decision tree as base estimator 102 | 103 | Returns: 104 | clf: Fitted ada boost model 105 | """ 106 | clf = AdaBoostClassifier(base_estimator=base_estimator, 107 | n_estimators=n_estimators, 108 | learning_rate=learning_rate, 109 | algorithm=algorithm, 110 | random_state=random_state) 111 | clf = clf.fit(X,y) 112 | print 'AdaBoost completed!' 113 | return clf 114 | 115 | def train_lda(X, y, solver='svd', shrinkage=None, priors=None, n_components=None, 116 | store_covariance=False, tol=0.0001): 117 | """ 118 | Builds a linear discriminant analysis model 119 | 120 | Returns: 121 | clf: Fitted LDA model 122 | """ 123 | clf = LDA(solver=solver, 124 | shrinkage=shrinkage, 125 | priors=priors, 126 | n_components=n_components, 127 | store_covariance=store_covariance, 128 | tol=tol) 129 | clf = clf.fit(X,y) 130 | print 'Linear Discriminant Analysis completed!' 131 | return clf 132 | 133 | def train_qda(X, y, priors=None, reg_param=0.0): 134 | """ 135 | Builds a quadratic discriminant analysis model 136 | 137 | Returns: 138 | clf: Fitted QDA model 139 | """ 140 | clf = QDA(priors=priors, 141 | reg_param=reg_param) 142 | clf = clf.fit(X,y) 143 | print 'Quadratic Discriminant Analysis completed!' 144 | return clf 145 | 146 | def train_bagging(X, y, base_estimator=None, n_estimators=10, max_samples=1.0, 147 | max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, 148 | random_state=None, verbose=0): 149 | """ 150 | Builds a Bagging model based on decision tree 151 | 152 | Returns: 153 | clf: Fitted Bagging classifier 154 | """ 155 | clf = BaggingClassifier(base_estimator=base_estimator, 156 | n_estimators=n_estimators, 157 | max_samples=max_samples, 158 | max_features=max_features, 159 | bootstrap=bootstrap, 160 | bootstrap_features=bootstrap_features, 161 | oob_score=oob_score, 162 | n_jobs=n_jobs, 163 | random_state=random_state, 164 | verbose=verbose) 165 | clf = clf.fit(X,y) 166 | return clf 167 | 168 | def train_randomForest(X, y, n_estimators=10, criterion='gini', max_depth=None, 169 | min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 170 | max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, 171 | n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None): 172 | """ 173 | 174 | Builds a random forest classifier 175 | 176 | Returns: Fitted random forest model 177 | """ 178 | clf = RandomForestClassifier(n_estimators=n_estimators, 179 | criterion=criterion, 180 | max_depth=max_depth, 181 | min_samples_split=min_samples_split, 182 | min_samples_leaf=min_samples_leaf, 183 | min_weight_fraction_leaf=min_weight_fraction_leaf, 184 | max_features=max_features, 185 | max_leaf_nodes=max_leaf_nodes, 186 | bootstrap=bootstrap, 187 | oob_score=oob_score, 188 | n_jobs=n_jobs, 189 | random_state=random_state, 190 | verbose=verbose, 191 | warm_start=warm_start, 192 | class_weight=class_weight) 193 | clf = clf.fit(X,y) 194 | return clf 195 | 196 | def train_stochaticGradientDescent(X, y, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, 197 | fit_intercept=True, n_iter=5, shuffle=True, verbose=0, 198 | epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', 199 | eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, 200 | average=False): 201 | clf = SGDClassifier(loss=loss, 202 | penalty=penalty, 203 | alpha=alpha, 204 | l1_ratio=l1_ratio, 205 | fit_intercept=fit_intercept, 206 | n_iter=n_iter, 207 | shuffle=shuffle, 208 | verbose=verbose, 209 | epsilon=epsilon, 210 | n_jobs=n_jobs, 211 | random_state=random_state, 212 | learning_rate=learning_rate, 213 | eta0=eta0, 214 | power_t=power_t, 215 | class_weight=class_weight, 216 | warm_start=warm_start, 217 | average=average 218 | ) 219 | clf = clf.fit(X,y) 220 | return clf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scikit-learn GUI 2 | ================ 3 | A simple GUI for doing fast-paced machine learning in Python. 4 | 5 | Release Notes 6 | ------------- 7 | ### Version 0.1 8 | --------------- 9 | **Features** 10 | * Various dataset loading utilities (see pk/utils/loading.py) 11 | * Preprocessing dataset commands 12 | * Visualizations of input data (2d-dist, histogram of class frequencies, Andrews curve, radial plot, etc.) 13 | * Supervised learning features 14 | * Clustering 15 | 16 | **Known bugs** 17 | * Concurrency issue caused by running two instances of cl_gui.py simultaneously 18 | * Extra blank window when plotting confusion matrix 19 | 20 | Installation Guide 21 | ------------------ 22 | The installation instructions are written primarily for Mac OSX machines. (although I think they might work on Linux systems too) 23 | 24 | ### Mac OSX 25 | 26 | In order to run the project, install the following dependencies: 27 | 28 | 1. [Python](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls) 29 | 2. [pip](https://pip.pypa.io/en/stable/installing/) 30 | 3. [scikit-Learn](http://scikit-learn.org/stable/install.html) 31 | 4. [numpy](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls) 32 | 5. [scipy](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls) 33 | 6. [matplotlb](http://matplotlib.org/users/installing.html) 34 | 7. [seaborn](http://stanford.edu/~mwaskom/software/seaborn/installing.html) 35 | 8. [PIL](http://www.pythonware.com/products/pil/) 36 | 9. [pandas](http://pandas.pydata.org/pandas-docs/stable/install.html) 37 | 10. [nose](https://nose.readthedocs.org/en/latest/) 38 | 11. [PyQt](http://pyqt.sourceforge.net/Docs/PyQt4/installation.html) 39 | 40 | Alternatively, you can enter the makefile command 41 | ``` 42 | make install 43 | ``` 44 | to automatically install dependencies 3-10. 45 | 46 | 47 | ## User Manual 48 | To start the application, enter the command at the root directory of the project: 49 | 50 | ``` 51 | python cl_gui.py 52 | ``` 53 | A list of commands for the Pykit-Learn application is provided in the table below: 54 | 55 | Commands | Example | Description 56 | ---------------------------- | ------------------------------------- | ---------------- 57 | **load** [file] | `load ~/Downloads/data.csv` | Loads the dataset at the path specified by [file]. No quotes "" around the filename! 58 | **load_random** | `load_random` | Load a randomly generated dataset with 3 classes. 59 | **load_file_gui** | | Opens a file dialog for selecting the desired file. 60 | **plot_2d** | | Plot a 2-D distribution of the dataset. 61 | **plot_andrews** | | Plot an Andrews curve of the dataset. 62 | **plot_frequency** | | View the frequency of each class label. 63 | **plot_feature_matrix** | | Generate a matrix plot of feature-feature relationships. 64 | **plot_scatter_matrix** | | Matrix plot with KDEs along the diagonal. 65 | **plot_radial** | | Plot a radial chart of the dataset. 66 | **preprocess** [flags] | `preprocess -std` | Preprocesses a dataset. Flags: **-std** Standardize to mean 0 and variance 1. **-norm** Normalize each feature to range [0,1] 67 | **run** -A [alg] -test_ratio [0-1] -cv [int] | `run -A dt -test_ratio .3 -cv 5` | Runs the ML alg on the loaded dataset. **alg** = dt (Decision Tree). Can specify the test-train ratio. **-cv** enables k-fold cross validation. 68 | **visualize** --suppress | | Plots all possible visualizations for input data. **--suppress** disables all plotting output. 69 | **help** | | Provides a help screen of available commands. 70 | **quit** | | Quits the command line GUI. 71 | 72 | 73 | ## Examples 74 | ### Supervised Learning with the Iris Dataset 75 | **Step 1: Loading the file** 76 | ``` 77 | >> load pk/tests/iris2.csv 78 | Feature Array: 79 | [[ 5.1 3.5 1.4 0.2] 80 | [ 4.9 3. 1.4 0.2] 81 | [ 4.7 3.2 1.3 0.2] 82 | ... 83 | [ 6.2 3.4 5.4 2.3] 84 | [ 5.9 3. 5.1 1.8]] 85 | Target classifications: 86 | ['setosa' 'setosa' 'setosa' ... 87 | 'versicolor' 'versicolor' ... 88 | 'virginica'] 89 | ``` 90 | **Step 2: Visualizing the input in 2-D** 91 | ``` 92 | >> plot_2d 93 | Creating visualization(s). 94 | Viewing generated plots... 95 | ``` 96 | ![1] (http://i.imgur.com/94F1iXg.png) 97 | 98 | **Step 3: Preprocessing the input dataset** 99 | ``` 100 | >> preprocess -h 101 | usage: cl_gui.py [-h] [-std] [-norm] 102 | 103 | optional arguments: 104 | -h, --help show this help message and exit 105 | -std Standardize the feature array. 106 | -norm Normalize the values of each feature. 107 | 0 108 | >> preprocess -std 109 | Standardizing feature array... 110 | [[ -9.00681170e-01 1.03205722e+00 -1.34127240e+00 -1.31297673e+00] 111 | [ -1.14301691e+00 -1.24957601e-01 -1.34127240e+00 -1.31297673e+00] 112 | [ -1.38535265e+00 3.37848329e-01 -1.39813811e+00 -1.31297673e+00] 113 | [ -1.50652052e+00 1.06445364e-01 -1.28440670e+00 -1.31297673e+00] 114 | ... 115 | [ 6.86617933e-02 -1.24957601e-01 7.62758643e-01 7.90590793e-01]] 116 | ``` 117 | 118 | **Step 4: Fitting a decision tree learner on Iris** 119 | ``` 120 | >> run -A dt -test_ratio .3 121 | Running decision tree algorithm on dataset... 122 | Decision Tree done! 123 | Train accuracy: 100.000000 124 | Test accuracy: 96.000000% 125 | Confusion Matrix is: 126 | [[16 0 0] 127 | [ 0 18 1] 128 | [ 0 1 14]] 129 | ``` 130 | ![2] (http://i.imgur.com/jXRDZhV.png) 131 | 132 | ## Testing 133 | The unit-testing framework used in this project is the **nose** Python module. Running the unit tests yourself 134 | is as simple as entering the following command in the root directory of the project: 135 | ``` 136 | make test 137 | ``` 138 | To run all the unit tests (this might take some time), type 139 | ``` 140 | make test-all 141 | ``` 142 | 143 | ## Todo List 144 | - [x] MVC Components 145 | - [x] Model Classes 146 | - [x] Algorithm 147 | - [x] SupervisedAlgorithm 148 | - [x] UnsupervisedAlgorithm 149 | - [x] RegressionAlgorithm 150 | - [x] ExecutionReport 151 | - [x] Controller Classes 152 | - [x] AlgorithmEngine 153 | - [x] DatasetIO 154 | - [x] PreprocessingEngine 155 | - [x] Visualizer 156 | - [x] View Classes 157 | - [x] BaseView 158 | - [x] Demos 159 | - [x] Image segmentation demo 160 | - [x] Command-line GUI 161 | - [x] Loading 162 | - [x] File formats 163 | - [x] .arff 164 | - [x] .csv 165 | - [x] .xls/.xlsx 166 | - [x] Generate random Gaussian data w/ labels 167 | - [x] Download dataset from mldata.org 168 | - [x] Preprocessing data 169 | - [x] Standardization 170 | - [x] Normalization of training examples 171 | - [x] Feature Binarization 172 | - [x] Remove examples with '?' missing values 173 | - [x] Imputation of missing values 174 | - [x] Numerical encoding of categorical features 175 | - [x] Supervised Learning 176 | - [x] Linear & Quadratic Discriminant Analysis 177 | - [x] SVMs 178 | - [x] Stochastic Gradient Descent 179 | - [x] kNN 180 | - [x] Decision Trees 181 | - [x] Ensemble Methods 182 | - [x] Bagging 183 | - [x] Randomized Trees 184 | - [x] AdaBoost 185 | - [x] Multiclass and Multilabel Algorithms 186 | - [x] Feature Selection 187 | - [x] Variance thresholding 188 | - [x] Univariate feature selection 189 | - [x] Generalized Linear Models 190 | - [x] Least Squares 191 | - [x] RANSAC 192 | - [x] Bayesian 193 | - [x] Logistic 194 | - [x] Polynomial 195 | - [x] Kernel Ridge Regression 196 | - [x] Unsupervised Learning 197 | - [x] Gaussian Mixture Models 198 | - [x] GMM 199 | - [x] DPGMM 200 | - [x] Manifold Learning 201 | - [x] Clustering 202 | - [x] K-means 203 | - [x] Spectral clustering 204 | - [x] Hierarchical clustering 205 | - [x] DBSCAN 206 | - [x] Decomposing signals into components 207 | - [x] PCA 208 | - [x] ICA 209 | - [x] Factor Analysis 210 | - [x] Covariance Estimation 211 | - [x] Novelty and Outlier Detection 212 | - [x] Restricted Boltzmann Machines 213 | - [x] Model Selection and Evaluation 214 | - [x] Cross Validation 215 | - [x] Grid Search 216 | - [x] Prediction Metrics 217 | - [x] Classification Metrics 218 | - [x] ROC 219 | - [x] Accuracy Score 220 | - [x] Confusion Matrix 221 | - [x] Regression Metrics 222 | - [x] MAE, MSE, R2 223 | - [x] Clustering Metrics 224 | - [x] Adjusted Rand index 225 | - [x] Homogeneity (similarity of items within cluster) 226 | - [x] Completeness (same class items all go in one cluster) 227 | - [x] Validation Curves 228 | - [x] Dataset Transformations 229 | - [x] Pipelining 230 | - [x] Feature Extraction 231 | - [x] Dictionary Vectorization 232 | - [x] Kernel Approximation 233 | - [x] Visualizations 234 | - [x] Plotting features (2d, frequency chart, radial plot, etc.) 235 | 236 | 237 | 238 | -------------------------------------------------------------------------------- /pk/utils/clustering.py: -------------------------------------------------------------------------------- 1 | """This module provides clustering utility functions. 2 | Author: Bhavesh 3 | """ 4 | from sklearn import cluster 5 | from sklearn.mixture import GMM, DPGMM 6 | 7 | def train_gmm(X, n_components=3, covariance_type='diag', random_state=None, 8 | thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, 9 | params='wmc', init_params='wmc'): 10 | """Variational Inference for the Infinite Gaussian Mixture Model. 11 | 12 | DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it 13 | is an infinite mixture model with the Dirichlet Process as a prior 14 | distribution on the number of clusters. In practice the 15 | approximate inference algorithm uses a truncated distribution with 16 | a fixed maximum number of components, but almost always the number 17 | of components actually used depends on the data. 18 | 19 | Stick-breaking Representation of a Gaussian mixture model 20 | probability distribution. This class allows for easy and efficient 21 | inference of an approximate posterior distribution over the 22 | parameters of a Gaussian mixture model with a variable number of 23 | components (smaller than the truncation parameter n_components). 24 | 25 | Initialization is with normally-distributed means and identity 26 | covariance, for proper convergence. 27 | 28 | Parameters 29 | ---------- 30 | n_components: int, optional 31 | Number of mixture components. Defaults to 1. 32 | 33 | covariance_type: string, optional 34 | String describing the type of covariance parameters to 35 | use. Must be one of 'spherical', 'tied', 'diag', 'full'. 36 | Defaults to 'diag'. 37 | 38 | alpha: float, optional 39 | Real number representing the concentration parameter of 40 | the dirichlet process. Intuitively, the Dirichlet Process 41 | is as likely to start a new cluster for a point as it is 42 | to add that point to a cluster with alpha elements. A 43 | higher alpha means more clusters, as the expected number 44 | of clusters is ``alpha*log(N)``. Defaults to 1. 45 | 46 | thresh : float, optional 47 | Convergence threshold. 48 | n_iter : int, optional 49 | Maximum number of iterations to perform before convergence. 50 | params : string, optional 51 | Controls which parameters are updated in the training 52 | process. Can contain any combination of 'w' for weights, 53 | 'm' for means, and 'c' for covars. Defaults to 'wmc'. 54 | init_params : string, optional 55 | Controls which parameters are updated in the initialization 56 | process. Can contain any combination of 'w' for weights, 57 | 'm' for means, and 'c' for covars. Defaults to 'wmc'. 58 | 59 | Attributes 60 | ---------- 61 | covariance_type : string 62 | String describing the type of covariance parameters used by 63 | the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'. 64 | 65 | n_components : int 66 | Number of mixture components. 67 | 68 | `weights_` : array, shape (`n_components`,) 69 | Mixing weights for each mixture component. 70 | 71 | `means_` : array, shape (`n_components`, `n_features`) 72 | Mean parameters for each mixture component. 73 | 74 | `precs_` : array 75 | Precision (inverse covariance) parameters for each mixture 76 | component. The shape depends on `covariance_type`:: 77 | 78 | (`n_components`, 'n_features') if 'spherical', 79 | (`n_features`, `n_features`) if 'tied', 80 | (`n_components`, `n_features`) if 'diag', 81 | (`n_components`, `n_features`, `n_features`) if 'full' 82 | 83 | `converged_` : bool 84 | True when convergence was reached in fit(), False otherwise. 85 | 86 | See Also 87 | -------- 88 | GMM : Finite Gaussian mixture model fit with EM 89 | 90 | VBGMM : Finite Gaussian mixture model fit with a variational 91 | algorithm, better for situations where there might be too little 92 | data to get a good estimate of the covariance matrix. 93 | """ 94 | 95 | model = GMM(n_components=n_components, 96 | covariance_type=covariance_type, 97 | random_state=random_state, 98 | thresh=thresh, 99 | tol=tol, 100 | min_covar=min_covar, 101 | n_iter=n_iter, 102 | n_init=n_init, 103 | params=params, 104 | init_params=init_params) 105 | model = model.fit(X) 106 | return model 107 | 108 | def train_dpgmm(X, n_components=3, covariance_type='diag', alpha=1.0, 109 | random_state=None, thresh=None, tol=0.001, verbose=False, 110 | min_covar=None, n_iter=10, params='wmc', init_params='wmc'): 111 | """ 112 | This function trains a Infinite Gaussian Mixture Model for clustering 113 | :param X: 114 | :param n_components: 115 | :param covariance_type: 116 | :param alpha: 117 | :param random_state: 118 | :param thresh: 119 | :param tol: 120 | :param verbose: 121 | :param min_covar: 122 | :param n_iter: 123 | :param params: 124 | :param init_params: 125 | :return: a trained DPGMM clustering model 126 | """ 127 | model = DPGMM(n_components=n_components, 128 | covariance_type=covariance_type, 129 | alpha=alpha, 130 | random_state=random_state, 131 | thresh=thresh, 132 | verbose=verbose, 133 | min_covar=min_covar, 134 | n_iter=n_iter, 135 | params=params, 136 | init_params=init_params) 137 | model = model.fit(X) 138 | return model 139 | 140 | 141 | def train_kmeans(X, n_clusters=3, init='k-means++', n_init=10, 142 | max_iter=300, tol=0.0001, precompute_distances='auto', 143 | verbose=0, random_state=None, copy_x=True, n_jobs=1): 144 | """ 145 | This functions trains a simple kmeans clustering model 146 | :param X: 147 | :param n_clusters: 148 | :param init: 149 | :param n_init: 150 | :param max_iter: 151 | :param tol: 152 | :param precompute_distances: 153 | :param verbose: 154 | :param random_state: 155 | :param copy_x: 156 | :param n_jobs: 157 | :return: trained kmeans model for clustering 158 | """ 159 | model = cluster.KMeans(n_clusters=n_clusters, 160 | init=init, 161 | n_init=init, 162 | max_iter=max_iter, 163 | tol=tol, 164 | precompute_distances=precompute_distances, 165 | verbose=verbose, 166 | random_state=random_state, 167 | copy_x=copy_x, 168 | n_jobs=n_jobs) 169 | model = model.fit(X) 170 | return model 171 | 172 | def train_spectral(X, n_clusters=3, eigen_solver=None, random_state=None, 173 | n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, 174 | eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, 175 | kernel_params=None): 176 | """ 177 | This functions trains Spectral clustering model 178 | :param X: 179 | :param n_clusters: 180 | :param eigen_solver: 181 | :param random_state: 182 | :param n_init: 183 | :param gamma: 184 | :param affinity: 185 | :param n_neighbors: 186 | :param eigen_tol: 187 | :param assign_labels: 188 | :param degree: 189 | :param coef0: 190 | :param kernel_params: 191 | :return: a trained Spectral Model for clustering 192 | """ 193 | model = cluster.SpectralClustering(n_clusters=n_clusters, 194 | eigen_solver=eigen_solver, 195 | random_state=random_state, 196 | n_init=n_init, 197 | gamma=gamma, 198 | affinity=affinity, 199 | n_neighbors=n_neighbors, 200 | eigen_tol=eigen_tol, 201 | assign_labels=assign_labels, 202 | degree=degree, 203 | coef0=coef0, 204 | kernel_params=kernel_params) 205 | model = model.fit(X) 206 | return model 207 | 208 | def train_agglomerative(X, n_clusters=3, affinity='euclidean', 209 | connectivity=None, n_components=None, 210 | compute_full_tree='auto', linkage='ward'): 211 | """ 212 | This function trains hierarchical/agglomerative clustering model 213 | :param X: 214 | :param n_clusters: 215 | :param affinity: 216 | :param connectivity: 217 | :param n_components: 218 | :param compute_full_tree: 219 | :param linkage: 220 | :return: a trained hierarchical model for clustering 221 | """ 222 | model = cluster.AgglomerativeClustering(n_clusters=n_clusters, 223 | affinity=affinity, 224 | connectivity=connectivity, 225 | n_components=n_components, 226 | compute_full_tree=compute_full_tree, 227 | linkage=linkage) 228 | model = model.fit(X) 229 | return model 230 | 231 | def train_dbscan(X, eps=0.5, min_samples=5, metric='euclidean', 232 | algorithm='auto', leaf_size=30, p=None, random_state=None): 233 | """ 234 | This function trains a density based spatial clustering model 235 | :param X: 236 | :param eps: 237 | :param min_samples: 238 | :param metric: 239 | :param algorithm: 240 | :param leaf_size: 241 | :param p: 242 | :param random_state: 243 | :return: a train DBSCAN model for clustering 244 | """ 245 | model = cluster.DBSCAN(eps=eps, 246 | min_samples=min_samples, 247 | metric=metric, 248 | algorithm=algorithm, 249 | leaf_size=leaf_size, 250 | p=p, 251 | random_state=random_state) 252 | model = model.fit(X) 253 | return model 254 | 255 | -------------------------------------------------------------------------------- /pk/utils/loading.py: -------------------------------------------------------------------------------- 1 | """ This module provides loading utilities for data set files with extensions .csv, .arff, .json. 2 | Author: Sean Dai 3 | """ 4 | from __future__ import print_function 5 | import cPickle 6 | import numpy as np 7 | import os 8 | import pandas as pd 9 | 10 | 11 | from sklearn.feature_extraction import DictVectorizer 12 | from sklearn.datasets import fetch_mldata 13 | from sklearn.datasets import make_blobs 14 | from scipy.io.arff import loadarff 15 | 16 | 17 | def _load_arff(filename): 18 | """ 19 | Base function to load arff files. 20 | """ 21 | dataset = loadarff(open(filename, 'r')) 22 | features = dataset[1].names() 23 | class_attr = features[-1] 24 | y = np.array(dataset[0][class_attr]) 25 | X = np.array(dataset[0][features[:-1]]) 26 | X = np.array([list(fv) for fv in X]) 27 | return X, y, features 28 | 29 | 30 | def is_numeric_type(array): 31 | """ 32 | Checks if the array's datatype is a number data type. 33 | 34 | Args: 35 | array: numpy array 36 | 37 | Returns: 38 | True if array.dtype is type float, int, uint, complex, or bool 39 | Otherwise, we say it's a string. 40 | """ 41 | numeric_dtypes = [] 42 | numeric_strings = {'uint', 'complex', 'float', 'int'} 43 | for dtype, entries in np.sctypes.items(): 44 | if dtype in numeric_strings: 45 | numeric_dtypes.extend(entries) 46 | return array.dtype.type in numeric_dtypes 47 | 48 | 49 | def vectorize_categorical_data(X, y, features): 50 | """ 51 | One-hot encoding for categorical attributes in the feature array. 52 | 53 | Args: 54 | X: (num_examples, num_features) numpy array of all the examples 55 | y: the class labels of size (1, num_examples) 56 | features: list of feature names 57 | 58 | Returns: 59 | X: new numpy array with all categorical labels becoming 1-hot encoded 60 | y: class labels, changed to 1-hot if labels were categorical 61 | """ 62 | vec = DictVectorizer() 63 | assert (len(features) - 1) == len(X[0]) 64 | 65 | # Create a dictionary for each example with the feature name as the key. 66 | # DictVectorizer requires feature arrays to be represented as a list 67 | # of dict objects. Each element of the list is 1 feature vector example from 68 | # the dataset. 69 | measurements = [] 70 | for ex in X: 71 | ex_dict = dict(zip(features, ex.tolist())) 72 | measurements.append(ex_dict) 73 | measurements = _convert_dict_values_to_num(measurements) 74 | 75 | if not is_numeric_type(y): 76 | y = _convert_target_to_num(y) 77 | 78 | X = vec.fit_transform(measurements, y).toarray() 79 | return X, y 80 | 81 | 82 | def _convert_dict_values_to_num(examples): 83 | """ 84 | Convert only the numeric values formatted as strings to actual 85 | numeric datatypes in the feature array of dicts. 86 | 87 | examples - list 88 | """ 89 | new_examples = examples[:] 90 | for dct in new_examples: 91 | for key in dct: 92 | value = dct[key] 93 | if is_number(value): 94 | dct[key] = float(value) 95 | return new_examples 96 | 97 | def is_number(s): 98 | """ True if string s can be converted to a number type. 99 | """ 100 | try: 101 | float(s) 102 | return True 103 | except ValueError: 104 | return False 105 | 106 | def _convert_target_to_num(target): 107 | """ 108 | Convert only the numeric values formatted as strings to actual 109 | numeric datatypes in the feature array of dicts. 110 | 111 | target - nd.array of class values 112 | 113 | Returns: 114 | converted target array to float dtype 115 | """ 116 | 117 | if all(map(is_number, target)): 118 | return target.astype(float) 119 | else: 120 | return target 121 | 122 | 123 | def load_arff(filename, vectorize_data=False, is_supervised=True): 124 | """ 125 | Loads .arff dataset files. 126 | 127 | Args: 128 | filename: str 129 | 130 | Returns: 131 | X : a (num_examples, num_features) numpy array of examples X 132 | y : the class labels y of size (1, num_examples) 133 | data: DataFrame object of features concatenated with target values 134 | """ 135 | X, y, features = _load_arff(filename) 136 | df = stack_to_data_frame(X, y) 137 | 138 | # For categorical data, we want the feature label names 139 | # in order to create a 1-hot encoding of the categorical 140 | # values in our feature array of examples. 141 | if not is_numeric_type(X) and vectorize_data: 142 | return vectorize_categorical_data(X, y, features), df 143 | else: 144 | return X, y, df 145 | 146 | def load_csv(filename, vectorize_data=False): 147 | """ 148 | Loads csv dataset files. 149 | 150 | Args: 151 | filename: str 152 | 153 | Returns: 154 | X : a (num_examples, num_features) numpy array of examples X 155 | y : the class labels y of size (1, num_examples) 156 | dataset: DataFrame object for dataset file 157 | """ 158 | try: 159 | dataset = pd.read_csv(filename, sep=',') 160 | dd = dataset.ix[:, -1] 161 | y = np.array(dd.tolist()).T 162 | column_names = dataset.dtypes.index 163 | X = np.array(dataset[column_names[:-1]]) 164 | 165 | if is_numeric_type(X): 166 | X = X.astype(float) 167 | if is_numeric_type(y): 168 | y = y.astype(float) 169 | 170 | # Change categorical attributes to 1-hot numerical encoding 171 | if vectorize_data: 172 | X, y = vectorize_categorical_data(X, y, column_names) 173 | return X, y, dataset 174 | except OSError: 175 | print('File does not exist') 176 | 177 | def load_excel(filename, vectorize_data=False): 178 | """ 179 | Loads .excel dataset files. 180 | 181 | Args: 182 | filename: str 183 | 184 | Returns: 185 | X : a (num_examples, num_features) numpy array of examples X 186 | y : the class labels y of size (1, num_examples) 187 | data: DataFrame object 188 | """ 189 | try: 190 | xl = pd.ExcelFile(filename) 191 | sheets = xl.sheet_names 192 | data = xl.parse(sheets[0]) 193 | last_col = data.ix[:, -1] 194 | # Assumes last column contains class value 195 | y = np.array(last_col.tolist()).T 196 | column_names = data.dtypes.index 197 | X = np.array(data[column_names[:-1]]) 198 | 199 | if is_numeric_type(X): 200 | X = X.astype(float) 201 | if is_numeric_type(y): 202 | y = y.astype(float) 203 | 204 | # Change categorical attributes to 1-hot numerical encoding 205 | if vectorize_data: 206 | X, y = vectorize_categorical_data(X, y, column_names) 207 | return X, y, data 208 | except OSError: 209 | print('File does not exist') 210 | 211 | def generate_random_points(n_samples=100, n_features=2, centers=3): 212 | """ 213 | Generate a random dataset consisting of Gaussian blobs. 214 | 215 | Args: 216 | n_samples: samples to have 217 | n_features: num of features per sample 218 | n_centers: number of clusters (or classes) 219 | 220 | Returns: 221 | X: feature array size (n_samples, n_features) 222 | y: target array that corresponds each example to a cluster 223 | with size (n_samples, 1). 224 | data_frame: DataFrame object 225 | """ 226 | X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers) 227 | return X, y, stack_to_data_frame(X, y) 228 | 229 | def stack_to_data_frame(X, y): 230 | """ 231 | Concatenates a feature array with its class labels. 232 | 233 | Args: 234 | X: feature array (n_samples, n_features) 235 | y: target labels (1, n_samples) 236 | 237 | Returns: 238 | Pandas DataFrame with y.T concatenated horizontally to X. 239 | 240 | Examples: 241 | X = [[1,2,3] 242 | [4,5,6] 243 | [7,8,9]] 244 | y = ['yes', 'no', yes'] 245 | stack_to_data_frame(X,y) = [[1,2,3,'yes'] 246 | [4,5,6,'no'] 247 | [7,8,9,'yes']] 248 | """ 249 | return pd.DataFrame(np.hstack((X, y[:, np.newaxis]))) 250 | 251 | class DatasetIO(object): 252 | """ 253 | This class performs loading and saving of dataset files. 254 | """ 255 | def load_file(self, filename): 256 | extension = filename[filename.rfind('.'):] 257 | if extension == '.csv': 258 | return load_csv(filename) 259 | elif extension == '.arff': 260 | return load_arff(filename) 261 | elif extension == '.xls' or extension == '.xlsx': 262 | return load_excel(filename) 263 | else: 264 | raise IOError('{} is not a valid filename!'.format(filename)) 265 | 266 | def load_from_mldata(self, dataname): 267 | """ 268 | Loads a dataset from the mldata.org repository. 269 | 270 | Args: 271 | dataname: Name of the dataset on mldata.org (str) 272 | Eg. "regression-datasets stock", "leukemia" 273 | 274 | Returns: 275 | X : a (num_examples, num_features) numpy array of examples X 276 | y : the class labels y of size (1, num_examples) 277 | data_frame: Pandas DataFrame object 278 | """ 279 | import tempfile 280 | import shutil 281 | 282 | # Create a temporary directory to store the downloaded dataset. 283 | test_data_home = tempfile.mkdtemp() 284 | # Fetch the dataset from ml data 285 | try: 286 | dataset = fetch_mldata(dataname, data_home=test_data_home, 287 | transpose_data=True) 288 | except Exception as e: 289 | shutil.rmtree(test_data_home, ignore_errors=True) 290 | raise Exception("No connection to mldata.org server!", e.message) 291 | 292 | X, y = dataset.data, dataset.target 293 | # Remove the temporary directory 294 | shutil.rmtree(test_data_home, ignore_errors=True) 295 | data_frame = stack_to_data_frame(X, y) 296 | return X, y, data_frame 297 | 298 | def pickle_files(self, files_to_save, save_dir): 299 | """ 300 | Saves a list of files to _temp directory 301 | 302 | Args: 303 | files_to_save: List of tuples in form (obj, filename_to_save) 304 | save_dir: Directory to save the files (str) 305 | """ 306 | for obj, filename in files_to_save: 307 | with open(os.path.join(save_dir, filename), 'wb') as f: 308 | cPickle.dump(obj, f) 309 | -------------------------------------------------------------------------------- /pk/tests/credit-g.arff: -------------------------------------------------------------------------------- 1 | % Description of the German credit dataset. 2 | % 3 | % 1. Title: German Credit data 4 | % 5 | % 2. Source Information 6 | % 7 | % Professor Dr. Hans Hofmann 8 | % Institut f"ur Statistik und "Okonometrie 9 | % Universit"at Hamburg 10 | % FB Wirtschaftswissenschaften 11 | % Von-Melle-Park 5 12 | % 2000 Hamburg 13 13 | % 14 | % 3. Number of Instances: 1000 15 | % 16 | % Two datasets are provided. the original dataset, in the form provided 17 | % by Prof. Hofmann, contains categorical/symbolic attributes and 18 | % is in the file "german.data". 19 | % 20 | % For algorithms that need numerical attributes, Strathclyde University 21 | % produced the file "german.data-numeric". This file has been edited 22 | % and several indicator variables added to make it suitable for 23 | % algorithms which cannot cope with categorical variables. Several 24 | % attributes that are ordered categorical (such as attribute 17) have 25 | % been coded as integer. This was the form used by StatLog. 26 | % 27 | % 28 | % 6. Number of Attributes german: 20 (7 numerical, 13 categorical) 29 | % Number of Attributes german.numer: 24 (24 numerical) 30 | % 31 | % 32 | % 7. Attribute description for german 33 | % 34 | % Attribute 1: (qualitative) 35 | % Status of existing checking account 36 | % A11 : ... < 0 DM 37 | % A12 : 0 <= ... < 200 DM 38 | % A13 : ... >= 200 DM / 39 | % salary assignments for at least 1 year 40 | % A14 : no checking account 41 | % 42 | % Attribute 2: (numerical) 43 | % Duration in month 44 | % 45 | % Attribute 3: (qualitative) 46 | % Credit history 47 | % A30 : no credits taken/ 48 | % all credits paid back duly 49 | % A31 : all credits at this bank paid back duly 50 | % A32 : existing credits paid back duly till now 51 | % A33 : delay in paying off in the past 52 | % A34 : critical account/ 53 | % other credits existing (not at this bank) 54 | % 55 | % Attribute 4: (qualitative) 56 | % Purpose 57 | % A40 : car (new) 58 | % A41 : car (used) 59 | % A42 : furniture/equipment 60 | % A43 : radio/television 61 | % A44 : domestic appliances 62 | % A45 : repairs 63 | % A46 : education 64 | % A47 : (vacation - does not exist?) 65 | % A48 : retraining 66 | % A49 : business 67 | % A410 : others 68 | % 69 | % Attribute 5: (numerical) 70 | % Credit amount 71 | % 72 | % Attibute 6: (qualitative) 73 | % Savings account/bonds 74 | % A61 : ... < 100 DM 75 | % A62 : 100 <= ... < 500 DM 76 | % A63 : 500 <= ... < 1000 DM 77 | % A64 : .. >= 1000 DM 78 | % A65 : unknown/ no savings account 79 | % 80 | % Attribute 7: (qualitative) 81 | % Present employment since 82 | % A71 : unemployed 83 | % A72 : ... < 1 year 84 | % A73 : 1 <= ... < 4 years 85 | % A74 : 4 <= ... < 7 years 86 | % A75 : .. >= 7 years 87 | % 88 | % Attribute 8: (numerical) 89 | % Installment rate in percentage of disposable income 90 | % 91 | % Attribute 9: (qualitative) 92 | % Personal status and sex 93 | % A91 : male : divorced/separated 94 | % A92 : female : divorced/separated/married 95 | % A93 : male : single 96 | % A94 : male : married/widowed 97 | % A95 : female : single 98 | % 99 | % Attribute 10: (qualitative) 100 | % Other debtors / guarantors 101 | % A101 : none 102 | % A102 : co-applicant 103 | % A103 : guarantor 104 | % 105 | % Attribute 11: (numerical) 106 | % Present residence since 107 | % 108 | % Attribute 12: (qualitative) 109 | % Property 110 | % A121 : real estate 111 | % A122 : if not A121 : building society savings agreement/ 112 | % life insurance 113 | % A123 : if not A121/A122 : car or other, not in attribute 6 114 | % A124 : unknown / no property 115 | % 116 | % Attribute 13: (numerical) 117 | % Age in years 118 | % 119 | % Attribute 14: (qualitative) 120 | % Other installment plans 121 | % A141 : bank 122 | % A142 : stores 123 | % A143 : none 124 | % 125 | % Attribute 15: (qualitative) 126 | % Housing 127 | % A151 : rent 128 | % A152 : own 129 | % A153 : for free 130 | % 131 | % Attribute 16: (numerical) 132 | % Number of existing credits at this bank 133 | % 134 | % Attribute 17: (qualitative) 135 | % Job 136 | % A171 : unemployed/ unskilled - non-resident 137 | % A172 : unskilled - resident 138 | % A173 : skilled employee / official 139 | % A174 : management/ self-employed/ 140 | % highly qualified employee/ officer 141 | % 142 | % Attribute 18: (numerical) 143 | % Number of people being liable to provide maintenance for 144 | % 145 | % Attribute 19: (qualitative) 146 | % Telephone 147 | % A191 : none 148 | % A192 : yes, registered under the customers name 149 | % 150 | % Attribute 20: (qualitative) 151 | % foreign worker 152 | % A201 : yes 153 | % A202 : no 154 | % 155 | % 156 | % 157 | % 8. Cost Matrix 158 | % 159 | % This dataset requires use of a cost matrix (see below) 160 | % 161 | % 162 | % 1 2 163 | % ---------------------------- 164 | % 1 0 1 165 | % ----------------------- 166 | % 2 5 0 167 | % 168 | % (1 = Good, 2 = Bad) 169 | % 170 | % the rows represent the actual classification and the columns 171 | % the predicted classification. 172 | % 173 | % It is worse to class a customer as good when they are bad (5), 174 | % than it is to class a customer as bad when they are good (1). 175 | % 176 | % 177 | % 178 | % 179 | % 180 | % Relabeled values in attribute checking_status 181 | % From: A11 To: '<0' 182 | % From: A12 To: '0<=X<200' 183 | % From: A13 To: '>=200' 184 | % From: A14 To: 'no checking' 185 | % 186 | % 187 | % Relabeled values in attribute credit_history 188 | % From: A30 To: 'no credits/all paid' 189 | % From: A31 To: 'all paid' 190 | % From: A32 To: 'existing paid' 191 | % From: A33 To: 'delayed previously' 192 | % From: A34 To: 'critical/other existing credit' 193 | % 194 | % 195 | % Relabeled values in attribute purpose 196 | % From: A40 To: 'new car' 197 | % From: A41 To: 'used car' 198 | % From: A42 To: furniture/equipment 199 | % From: A43 To: radio/tv 200 | % From: A44 To: 'domestic appliance' 201 | % From: A45 To: repairs 202 | % From: A46 To: education 203 | % From: A47 To: vacation 204 | % From: A48 To: retraining 205 | % From: A49 To: business 206 | % From: A410 To: other 207 | % 208 | % 209 | % Relabeled values in attribute savings_status 210 | % From: A61 To: '<100' 211 | % From: A62 To: '100<=X<500' 212 | % From: A63 To: '500<=X<1000' 213 | % From: A64 To: '>=1000' 214 | % From: A65 To: 'no known savings' 215 | % 216 | % 217 | % Relabeled values in attribute employment 218 | % From: A71 To: unemployed 219 | % From: A72 To: '<1' 220 | % From: A73 To: '1<=X<4' 221 | % From: A74 To: '4<=X<7' 222 | % From: A75 To: '>=7' 223 | % 224 | % 225 | % Relabeled values in attribute personal_status 226 | % From: A91 To: 'male div/sep' 227 | % From: A92 To: 'female div/dep/mar' 228 | % From: A93 To: 'male single' 229 | % From: A94 To: 'male mar/wid' 230 | % From: A95 To: 'female single' 231 | % 232 | % 233 | % Relabeled values in attribute other_parties 234 | % From: A101 To: none 235 | % From: A102 To: 'co applicant' 236 | % From: A103 To: guarantor 237 | % 238 | % 239 | % Relabeled values in attribute property_magnitude 240 | % From: A121 To: 'real estate' 241 | % From: A122 To: 'life insurance' 242 | % From: A123 To: car 243 | % From: A124 To: 'no known property' 244 | % 245 | % 246 | % Relabeled values in attribute other_payment_plans 247 | % From: A141 To: bank 248 | % From: A142 To: stores 249 | % From: A143 To: none 250 | % 251 | % 252 | % Relabeled values in attribute housing 253 | % From: A151 To: rent 254 | % From: A152 To: own 255 | % From: A153 To: 'for free' 256 | % 257 | % 258 | % Relabeled values in attribute job 259 | % From: A171 To: 'unemp/unskilled non res' 260 | % From: A172 To: 'unskilled resident' 261 | % From: A173 To: skilled 262 | % From: A174 To: 'high qualif/self emp/mgmt' 263 | % 264 | % 265 | % Relabeled values in attribute own_telephone 266 | % From: A191 To: none 267 | % From: A192 To: yes 268 | % 269 | % 270 | % Relabeled values in attribute foreign_worker 271 | % From: A201 To: yes 272 | % From: A202 To: no 273 | % 274 | % 275 | % Relabeled values in attribute class 276 | % From: 1 To: good 277 | % From: 2 To: bad 278 | % 279 | @relation german_credit 280 | @attribute checking_status { '<0', '0<=X<200', '>=200', 'no checking'} 281 | @attribute duration real 282 | @attribute credit_history { 'no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit'} 283 | @attribute purpose { 'new car', 'used car', furniture/equipment, radio/tv, 'domestic appliance', repairs, education, vacation, retraining, business, other} 284 | @attribute credit_amount real 285 | @attribute savings_status { '<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings'} 286 | @attribute employment { unemployed, '<1', '1<=X<4', '4<=X<7', '>=7'} 287 | @attribute installment_commitment real 288 | @attribute personal_status { 'male div/sep', 'female div/dep/mar', 'male single', 'male mar/wid', 'female single'} 289 | @attribute other_parties { none, 'co applicant', guarantor} 290 | @attribute residence_since real 291 | @attribute property_magnitude { 'real estate', 'life insurance', car, 'no known property'} 292 | @attribute age real 293 | @attribute other_payment_plans { bank, stores, none} 294 | @attribute housing { rent, own, 'for free'} 295 | @attribute existing_credits real 296 | @attribute job { 'unemp/unskilled non res', 'unskilled resident', skilled, 'high qualif/self emp/mgmt'} 297 | @attribute num_dependents real 298 | @attribute own_telephone { none, yes} 299 | @attribute foreign_worker { yes, no} 300 | @attribute class { good, bad} 301 | @data 302 | '<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,4,'real estate',67,none,own,2,skilled,1,yes,yes,good 303 | '0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,2,'real estate',22,none,own,1,skilled,1,none,yes,bad 304 | 'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,3,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good 305 | '<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,4,'life insurance',45,none,'for free',1,skilled,2,none,yes,good 306 | '<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,4,'no known property',53,none,'for free',2,skilled,2,none,yes,bad 307 | 'no checking',36,'existing paid',education,9055,'no known savings','1<=X<4',2,'male single',none,4,'no known property',35,none,'for free',1,'unskilled resident',2,yes,yes,good 308 | 'no checking',24,'existing paid',furniture/equipment,2835,'500<=X<1000','>=7',3,'male single',none,4,'life insurance',53,none,own,1,skilled,1,none,yes,good 309 | '0<=X<200',36,'existing paid','used car',6948,'<100','1<=X<4',2,'male single',none,2,car,35,none,rent,1,'high qualif/self emp/mgmt',1,yes,yes,good -------------------------------------------------------------------------------- /pk/main/ui/main_gui.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | main_tab 4 | 5 | 6 | 7 | 0 8 | 0 9 | 987 10 | 737 11 | 12 | 13 | 14 | Scikit GUI 15 | 16 | 17 | 0 18 | 19 | 20 | 21 | Upload 22 | 23 | 24 | 25 | 26 | 0 27 | 0 28 | 501 29 | 51 30 | 31 | 32 | 33 | 34 | 35 | 36 | Open File 37 | 38 | 39 | 40 | 41 | 42 | 43 | Open URL 44 | 45 | 46 | 47 | 48 | 49 | 50 | Generate 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 20 60 | 50 61 | 391 62 | 621 63 | 64 | 65 | 66 | 67 | 68 | 69 | Dataset Information 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 420 82 | 50 83 | 541 84 | 621 85 | 86 | 87 | 88 | 89 | 90 | 91 | Dataset Plotter 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 20 104 | 683 105 | 941 106 | 16 107 | 108 | 109 | 110 | 0 111 | 112 | 113 | 114 | 115 | 116 | true 117 | 118 | 119 | false 120 | 121 | 122 | Preprocess 123 | 124 | 125 | 126 | 127 | 0 128 | 0 129 | 981 130 | 51 131 | 132 | 133 | 134 | 135 | 136 | 137 | 0 138 | 139 | 140 | 141 | Normalize 142 | 143 | 144 | 145 | 146 | Standardize 147 | 148 | 149 | 150 | 151 | Binarize 152 | 153 | 154 | 155 | 156 | Impute 157 | 158 | 159 | 160 | 161 | Add Noise 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 20 172 | 60 173 | 81 174 | 17 175 | 176 | 177 | 178 | Attributes 179 | 180 | 181 | 182 | 183 | 184 | 10 185 | 80 186 | 421 187 | 251 188 | 189 | 190 | 191 | 192 | 193 | 194 | Attribute 1 195 | 196 | 197 | 198 | 199 | 200 | 201 | Attribute 2 202 | 203 | 204 | 205 | 206 | 207 | 208 | Attribute 3 209 | 210 | 211 | 212 | 213 | 214 | 215 | Attribute 4 216 | 217 | 218 | 219 | 220 | 221 | 222 | Attribute 5 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 10 232 | 330 233 | 419 234 | 26 235 | 236 | 237 | 238 | Remove 239 | 240 | 241 | 242 | 243 | 244 | Regression 245 | 246 | 247 | 248 | 249 | 0 250 | 0 251 | 981 252 | 51 253 | 254 | 255 | 256 | 257 | 258 | 259 | 0 260 | 261 | 262 | 263 | Linear 264 | 265 | 266 | 267 | 268 | Polynomial 269 | 270 | 271 | 272 | 273 | Least Square 274 | 275 | 276 | 277 | 278 | Logistic 279 | 280 | 281 | 282 | 283 | Gradient Descent 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | Classify 294 | 295 | 296 | 297 | 298 | 0 299 | 0 300 | 981 301 | 51 302 | 303 | 304 | 305 | 306 | 307 | 308 | 0 309 | 310 | 311 | Qt::ElideNone 312 | 313 | 314 | 315 | Decision Tree 316 | 317 | 318 | 319 | 320 | Ensemble 321 | 322 | 323 | 324 | 325 | Neural Networks 326 | 327 | 328 | 329 | 330 | SVM 331 | 332 | 333 | 334 | 335 | Bayes Nets 336 | 337 | 338 | 339 | 340 | kNN 341 | 342 | 343 | 344 | 345 | Others 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | Cluster 356 | 357 | 358 | 359 | 360 | 0 361 | 0 362 | 981 363 | 51 364 | 365 | 366 | 367 | 368 | 369 | 370 | 0 371 | 372 | 373 | 374 | kMeans 375 | 376 | 377 | 378 | 379 | EM 380 | 381 | 382 | 383 | 384 | Affinity Propogation 385 | 386 | 387 | 388 | 389 | Spectral 390 | 391 | 392 | 393 | 394 | Agglomerative 395 | 396 | 397 | 398 | 399 | DBSCAN 400 | 401 | 402 | 403 | 404 | 405 | 406 | horizontalWidget_5 407 | clustermethod_tab 408 | 409 | 410 | 411 | Reduce 412 | 413 | 414 | 415 | 416 | 0 417 | 0 418 | 981 419 | 51 420 | 421 | 422 | 423 | 424 | 425 | 426 | 0 427 | 428 | 429 | 430 | PCA 431 | 432 | 433 | 434 | 435 | ICA 436 | 437 | 438 | 439 | 440 | Random Projection 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | Visualize 451 | 452 | 453 | 454 | 455 | Other 456 | 457 | 458 | 459 | 460 | 461 | 462 | -------------------------------------------------------------------------------- /pk/tests/correct_array.pkl: -------------------------------------------------------------------------------- 1 | (cnumpy.core.multiarray 2 | _reconstruct 3 | p1 4 | (cnumpy 5 | ndarray 6 | p2 7 | (I0 8 | tS'b' 9 | tRp3 10 | (I1 11 | (I56 12 | I10 13 | tcnumpy 14 | dtype 15 | p4 16 | (S'f8' 17 | I0 18 | I1 19 | tRp5 20 | (I3 21 | S'<' 22 | NNNI-1 23 | I-1 24 | I0 25 | tbI00 26 | S'\xd5A^\x0f&\xc5\xcb?;\xc4?l\xe9\xd1\xc8?\x82\xe6s\xeev\xbd\xac?\xd9wE\xf0\xbf\x95\xb4?_\n\x0f\x9a]\xf7\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\xdfQcB\xcc%\xb1?\x00\x00\x00\x00\x00\x00\x00\x00q\x90\x10\xe5\x0bZ\xd5?\x89\x95\xd1\xc8\xe7\x15\xdb?\x1c\x0b\n\x832\x8d\xc6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\xa1\x11l\\\xff\xae\xc9?~\x18!<\xda8\xc6?x\xb5\xdc\x99\t\x06\xe4?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x1f\xb95\xe9\xb6D\xbe?\xceQG\xc7\xd5\xc8\xba?u\x1e\x15\xffwD\xbd?`"\xde:\xffv\xd6?\x1ak\x7fg{\xf4\xd3?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@8\x13\xd3\x85X\xfd\xc7?\x8e\xcaM\xd4\xd2\xdc\xd9?\xb1\xc0Wt\xeb5\xd1?\x00\x00\x00\x00\x00\x00\x00\x00\xa4\xa6]L3\xdd\xc1?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00&@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x000@lA\xef\x8d!\x00\xdf?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00J_\x089\xef\x7f\xe0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x08@\x0f\xba\x84Co\xf1\xce?\x1f\xbeL\x14!u\xbb?\xe6\xae%\xe4\x83\x9e\xb9?\x91E\x9ax\x07x\xdb?\x9a\x08\x1b\x9e^)\xbf?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00O\xea\xcb\xd2N\xcd\xad?\xdbl\xac\xc4<+\xd6?\xee\n}\xb0\x8c\r\xe3?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00=@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x18@L3\xdd\xeb\xa4\xbe\xd7?\xb8\xc9\xa82\x8c\xbb\x91?\x92\xc2?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00&@+\xfc\x19\xde\xacA\xe4?R\x0c\x90h\x02E\xac?!\x06\xba\xf6\x05\xf4\xd3?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x18\\sG\xff\xcb\xd0?\x00\x00\x00\x00\x00\x00\x00\x00\xd68\x9b\x8e\x00n\xdf?$\xd6\xe2S\x00\x8c\xcf?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00(@\x0c\xb0\x8fN]\xf9\xa4?>\xb1N\x95\xef\x99\xe4?0\x81[w\xf3T\xaf?\x00\x00\x00\x00\x00\x00\x00\x00]7\xa5\xbcVB\xd0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x80C@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00<\xa5\x83\xf5\x7f\x0e\xbb?\x93\xa9\x82QI\x9d\xde?T5A\xd4}\x00\xc4?\x00\x00\x00\x00\x00\x00\x00\x00t\xd2\xfb\xc6\xd7\x9e\xd0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@s\xf4\xf8\xbdM\x7f\xd7?a2U0*\xa9\xbb?\xaa\x82QI\x9d\x80\xd2?\x00\x00\x00\x00\x00\x00\x00\x00\x96x@\xd9\x94+\xce?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x8e@\xbc\xae_\xb0\xcb?\xd6\x8dwG\xc6j\xc1?\xa2a1\xeaZ{\xc5?N} y\xe7P\xb6?\xaaH\x85\xb1\x85 \xd9?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00;@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x009\x0c\xe6\xaf\x90\xb9\xc6?\x84\xf1\xd3\xb87\xbf\xd3?j\xdeq\x8a\x8e\xe4\xc4?\xc0%\x00\xff\x94*\xcd?\xe0\xb9\xf7p\xc9q\xbf?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@\x80\x99\xef\xe0\'\x0e\xce?Uh \x96\xcd\x1c\xb2?\xb3\xeb\xde\x8a\xc4\x04\xd3?w-!\x1f\xf4l\xd9?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00K@\xe4\xf9\x0c\xa87\xa3\xce?\xe9\x7f\xb9\x16-@\xc7?\x00\x00\x00\x00\x00\x00\x00\x00\x1c\t4\xd8\xd4y\xd9?\xfbs\xd1\x90\xf1(\xc7?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00*@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@"\xa5\xd9<\x0e\x83\xe4?\x0e\xa3 x|{\xc7?\x00\x00\x00\x00\x00\x00\x00\x00i\xc8x\x94Jx\xc6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\xe8-\x1e\xdes`\xb5?\xefq\xa6\t\xdbO\xb6?\xa1\x83.\xe1\xd0[\xc4?\xa6(\x97\xc6/\xbc\xdd?\x82\xab<\x81\xb0S\xca?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x00\x00\x00Ul\xcc\xeb\x88C\xd4?\x00\x00\x00\x00\x00\x00\x00\x00\xf7\x8f\x85\xe8\x108\xda?\xb4\x03\xae+f\x84\xd1?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x90\xa1c\x07\x95\xb8\x9e?\xe7\xa6\xcd8\rQ\xc1?Z)\x04r\x89\xa3\xe0?\xbe\x9f\x1a/\xdd$\xd4?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x000@\xe4\x9f\x19\xc4\x07v\xca?m\x1d\x1c\xecM\x0c\xe3?\xcf\xbaF\xcb\x81\x1e\xc2?b\xbe\xbc\x00\xfb\xe8\xac?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x90\xd7\x83I\xf1\xf1\xcd?\x00\x00\x00\x00\x00\x00\x00\x00`\xab\x04\x8b\xc3\x19\xe3?\xeezi\x8a\x00\xa7\xc5?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00&@ \x96\xcd\x1c\x92Z\x98?\xc0{G\x8d\t1\xdb?Ou\xc8\xcdp\x03\xca?\xa9/K;5\x97\xa3?B\n\x9eB\xae\xd4\xd3?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x009@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00c\xeeZB>\xe8\xe6?\x00\x00\x00\x00\x00\x00\x00\x00J\xb8\x90Gp#\xd2?r\xe0\xd5rg&H?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00H\xc4\x94H\xa2\x17\xe4?pw\xd6n\xbb\xd0\xd7?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00&@A\xef\x8d!\x008\xc8?\xd3\xd8^\x0bzo\xc4?\x83/L\xa6\nF\xc5?\xf8\x88\x98\x12I\xf4\xd4?w\xf6\x95\x07\xe9)\xc4?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80@@\xc7\xd5\xc8\xae\xb4\x8c\xc0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\xa6\xd3\xba\r\xea\xe3?%\x92\xe8e\x14\xcb\xcf?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@\xf9\xbc\xe2\xa9G\x1a\xc6?O\xe5\xb4\xa7\xe4\x9c\xd6?8\x82T\x8a\x1d\x8d\xcf?\xb2\x12\xf3\xac\xa4\x15\xcd?\xd2\xfb\xc6\xd7\x9eY2?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00\x00I\x80\x9aZ\xb6\xd6W?\xcdZ\nH\xfb\x1f\xc6?\xbbF\xcb\x81\x1ej\xd3?\xc6l\xc9\xaa\x08\xb7\xe0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00"@\x84f\xd7\xbd\x15\x89\xd7?\x00\x00\x00\x00\x00\x00\x00\x00w\x87\x14\x03$\x9a\xca?\xb1Pk\x9aw\x9c\xd1? \nfL\xc1\x1a\xc3?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xad\xc3\xd1U\xba;\xe2?\xfa\xb7\xcb~\xdd\xe9\x9e?a\xffun\xda\x8c\xcb?\xeezi\x8a\x00\xa7\xc7?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\xf6Cl\xb0p\x92\xa6?\x00\x00\x00\x00\x00\x00\x00\x00\xa5\x84`U\xbd\xfc\xce?.8\x83\xbf_\xcc\xe3?\xff\xb3\xe6\xc7_Z\xb8?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x8b\xc5o\n+\x15\xb0?q9^\x81\xe8I\x89?zrM\x81\xcc\xce\xba?\xca\xfc\xa3o\xd24\xc0?\xd0\xec\xba\xb7"1\xe6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x002@\x00\x00\x00\x00\x00\x80C@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00t\xf0Lh\x92X\xd3?u\x8e\x01\xd9\xeb\xdd\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\xda\xc9\xe0(yu\xc8?\xe4\x9f\x19\xc4\x07v\xac?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00H@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00C@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\xf4\xc2\x9d\x0b#\xd5?\xc6\xfdG\xa6C\xa7\xda?4\x0e\xf5\xbb\xb05\xd0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x004@-[\xeb\x8b\x84\xb6\xc0?\xb2\xf2\xcb`\x8cH\xb4?\n\x82\xc7\xb7w\x8d\xe6?\r\xdf\xc2\xba\xf1\xee\xa8?\xa2\xefne\x89\xce\xa2?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.@\xbbc\xb1M*\x1a\xcb?\xe9\xf3QF\\\x00\xc6?\xc1\x1e\x13)\xcd\xe6\xcb?\xc1\x8d\x94-\x92v\xd3?3\xdc\x80\xcf\x0f#\xb8?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x003@\x00\x00\x00\x00\x00\x00\x00\x00\xd6t=\xd1u\xe1\xde?\x14\xd0D\xd8\xf0\xf4\xc6?[\\\xe33\xd9?\xc7?\xe6\xe9\\QJ\x08\xc4?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(@0\x10\x04\xc8\xd0\xb1\xc5?<\x872T\xc5T\xc0?5\r\x8a\xe6\x01,\xd7?~\xc6\x85\x03!Y\xc0?R\xb7\xb3\xaf\xcd\xc9\x8bL\xc0\xc5?\x84\rO\xaf\x94e\xc2?\x8c0E\xb94~\xcf?Y\xfa\xd0\x05\xf5-\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00I@' 27 | tbg1 28 | (g2 29 | (I0 30 | tS'b' 31 | tRp6 32 | (I1 33 | (I56 34 | tg5 35 | I00 36 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00' 37 | tbt. -------------------------------------------------------------------------------- /cl_gui.py: -------------------------------------------------------------------------------- 1 | """ This module contains an executable command-line version of the Pykit-Learn 2 | GUI. 3 | Author: Sean Dai 4 | """ 5 | 6 | # Ignore any warnings issued by third-party modules 7 | import warnings 8 | warnings.filterwarnings("ignore") 9 | 10 | import cPickle 11 | import logging 12 | import multiprocessing 13 | import os 14 | import shutil 15 | import sys 16 | import traceback 17 | from argparse import ArgumentParser 18 | from collections import Counter 19 | from glob import glob 20 | from os.path import join 21 | 22 | from pandas.tools.plotting import radviz 23 | from pandas.tools.plotting import scatter_matrix 24 | from pandas.tools.plotting import andrews_curves 25 | from PyQt4 import QtGui 26 | from sklearn import cross_validation 27 | 28 | import matplotlib.pyplot as plt 29 | import seaborn as sns 30 | import wx 31 | from PIL import Image 32 | 33 | from pk.utils.loading import * 34 | from pk.utils.preprocess import * 35 | from pk.utils.prygress import progress 36 | from pk.utils.classification_utils import * 37 | from pk.utils.metrics import * 38 | from pk.controller import ViewGenerator 39 | 40 | app = QtGui.QApplication(sys.argv) 41 | 42 | class Status(object): 43 | DATASET_LOADED = False 44 | FILENAME = '' 45 | EXTENSION = None 46 | TEMP_DIR = '_temp/' 47 | USER_QUIT = 'user_quit' 48 | RADIAL_NAME = 'plot_radial.png' 49 | SCM_NAME = 'plot_scatter_matrix.png' 50 | FREQ_NAME = 'plot_frequency.png' 51 | FM_NAME = 'plot_feature_matrix.png' 52 | ANDREWS_NAME = 'plot_andrews.png' 53 | TD_NAME = 'plot_2d.png' 54 | FINISH_PLOTS = False 55 | PLOT_COMMANDS = {'plot_frequency', 'plot_feature_matrix', 'plot_radial', 56 | 'plot_andrews', 'plot_scatter_matrix', 'plot_2d'} 57 | ALL_COMMANDS = list(PLOT_COMMANDS) + ['load', 'load_file_gui', 'load_random', 'preprocess', 58 | 'run', 'visualize', 'help', 'quit', 59 | 'see_images'] 60 | 61 | 62 | class InvalidCommandException(Exception): 63 | def __init__(self, message, errors=None): 64 | super(InvalidCommandException, self).__init__(message) 65 | self.errors = errors 66 | 67 | def _load_file(filename): 68 | loader = DatasetIO() 69 | return loader.load_file(filename) 70 | 71 | def load_file(filename): 72 | """ 73 | Function to load a dataset file. 74 | """ 75 | X, y, df = _load_file(filename) 76 | 77 | loader = DatasetIO() 78 | loader.pickle_files([(X, 'load_X.pkl'), (y, 'load_y.pkl'), (df, 'df.pkl')], 79 | Status.TEMP_DIR) 80 | 81 | # Update appropriate status flags. 82 | Status.DATASET_LOADED = True 83 | Status.FILENAME = os.path.basename(filename) 84 | Status.EXTENSION = filename[filename.rfind('.')] 85 | 86 | print 'Feature Array:\n %s' % X 87 | print 'Target classifications:\n %s' % y 88 | 89 | def load_file_gui(): 90 | from pk.controller import ViewGenerator 91 | popup = ViewGenerator() 92 | filter = "CSV files (*.csv);;XLS files (*.xls);;ARFF files (*.arff)" 93 | filename = popup.open_file_dialog(app, filter) 94 | 95 | if filename == '': 96 | return 97 | load_file(filename) 98 | 99 | def load_random(): 100 | """ 101 | Generates a random dataset with 100 samples, 2 features, and 3 classes. 102 | """ 103 | X, y, df = generate_random_points() 104 | loader = DatasetIO() 105 | loader.pickle_files([(X, 'load_X.pkl'), (y, 'load_y.pkl'), (df, 'df.pkl')], 106 | Status.TEMP_DIR) 107 | 108 | # Update appropriate status flags. 109 | Status.DATASET_LOADED = True 110 | Status.FILENAME = 'RANDOM' 111 | 112 | print 'Feature Array:\n %s' % X 113 | print 'Target classifications:\n %s' % y 114 | 115 | def get_pickled_dataset(): 116 | """ 117 | Returns X, y, and data_frame pickled files. 118 | """ 119 | f1 = open('_temp/load_X.pkl', 'r') 120 | f2 = open('_temp/load_y.pkl', 'r') 121 | f3 = open('_temp/df.pkl', 'r') 122 | 123 | X = cPickle.load(f1) 124 | y = cPickle.load(f2) 125 | data_frame = cPickle.load(f3) 126 | 127 | f1.close() 128 | f2.close() 129 | f3.close() 130 | return X, y, data_frame 131 | 132 | 133 | def update_feature_array(changed_X): 134 | with open('_temp/load_X.pkl', 'wb') as f: 135 | cPickle.dump(changed_X, f) 136 | with open('_temp/df.pkl', 'wb') as f: 137 | cPickle.dump(pd.DataFrame(changed_X), f) 138 | 139 | 140 | def visualize_dataset(command='', flags=(), plot_all=False, *args, **kwargs): 141 | """ 142 | Create and display visualizations to user. 143 | """ 144 | 145 | # Build parser for visualization 146 | parser = ArgumentParser() 147 | parser.add_argument('--suppress', action='store_true', dest='suppress', 148 | help='Disable viewing of any generated plot(s).') 149 | p_args = parser.parse_args(flags) 150 | 151 | if Status.DATASET_LOADED: 152 | print "Creating visualization(s)", 153 | make_visualizations(command, plot_all) 154 | print "" 155 | if not p_args.suppress: 156 | print "Viewing generated plots..." 157 | view_saved_plots(command) 158 | else: 159 | raise InvalidCommandException("Can't visualize an unloaded dataset!") 160 | 161 | def view_saved_plots(plot_name=''): 162 | # View all plots by default 163 | if plot_name == '': 164 | plot_name = '*.png' 165 | files = glob(join(Status.TEMP_DIR, plot_name)) 166 | else: 167 | files = glob(join(Status.TEMP_DIR, plot_name + '.png')) 168 | 169 | for im_file in files: 170 | im = Image.open(im_file, 'r') 171 | im.show() 172 | 173 | def see_images(*args): 174 | if '_temp/*.png' in args: 175 | files = glob('_temp/*.png') 176 | else: 177 | files = args 178 | 179 | for im_file in files: 180 | im = Image.open(im_file, 'r') 181 | im.show() 182 | 183 | @progress(char='.', pause=0.5) 184 | def make_visualizations(command='', plot_all=False): 185 | """ 186 | Save the plots to _temp directory. 187 | """ 188 | X, y, data_frame = get_pickled_dataset() 189 | class_name = data_frame.dtypes.index[-1] 190 | 191 | if command == 'plot_frequency' or plot_all: 192 | plot_class_frequency_bar(y) 193 | if command == 'plot_feature_matrix': 194 | plot_feature_matrix(data_frame) 195 | if command == 'plot_radial' or plot_all: 196 | plot_radial(data_frame, class_name) 197 | if command == 'plot_andrews' or plot_all: 198 | plot_andrews(data_frame, class_name) 199 | if command == 'plot_scatter_matrix': 200 | plot_scatter_matrix(data_frame) 201 | if command == 'plot_2d' or plot_all: 202 | plot_2d_dist(X, y) 203 | 204 | def reset_plot_status(): 205 | Status.FINISH_PLOTS = False 206 | 207 | def plot_class_frequency_bar(target, bar_width=.35): 208 | plt.clf() 209 | # Get the frequency of each class label 210 | classes = np.unique(target) 211 | target_counts = Counter(target) 212 | 213 | # Plot the bar chart of class frequencies 214 | fig, ax = plt.subplots() 215 | ind = np.arange(len(classes)) 216 | ax.set_xticks(ind) 217 | ax.bar(ind, target_counts.values(), width=bar_width, align='center') 218 | ax.set_title(Status.FILENAME) 219 | ax.set_xlabel('Class') 220 | ax.set_ylabel('Frequency') 221 | ax.set_xticklabels(target_counts.keys()) 222 | ax.set_title('Bar Chart of Class Label Frequencies') 223 | plt.savefig(join(Status.TEMP_DIR, Status.FREQ_NAME)) 224 | 225 | 226 | def plot_feature_matrix(data_frame): 227 | # Plot the matrix of feature-feature pairs 228 | plt.clf() 229 | g = sns.PairGrid(data_frame) 230 | g.map(plt.scatter) 231 | # plt.show(block=False) 232 | plt.title('Feature Matrix') 233 | plt.savefig(join(Status.TEMP_DIR, Status.FM_NAME)) 234 | 235 | def plot_radial(data_frame, class_name): 236 | plt.clf() 237 | radviz(data_frame, class_name) 238 | # plt.show(block=False) 239 | plt.title('Radial Plot') 240 | plt.savefig(join(Status.TEMP_DIR, Status.RADIAL_NAME)) 241 | 242 | def plot_andrews(data_frame, class_name): 243 | plt.clf() 244 | andrews_curves(data_frame, class_name) 245 | plt.title('Andrews Curve') 246 | # plt.show(block=False) 247 | plt.savefig(join(Status.TEMP_DIR, Status.ANDREWS_NAME)) 248 | 249 | def plot_scatter_matrix(data_frame): 250 | plt.clf() 251 | axes = scatter_matrix(data_frame, alpha=0.2, figsize=(10, 10), diagonal='kde') 252 | # plt.show(block=False) 253 | axes[0][0].set_title('Scatter Matrix with KDEs') 254 | plt.savefig(join(Status.TEMP_DIR, Status.SCM_NAME)) 255 | 256 | def plot_2d_dist(X, y): 257 | """ 258 | Plots the feature array points on a plane. 259 | 260 | If the n_dims > 2, only consider the first two features. 261 | """ 262 | plt.clf() 263 | from itertools import cycle 264 | colors = cycle('bgrcmyk') 265 | 266 | if len(X[0]) > 2: 267 | x_values = X[:, :2] 268 | else: 269 | x_values = X 270 | 271 | # Create a color-coded scatter plot by class label. 272 | for class_label, c in zip(np.unique(y), colors): 273 | xs = x_values[np.where(y == class_label)] 274 | plt.scatter(xs[:, 0], xs[:, 1], c=c, label=class_label) 275 | 276 | # Set plot labels and save. 277 | plt.xlabel('x1') 278 | plt.ylabel('x2') 279 | plt.title('Distribution of Dataset ({})'.format(Status.FILENAME)) 280 | plt.legend(loc='best') 281 | plt.savefig(join(Status.TEMP_DIR, Status.TD_NAME)) 282 | 283 | def dispatch_preprocess(args): 284 | if not Status.DATASET_LOADED: 285 | raise InvalidCommandException("Can't preprocess an unloaded dataset!") 286 | 287 | parser = ArgumentParser() 288 | parser.add_argument('-std', dest='std', action='store_true', 289 | help='Standardize the feature array.') 290 | parser.add_argument('-norm', dest='norm', action='store_true', 291 | help="Normalize the values of each feature.") 292 | p_args = parser.parse_args(args) 293 | pe = PreprocessingEngine() 294 | 295 | if p_args.std: 296 | print "Standardizing feature array..." 297 | X, y, _ = get_pickled_dataset() 298 | new_X = pe.standardize(X) 299 | print new_X 300 | update_feature_array(new_X) 301 | 302 | if p_args.norm: 303 | print "Normalizing feature array..." 304 | X, y, _ = get_pickled_dataset() 305 | new_X = pe.normalize_data(X) 306 | print new_X 307 | update_feature_array(new_X) 308 | 309 | 310 | def dispatch_run(args): 311 | # Build parser for "run" flags 312 | parser = ArgumentParser() 313 | parser.add_argument('-A', dest='A', help='Select the ML algorithm to run.') 314 | parser.add_argument('-test_ratio', type=float, dest='test_ratio', 315 | help="Split data into training and test sets.") 316 | parser.add_argument('-cv', dest='cv', type=int, 317 | help='Run with cross-validation.') 318 | p_args = parser.parse_args(args) 319 | 320 | # Process the passed in arguments 321 | if p_args.A: 322 | # Run a decision tree algorithm on data 323 | if p_args.A.strip() == 'dt': 324 | print "Running decision tree algorithm on dataset..." 325 | X, y, _ = get_pickled_dataset() 326 | X_train, y_train = X, y 327 | X_test, y_test = X, y 328 | 329 | # Split the original dataset to training & testing sets 330 | if p_args.test_ratio: 331 | X_train, X_test, y_train, y_test = cross_validation.train_test_split( 332 | X, y, test_size=p_args.test_ratio, 333 | random_state=0) 334 | # Train the Decision Tree classifier 335 | clf = train_decision_tree(X_train, y_train) 336 | print "Train accuracy: %f" % get_train_accuracy(clf, X_train, y_train) 337 | 338 | # Output metrics from train-test split 339 | if X_test is not None and y_test is not None: 340 | print "Test accuracy: %f%%" % get_test_accuracy(clf, X_test, y_test) 341 | 342 | # Get cross-validation score(s) 343 | if p_args.cv: 344 | print "" 345 | print "Cross Validation Scores:" 346 | scores, avg = get_cv_accuracy(clf, X_train, y_train, cv=p_args.cv) 347 | print 'Scores: ' + ', '.join(map(str, scores)) 348 | print 'Average accuracy: %f (+/- %f)' % (avg, scores.std() * 2) 349 | 350 | # Plot the confusion matrix 351 | cm = get_confusion_matrix(clf, X_test, y_test) 352 | plot_confusion_matrix(cm, y=np.unique(y)) 353 | 354 | def setup(): 355 | # Create temporary directory for storing serialized objects. 356 | if not os.path.exists("_temp/"): 357 | os.mkdir("_temp/") 358 | 359 | # Configure log file for the application. 360 | logging.basicConfig(level=logging.DEBUG, 361 | format='%(asctime)s %(levelname)s: %(message)s', 362 | filename='cl_gui.log') 363 | logging.info("Starting application...") 364 | 365 | # Code snippet for recalling previous commands with the 366 | # 'up' and 'down' arrow keys. 367 | import rlcompleter 368 | import atexit 369 | import readline 370 | 371 | hist_file = os.path.join(os.environ['HOME'], '.pythonhistory') 372 | try: 373 | readline.read_history_file(hist_file) 374 | except IOError: 375 | pass 376 | 377 | # Set a limit on the number of commands to remember. 378 | # High values will hog system memory! 379 | readline.set_history_length(25) 380 | atexit.register(readline.write_history_file, hist_file) 381 | 382 | # Tab completion for GUI commands 383 | def completer(text, state): 384 | commands = Status.ALL_COMMANDS 385 | file_paths = [] 386 | for dirname, dirnames, filenames in os.walk('.'): 387 | if '.git' in dirnames: 388 | # don't go into any .git directories. 389 | dirnames.remove('.git') 390 | # Add path to subdirectories 391 | file_paths.extend([os.path.join(dirname, sub_dir) for sub_dir in dirnames]) 392 | # Add path to all filenames in subdirectories. 393 | file_paths.extend([os.path.join(dirname, filename) for filename in filenames]) 394 | # Remove './' header in file strings. 395 | file_paths = [file.strip('./') for file in file_paths] 396 | 397 | options = [i for i in commands if i.startswith(text)] 398 | options.extend([f for f in file_paths if f.startswith(text)]) 399 | 400 | try: 401 | return options[state] 402 | except IndexError: 403 | return None 404 | 405 | readline.set_completer(completer) 406 | 407 | # Bind tab completer to specific platforms 408 | if readline.__doc__ and 'libedit' in readline.__doc__: 409 | readline.parse_and_bind("bind -e") 410 | readline.parse_and_bind("bind '\t' rl_complete") 411 | else: 412 | readline.parse_and_bind("tab: complete") 413 | del hist_file, readline, rlcompleter 414 | 415 | 416 | def quit_gui(): 417 | shutil.rmtree(Status.TEMP_DIR) 418 | logging.info("Quitting application...") 419 | sys.exit(Status.USER_QUIT) 420 | 421 | 422 | def help_page(): 423 | output_page = """ 424 | Pykit-Learn Command Line GUI 425 | -------------------------------- 426 | Commands: 427 | The following commands are available: 428 | 429 | load [file] Loads the dataset at the path specified by [file]. 430 | No quotes "" around filename! 431 | load_random Load a randomly generated dataset with 3 classes. 432 | plot_2d Plot a 2-D distribution of the dataset. 433 | plot_andrews Plots an Andrews curve of the dataset. 434 | 435 | plot_frequency View the frequency of each class label. 436 | plot_feature_matrix Generate a matrix plot of feature-feature 437 | relationships. 438 | plot_scatter_matrix Matrix plot with KDEs along the diagonal. 439 | plot_radial Plot a radial chart of the dataset. 440 | preprocess [flags] Preprocesses a dataset. Flags are 441 | -std Standardize to mean 0 and variance 1 442 | -norm Normalize each feature to range [0,1] 443 | Eg. "preprocess -std" 444 | see_images [files] View temporarily stored plots. 445 | Eg. "see_images _temp/plot_2d.png" 446 | run Runs the ML alg on the loaded dataset. 447 | -A [alg] REQUIRED flag! Options for [alg]: 448 | dt = (Decision Tree) 449 | -test_ratio [0-1] User can specify the test-train ratio. 450 | -cv [int] Enables k-fold cross validation. 451 | Example: "run -A dt -test_ratio .3 -cv 5" 452 | visualize Plots all possible visualizations for input data. 453 | --suppress Disable plotting output. 454 | help Provides a help screen of available commands. 455 | quit Quits the command line GUI. 456 | """ 457 | return output_page 458 | 459 | 460 | def process(line): 461 | tokens = tuple(line.split(' ')) 462 | command, args = tokens[0], tokens[1:] 463 | 464 | # Select the appropriate function to call 465 | if command == 'load': 466 | load_file(*args) 467 | elif command == 'load_random': 468 | load_random() 469 | elif command == 'load_file_gui': 470 | load_file_gui() 471 | elif command == 'preprocess_gui': 472 | gen = ViewGenerator() 473 | gen.get_preprocess_options(app) 474 | elif command == 'preprocess': 475 | dispatch_preprocess(args) 476 | elif command in Status.PLOT_COMMANDS: 477 | visualize_dataset(command, args) 478 | elif command == 'visualize': 479 | visualize_dataset(flags=args, plot_all=True) 480 | elif command == 'see_images': 481 | see_images(*args) 482 | elif command == 'run': 483 | dispatch_run(args) 484 | elif command == 'help': 485 | print help_page() 486 | elif command == 'quit': 487 | quit_gui() 488 | elif command == '': 489 | return 490 | else: 491 | raise InvalidCommandException( 492 | "{} is not a recognized command.".format(command)) 493 | 494 | 495 | def main(): 496 | """ 497 | To run, type "python cl_gui.py". 498 | """ 499 | print "Welcome to the command-line version of Pykit-Learn!" 500 | print "Type 'help' for a list of available commands" 501 | setup() 502 | 503 | while True: 504 | try: 505 | input_line = raw_input(">> ") 506 | process(input_line.strip()) 507 | except IOError as ioe: 508 | print ioe.message 509 | except InvalidCommandException as inv: 510 | print inv.message 511 | except AttributeError as ae: 512 | print ae.message 513 | except Exception: 514 | traceback.print_exc() 515 | except SystemExit as se: 516 | if str(se.message) == Status.USER_QUIT: 517 | return 518 | else: 519 | print se.message 520 | except KeyboardInterrupt: 521 | quit_gui() 522 | 523 | 524 | if __name__ == "__main__": 525 | main() 526 | -------------------------------------------------------------------------------- /pk/main/ui/main_gui.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Form implementation generated from reading ui file 'main_gui.ui' 4 | # 5 | # Created by: PyQt4 UI code generator 4.11.4 6 | # 7 | # WARNING! All changes made in this file will be lost! 8 | 9 | from PyQt4 import QtCore, QtGui 10 | 11 | try: 12 | _fromUtf8 = QtCore.QString.fromUtf8 13 | except AttributeError: 14 | def _fromUtf8(s): 15 | return s 16 | 17 | try: 18 | _encoding = QtGui.QApplication.UnicodeUTF8 19 | def _translate(context, text, disambig): 20 | return QtGui.QApplication.translate(context, text, disambig, _encoding) 21 | except AttributeError: 22 | def _translate(context, text, disambig): 23 | return QtGui.QApplication.translate(context, text, disambig) 24 | 25 | class Ui_main_tab(object): 26 | def setupUi(self, main_tab): 27 | main_tab.setObjectName(_fromUtf8("main_tab")) 28 | main_tab.resize(987, 737) 29 | self.upload_tab = QtGui.QWidget() 30 | self.upload_tab.setObjectName(_fromUtf8("upload_tab")) 31 | self.horizontalWidget = QtGui.QWidget(self.upload_tab) 32 | self.horizontalWidget.setGeometry(QtCore.QRect(0, 0, 501, 51)) 33 | self.horizontalWidget.setObjectName(_fromUtf8("horizontalWidget")) 34 | self.horizontalLayout = QtGui.QHBoxLayout(self.horizontalWidget) 35 | self.horizontalLayout.setObjectName(_fromUtf8("horizontalLayout")) 36 | self.openfile_btn = QtGui.QPushButton(self.horizontalWidget) 37 | self.openfile_btn.setObjectName(_fromUtf8("openfile_btn")) 38 | self.horizontalLayout.addWidget(self.openfile_btn) 39 | self.openurl_btn = QtGui.QPushButton(self.horizontalWidget) 40 | self.openurl_btn.setObjectName(_fromUtf8("openurl_btn")) 41 | self.horizontalLayout.addWidget(self.openurl_btn) 42 | self.generate_btn = QtGui.QPushButton(self.horizontalWidget) 43 | self.generate_btn.setObjectName(_fromUtf8("generate_btn")) 44 | self.horizontalLayout.addWidget(self.generate_btn) 45 | self.verticalLayoutWidget = QtGui.QWidget(self.upload_tab) 46 | self.verticalLayoutWidget.setGeometry(QtCore.QRect(20, 50, 391, 621)) 47 | self.verticalLayoutWidget.setObjectName(_fromUtf8("verticalLayoutWidget")) 48 | self.verticalLayout = QtGui.QVBoxLayout(self.verticalLayoutWidget) 49 | self.verticalLayout.setObjectName(_fromUtf8("verticalLayout")) 50 | self.datainfo_label = QtGui.QLabel(self.verticalLayoutWidget) 51 | self.datainfo_label.setObjectName(_fromUtf8("datainfo_label")) 52 | self.verticalLayout.addWidget(self.datainfo_label) 53 | self.datainfotext = QtGui.QTextBrowser(self.verticalLayoutWidget) 54 | self.datainfotext.setObjectName(_fromUtf8("datainfotext")) 55 | self.verticalLayout.addWidget(self.datainfotext) 56 | self.verticalLayoutWidget_2 = QtGui.QWidget(self.upload_tab) 57 | self.verticalLayoutWidget_2.setGeometry(QtCore.QRect(420, 50, 541, 621)) 58 | self.verticalLayoutWidget_2.setObjectName(_fromUtf8("verticalLayoutWidget_2")) 59 | self.verticalLayout_2 = QtGui.QVBoxLayout(self.verticalLayoutWidget_2) 60 | self.verticalLayout_2.setObjectName(_fromUtf8("verticalLayout_2")) 61 | self.dataplotter_label = QtGui.QLabel(self.verticalLayoutWidget_2) 62 | self.dataplotter_label.setObjectName(_fromUtf8("dataplotter_label")) 63 | self.verticalLayout_2.addWidget(self.dataplotter_label) 64 | self.dataplottergraphics = QtGui.QGraphicsView(self.verticalLayoutWidget_2) 65 | self.dataplottergraphics.setObjectName(_fromUtf8("dataplottergraphics")) 66 | self.verticalLayout_2.addWidget(self.dataplottergraphics) 67 | self.progressBar = QtGui.QProgressBar(self.upload_tab) 68 | self.progressBar.setGeometry(QtCore.QRect(20, 683, 941, 16)) 69 | self.progressBar.setProperty("value", 0) 70 | self.progressBar.setObjectName(_fromUtf8("progressBar")) 71 | main_tab.addTab(self.upload_tab, _fromUtf8("")) 72 | self.preprocess_tab = QtGui.QWidget() 73 | self.preprocess_tab.setEnabled(True) 74 | self.preprocess_tab.setAutoFillBackground(False) 75 | self.preprocess_tab.setObjectName(_fromUtf8("preprocess_tab")) 76 | self.horizontalWidget_2 = QtGui.QWidget(self.preprocess_tab) 77 | self.horizontalWidget_2.setGeometry(QtCore.QRect(0, 0, 981, 51)) 78 | self.horizontalWidget_2.setObjectName(_fromUtf8("horizontalWidget_2")) 79 | self.horizontalLayout_2 = QtGui.QHBoxLayout(self.horizontalWidget_2) 80 | self.horizontalLayout_2.setObjectName(_fromUtf8("horizontalLayout_2")) 81 | self.preprocess_tab_2 = QtGui.QTabWidget(self.horizontalWidget_2) 82 | self.preprocess_tab_2.setObjectName(_fromUtf8("preprocess_tab_2")) 83 | self.normalize_tab = QtGui.QWidget() 84 | self.normalize_tab.setObjectName(_fromUtf8("normalize_tab")) 85 | self.preprocess_tab_2.addTab(self.normalize_tab, _fromUtf8("")) 86 | self.standardize_tab = QtGui.QWidget() 87 | self.standardize_tab.setObjectName(_fromUtf8("standardize_tab")) 88 | self.preprocess_tab_2.addTab(self.standardize_tab, _fromUtf8("")) 89 | self.binarize_tab = QtGui.QWidget() 90 | self.binarize_tab.setObjectName(_fromUtf8("binarize_tab")) 91 | self.preprocess_tab_2.addTab(self.binarize_tab, _fromUtf8("")) 92 | self.impute_tab = QtGui.QWidget() 93 | self.impute_tab.setObjectName(_fromUtf8("impute_tab")) 94 | self.preprocess_tab_2.addTab(self.impute_tab, _fromUtf8("")) 95 | self.noise_tab = QtGui.QWidget() 96 | self.noise_tab.setObjectName(_fromUtf8("noise_tab")) 97 | self.preprocess_tab_2.addTab(self.noise_tab, _fromUtf8("")) 98 | self.horizontalLayout_2.addWidget(self.preprocess_tab_2) 99 | self.attr_label = QtGui.QLabel(self.preprocess_tab) 100 | self.attr_label.setGeometry(QtCore.QRect(20, 60, 81, 17)) 101 | self.attr_label.setObjectName(_fromUtf8("attr_label")) 102 | self.verticalLayoutWidget_3 = QtGui.QWidget(self.preprocess_tab) 103 | self.verticalLayoutWidget_3.setGeometry(QtCore.QRect(10, 80, 421, 251)) 104 | self.verticalLayoutWidget_3.setObjectName(_fromUtf8("verticalLayoutWidget_3")) 105 | self.verticalLayout_3 = QtGui.QVBoxLayout(self.verticalLayoutWidget_3) 106 | self.verticalLayout_3.setObjectName(_fromUtf8("verticalLayout_3")) 107 | self.attr_checkbox1 = QtGui.QCheckBox(self.verticalLayoutWidget_3) 108 | self.attr_checkbox1.setObjectName(_fromUtf8("attr_checkbox1")) 109 | self.verticalLayout_3.addWidget(self.attr_checkbox1) 110 | self.attr_checkbox2 = QtGui.QCheckBox(self.verticalLayoutWidget_3) 111 | self.attr_checkbox2.setObjectName(_fromUtf8("attr_checkbox2")) 112 | self.verticalLayout_3.addWidget(self.attr_checkbox2) 113 | self.attr_checkbox3 = QtGui.QCheckBox(self.verticalLayoutWidget_3) 114 | self.attr_checkbox3.setObjectName(_fromUtf8("attr_checkbox3")) 115 | self.verticalLayout_3.addWidget(self.attr_checkbox3) 116 | self.attr_checkbox4 = QtGui.QCheckBox(self.verticalLayoutWidget_3) 117 | self.attr_checkbox4.setObjectName(_fromUtf8("attr_checkbox4")) 118 | self.verticalLayout_3.addWidget(self.attr_checkbox4) 119 | self.attr_checkbox5 = QtGui.QCheckBox(self.verticalLayoutWidget_3) 120 | self.attr_checkbox5.setObjectName(_fromUtf8("attr_checkbox5")) 121 | self.verticalLayout_3.addWidget(self.attr_checkbox5) 122 | self.remove_attr_btn = QtGui.QPushButton(self.preprocess_tab) 123 | self.remove_attr_btn.setGeometry(QtCore.QRect(10, 330, 419, 26)) 124 | self.remove_attr_btn.setObjectName(_fromUtf8("remove_attr_btn")) 125 | main_tab.addTab(self.preprocess_tab, _fromUtf8("")) 126 | self.regression_tab = QtGui.QWidget() 127 | self.regression_tab.setObjectName(_fromUtf8("regression_tab")) 128 | self.horizontalWidget_4 = QtGui.QWidget(self.regression_tab) 129 | self.horizontalWidget_4.setGeometry(QtCore.QRect(0, 0, 981, 51)) 130 | self.horizontalWidget_4.setObjectName(_fromUtf8("horizontalWidget_4")) 131 | self.horizontalLayout_6 = QtGui.QHBoxLayout(self.horizontalWidget_4) 132 | self.horizontalLayout_6.setObjectName(_fromUtf8("horizontalLayout_6")) 133 | self.regression_tab_2 = QtGui.QTabWidget(self.horizontalWidget_4) 134 | self.regression_tab_2.setObjectName(_fromUtf8("regression_tab_2")) 135 | self.linearreg_tab = QtGui.QWidget() 136 | self.linearreg_tab.setObjectName(_fromUtf8("linearreg_tab")) 137 | self.regression_tab_2.addTab(self.linearreg_tab, _fromUtf8("")) 138 | self.polyreg_tab = QtGui.QWidget() 139 | self.polyreg_tab.setObjectName(_fromUtf8("polyreg_tab")) 140 | self.regression_tab_2.addTab(self.polyreg_tab, _fromUtf8("")) 141 | self.leastsqreg_tab = QtGui.QWidget() 142 | self.leastsqreg_tab.setObjectName(_fromUtf8("leastsqreg_tab")) 143 | self.regression_tab_2.addTab(self.leastsqreg_tab, _fromUtf8("")) 144 | self.logisticreg_tab = QtGui.QWidget() 145 | self.logisticreg_tab.setObjectName(_fromUtf8("logisticreg_tab")) 146 | self.regression_tab_2.addTab(self.logisticreg_tab, _fromUtf8("")) 147 | self.gdreg_tab = QtGui.QWidget() 148 | self.gdreg_tab.setObjectName(_fromUtf8("gdreg_tab")) 149 | self.regression_tab_2.addTab(self.gdreg_tab, _fromUtf8("")) 150 | self.horizontalLayout_6.addWidget(self.regression_tab_2) 151 | main_tab.addTab(self.regression_tab, _fromUtf8("")) 152 | self.classify_tab = QtGui.QWidget() 153 | self.classify_tab.setObjectName(_fromUtf8("classify_tab")) 154 | self.horizontalWidget_3 = QtGui.QWidget(self.classify_tab) 155 | self.horizontalWidget_3.setGeometry(QtCore.QRect(0, 0, 981, 51)) 156 | self.horizontalWidget_3.setObjectName(_fromUtf8("horizontalWidget_3")) 157 | self.horizontalLayout_5 = QtGui.QHBoxLayout(self.horizontalWidget_3) 158 | self.horizontalLayout_5.setObjectName(_fromUtf8("horizontalLayout_5")) 159 | self.classifymethods_tab = QtGui.QTabWidget(self.horizontalWidget_3) 160 | self.classifymethods_tab.setElideMode(QtCore.Qt.ElideNone) 161 | self.classifymethods_tab.setObjectName(_fromUtf8("classifymethods_tab")) 162 | self.dt_tab = QtGui.QWidget() 163 | self.dt_tab.setObjectName(_fromUtf8("dt_tab")) 164 | self.classifymethods_tab.addTab(self.dt_tab, _fromUtf8("")) 165 | self.ensemble_tab = QtGui.QWidget() 166 | self.ensemble_tab.setObjectName(_fromUtf8("ensemble_tab")) 167 | self.classifymethods_tab.addTab(self.ensemble_tab, _fromUtf8("")) 168 | self.nn_tab = QtGui.QWidget() 169 | self.nn_tab.setObjectName(_fromUtf8("nn_tab")) 170 | self.classifymethods_tab.addTab(self.nn_tab, _fromUtf8("")) 171 | self.svm_tab = QtGui.QWidget() 172 | self.svm_tab.setObjectName(_fromUtf8("svm_tab")) 173 | self.classifymethods_tab.addTab(self.svm_tab, _fromUtf8("")) 174 | self.bn_tab = QtGui.QWidget() 175 | self.bn_tab.setObjectName(_fromUtf8("bn_tab")) 176 | self.classifymethods_tab.addTab(self.bn_tab, _fromUtf8("")) 177 | self.knn_tab = QtGui.QWidget() 178 | self.knn_tab.setObjectName(_fromUtf8("knn_tab")) 179 | self.classifymethods_tab.addTab(self.knn_tab, _fromUtf8("")) 180 | self.otherclassify_tab = QtGui.QWidget() 181 | self.otherclassify_tab.setObjectName(_fromUtf8("otherclassify_tab")) 182 | self.classifymethods_tab.addTab(self.otherclassify_tab, _fromUtf8("")) 183 | self.horizontalLayout_5.addWidget(self.classifymethods_tab) 184 | main_tab.addTab(self.classify_tab, _fromUtf8("")) 185 | self.cluster_tab = QtGui.QWidget() 186 | self.cluster_tab.setObjectName(_fromUtf8("cluster_tab")) 187 | self.horizontalWidget_5 = QtGui.QWidget(self.cluster_tab) 188 | self.horizontalWidget_5.setGeometry(QtCore.QRect(0, 0, 981, 51)) 189 | self.horizontalWidget_5.setObjectName(_fromUtf8("horizontalWidget_5")) 190 | self.horizontalLayout_7 = QtGui.QHBoxLayout(self.horizontalWidget_5) 191 | self.horizontalLayout_7.setObjectName(_fromUtf8("horizontalLayout_7")) 192 | self.clustermethod_tab = QtGui.QTabWidget(self.horizontalWidget_5) 193 | self.clustermethod_tab.setObjectName(_fromUtf8("clustermethod_tab")) 194 | self.kmeans_tab = QtGui.QWidget() 195 | self.kmeans_tab.setObjectName(_fromUtf8("kmeans_tab")) 196 | self.clustermethod_tab.addTab(self.kmeans_tab, _fromUtf8("")) 197 | self.em_tab = QtGui.QWidget() 198 | self.em_tab.setObjectName(_fromUtf8("em_tab")) 199 | self.clustermethod_tab.addTab(self.em_tab, _fromUtf8("")) 200 | self.propcluster_tab = QtGui.QWidget() 201 | self.propcluster_tab.setObjectName(_fromUtf8("propcluster_tab")) 202 | self.clustermethod_tab.addTab(self.propcluster_tab, _fromUtf8("")) 203 | self.spectralcluster_tab = QtGui.QWidget() 204 | self.spectralcluster_tab.setObjectName(_fromUtf8("spectralcluster_tab")) 205 | self.clustermethod_tab.addTab(self.spectralcluster_tab, _fromUtf8("")) 206 | self.aggcluster_tab = QtGui.QWidget() 207 | self.aggcluster_tab.setObjectName(_fromUtf8("aggcluster_tab")) 208 | self.clustermethod_tab.addTab(self.aggcluster_tab, _fromUtf8("")) 209 | self.dbscan_tab = QtGui.QWidget() 210 | self.dbscan_tab.setObjectName(_fromUtf8("dbscan_tab")) 211 | self.clustermethod_tab.addTab(self.dbscan_tab, _fromUtf8("")) 212 | self.horizontalLayout_7.addWidget(self.clustermethod_tab) 213 | self.horizontalWidget_5.raise_() 214 | self.clustermethod_tab.raise_() 215 | main_tab.addTab(self.cluster_tab, _fromUtf8("")) 216 | self.reduce_tab = QtGui.QWidget() 217 | self.reduce_tab.setObjectName(_fromUtf8("reduce_tab")) 218 | self.horizontalWidget_6 = QtGui.QWidget(self.reduce_tab) 219 | self.horizontalWidget_6.setGeometry(QtCore.QRect(0, 0, 981, 51)) 220 | self.horizontalWidget_6.setObjectName(_fromUtf8("horizontalWidget_6")) 221 | self.horizontalLayout_8 = QtGui.QHBoxLayout(self.horizontalWidget_6) 222 | self.horizontalLayout_8.setObjectName(_fromUtf8("horizontalLayout_8")) 223 | self.reducemethods_tab = QtGui.QTabWidget(self.horizontalWidget_6) 224 | self.reducemethods_tab.setObjectName(_fromUtf8("reducemethods_tab")) 225 | self.pca_tab = QtGui.QWidget() 226 | self.pca_tab.setObjectName(_fromUtf8("pca_tab")) 227 | self.reducemethods_tab.addTab(self.pca_tab, _fromUtf8("")) 228 | self.ica_tab = QtGui.QWidget() 229 | self.ica_tab.setObjectName(_fromUtf8("ica_tab")) 230 | self.reducemethods_tab.addTab(self.ica_tab, _fromUtf8("")) 231 | self.rpa_tab = QtGui.QWidget() 232 | self.rpa_tab.setObjectName(_fromUtf8("rpa_tab")) 233 | self.reducemethods_tab.addTab(self.rpa_tab, _fromUtf8("")) 234 | self.horizontalLayout_8.addWidget(self.reducemethods_tab) 235 | main_tab.addTab(self.reduce_tab, _fromUtf8("")) 236 | self.visualize_tab = QtGui.QWidget() 237 | self.visualize_tab.setObjectName(_fromUtf8("visualize_tab")) 238 | main_tab.addTab(self.visualize_tab, _fromUtf8("")) 239 | self.other_tab = QtGui.QWidget() 240 | self.other_tab.setObjectName(_fromUtf8("other_tab")) 241 | main_tab.addTab(self.other_tab, _fromUtf8("")) 242 | 243 | self.retranslateUi(main_tab) 244 | main_tab.setCurrentIndex(0) 245 | self.preprocess_tab_2.setCurrentIndex(0) 246 | self.regression_tab_2.setCurrentIndex(0) 247 | self.classifymethods_tab.setCurrentIndex(0) 248 | self.clustermethod_tab.setCurrentIndex(0) 249 | self.reducemethods_tab.setCurrentIndex(0) 250 | QtCore.QMetaObject.connectSlotsByName(main_tab) 251 | 252 | def retranslateUi(self, main_tab): 253 | main_tab.setWindowTitle(_translate("main_tab", "Scikit GUI", None)) 254 | self.openfile_btn.setText(_translate("main_tab", "Open File", None)) 255 | self.openurl_btn.setText(_translate("main_tab", "Open URL", None)) 256 | self.generate_btn.setText(_translate("main_tab", "Generate", None)) 257 | self.datainfo_label.setText(_translate("main_tab", "Dataset Information", None)) 258 | self.dataplotter_label.setText(_translate("main_tab", "Dataset Plotter", None)) 259 | main_tab.setTabText(main_tab.indexOf(self.upload_tab), _translate("main_tab", "Upload", None)) 260 | self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.normalize_tab), _translate("main_tab", "Normalize", None)) 261 | self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.standardize_tab), _translate("main_tab", "Standardize", None)) 262 | self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.binarize_tab), _translate("main_tab", "Binarize", None)) 263 | self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.impute_tab), _translate("main_tab", "Impute", None)) 264 | self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.noise_tab), _translate("main_tab", "Add Noise", None)) 265 | self.attr_label.setText(_translate("main_tab", "Attributes", None)) 266 | self.attr_checkbox1.setText(_translate("main_tab", "Attribute 1", None)) 267 | self.attr_checkbox2.setText(_translate("main_tab", "Attribute 2", None)) 268 | self.attr_checkbox3.setText(_translate("main_tab", "Attribute 3", None)) 269 | self.attr_checkbox4.setText(_translate("main_tab", "Attribute 4", None)) 270 | self.attr_checkbox5.setText(_translate("main_tab", "Attribute 5", None)) 271 | self.remove_attr_btn.setText(_translate("main_tab", "Remove", None)) 272 | main_tab.setTabText(main_tab.indexOf(self.preprocess_tab), _translate("main_tab", "Preprocess", None)) 273 | self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.linearreg_tab), _translate("main_tab", "Linear", None)) 274 | self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.polyreg_tab), _translate("main_tab", "Polynomial", None)) 275 | self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.leastsqreg_tab), _translate("main_tab", "Least Square", None)) 276 | self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.logisticreg_tab), _translate("main_tab", "Logistic", None)) 277 | self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.gdreg_tab), _translate("main_tab", "Gradient Descent", None)) 278 | main_tab.setTabText(main_tab.indexOf(self.regression_tab), _translate("main_tab", "Regression", None)) 279 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.dt_tab), _translate("main_tab", "Decision Tree", None)) 280 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.ensemble_tab), _translate("main_tab", "Ensemble", None)) 281 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.nn_tab), _translate("main_tab", "Neural Networks", None)) 282 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.svm_tab), _translate("main_tab", "SVM", None)) 283 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.bn_tab), _translate("main_tab", "Bayes Nets", None)) 284 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.knn_tab), _translate("main_tab", "kNN", None)) 285 | self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.otherclassify_tab), _translate("main_tab", "Others", None)) 286 | main_tab.setTabText(main_tab.indexOf(self.classify_tab), _translate("main_tab", "Classify", None)) 287 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.kmeans_tab), _translate("main_tab", "kMeans", None)) 288 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.em_tab), _translate("main_tab", "EM", None)) 289 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.propcluster_tab), _translate("main_tab", "Affinity Propogation", None)) 290 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.spectralcluster_tab), _translate("main_tab", "Spectral", None)) 291 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.aggcluster_tab), _translate("main_tab", "Agglomerative", None)) 292 | self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.dbscan_tab), _translate("main_tab", "DBSCAN", None)) 293 | main_tab.setTabText(main_tab.indexOf(self.cluster_tab), _translate("main_tab", "Cluster", None)) 294 | self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.pca_tab), _translate("main_tab", "PCA", None)) 295 | self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.ica_tab), _translate("main_tab", "ICA", None)) 296 | self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.rpa_tab), _translate("main_tab", "Random Projection", None)) 297 | main_tab.setTabText(main_tab.indexOf(self.reduce_tab), _translate("main_tab", "Reduce", None)) 298 | main_tab.setTabText(main_tab.indexOf(self.visualize_tab), _translate("main_tab", "Visualize", None)) 299 | main_tab.setTabText(main_tab.indexOf(self.other_tab), _translate("main_tab", "Other", None)) 300 | 301 | --------------------------------------------------------------------------------