├── pk
    ├── __init__.py
    ├── main
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   └── main.py
    │   └── ui
    │   │   ├── __init__.py
    │   │   ├── main_gui.ui
    │   │   └── main_gui.py
    ├── utils
    │   ├── __init__.py
    │   ├── prygress.py
    │   ├── imaging.py
    │   ├── metrics.py
    │   ├── preprocess.py
    │   ├── regression_utils.py
    │   ├── classification_utils.py
    │   ├── clustering.py
    │   └── loading.py
    ├── tests
    │   ├── Wine.xls
    │   ├── __init__.py
    │   ├── blank.csv
    │   ├── iris.csv
    │   ├── test_imports.py
    │   ├── test_models.py
    │   ├── test_cl_gui.py
    │   ├── test_preprocessing.py
    │   ├── test_classification.py
    │   ├── ratings_best.arff
    │   ├── test_regression.py
    │   ├── iris2.csv
    │   ├── faithful.csv
    │   ├── test_loading.py
    │   ├── credit-g.arff
    │   └── correct_array.pkl
    ├── controller.py
    └── models.py
├── setup.cfg
├── Makefile
├── install.sh
├── .gitignore
├── README.md
└── cl_gui.py


/pk/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pk/main/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pk/main/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pk/main/ui/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pk/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pk/tests/Wine.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bsuhagia/Pykit-Learn/HEAD/pk/tests/Wine.xls


--------------------------------------------------------------------------------
/pk/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | 
4 | sys.path.append(os.path.abspath(os.path.dirname(__file__)))


--------------------------------------------------------------------------------
/pk/tests/blank.csv:
--------------------------------------------------------------------------------
1 | "F1","F2","F3","F4","Class"
2 | 1,2,3,4,good
3 | 2,3,4,5,good
4 | 1,?,?,3,bad
5 | 2,3,4,5,good
6 | ?,?,?,?,bad
7 | ?,0,s,1,bad
8 | 1,2,3,4,good
9 | 


--------------------------------------------------------------------------------
/pk/tests/iris.csv:
--------------------------------------------------------------------------------
1 | "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
2 | 5.8,4,1.2,0.2,"setosa"
3 | 5.9,3,4.2,1.5,"versicolor"
4 | 6.5,3.2,5.1,2,"virginica"
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | verbosity = 2
3 | detailed-errors = 1
4 | with-doctest = 1
5 | doctest-tests = 1
6 | doctest-extension = rst
7 | doctest-fixtures = _fixture
8 | #doctest-options = +ELLIPSIS,+NORMALIZE_WHITESPACE
9 | where = pk/tests/


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test:
 2 | 	@nosetests -a !slow
 3 | 
 4 | test-all:
 5 | 	@nosetests
 6 | 
 7 | clean:
 8 | 	@find . -name *.pyc -type f -delete
 9 | 	@find . -name cl_gui.log -type f -delete
10 | 
11 | install:
12 | 	@chmod 755 ./install.sh
13 | 	@./install.sh
14 | 


--------------------------------------------------------------------------------
/pk/tests/test_imports.py:
--------------------------------------------------------------------------------
 1 | # Author: Sean Dai
 2 | 
 3 | def test_imports():
 4 |     """
 5 |     Required modules are installed.
 6 |     """
 7 |     import sklearn
 8 |     import matplotlib
 9 |     import seaborn
10 |     import numpy
11 |     import scipy
12 |     import pandas
13 |     import PyQt4
14 |     import PIL


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Install pip if not already installed
 4 | if ! pip_loc="$(type -p "pip")" || [ -z "$pip_loc" ]; then
 5 |     sudo curl https://bootstrap.pypa.io/ez_setup.py -o - | sudo python
 6 | fi
 7 | sudo easy_install pip
 8 | 
 9 | # Python module dependencies
10 | sudo pip install sklearn
11 | sudo pip install numpy
12 | sudo pip install matplotlib
13 | sudo pip install pandas
14 | sudo pip install scipy
15 | sudo pip install seaborn
16 | sudo pip install Pillow
17 | sudo pip install nose
18 | 
19 | PROJ_DIR=`pwd`
20 | echo `export PYTHONPATH=$PYTHONPATH:$PROJ_DIR` >> ~/.bash_profile
21 | 
22 | 


--------------------------------------------------------------------------------
/pk/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | from pk.models import *
 2 | from sklearn.datasets import load_iris
 3 | from sklearn.tree import DecisionTreeClassifier
 4 | from sklearn.mixture import GMM
 5 | from nose.tools import raises
 6 | 
 7 | def test_alg_creation():
 8 |     alg = Algorithm(DecisionTreeClassifier())
 9 |     assert alg.clf_name == 'DecisionTreeClassifier'
10 | 
11 | def test_fit_supervised_algorithm_with_dt():
12 |     iris = load_iris()
13 |     X, y = iris.data, iris.target
14 | 
15 |     alg = SupervisedAlgorithm(DecisionTreeClassifier())
16 |     assert alg.fitted == False
17 |     assert alg.params['tree_'] is None
18 | 
19 |     alg.fit(X,y)
20 |     assert alg.fitted == True
21 |     assert alg.params['tree_'] is not None
22 | 
23 | @raises(Exception)
24 | def test_predict_untrained():
25 |     alg = UnsupervisedAlgorithm(GMM())
26 |     alg.predict([0])
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/pk/utils/prygress.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import threading
 4 | from functools import wraps
 5 | 
 6 | def progress(function=None, stream=sys.stdout, char='.', pause=0.2):
 7 |     """Shows a progress bar while a function runs."""
 8 |     if function is None:
 9 |         return lambda func: progress(func, stream, char, pause)
10 | 
11 |     @wraps(function)
12 |     def wrap_function(*args, **kwargs):
13 |         stop = False
14 | 
15 |         def progress_bar():
16 |             stream.write('')
17 |             while not stop:
18 |                 stream.write(char)
19 |                 stream.flush()
20 |                 time.sleep(pause)
21 |             stream.flush()
22 |         
23 |         try:
24 |             p = threading.Thread(target=progress_bar)
25 |             p.start()
26 |             return function(*args, **kwargs)
27 |         finally:
28 |             stop = True
29 | 
30 |     return wrap_function
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # IntelliJ Files
60 | .idea/
61 | 
62 | # IPython Checkpoints
63 | pk/.ipynb_checkpoints/
64 | *.ipynb
65 | 
66 | # Runtime Folder
67 | temp/
68 | 
69 | # Logging files
70 | *.log
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/pk/main/src/main.py:
--------------------------------------------------------------------------------
 1 | from PyQt4 import QtCore, QtGui
 2 | from PyQt4.QtCore import *
 3 | from PyQt4.QtGui import *
 4 | from pk.main.ui.main_gui import Ui_main_tab
 5 | import sys
 6 | from pk.utils.loading import *
 7 | 
 8 | class MainWindow(QtGui.QTabWidget, Ui_main_tab):
 9 |     def __init__(self, parent=None):
10 |         QtGui.QWidget.__init__(self, parent)
11 |         self.setupUi(self)
12 | # GUI functions goes here
13 | class FileOpener(object):
14 |     def __init__(self, filename):
15 |         self.filename = filename
16 |     @staticmethod
17 |     def load_file(filename):
18 |         extension = filename[filename.rfind('.'):]
19 |         if (extension == '.csv'):
20 |             return load_csv(filename)
21 |         elif (extension == '.arff'):
22 |             return load_arff(filename)
23 |         elif (extension == '.xls' or extension == '.xlsx'):
24 |             return load_excel(filename)
25 | 
26 | 
27 | 
28 | def openfile():
29 |     filename = QFileDialog.getOpenFileName('Open File','/')
30 |     filename = str(filename)
31 |     X, y = FileOpener.load_file(filename)
32 |     return X,y
33 | 
34 | 
35 | 
36 | # Scikit functions goes here
37 | 
38 | # main function to run the program
39 | def main():
40 |     app = QtGui.QApplication(sys.argv)
41 |     ex = MainWindow()
42 |     ex.openfile_btn.clicked.connect(openfile)
43 |     ex.show()
44 |     sys.exit(app.exec_())
45 | 
46 | # runs the main function
47 | if __name__ == '__main__':
48 |     main()


--------------------------------------------------------------------------------
/pk/tests/test_cl_gui.py:
--------------------------------------------------------------------------------
 1 | """This module tests the command line GUI.
 2 |     Author: Sean Dai
 3 | """
 4 | import cl_gui
 5 | from nose.plugins.attrib import attr
 6 | from nose.tools import nottest
 7 | from nose.tools import assert_raises
 8 | from nose.tools import assert_true
 9 | import os
10 | 
11 | def setup():
12 |     os.chdir(os.path.abspath(os.path.join(__file__, '../../..')))
13 |     cl_gui.setup()
14 | 
15 | def td():
16 |     with assert_raises(SystemExit):
17 |         cl_gui.quit_gui()
18 | 
19 | @nottest
20 | def get_test_accuracy():
21 |     pass
22 | 
23 | @attr('slow')
24 | def test_visualize_iris():
25 |     setup()
26 |     cl_gui.process('load pk/tests/iris.csv')
27 |     cl_gui.process('visualize --suppress')
28 |     temp_files = os.listdir('_temp/')
29 |     assert_true('plot_andrews.png' in temp_files)
30 |     assert_true('plot_frequency.png' in temp_files)
31 |     assert_true('plot_radial.png' in temp_files)
32 |     td()
33 | 
34 | @attr('slow')
35 | def test_preprocess_flow():
36 |     setup()
37 |     cl_gui.process('load pk/tests/iris2.csv')
38 |     cl_gui.process('preprocess -std -norm')
39 |     cl_gui.process('plot_radial --suppress')
40 |     temp_files = os.listdir('_temp/')
41 |     assert_true('plot_radial.png' in temp_files)
42 |     td()
43 | 
44 | @attr('slow')
45 | def test_run_decision_tree():
46 |     setup()
47 |     cl_gui.process('load pk/tests/iris2.csv')
48 |     cl_gui.process('run -A dt -test_ratio .5 -cv 15')
49 |     td()
50 | 
51 | @attr('slow')
52 | def test_plot_2d():
53 |     setup()
54 |     cl_gui.process('load_random')
55 |     cl_gui.process('plot_2d --suppress')
56 |     temp_files = os.listdir('_temp/')
57 |     assert_true('plot_2d.png' in temp_files)
58 |     td()


--------------------------------------------------------------------------------
/pk/controller.py:
--------------------------------------------------------------------------------
 1 | """This file contains classes and functions for controller objects.
 2 |     Author: Sean Dai
 3 | """
 4 | from PyQt4 import QtGui
 5 | from PyQt4.QtGui import QFileDialog
 6 | from PyQt4.QtGui import *
 7 | from PyQt4.QtCore import *
 8 | from PyQt4.QtGui import QDialogButtonBox
 9 | 
10 | class ViewGenerator(object):
11 | 
12 |     def open_file_dialog(self, app, filter):
13 |         """
14 |         Opens a file dialog for the user to select the desired file.
15 |         """
16 |         frame = QtGui.QWidget()
17 |         path = QFileDialog.getOpenFileName(parent=frame, caption="Open File",
18 |                                            filter=filter)
19 |         frame.destroy()
20 |         app.closeAllWindows()
21 |         return str(path)
22 | 
23 |     def get_preprocess_options(self, app):
24 |         """
25 |         Opens a new window for selecting preprocessing options for the dataset.
26 |         """
27 |         class PreprocessFrame(QtGui.QWidget):
28 |             def __init__(self):
29 |                 QtGui.QWidget.__init__(self)
30 |                 layout = QVBoxLayout(self)
31 |                 self.setWindowTitle("Preprocessing")
32 |                 self.cbox1 = QCheckBox("Normalize")
33 |                 self.cbox2 = QCheckBox("Standardize")
34 |                 self.cbox3 = QCheckBox("Remove examples containing:")
35 | 
36 |                 text_area = QLineEdit(self.cbox3)
37 |                 text_area.setText("?")
38 |                 dbx = QDialogButtonBox(self)
39 |                 dbx.setStandardButtons(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
40 |                 # self.connect(dbx, SIGNAL("accepted()"), dbx, SLOT("accept()"))
41 |                 # self.connect(dbx, SIGNAL("rejected()"), dbx, SLOT("reject()"))
42 |                 layout.addWidget(self.cbox1)
43 |                 layout.addWidget(self.cbox2)
44 |                 layout.addWidget(self.cbox3)
45 |                 layout.addWidget(text_area)
46 |                 layout.addWidget(dbx, alignment=Qt.AlignCenter)
47 |                 self.setLayout(layout)
48 | 
49 |         pf = PreprocessFrame()
50 |         pf.show()
51 |         app.exec_()
52 | 


--------------------------------------------------------------------------------
/pk/utils/imaging.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from sklearn.feature_extraction.image import grid_to_graph
 4 | from sklearn.cluster import AgglomerativeClustering
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | from scipy.ndimage import imread
 8 | from sklearn.cluster import DBSCAN
 9 | 
10 | 
11 | def segment_image(im_file, n_segments=5, alg='ac'):
12 |     img = imread(im_file)
13 |     img = img[:,:,0]
14 |     X = np.reshape(img, (-1, 1))
15 | 
16 |     if alg == 'ac':
17 |         # Define the structure A of the data. Pixels connected to their neighbors.
18 |         connectivity = grid_to_graph(*img.shape)
19 | 
20 |         # Compute clustering
21 |         print("Compute structured hierarchical clustering...")
22 |         st = time.time()
23 |         n_clusters = n_segments  # number of regions
24 |         ward = AgglomerativeClustering(n_clusters=n_clusters,
25 |                 linkage='complete', connectivity=connectivity).fit(X)
26 |         label = np.reshape(ward.labels_, img.shape)
27 |     elif alg == 'dbscan':
28 |         print("Compute DBScan clustering...")
29 |         st = time.time()
30 |         dbs = DBSCAN(eps=1).fit(X)
31 |         label = np.reshape(dbs.labels_, img.shape)
32 | 
33 |     print("Elapsed time: ", time.time() - st)
34 |     print("Number of pixels: ", label.size)
35 |     print("Number of clusters: ", np.unique(label).size)
36 | 
37 |     return label
38 | 
39 | def visualize_segments(label, type='mask', im_file=None):
40 |     if type == 'mask':
41 |         plt.imshow(label, cmap=plt.cm.Paired)
42 |     elif type == 'contour':
43 |         if im_file is not None:
44 |             img = imread(im_file)
45 |             n_clusters = np.unique(label).size
46 |             plt.imshow(img, cmap=plt.cm.gray)
47 |             for cluster_i in range(n_clusters):
48 |                 plt.contour(label == cluster_i, contours=1,
49 |                             colors=[plt.cm.spectral(
50 |                                 cluster_i/float(n_clusters)),])
51 |     plt.xticks(())
52 |     plt.yticks(())
53 |     plt.show()
54 | 
55 | # im_file = "/Users/sd/Downloads/sample_images/biking.jpg"
56 | # segment_labels = segment_image(im_file, n_segments=15, alg='ac')
57 | # visualize_segments(segment_labels, type='contour', im_file=im_file)
58 | # visualize_segments(segment_labels, type='mask', im_file=im_file)


--------------------------------------------------------------------------------
/pk/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
 1 | # Author: Sean Dai
 2 | import os
 3 | 
 4 | from numpy.testing import assert_array_equal
 5 | from nose.tools import assert_true
 6 | from numpy.testing import assert_array_almost_equal
 7 | 
 8 | from pk.utils.preprocess import PreprocessingEngine
 9 | from sklearn.datasets import load_digits
10 | from sklearn.datasets import load_boston
11 | from sklearn.preprocessing import StandardScaler
12 | from sklearn.preprocessing import Normalizer
13 | 
14 | from pk.utils.loading import load_csv
15 | import numpy as np
16 | 
17 | __DIR_NAME = os.path.abspath(os.path.dirname(__file__)) + '/'
18 | pe = PreprocessingEngine()
19 | 
20 | def test_standardize():
21 |     digits = load_digits()
22 |     X = digits.data
23 |     assert_true((pe.standardize(X) == StandardScaler().fit_transform(X)).all())
24 | 
25 | def test_normalize_data():
26 |     boston = load_boston()
27 |     X = boston.data
28 |     assert_true((pe.normalize_data(X) == Normalizer().fit_transform(X)).all())
29 | 
30 | def test_remove_incomplete_examples():
31 |     X, y, _ = load_csv(__DIR_NAME + 'blank.csv')
32 |     assert len(X) == len(y)
33 |     X, y = pe.remove_incomplete_examples(X, y, '?')
34 | 
35 |     exp_X = np.array([[1,2,3,4],
36 |                      [2,3,4,5],
37 |                      [2,3,4,5],
38 |                      [1,2,3,4]])
39 |     exp_X = exp_X.astype('str')
40 |     exp_y = np.array(['good', 'good', 'good', 'good'])
41 |     assert_array_equal(X, exp_X)
42 |     assert_array_equal(y, exp_y)
43 | 
44 | def test_label_encoder():
45 |     X = np.array([['a','b',1], ['a','a',11], ['b','b',13], ['c', 'c', 100]])
46 |     expX = np.array([[0, 1, 1],
47 |                      [0, 0, 11],
48 |                      [1, 1, 13],
49 |                      [2, 2, 100]])
50 |     assert_array_almost_equal(pe.convert_to_float_array(pe.encode_labels(X)),
51 |                               expX)
52 | 
53 | def test_binarize():
54 |     y = ['a', 'b', 'c', 'a']
55 |     exp_y = [[1, 0, 0],
56 |              [0, 1, 0],
57 |              [0, 0, 1],
58 |              [1, 0, 0]]
59 |     assert_array_almost_equal(pe.binarize(y), exp_y)
60 | 
61 | def test_inpute_missing_values():
62 |     X = np.array([[1,2,'NaN'], [3,'NaN',5], [1,2,3]])
63 |     X = pe.encode_labels(X)
64 |     X = pe.impute_missing_values(X, missing_values='NaN')
65 |     exp_X = np.array([[ 1.,  2.,  4.],
66 |                       [ 3.,  2.,  5.],
67 |                       [ 1.,  2.,  3.]])
68 |     assert_array_almost_equal(X, exp_X)


--------------------------------------------------------------------------------
/pk/models.py:
--------------------------------------------------------------------------------
 1 | """ This module contains the model objects for the GUI.
 2 |     Author: Sean Dai
 3 | """
 4 | from sklearn.base import BaseEstimator
 5 | from sklearn.base import clone
 6 | 
 7 | class BaseModel(object):
 8 |     """
 9 |     A base model class to hold information.
10 |     """
11 |     def __init__(self):
12 |         self.observers = []
13 |         self.data = None
14 | 
15 |     def add_observer(self, observer):
16 |         """
17 |         Register an observer.
18 |         """
19 |         self.observers.append(observer)
20 | 
21 |     def changed(self, event):
22 |         """
23 |         Notify observers of changes.
24 |         """
25 |         for obs in self.observers:
26 |             obs.update(event, self)
27 | 
28 | class Algorithm(BaseModel):
29 |     """
30 |     This class wraps the machine learning algorithm around an object.
31 |     """
32 |     def __init__(self, clf=BaseEstimator()):
33 |         super(Algorithm, self).__init__()
34 |         self.clf = clf
35 |         # Create a deep copy of origin classifier for retraining purposes.
36 |         self.clf_name = type(clf).__name__
37 |         self.fitted = False
38 | 
39 |     def __repr__(self):
40 |         return str(vars(self))
41 | 
42 |     @property
43 |     def params(self):
44 |         """
45 |         Gets the classifier parameters
46 |         """
47 |         return self.clf.__dict__
48 | 
49 |     def _fit(self, *args, **kwargs):
50 |         """
51 |         Runs the algorithm with the passed-in parameters.
52 |         """
53 |         self.fitted = True
54 |         return self.clf.fit(*args, **kwargs)
55 | 
56 |     def predict(self, X):
57 |         if not self.fitted:
58 |             raise Exception("Can't predict with untrained classifier!")
59 |         return self.clf.predict(X)
60 | 
61 | class SupervisedAlgorithm(Algorithm):
62 |     """
63 |     Wrapper class for supervised learning algorithms.
64 |     """
65 |     def __init__(self, clf):
66 |         super(SupervisedAlgorithm, self).__init__(clf)
67 | 
68 |     def fit(self, X, y):
69 |         self.clf = self._fit(X, y)
70 | 
71 | class UnsupervisedAlgorithm(Algorithm):
72 |     """
73 |     Class for unsupervised algorithms (eg. clustering, PCA, etc.)
74 |     """
75 |     def __init__(self, clf):
76 |         super(UnsupervisedAlgorithm, self).__init__(clf)
77 | 
78 |     def fit(self, X):
79 |         self.clf = self._fit(X)
80 | 
81 | 
82 | # from sklearn.tree import DecisionTreeClassifier
83 | # from sklearn.mixture import GMM
84 | # from pk.utils.loading import load_csv
85 | # X,y,_ = load_csv('tests/iris2.csv')
86 | # clf = GMM(n_components=3)
87 | # a = UnsupervisedAlgorithm(clf)
88 | # print a.params
89 | # print a.fitted
90 | # import time
91 | # s = time.time()
92 | # a.fit(X)
93 | # print "Took %f secs" % (time.time() - s)
94 | # print a.fitted
95 | # print a.params
96 | # print a
97 | # print a.clf.means_
98 | # print a.predict([[1,2,3,4]])
99 | 


--------------------------------------------------------------------------------
/pk/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | # __author__ = 'Bhavesh'
 2 | #
 3 | from sklearn.metrics import confusion_matrix, explained_variance_score, mean_squared_error, mean_absolute_error, r2_score, adjusted_rand_score, adjusted_mutual_info_score, homogeneity_score, silhouette_score, v_measure_score
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | from sklearn import cross_validation
 7 | 
 8 | def get_confusion_matrix(clf, X, true_y):
 9 |     predicted_y = clf.predict(X)
10 |     matrix = confusion_matrix(true_y, predicted_y)
11 |     print 'Confusion Matrix is: \n%s' % matrix
12 |     return matrix
13 | 
14 | 
15 | def plot_confusion_matrix(cm, y, title='Confusion matrix', cmap=plt.cm.Blues,
16 |                           continuous_class=False):
17 |     if continuous_class:
18 |         return None
19 |     plt.clf()
20 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
21 |     plt.title(title)
22 |     plt.colorbar()
23 |     tick_marks = np.arange(len(np.unique(y)))
24 |     plt.xticks(tick_marks, np.unique(y), rotation=45)
25 |     plt.yticks(tick_marks, np.unique(y))
26 |     plt.tight_layout()
27 |     plt.ylabel('True label')
28 |     plt.xlabel('Predicted label')
29 |     plt.show(block=False)
30 | 
31 | 
32 | def get_train_accuracy(clf, X, y):
33 |     return round(((clf.score(X, y))*100),5)
34 | 
35 | 
36 | def get_test_accuracy(clf, X, y):
37 |     return round(((clf.score(X, y))*100),5)
38 | 
39 | 
40 | def get_cv_accuracy(clf, X, y, cv=10):
41 |     scores = cross_validation.cross_val_score(clf, X, y, cv=cv)
42 |     avg = scores.mean()
43 |     return scores, round(avg*100, 5)
44 | 
45 | # def get_variance_score(clf, X_test, true_y):
46 | #     pred_y = clf.predict(X_test)
47 | #     return round(explained_variance_score(true_y, pred_y), 4)
48 | #
49 | # def get_mean_abs_error(clf, X_test, true_y):
50 | #     pred_y = clf.predict(X_test)
51 | #     return round(mean_absolute_error(true_y, pred_y), 4)
52 | #
53 | # def get_mean_squared_error(clf, X_test, true_y):
54 | #     pred_y = clf.predict(X_test)
55 | #     return round(mean_squared_error(true_y, pred_y), 4)
56 | #
57 | # def get_median_abs_error(clf, X_test, true_y):
58 | #     pred_y = clf.predict(X_test)
59 | #     return round(median_absolute_error(true_y, pred_y), 4)
60 | #
61 | # def get_r2_score(clf, X_test, true_y):
62 | #     pred_y = clf.predict(X_test)
63 | #     return round(r2_score(true_y, pred_y), 4)
64 | #
65 | # def get_adjusted_rand_index(clf, X_test, true_y):
66 | #     pred_y = clf.predict(X_test)
67 | #     return round(adjusted_rand_score(true_y, pred_y), 4)
68 | #
69 | # def get_adjusted_mutual_info(clf, X_test, true_y):
70 | #     pred_y = clf.predict(X_test)
71 | #     return round(adjusted_mutual_info_score(true_y, pred_y), 4)
72 | #
73 | # def get_homogeneity_score(clf, X_test, true_y):
74 | #     pred_y = clf.predict(X_test)
75 | #     return round(homogeneity_score(true_y, pred_y), 4)
76 | #
77 | # def get_vscore(clf, X_test, true_y):
78 | #     pred_y = clf.predict(X_test)
79 | #     return round(v_measure_score(true_y, pred_y), 4)
80 | #
81 | # def get_silhouette_score(clf, X):
82 | #     # pred_y = clf.predict(X_test)
83 | #     return round(silhouette_score(X, clf.means_, metric='euclidean'))
84 | #
85 | # def benchmark(X, y, training_func, *args, **kwargs):
86 | #     clf = training_func(X, y, *args, **kwargs)
87 | #     get_train_accuracy(clf, X, y)
88 | #     get_test_accuracy(clf, X, y)
89 | 


--------------------------------------------------------------------------------
/pk/tests/test_classification.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Bhavesh'
 2 | 
 3 | """
 4 | from pk.utils.metrics import *
 5 | from pk.utils.classification_utils import *
 6 | from prettytable import PrettyTable
 7 | from pk.utils.loading import *
 8 | import warnings
 9 | 
10 | def runall_classification(X, y):
11 |     warnings.filterwarnings('ignore')
12 |     T = PrettyTable(["Method", "Train Accuracy (%)", "Test Accuracy (%)", "Cross Validation Accuracy (%)"])
13 |     X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3,random_state=0)
14 | 
15 |     dts = train_decision_tree(X_train, y_train)
16 |     dts_train_acc = get_train_accuracy(dts, X_train, y_train)
17 |     dts_test_acc = get_test_accuracy(dts, X_test, y_test)
18 |     _, dts_cv_acc = get_cv_accuracy(dts, X, y)
19 |     T.add_row((["Decision Tree", dts_train_acc, dts_test_acc, dts_cv_acc]))
20 | 
21 |     knn = train_knn(X_train, y_train)
22 |     knn_train_acc = get_train_accuracy(knn, X_train, y_train)
23 |     knn_test_acc = get_test_accuracy(knn, X_test, y_test)
24 |     _, knn_cv_acc = get_cv_accuracy(dts, X, y)
25 |     T.add_row((["Nearest Neighbor", knn_train_acc, knn_test_acc, knn_cv_acc]))
26 | 
27 |     svm = train_svm(X_train, y_train)
28 |     svm_train_acc = get_train_accuracy(svm, X_train, y_train)
29 |     svm_test_acc = get_test_accuracy(svm, X_test, y_test)
30 |     _, svm_cv_acc = get_cv_accuracy(svm, X, y)
31 |     T.add_row((["Support Vector Machine", svm_train_acc, svm_test_acc, svm_cv_acc]))
32 | 
33 |     nb = train_naive_bayes(X_train, y_train)
34 |     nb_train_acc = get_train_accuracy(nb, X_train, y_train)
35 |     nb_test_acc = get_test_accuracy(nb, X_test, y_test)
36 |     _, nb_cv_acc = get_cv_accuracy(nb, X, y)
37 |     T.add_row((["Naive Bayes", nb_train_acc, nb_test_acc, nb_cv_acc]))
38 | 
39 |     ada = train_adaboost(X_train, y_train, base_estimator=dts)
40 |     ada_train_acc = get_train_accuracy(ada, X_train, y_train)
41 |     ada_test_acc = get_test_accuracy(ada, X_test, y_test)
42 |     _, ada_cv_acc = get_cv_accuracy(ada, X, y)
43 |     T.add_row((["AdaBoost", ada_train_acc, ada_test_acc, ada_cv_acc]))
44 | 
45 |     lda = train_lda(X_train, y_train)
46 |     lda_train_acc = get_train_accuracy(lda, X_train, y_train)
47 |     lda_test_acc = get_test_accuracy(lda, X_test, y_test)
48 |     _, lda_cv_acc = get_cv_accuracy(lda, X, y)
49 |     T.add_row((["Linear Discriminant Analysis", lda_train_acc, lda_test_acc, lda_cv_acc]))
50 | 
51 |     qda = train_qda(X_train, y_train)
52 |     qda_train_acc = get_train_accuracy(qda, X_train, y_train)
53 |     qda_test_acc = get_test_accuracy(qda, X_test, y_test)
54 |     _, qda_cv_acc = get_cv_accuracy(qda, X, y)
55 |     T.add_row((["Quadratic Discriminant Analysis", qda_train_acc, qda_test_acc, qda_cv_acc]))
56 | 
57 |     bag = train_bagging(X_train, y_train, base_estimator=dts)
58 |     bag_train_acc = get_train_accuracy(bag, X_train, y_train)
59 |     bag_test_acc = get_test_accuracy(bag, X_test, y_test)
60 |     _, bag_cv_acc = get_cv_accuracy(bag, X, y)
61 |     T.add_row((["Bagging", bag_train_acc, bag_test_acc, bag_cv_acc]))
62 | 
63 |     rf = train_randomForest(X_train, y_train)
64 |     rf_train_acc = get_train_accuracy(rf, X_train, y_train)
65 |     rf_test_acc = get_test_accuracy(rf, X_test, y_test)
66 |     _, rf_cv_acc = get_cv_accuracy(rf, X, y)
67 |     T.add_row((["Random Forest", rf_train_acc, rf_test_acc, rf_cv_acc]))
68 | 
69 |     sgd = train_stochaticGradientDescent(X_train, y_train)
70 |     sgd_train_acc = get_train_accuracy(sgd, X_train, y_train)
71 |     sgd_test_acc = get_test_accuracy(sgd, X_test, y_test)
72 |     _, sgd_cv_acc = get_cv_accuracy(sgd, X, y)
73 |     T.add_row((["Stochastic Gradient Descent", sgd_train_acc, sgd_test_acc, sgd_cv_acc]))
74 |     print T
75 | 
76 | X_data, y_data, dataset = load_csv('iris2.csv')
77 | runall_classification(X_data,y_data)
78 | """


--------------------------------------------------------------------------------
/pk/tests/ratings_best.arff:
--------------------------------------------------------------------------------
 1 | @relation user-funness-rating
 2 | 
 3 | @attribute probBuildJump numeric
 4 | @attribute probBuildCannons numeric
 5 | @attribute probBuildHillStraight numeric
 6 | @attribute probBuildTubes numeric
 7 | @attribute probBuildStraight numeric
 8 | @attribute difficulty numeric
 9 | @attribute blocksCoins numeric
10 | @attribute blocksEmpty numeric
11 | @attribute blocksPower numeric
12 | @attribute enemies numeric
13 | @attribute fun numeric
14 | 
15 | @data
16 | 0.216954,0.193906,0.056133,0.080410,0.452598,0,5,14,1,0,0.000000
17 | 0.066983,0.000000,0.333621,0.423212,0.176184,0,2,3,0,1,0
18 | 0.200653,0.173610,0.625737,0.000000,0.000000,1,3,7,3,7,1.000000
19 | 0.118236,0.104627,0.114326,0.351013,0.311797,2,3,14,0,4,1.00000
20 | 0.187419,0.404103,0.268916,0.000000,0.139563,2,3,11,2,16,1.000000
21 | 0.484383,0.000000,0.000000,0.000000,0.515617,1,10,20,5,3,0.000000
22 | 0.241743,0.107256,0.100075,0.429201,0.121725,0,5,6,0,2,0.000000
23 | 0.000000,0.000000,0.058207,0.346389,0.595404,2,6,29,4,6,0.000000
24 | 0.371011,0.017317,0.451252,0.000000,0.160420,4,0,3,0,47,1.000000
25 | 0.235802,0.002202,0.723037,0.000000,0.038960,1,1,2,0,11,1.000000
26 | 0.379509,0.015927,0.368553,0.169505,0.066507,1,1,7,0,10,0.000000
27 | 0.087263,0.000000,0.462896,0.109292,0.340549,0,7,17,1,0,0.000000
28 | 0.705555,0.021981,0.046962,0.196528,0.028975,0,0,0,0,1,0.000000
29 | 0.268377,0.000000,0.000000,0.731623,0.000000,1,0,0,0,6,0.000000
30 | 0.091454,0.209781,0.205219,0.011092,0.482454,3,6,25,3,18,1.000000
31 | 0.047697,0.218361,0.063613,0.279565,0.390765,2,9,17,1,4,0.000000
32 | 0.000000,0.021094,0.214678,0.619140,0.145088,1,2,7,1,11,1.000000
33 | 0.633017,0.055214,0.311769,0.000000,0.000000,1,0,0,0,2,0.000000
34 | 0.000000,0.262451,0.000000,0.491089,0.246460,4,4,2,1,12,1.000000
35 | 0.040965,0.643791,0.061195,0.000000,0.254049,1,12,39,2,0,1.000000
36 | 0.105690,0.478350,0.156265,0.000000,0.259695,1,9,16,0,7,1.000000
37 | 0.367145,0.108050,0.289100,0.000000,0.235705,0,1,5,0,0,0.000000
38 | 0.216320,0.136071,0.167827,0.087172,0.392610,0,10,27,4,0,0.000000
39 | 0.177538,0.308546,0.163225,0.227862,0.122830,2,1,3,0,6,0.000000
40 | 0.234807,0.070752,0.297166,0.397275,0.000000,4,2,4,0,54,1.000000
41 | 0.239356,0.181646,0.000000,0.398061,0.180937,0,6,13,2,2,1.000000
42 | 0.640998,0.183456,0.000000,0.175546,0.000000,1,0,0,0,3,0.000000
43 | 0.083503,0.087156,0.159052,0.464611,0.205679,2,0,4,0,12,1.000000
44 | 0.000000,0.316622,0.000000,0.409672,0.273706,2,4,14,1,6,1.000000
45 | 0.030001,0.135286,0.519963,0.314750,0.000000,2,1,6,2,16,1.000000
46 | 0.206727,0.595252,0.141556,0.056465,0.000000,0,3,7,0,0,0.000000
47 | 0.000000,0.233946,0.000000,0.596895,0.169159,1,1,6,3,11,0.000000
48 | 0.023783,0.424868,0.203230,0.038263,0.309856,1,8,25,4,5,1.000000
49 | 0.000000,0.715850,0.000000,0.283413,0.000737,2,0,0,0,4,0.000000
50 | 0.000000,0.000000,0.627885,0.372115,0.000000,1,3,7,0,11,0.000000
51 | 0.189209,0.159652,0.166200,0.327410,0.157529,3,3,1,0,33,1.000000
52 | 0.129294,0.000000,0.000000,0.622321,0.248385,1,4,14,0,7,0.000000
53 | 0.172677,0.353326,0.246494,0.227223,0.000280,2,2,4,0,20,1.000000
54 | 0.000000,0.001455,0.172851,0.303352,0.522343,2,10,16,2,9,0.000000
55 | 0.367742,0.000000,0.207829,0.275175,0.149254,0,1,5,0,0,0.000000
56 | 0.000000,0.569791,0.030189,0.215236,0.184784,4,1,5,0,4,0.000000
57 | 0.044086,0.000000,0.242088,0.618698,0.095129,1,3,3,0,10,1.000000
58 | 0.062823,0.012348,0.104718,0.126612,0.693498,0,18,39,5,0,1.000000
59 | 0.302281,0.451045,0.000000,0.191085,0.055588,3,0,0,0,3,1.000000
60 | 0.000000,0.000000,0.000000,0.000000,1.000000,4,16,48,4,38,1.000000
61 | 0.000000,0.000000,0.330264,0.416459,0.253277,2,5,9,1,20,1.000000
62 | 0.130570,0.079232,0.704769,0.048698,0.036732,1,2,4,0,15,0.000000
63 | 0.211736,0.171886,0.217981,0.304112,0.094285,2,0,5,1,19,1.000000
64 | 0.000000,0.482511,0.179350,0.181636,0.156503,1,1,2,0,12,0.000000
65 | 0.169489,0.127587,0.362061,0.127720,0.213142,1,5,15,1,6,0.000000
66 | 0.063654,0.483960,0.082888,0.271711,0.097787,2,3,9,0,16,1.000000
67 | 0.187733,0.450175,0.293570,0.000000,0.068522,2,2,1,0,17,1.000000
68 | 0.072522,0.672800,0.080936,0.160421,0.013321,4,0,0,0,22,1.000000
69 | 0.158278,0.063573,0.428389,0.349759,0.000000,2,0,2,1,19,0.000000
70 | 0.000000,0.419720,0.247382,0.310430,0.022467,2,2,10,0,21,1.000000
71 | 0.169931,0.143725,0.246039,0.440305,0.000000,3,0,4,0,50,0.000000
72 | 


--------------------------------------------------------------------------------
/pk/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | """
 2 | __author__ = 'Bhavesh'
 3 | 
 4 | 
 5 | from pk.utils.loading import *
 6 | from pk.utils.regression_utils import *
 7 | from pk.utils.metrics import *
 8 | from prettytable import PrettyTable
 9 | def runall_regression(X, y):
10 |     T = PrettyTable(["Regression Method", "Train Accuracy (%)", "Test Accuracy (%)", "Variance score", "Mean Squared Error", "Mean Abs Error", "Median Abs Error", "R2 score"])
11 |     X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3,random_state=0)
12 | 
13 |     # leastsquare
14 |     ls = train_leastSquareModel(X_train, y_train)
15 |     ls_train_acc = get_train_accuracy(ls, X_train, y_train)
16 |     ls_test_acc = get_test_accuracy(ls, X_test, y_test)
17 |     ls_var = get_variance_score(ls, X_test, y_test)
18 |     ls_mse = get_mean_squared_error(ls, X_test, y_test)
19 |     ls_mean = get_mean_abs_error(ls, X_test, y_test)
20 |     ls_med = get_median_abs_error(ls, X_test, y_test)
21 |     ls_r2 = get_r2_score(ls, X_test, y_test)
22 |     T.add_row((["Least Square Linear", ls_train_acc, ls_test_acc, ls_var, ls_mse, ls_mean, ls_med, ls_r2]))
23 | 
24 |     # polynomial model with degree 3
25 |     poly = train_polynomialRegressionModel(X_train, y_train, degree=3)
26 |     poly_train_acc = get_train_accuracy(poly, X_train, y_train)
27 |     poly_test_acc = get_test_accuracy(poly, X_test, y_test)
28 |     poly_var = get_variance_score(poly, X_test, y_test)
29 |     poly_mse = get_mean_squared_error(poly, X_test, y_test)
30 |     poly_mean = get_mean_abs_error(poly, X_test, y_test)
31 |     poly_med = get_median_abs_error(poly, X_test, y_test)
32 |     poly_r2 = get_r2_score(poly, X_test, y_test)
33 |     T.add_row((["Polynomial (degree = 3)", poly_train_acc, poly_test_acc, poly_var, poly_mse, poly_mean, poly_med, poly_r2]))
34 | 
35 |     # logistic regression
36 |     log = train_logisticRegressionModel(X_train, y_train)
37 |     log_var = get_variance_score(log, X_test, y_test)
38 |     log_mse = get_mean_squared_error(log, X_test, y_test)
39 |     log_mean = get_mean_abs_error(log, X_test, y_test)
40 |     log_med = get_median_abs_error(log, X_test, y_test)
41 |     log_r2 = get_r2_score(log, X_test, y_test)
42 |     T.add_row((["Logistic", "NA", "NA", log_var, log_mse, log_mean, log_med, log_r2]))
43 | 
44 |     # RANSAN
45 |     ransac = train_RANSACRegressionModel(X_train, y_train)
46 |     ransac_train_acc = get_train_accuracy(ransac, X_train, y_train)
47 |     ransac_test_acc = get_test_accuracy(ransac, X_test, y_test)
48 |     ransac_var = get_variance_score(ransac, X_test, y_test)
49 |     ransac_mse = get_mean_squared_error(ransac, X_test, y_test)
50 |     ransac_mean = get_mean_abs_error(ransac, X_test, y_test)
51 |     ransac_med = get_median_abs_error(ransac, X_test, y_test)
52 |     ransac_r2 = get_r2_score(ransac, X_test, y_test)
53 |     T.add_row((["RANSAC", ransac_train_acc, ransac_test_acc, ransac_var, ransac_mse, ransac_mean, ransac_med, ransac_r2]))
54 | 
55 |     # Bayes
56 |     bayes = train_BayesianRegressionModel(X_train, y_train)
57 |     bayes_train_acc = get_train_accuracy(bayes, X_train, y_train)
58 |     bayes_test_acc = get_test_accuracy(bayes, X_test, y_test)
59 |     bayes_var = get_variance_score(bayes, X_test, y_test)
60 |     bayes_mse = get_mean_squared_error(bayes, X_test, y_test)
61 |     bayes_mean = get_mean_abs_error(bayes, X_test, y_test)
62 |     bayes_med = get_median_abs_error(bayes, X_test, y_test)
63 |     bayes_r2 = get_r2_score(bayes, X_test, y_test)
64 |     T.add_row((["Bayesian", bayes_train_acc, bayes_test_acc, bayes_var, bayes_mse, bayes_mean, bayes_med, bayes_r2]))
65 | 
66 |     # Kernel ridge
67 |     kr = train_kernelRidgeModel(X_train, y_train)
68 |     kr_train_acc = get_train_accuracy(kr, X_train, y_train)
69 |     kr_test_acc = get_test_accuracy(kr, X_test, y_test)
70 |     kr_var = get_variance_score(kr, X_test, y_test)
71 |     kr_mse = get_mean_squared_error(kr, X_test, y_test)
72 |     kr_mean = get_mean_abs_error(kr, X_test, y_test)
73 |     kr_med = get_median_abs_error(kr, X_test, y_test)
74 |     kr_r2 = get_r2_score(kr, X_test, y_test)
75 |     T.add_row((["Kernel Ridge", kr_train_acc, kr_test_acc, kr_var, kr_mse, kr_mean, kr_med, kr_r2]))
76 |     print T
77 | 
78 | # dataset for regression
79 | X, y, _ = load_csv('concrete.csv')
80 | runall_regression(X, y)
81 | 
82 | """
83 | 


--------------------------------------------------------------------------------
/pk/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.preprocessing import scale
  3 | from sklearn.preprocessing import normalize
  4 | from sklearn.preprocessing import LabelEncoder
  5 | from sklearn.preprocessing import LabelBinarizer
  6 | from sklearn.preprocessing import Imputer
  7 | 
  8 | from pk.utils.loading import is_number
  9 | 
 10 | 
 11 | class PreprocessingEngine(object):
 12 |     """
 13 |     This class provides functions for preprocessing the feature array.
 14 |     """
 15 |     def standardize(self, X, axis=0, with_mean=True, with_std=True, copy=True):
 16 |         """
 17 |         Standardize a dataset along any axis.
 18 | 
 19 |         Args:
 20 |             X: numpy feature array of size (n_examples, n_features)
 21 |             axis: axis to compute mean and stds along.
 22 |             with_mean: if True, center data before scaling
 23 |             with_std: if True, scale to unit variance
 24 |             copy: if False, do inplace normalization and avoid copying array
 25 | 
 26 |         Returns:
 27 |             changed_X: mean-shifted X with unit variance
 28 |         """
 29 |         return scale(X, axis, with_mean, with_std, copy)
 30 | 
 31 |     def normalize_data(self, X, norm='l2', axis=1, copy=True):
 32 |         """
 33 |         Scale input vectors to unit norm.
 34 | 
 35 |         Args:
 36 |             X: numpy feature array with shape (n_samples, n_features)
 37 |             norm: the norm to use
 38 |             axis: axis along which to normalize
 39 |             copy: if False, do inplace row normalization
 40 | 
 41 |         Returns:
 42 |             A normalized numpy array.
 43 |         """
 44 |         return normalize(X, norm, axis, copy)
 45 | 
 46 |     def binarize(self, y):
 47 |         """
 48 |         Binarize class labels to support 1 vs. all classfication.
 49 | 
 50 |         Args:
 51 |             y: target - numpy array (1, n_examples)
 52 | 
 53 |         Returns:
 54 |             A binarized target array
 55 |         """
 56 |         return LabelBinarizer().fit_transform(y)
 57 | 
 58 |     def encode_labels(self, X):
 59 |         """
 60 |         Converts categorical feature columns to a numerical values 0 - num_features.
 61 | 
 62 |         Arguments:
 63 |             X: feature array
 64 | 
 65 |         Returns:
 66 |             Feature array with categorical columns replaced with numbers.
 67 |         """
 68 |         # Gets feature labels and stores them in a dict.
 69 |         feature_dict = { i:X[:, i] for i in xrange(len(X[0])) }
 70 |         for i in feature_dict:
 71 |             if not is_number(X[0, i]):
 72 |                 feature_dict[i] = LabelEncoder().fit_transform(feature_dict[i])
 73 | 
 74 |         return np.array(feature_dict.values()).T
 75 | 
 76 |     def convert_to_float_array(self, arr):
 77 |         """
 78 |         Converts a numpy array to float data type.
 79 |         """
 80 |         return arr.astype(float)
 81 | 
 82 |     def remove_incomplete_examples(self, X, y, missing_char="?"):
 83 |         """
 84 |         Removes examples with missing/incomplete features.
 85 | 
 86 |         Args:
 87 |             missing_char: Placeholder for intended value
 88 | 
 89 |         Returns:
 90 |             Numpy feature array X with bad examples removed
 91 |             target classes with bad examples removed
 92 |         """
 93 |         row_ind, _ = np.where(X == missing_char)
 94 |         row_ind = np.unique(row_ind)
 95 |         valid_rows = np.delete(np.arange(len(X)), row_ind)
 96 |         return X[valid_rows, :], y[valid_rows]
 97 | 
 98 |     def impute_missing_values(self, X, missing_values='NaN', strategy='mean',
 99 |                               axis=0, verbose=0, copy=True):
100 |         """
101 |         Replaces missing values in the feature array with inferred values.
102 | 
103 |         Args:
104 |             missing_values: placeholder for missing value
105 |             strategy: default - 'mean', replace missing_values using strategy
106 |                     along the axis
107 |             axis: axis along which to impute
108 |             verbose: verbosity of imputer
109 |             copy: if True, create a copy of X
110 | 
111 |         Returns:
112 |             Numpy feature array X with bad examples removed
113 |             target classes with bad examples removed
114 |         """
115 |         imp = Imputer(missing_values=missing_values, strategy=strategy, axis=axis,
116 |                       verbose=verbose, copy=copy)
117 |         return imp.fit_transform(X)


--------------------------------------------------------------------------------
/pk/tests/iris2.csv:
--------------------------------------------------------------------------------
  1 | Sepal Length,Sepal Width,Petal Length,Petal Width,Species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3.0,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5.0,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5.0,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3.0,1.4,0.1,setosa
 15 | 4.3,3.0,1.1,0.1,setosa
 16 | 5.8,4.0,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1.0,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5.0,3.0,1.6,0.2,setosa
 28 | 5.0,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5.0,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3.0,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5.0,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5.0,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3.0,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5.0,3.3,1.4,0.2,setosa
 52 | 7.0,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4.0,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1.0,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5.0,2.0,3.5,1.0,versicolor
 63 | 5.9,3.0,4.2,1.5,versicolor
 64 | 6.0,2.2,4.0,1.0,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3.0,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1.0,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4.0,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3.0,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3.0,5.0,1.7,versicolor
 80 | 6.0,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1.0,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1.0,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6.0,2.7,5.1,1.6,versicolor
 86 | 5.4,3.0,4.5,1.5,versicolor
 87 | 6.0,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3.0,4.1,1.3,versicolor
 91 | 5.5,2.5,4.0,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3.0,4.6,1.4,versicolor
 94 | 5.8,2.6,4.0,1.2,versicolor
 95 | 5.0,2.3,3.3,1.0,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3.0,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3.0,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6.0,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3.0,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3.0,5.8,2.2,virginica
107 | 7.6,3.0,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2.0,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3.0,5.5,2.1,virginica
115 | 5.7,2.5,5.0,2.0,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3.0,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6.0,2.2,5.0,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2.0,virginica
124 | 7.7,2.8,6.7,2.0,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6.0,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3.0,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3.0,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2.0,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3.0,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6.0,3.0,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3.0,5.2,2.3,virginica
148 | 6.3,2.5,5.0,1.9,virginica
149 | 6.5,3.0,5.2,2.0,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3.0,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/pk/utils/regression_utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Bhavesh'
  2 | 
  3 | from sklearn.linear_model import LinearRegression, LogisticRegression, RANSACRegressor, BayesianRidge
  4 | from sklearn.kernel_ridge import KernelRidge
  5 | from sklearn.preprocessing import PolynomialFeatures
  6 | from sklearn.pipeline import Pipeline
  7 | from numpy import inf
  8 | 
  9 | def train_leastSquareModel(X, y, fit_intercept=True, normalize=False,
 10 |                            copy_X=True, n_jobs=1):
 11 |     """
 12 |     Train a regression model using Least Square method
 13 |     """
 14 |     model = LinearRegression(fit_intercept=fit_intercept,
 15 |                                           normalize=normalize,
 16 |                                           copy_X=copy_X,
 17 |                                           n_jobs=n_jobs)
 18 |     model = model.fit(X, y)
 19 |     return model
 20 | 
 21 | def train_kernelRidgeModel(X, y, alpha=1, kernel='linear',gamma=None, degree=3,
 22 |                       coef0=1, kernel_params=None):
 23 |     """
 24 |     Train a kernel ridge regression model
 25 |     """
 26 |     model = KernelRidge(alpha=alpha,
 27 |                         kernel=kernel,
 28 |                         gamma=gamma,
 29 |                         degree=degree,
 30 |                         coef0=coef0,
 31 |                         kernel_params=kernel_params)
 32 |     model = model.fit(X, y)
 33 |     return model
 34 | 
 35 | def train_logisticRegressionModel(X, y, penalty='l2', dual=False, tol=0.0001,
 36 |                                   C=1.0, fit_intercept=True, intercept_scaling=1,
 37 |                                   class_weight=None, random_state=None,
 38 |                                   solver='liblinear', max_iter=100,
 39 |                                   multi_class='ovr', verbose=False):
 40 |     """
 41 |     Train a logistic regression model
 42 |     """
 43 |     model = LogisticRegression(penalty=penalty,
 44 |                                dual=dual,
 45 |                                tol=tol,
 46 |                                C=C,
 47 |                                fit_intercept=fit_intercept,
 48 |                                intercept_scaling=intercept_scaling,
 49 |                                class_weight=class_weight,
 50 |                                random_state=random_state,
 51 |                                solver=solver,
 52 |                                max_iter=max_iter,
 53 |                                multi_class=multi_class,
 54 |                                verbose=verbose)
 55 |     model = model.fit(X,y)
 56 |     return model
 57 | 
 58 | def train_polynomialRegressionModel(X, y, degree=2, interaction_only=False,
 59 |                                     include_bias=True):
 60 |     """
 61 |     Train a polynomial model using Linear Regression Pipeline with degrees
 62 |     """
 63 |     model = Pipeline([('poly', PolynomialFeatures(degree=degree)),
 64 |                       ('linear', LinearRegression(fit_intercept=False))])
 65 |     model = model.fit(X, y)
 66 |     return model
 67 | 
 68 | def train_BayesianRegressionModel(X, y,n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False):
 69 |     """
 70 |     Train a Bayesian regression model
 71 |     """
 72 |     model = BayesianRidge(n_iter=n_iter,
 73 |                           tol=tol,
 74 |                           alpha_1=alpha_1,
 75 |                           alpha_2=alpha_2,
 76 |                           lambda_1=lambda_1,
 77 |                           lambda_2=lambda_2,
 78 |                           compute_score=compute_score,
 79 |                           fit_intercept=fit_intercept,
 80 |                           normalize=normalize,
 81 |                           copy_X=copy_X,
 82 |                           verbose=verbose)
 83 |     model = model.fit(X,y)
 84 |     return model
 85 | 
 86 | def train_RANSACRegressionModel(X, y, base_estimator=None, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, stop_n_inliers=inf, stop_score=inf, stop_probability=0.99, residual_metric=None, random_state=None):
 87 |     """
 88 |     Train a RANSAC regression model
 89 |     """
 90 |     model = RANSACRegressor(base_estimator=base_estimator,
 91 |                             min_samples=min_samples,
 92 |                             residual_threshold=residual_threshold,
 93 |                             is_data_valid=is_data_valid,
 94 |                             is_model_valid=is_model_valid,
 95 |                             max_trials=max_trials,
 96 |                             stop_n_inliers=stop_n_inliers,
 97 |                             stop_score=stop_score,
 98 |                             stop_probability=stop_probability,
 99 |                             residual_metric=residual_metric,
100 |                             random_state=random_state)
101 |     model = model.fit(X, y)
102 |     return model
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/pk/tests/faithful.csv:
--------------------------------------------------------------------------------
  1 | "","eruptions","waiting"
  2 | "1",3.6,79
  3 | "2",1.8,54
  4 | "3",3.333,74
  5 | "4",2.283,62
  6 | "5",4.533,85
  7 | "6",2.883,55
  8 | "7",4.7,88
  9 | "8",3.6,85
 10 | "9",1.95,51
 11 | "10",4.35,85
 12 | "11",1.833,54
 13 | "12",3.917,84
 14 | "13",4.2,78
 15 | "14",1.75,47
 16 | "15",4.7,83
 17 | "16",2.167,52
 18 | "17",1.75,62
 19 | "18",4.8,84
 20 | "19",1.6,52
 21 | "20",4.25,79
 22 | "21",1.8,51
 23 | "22",1.75,47
 24 | "23",3.45,78
 25 | "24",3.067,69
 26 | "25",4.533,74
 27 | "26",3.6,83
 28 | "27",1.967,55
 29 | "28",4.083,76
 30 | "29",3.85,78
 31 | "30",4.433,79
 32 | "31",4.3,73
 33 | "32",4.467,77
 34 | "33",3.367,66
 35 | "34",4.033,80
 36 | "35",3.833,74
 37 | "36",2.017,52
 38 | "37",1.867,48
 39 | "38",4.833,80
 40 | "39",1.833,59
 41 | "40",4.783,90
 42 | "41",4.35,80
 43 | "42",1.883,58
 44 | "43",4.567,84
 45 | "44",1.75,58
 46 | "45",4.533,73
 47 | "46",3.317,83
 48 | "47",3.833,64
 49 | "48",2.1,53
 50 | "49",4.633,82
 51 | "50",2,59
 52 | "51",4.8,75
 53 | "52",4.716,90
 54 | "53",1.833,54
 55 | "54",4.833,80
 56 | "55",1.733,54
 57 | "56",4.883,83
 58 | "57",3.717,71
 59 | "58",1.667,64
 60 | "59",4.567,77
 61 | "60",4.317,81
 62 | "61",2.233,59
 63 | "62",4.5,84
 64 | "63",1.75,48
 65 | "64",4.8,82
 66 | "65",1.817,60
 67 | "66",4.4,92
 68 | "67",4.167,78
 69 | "68",4.7,78
 70 | "69",2.067,65
 71 | "70",4.7,73
 72 | "71",4.033,82
 73 | "72",1.967,56
 74 | "73",4.5,79
 75 | "74",4,71
 76 | "75",1.983,62
 77 | "76",5.067,76
 78 | "77",2.017,60
 79 | "78",4.567,78
 80 | "79",3.883,76
 81 | "80",3.6,83
 82 | "81",4.133,75
 83 | "82",4.333,82
 84 | "83",4.1,70
 85 | "84",2.633,65
 86 | "85",4.067,73
 87 | "86",4.933,88
 88 | "87",3.95,76
 89 | "88",4.517,80
 90 | "89",2.167,48
 91 | "90",4,86
 92 | "91",2.2,60
 93 | "92",4.333,90
 94 | "93",1.867,50
 95 | "94",4.817,78
 96 | "95",1.833,63
 97 | "96",4.3,72
 98 | "97",4.667,84
 99 | "98",3.75,75
100 | "99",1.867,51
101 | "100",4.9,82
102 | "101",2.483,62
103 | "102",4.367,88
104 | "103",2.1,49
105 | "104",4.5,83
106 | "105",4.05,81
107 | "106",1.867,47
108 | "107",4.7,84
109 | "108",1.783,52
110 | "109",4.85,86
111 | "110",3.683,81
112 | "111",4.733,75
113 | "112",2.3,59
114 | "113",4.9,89
115 | "114",4.417,79
116 | "115",1.7,59
117 | "116",4.633,81
118 | "117",2.317,50
119 | "118",4.6,85
120 | "119",1.817,59
121 | "120",4.417,87
122 | "121",2.617,53
123 | "122",4.067,69
124 | "123",4.25,77
125 | "124",1.967,56
126 | "125",4.6,88
127 | "126",3.767,81
128 | "127",1.917,45
129 | "128",4.5,82
130 | "129",2.267,55
131 | "130",4.65,90
132 | "131",1.867,45
133 | "132",4.167,83
134 | "133",2.8,56
135 | "134",4.333,89
136 | "135",1.833,46
137 | "136",4.383,82
138 | "137",1.883,51
139 | "138",4.933,86
140 | "139",2.033,53
141 | "140",3.733,79
142 | "141",4.233,81
143 | "142",2.233,60
144 | "143",4.533,82
145 | "144",4.817,77
146 | "145",4.333,76
147 | "146",1.983,59
148 | "147",4.633,80
149 | "148",2.017,49
150 | "149",5.1,96
151 | "150",1.8,53
152 | "151",5.033,77
153 | "152",4,77
154 | "153",2.4,65
155 | "154",4.6,81
156 | "155",3.567,71
157 | "156",4,70
158 | "157",4.5,81
159 | "158",4.083,93
160 | "159",1.8,53
161 | "160",3.967,89
162 | "161",2.2,45
163 | "162",4.15,86
164 | "163",2,58
165 | "164",3.833,78
166 | "165",3.5,66
167 | "166",4.583,76
168 | "167",2.367,63
169 | "168",5,88
170 | "169",1.933,52
171 | "170",4.617,93
172 | "171",1.917,49
173 | "172",2.083,57
174 | "173",4.583,77
175 | "174",3.333,68
176 | "175",4.167,81
177 | "176",4.333,81
178 | "177",4.5,73
179 | "178",2.417,50
180 | "179",4,85
181 | "180",4.167,74
182 | "181",1.883,55
183 | "182",4.583,77
184 | "183",4.25,83
185 | "184",3.767,83
186 | "185",2.033,51
187 | "186",4.433,78
188 | "187",4.083,84
189 | "188",1.833,46
190 | "189",4.417,83
191 | "190",2.183,55
192 | "191",4.8,81
193 | "192",1.833,57
194 | "193",4.8,76
195 | "194",4.1,84
196 | "195",3.966,77
197 | "196",4.233,81
198 | "197",3.5,87
199 | "198",4.366,77
200 | "199",2.25,51
201 | "200",4.667,78
202 | "201",2.1,60
203 | "202",4.35,82
204 | "203",4.133,91
205 | "204",1.867,53
206 | "205",4.6,78
207 | "206",1.783,46
208 | "207",4.367,77
209 | "208",3.85,84
210 | "209",1.933,49
211 | "210",4.5,83
212 | "211",2.383,71
213 | "212",4.7,80
214 | "213",1.867,49
215 | "214",3.833,75
216 | "215",3.417,64
217 | "216",4.233,76
218 | "217",2.4,53
219 | "218",4.8,94
220 | "219",2,55
221 | "220",4.15,76
222 | "221",1.867,50
223 | "222",4.267,82
224 | "223",1.75,54
225 | "224",4.483,75
226 | "225",4,78
227 | "226",4.117,79
228 | "227",4.083,78
229 | "228",4.267,78
230 | "229",3.917,70
231 | "230",4.55,79
232 | "231",4.083,70
233 | "232",2.417,54
234 | "233",4.183,86
235 | "234",2.217,50
236 | "235",4.45,90
237 | "236",1.883,54
238 | "237",1.85,54
239 | "238",4.283,77
240 | "239",3.95,79
241 | "240",2.333,64
242 | "241",4.15,75
243 | "242",2.35,47
244 | "243",4.933,86
245 | "244",2.9,63
246 | "245",4.583,85
247 | "246",3.833,82
248 | "247",2.083,57
249 | "248",4.367,82
250 | "249",2.133,67
251 | "250",4.35,74
252 | "251",2.2,54
253 | "252",4.45,83
254 | "253",3.567,73
255 | "254",4.5,73
256 | "255",4.15,88
257 | "256",3.817,80
258 | "257",3.917,71
259 | "258",4.45,83
260 | "259",2,56
261 | "260",4.283,79
262 | "261",4.767,78
263 | "262",4.533,84
264 | "263",1.85,58
265 | "264",4.25,83
266 | "265",1.983,43
267 | "266",2.25,60
268 | "267",4.75,75
269 | "268",4.117,81
270 | "269",2.15,46
271 | "270",4.417,90
272 | "271",1.817,46
273 | "272",4.467,74
274 | 


--------------------------------------------------------------------------------
/pk/tests/test_loading.py:
--------------------------------------------------------------------------------
  1 | # Author: Sean Dai
  2 | import cPickle
  3 | import logging
  4 | import os
  5 | 
  6 | from numpy.testing import assert_array_equal
  7 | from nose.tools import assert_true
  8 | from nose.plugins.attrib import attr
  9 | from numpy.testing import assert_array_almost_equal
 10 | from pandas.util.testing import assert_frame_equal
 11 | from pk.utils.loading import *
 12 | 
 13 | 
 14 | __DIR_NAME = os.path.abspath(os.path.dirname(__file__)) + '/'
 15 | 
 16 | 
 17 | def test_load_arff():
 18 |     X, y, _ = load_arff(__DIR_NAME + "ratings_best.arff")
 19 |     X2, y2 = cPickle.load(open(__DIR_NAME + 'correct_array.pkl', 'r'))
 20 |     assert_true((X == X2).all())
 21 |     assert_true((y == y2).all())
 22 | 
 23 | 
 24 | def test_load_arff_categorical():
 25 |     X, y, _ = load_arff(__DIR_NAME + "credit-g.arff")
 26 |     print X, y
 27 |     logging.info((X, y))
 28 | 
 29 | 
 30 | def test_vectorize():
 31 |     X = np.array([['a', 1], ['b', 2], ['a', 1]])
 32 |     y = np.array(['0', '1', '0'])
 33 |     features = ['f1', 'f2', 'class']
 34 |     X2, y2 = vectorize_categorical_data(X, y, features)
 35 |     exp_vec_X = np.array([[1., 0., 1.],
 36 |                           [0., 1., 2.],
 37 |                           [1., 0., 1.]])
 38 |     assert_array_equal(exp_vec_X, X2)
 39 | 
 40 | 
 41 | def test_vectorize_numeric():
 42 |     X = np.array([[0, 1, 3, 4], [2, 1, 1, 1], [4, 55, 2, 1]])
 43 |     y = np.array([0, 1, 0, 1])
 44 |     features = ['num1', 'num2', 'num3', 'num4', 'class']
 45 |     X2, y2 = vectorize_categorical_data(X, y, features)
 46 |     exp_vec_X = np.array([[0., 1., 3., 4.],
 47 |                           [2., 1., 1., 1.],
 48 |                           [4., 55., 2., 1.]])
 49 |     exp_vec_y = np.array([0, 1, 0, 1])
 50 |     assert_array_equal(X2, exp_vec_X)
 51 |     assert_array_equal(y2, exp_vec_y)
 52 | 
 53 | def test_vectorize_bool_numeric():
 54 |     X = np.array([[0, 1, 3, True], [2, 1, 1, False], [4, 55, 2, True]])
 55 |     y = np.array([0, 1, 0, 1])
 56 |     features = ['num1', 'num2', 'num3', 'num4', 'class']
 57 |     X2, y2 = vectorize_categorical_data(X, y, features)
 58 |     exp_vec_X = np.array([[0., 1., 3., 1.],
 59 |                           [2., 1., 1., 0.],
 60 |                           [4., 55., 2., 1.]])
 61 |     exp_vec_y = np.array([0, 1, 0, 1])
 62 |     assert_array_equal(X2, exp_vec_X)
 63 |     assert_array_equal(y2, exp_vec_y)
 64 | 
 65 | def test_vectorize_bool_only():
 66 |     X = np.array([[False, True], [False, False], [True, True]])
 67 |     y = np.array([0, 1, 0])
 68 |     features = ['bool1', 'bool2', 'class']
 69 |     X2, y2 = vectorize_categorical_data(X, y, features)
 70 |     exp_vec_X = np.array([[0, 1],
 71 |                           [0, 0],
 72 |                           [1, 1]])
 73 |     exp_vec_y = np.array([0, 1, 0])
 74 |     assert_array_equal(X2, exp_vec_X)
 75 |     assert_array_equal(y2, exp_vec_y)
 76 | 
 77 | def test_load_categorical_no_vectorize():
 78 |     X, y, _ = load_arff(__DIR_NAME + "credit-g.arff", vectorize_data=False)
 79 |     correct_list = ["'<0'", '6.0', "'critical/other existing credit'", 'radio/tv', '1169.0',
 80 |                     "'no known savings'", "'>=7'", '4.0', "'male single'", 'none', '4.0',
 81 |                     "'real estate'", '67.0', 'none' ,'own' ,'2.0', 'skilled', '1.0', 'yes', 'yes']
 82 |     assert_array_equal(X[0], correct_list)
 83 | 
 84 | def test_load_csv():
 85 |     filename = __DIR_NAME + 'iris.csv'
 86 |     X, y, _ = load_csv(filename)
 87 |     expX = [[5.8,4,1.2,0.2],
 88 |             [5.9,3,4.2,1.5],
 89 |             [6.5,3.2,5.1,2]]
 90 |     expY = ['setosa', 'versicolor', 'virginica']
 91 |     assert_array_equal(X, expX)
 92 |     assert_array_equal(y, expY)
 93 | 
 94 | def test_load_excel():
 95 |     filename = __DIR_NAME + 'Wine.xls'
 96 |     X, y, _ = load_excel(filename)
 97 |     expX = np.array([[ 1.42300000e+01,   1.71000000e+00,   2.43000000e+00,
 98 |                        1.56000000e+01,   1.27000000e+02,   2.80000000e+00,
 99 |                        3.06000000e+00,   2.80000000e-01,   2.29000000e+00,
100 |                        5.64000000e+00,   1.04000000e+00,   3.92000000e+00,
101 |                        1.06500000e+03],
102 |                     [  1.23700000e+01,   9.40000000e-01,   1.36000000e+00,
103 |                        1.06000000e+01,   8.80000000e+01,   1.98000000e+00,
104 |                        5.70000000e-01,   2.80000000e-01,   4.20000000e-01,
105 |                        1.95000000e+00,   1.05000000e+00,   1.82000000e+00,
106 |                        5.20000000e+02],
107 |                     [  1.28600000e+01,   1.35000000e+00,   2.32000000e+00,
108 |                        1.80000000e+01,   1.22000000e+02,   1.51000000e+00,
109 |                        1.25000000e+00,   2.10000000e-01,   9.40000000e-01,
110 |                        4.10000000e+00,   7.60000000e-01,   1.29000000e+00,
111 |                        6.30000000e+02]])
112 |     expY = ['A', 'B', 'C']
113 |     assert_array_almost_equal(X, expX)
114 |     assert_array_equal(y, expY)
115 | 
116 | def test_generate_random():
117 |     np.random.seed(42)
118 |     X, y, df = generate_random_points(5)
119 |     expX = np.array([[-0.92998481,  9.78172086],
120 |                        [ 4.88184111,  0.05988944],
121 |                        [-2.97867201,  9.55684617],
122 |                        [-8.60454502, -7.44239712],
123 |                        [ 4.17646114,  1.50743993]])
124 |     expY = np.array([0, 1, 0, 2, 1])
125 |     exp_df = pd.DataFrame(np.hstack((expX,expY[:, np.newaxis])))
126 | 
127 |     assert_array_almost_equal(X, expX)
128 |     assert_array_equal(y, expY)
129 |     assert_frame_equal(df, exp_df)
130 | 
131 | @attr('slow')
132 | def test_mldata():
133 |     dl = DatasetIO()
134 |     X, y, df = dl.load_from_mldata('iris')
135 | 
136 | # test_load_arff()
137 | # test_load_arff_categorical()
138 | # test_vectorize()
139 | # test_vectorize_numeric()
140 | # test_load_categorical_no_vectorize()
141 | # test_load_excel()
142 | # test_generate_random()


--------------------------------------------------------------------------------
/pk/utils/classification_utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Bhavesh'
  2 | 
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | from sklearn import svm
  5 | from sklearn.neighbors import KNeighborsClassifier
  6 | from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
  7 | from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
  8 | from sklearn.lda import LDA
  9 | from sklearn.qda import QDA
 10 | from sklearn.linear_model import SGDClassifier
 11 | 
 12 | from prygress import *
 13 | 
 14 | def train_decision_tree(X, y, criterion='gini', splitter='best', max_depth=None,
 15 |                         min_samples_split=2, min_samples_leaf=1,
 16 |                         max_features=None, random_state=None,
 17 |                         max_leaf_nodes=None):
 18 |     """
 19 |     Builds a decision tree model.
 20 | 
 21 |     Returns:
 22 |      clf: Fitted Decision tree classifier object
 23 |     """
 24 |     clf = DecisionTreeClassifier(criterion=criterion,
 25 |                                  splitter=splitter,
 26 |                                  max_depth=max_depth,
 27 |                                  min_samples_split=min_samples_split,
 28 |                                  min_samples_leaf=min_samples_leaf,
 29 |                                  max_features=max_features,
 30 |                                  random_state=random_state,
 31 |                                  max_leaf_nodes=max_leaf_nodes)
 32 |     clf = clf.fit(X, y)
 33 |     print 'Decision Tree done!'
 34 |     return clf
 35 | 
 36 | def train_svm(X, y, C=1.0, kernel='linear', degree=3, gamma=0.0, coef0=0.0,
 37 |               shrinking=True, probability=False, tol=0.001, cache_size=200,
 38 |               class_weight=None, verbose=False, max_iter=-1, random_state=None):
 39 |     """
 40 |     Builds a support vector machine model
 41 | 
 42 |     Returns:
 43 |     clf: Fitted SVM classifier object
 44 |     """
 45 |     clf = svm.SVC(C=C,
 46 |                   kernel=kernel,
 47 |                   degree=degree,
 48 |                   gamma=gamma,
 49 |                   coef0=gamma,
 50 |                   shrinking=shrinking,
 51 |                   probability=probability,
 52 |                   tol=tol,
 53 |                   cache_size=cache_size,
 54 |                   class_weight=class_weight,
 55 |                   verbose=verbose,
 56 |                   max_iter=max_iter,
 57 |                   random_state=random_state)
 58 |     clf = clf.fit(X, y)
 59 |     print 'SVM completed!'
 60 |     return clf
 61 | 
 62 | def train_knn(X, y, n_neighbors=5, weights='uniform', algorithm='auto',
 63 |               leaf_size=30, p=2, metric='minkowski', metric_params=None):
 64 |     """
 65 |     Builds a k-nearest neighbor model
 66 | 
 67 |     Returns:
 68 |     clf: Fitted nearest neighbor model
 69 |     """
 70 |     clf = KNeighborsClassifier(n_neighbors=n_neighbors,
 71 |                                weights=weights,
 72 |                                algorithm=algorithm,
 73 |                                leaf_size=leaf_size,
 74 |                                p=p,
 75 |                                metric=metric,
 76 |                                metric_params=metric_params)
 77 |     clf = clf.fit(X, y)
 78 |     print 'KNN completed!'
 79 |     return clf
 80 | 
 81 | def train_naive_bayes(X, y, distribution='Gaussian'):
 82 |     """
 83 |     Builds a naive bayes classification model
 84 | 
 85 |     Returns:
 86 |     clf: Fitted naive bayes model
 87 |     """
 88 |     if (distribution == 'Guassian'):
 89 |         clf = GaussianNB()
 90 |     elif (distribution == 'Multinomial'):
 91 |         clf = MultinomialNB()
 92 |     else:
 93 |         clf = BernoulliNB()
 94 |     clf = clf.fit(X,y)
 95 |     print 'Naive Bayes completed!'
 96 |     return clf
 97 | 
 98 | def train_adaboost(X, y, base_estimator=DecisionTreeClassifier, n_estimators=50, learning_rate=1.0,
 99 |                    algorithm='SAMME.R', random_state=None):
100 |     """
101 |     Builds a Boost classifier with decision tree as base estimator
102 | 
103 |     Returns:
104 |     clf: Fitted ada boost model
105 |     """
106 |     clf = AdaBoostClassifier(base_estimator=base_estimator,
107 |                              n_estimators=n_estimators,
108 |                              learning_rate=learning_rate,
109 |                              algorithm=algorithm,
110 |                              random_state=random_state)
111 |     clf = clf.fit(X,y)
112 |     print 'AdaBoost completed!'
113 |     return clf
114 | 
115 | def train_lda(X, y, solver='svd', shrinkage=None, priors=None, n_components=None,
116 |               store_covariance=False, tol=0.0001):
117 |     """
118 |     Builds a linear discriminant analysis model
119 | 
120 |     Returns:
121 |     clf: Fitted LDA model
122 |     """
123 |     clf  = LDA(solver=solver,
124 |                shrinkage=shrinkage,
125 |                priors=priors,
126 |                n_components=n_components,
127 |                store_covariance=store_covariance,
128 |                tol=tol)
129 |     clf = clf.fit(X,y)
130 |     print 'Linear Discriminant Analysis completed!'
131 |     return clf
132 | 
133 | def train_qda(X, y, priors=None, reg_param=0.0):
134 |     """
135 |     Builds a quadratic discriminant analysis model
136 | 
137 |     Returns:
138 |     clf: Fitted QDA model
139 |     """
140 |     clf = QDA(priors=priors,
141 |               reg_param=reg_param)
142 |     clf = clf.fit(X,y)
143 |     print 'Quadratic Discriminant Analysis completed!'
144 |     return clf
145 | 
146 | def train_bagging(X, y, base_estimator=None, n_estimators=10, max_samples=1.0,
147 |                   max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1,
148 |                   random_state=None, verbose=0):
149 |     """
150 |     Builds a Bagging model based on decision tree
151 | 
152 |     Returns:
153 |     clf: Fitted Bagging classifier
154 |     """
155 |     clf = BaggingClassifier(base_estimator=base_estimator,
156 |                             n_estimators=n_estimators,
157 |                             max_samples=max_samples,
158 |                             max_features=max_features,
159 |                             bootstrap=bootstrap,
160 |                             bootstrap_features=bootstrap_features,
161 |                             oob_score=oob_score,
162 |                             n_jobs=n_jobs,
163 |                             random_state=random_state,
164 |                             verbose=verbose)
165 |     clf = clf.fit(X,y)
166 |     return clf
167 | 
168 | def train_randomForest(X, y, n_estimators=10, criterion='gini', max_depth=None,
169 |                        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
170 |                        max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False,
171 |                        n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None):
172 |     """
173 | 
174 |     Builds a random forest classifier
175 | 
176 |     Returns: Fitted random forest model
177 |     """
178 |     clf = RandomForestClassifier(n_estimators=n_estimators,
179 |                                  criterion=criterion,
180 |                                  max_depth=max_depth,
181 |                                  min_samples_split=min_samples_split,
182 |                                  min_samples_leaf=min_samples_leaf,
183 |                                  min_weight_fraction_leaf=min_weight_fraction_leaf,
184 |                                  max_features=max_features,
185 |                                  max_leaf_nodes=max_leaf_nodes,
186 |                                  bootstrap=bootstrap,
187 |                                  oob_score=oob_score,
188 |                                  n_jobs=n_jobs,
189 |                                  random_state=random_state,
190 |                                  verbose=verbose,
191 |                                  warm_start=warm_start,
192 |                                  class_weight=class_weight)
193 |     clf = clf.fit(X,y)
194 |     return clf
195 | 
196 | def train_stochaticGradientDescent(X, y, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
197 |                                    fit_intercept=True, n_iter=5, shuffle=True, verbose=0,
198 |                                    epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal',
199 |                                    eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
200 |                                    average=False):
201 |     clf = SGDClassifier(loss=loss,
202 |                         penalty=penalty,
203 |                         alpha=alpha,
204 |                         l1_ratio=l1_ratio,
205 |                         fit_intercept=fit_intercept,
206 |                         n_iter=n_iter,
207 |                         shuffle=shuffle,
208 |                         verbose=verbose,
209 |                         epsilon=epsilon,
210 |                         n_jobs=n_jobs,
211 |                         random_state=random_state,
212 |                         learning_rate=learning_rate,
213 |                         eta0=eta0,
214 |                         power_t=power_t,
215 |                         class_weight=class_weight,
216 |                         warm_start=warm_start,
217 |                         average=average
218 |                         )
219 |     clf = clf.fit(X,y)
220 |     return clf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Scikit-learn GUI
  2 | ================
  3 | A simple GUI for doing fast-paced machine learning in Python.
  4 | 
  5 | Release Notes
  6 | -------------
  7 | ### Version 0.1
  8 | ---------------
  9 | **Features**
 10 | * Various dataset loading utilities (see pk/utils/loading.py)
 11 | * Preprocessing dataset commands
 12 | * Visualizations of input data (2d-dist, histogram of class frequencies, Andrews curve, radial plot, etc.)
 13 | * Supervised learning features
 14 | * Clustering
 15 | 
 16 | **Known bugs**
 17 | * Concurrency issue caused by running two instances of cl_gui.py simultaneously
 18 | * Extra blank window when plotting confusion matrix
 19 | 
 20 | Installation Guide
 21 | ------------------
 22 | The installation instructions are written primarily for Mac OSX machines. (although I think they might work on Linux systems too)
 23 | 
 24 | ### Mac OSX
 25 |   
 26 | In order to run the project, install the following dependencies:
 27 | 
 28 | 1. [Python](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls)
 29 | 2. [pip](https://pip.pypa.io/en/stable/installing/)
 30 | 3. [scikit-Learn](http://scikit-learn.org/stable/install.html)
 31 | 4. [numpy](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls)
 32 | 5. [scipy](https://drive.google.com/open?id=1TYYzeYfbz6GQZPwTKHsBl528ujDL7akzCeEhxbaLMls)
 33 | 6. [matplotlb](http://matplotlib.org/users/installing.html)
 34 | 7. [seaborn](http://stanford.edu/~mwaskom/software/seaborn/installing.html)
 35 | 8. [PIL](http://www.pythonware.com/products/pil/)
 36 | 9. [pandas](http://pandas.pydata.org/pandas-docs/stable/install.html)
 37 | 10. [nose](https://nose.readthedocs.org/en/latest/)
 38 | 11. [PyQt](http://pyqt.sourceforge.net/Docs/PyQt4/installation.html)
 39 | 
 40 | Alternatively, you can enter the makefile command
 41 | ```
 42 | make install
 43 | ```
 44 | to automatically install dependencies 3-10.
 45 | 
 46 | 
 47 | ## User Manual
 48 | To start the application, enter the command at the root directory of the project:
 49 | 
 50 | ```
 51 | python cl_gui.py
 52 | ```
 53 | A list of commands for the Pykit-Learn application is provided in the table below: 
 54 | 
 55 | Commands                     | Example                               | Description 
 56 | ---------------------------- | ------------------------------------- | ----------------
 57 | **load** [file]              | `load ~/Downloads/data.csv`           | Loads the dataset at the path specified by [file]. No quotes "" around the filename!
 58 | **load_random**              | `load_random`                         | Load a randomly generated dataset with 3 classes.
 59 | **load_file_gui**            |                                       | Opens a file dialog for selecting the desired file.
 60 | **plot_2d**                  |                                       | Plot a 2-D distribution of the dataset.
 61 | **plot_andrews**             |                                       | Plot an Andrews curve of the dataset.
 62 | **plot_frequency**           |                                       | View the frequency of each class label.
 63 | **plot_feature_matrix**      |                                       | Generate a matrix plot of feature-feature relationships.
 64 | **plot_scatter_matrix**      |                                       | Matrix plot with KDEs along the diagonal.
 65 | **plot_radial**              |                                       | Plot a radial chart of the dataset.
 66 | **preprocess** [flags]       | `preprocess -std`                     | Preprocesses a dataset. Flags: **-std** Standardize to mean 0 and variance 1. **-norm** Normalize each feature to range [0,1]
 67 | **run** -A [alg] -test_ratio [0-1] -cv [int] | `run -A dt -test_ratio .3 -cv 5` | Runs the ML alg on the loaded dataset. **alg** = dt (Decision Tree). Can specify the test-train ratio. **-cv** enables k-fold cross validation.
 68 | **visualize** --suppress     |                                       | Plots all possible visualizations for input data. **--suppress** disables all plotting output.
 69 | **help**                     |                                       | Provides a help screen of available commands.
 70 | **quit**                     |                                       | Quits the command line GUI.
 71 | 
 72 | 
 73 | ## Examples
 74 | ### Supervised Learning with the Iris Dataset
 75 | **Step 1: Loading the file**
 76 | ```
 77 | >> load pk/tests/iris2.csv
 78 | Feature Array:
 79 |  [[ 5.1  3.5  1.4  0.2]
 80 |  [ 4.9  3.   1.4  0.2]
 81 |  [ 4.7  3.2  1.3  0.2]
 82 |   ...
 83 |  [ 6.2  3.4  5.4  2.3]
 84 |  [ 5.9  3.   5.1  1.8]]
 85 | Target classifications:
 86 |  ['setosa' 'setosa' 'setosa' ...
 87 |  'versicolor' 'versicolor' ...
 88 |  'virginica']
 89 | ```
 90 | **Step 2: Visualizing the input in 2-D**
 91 | ```
 92 | >> plot_2d
 93 | Creating visualization(s).
 94 | Viewing generated plots...
 95 | ```
 96 | ![1] (http://i.imgur.com/94F1iXg.png)
 97 | 
 98 | **Step 3: Preprocessing the input dataset**
 99 | ```
100 | >> preprocess -h
101 | usage: cl_gui.py [-h] [-std] [-norm]
102 | 
103 | optional arguments:
104 |   -h, --help  show this help message and exit
105 |   -std        Standardize the feature array.
106 |   -norm       Normalize the values of each feature.
107 | 0
108 | >> preprocess -std
109 | Standardizing feature array...
110 | [[ -9.00681170e-01   1.03205722e+00  -1.34127240e+00  -1.31297673e+00]
111 |  [ -1.14301691e+00  -1.24957601e-01  -1.34127240e+00  -1.31297673e+00]
112 |  [ -1.38535265e+00   3.37848329e-01  -1.39813811e+00  -1.31297673e+00]
113 |  [ -1.50652052e+00   1.06445364e-01  -1.28440670e+00  -1.31297673e+00]
114 |  ...
115 |  [  6.86617933e-02  -1.24957601e-01   7.62758643e-01   7.90590793e-01]]
116 | ```
117 | 
118 | **Step 4: Fitting a decision tree learner on Iris**
119 | ```
120 | >> run -A dt -test_ratio .3
121 | Running decision tree algorithm on dataset...
122 | Decision Tree done!
123 | Train accuracy: 100.000000
124 | Test accuracy: 96.000000%
125 | Confusion Matrix is:
126 | [[16  0  0]
127 |  [ 0 18  1]
128 |  [ 0  1 14]]
129 | ```
130 | ![2] (http://i.imgur.com/jXRDZhV.png)
131 | 
132 | ## Testing
133 | The unit-testing framework used in this project is the **nose** Python module. Running the unit tests yourself
134 | is as simple as entering the following command in the root directory of the project:
135 | ```
136 | make test
137 | ```
138 | To run all the unit tests (this might take some time), type
139 | ```
140 | make test-all
141 | ```
142 | 
143 | ## Todo List
144 | - [x] MVC Components
145 |     - [x] Model Classes
146 |         - [x] Algorithm
147 |         - [x] SupervisedAlgorithm
148 |         - [x] UnsupervisedAlgorithm
149 |         - [x] RegressionAlgorithm
150 |         - [x] ExecutionReport
151 |     - [x] Controller Classes
152 |         - [x] AlgorithmEngine
153 |         - [x] DatasetIO
154 |         - [x] PreprocessingEngine
155 |         - [x] Visualizer
156 |     - [x] View Classes
157 |         - [x] BaseView
158 | - [x] Demos
159 |   - [x] Image segmentation demo
160 |   - [x] Command-line GUI
161 | - [x] Loading
162 |     - [x] File formats
163 |       - [x] .arff
164 |       - [x] .csv
165 |       - [x] .xls/.xlsx
166 |   - [x] Generate random Gaussian data w/ labels
167 |   - [x] Download dataset from mldata.org
168 | - [x] Preprocessing data
169 |   - [x] Standardization
170 |   - [x] Normalization of training examples
171 |   - [x] Feature Binarization
172 |   - [x] Remove examples with '?' missing values
173 |   - [x] Imputation of missing values
174 |   - [x] Numerical encoding of categorical features
175 | - [x] Supervised Learning
176 |   - [x] Linear & Quadratic Discriminant Analysis
177 |   - [x] SVMs
178 |   - [x] Stochastic Gradient Descent
179 |   - [x] kNN
180 |   - [x] Decision Trees
181 |   - [x] Ensemble Methods
182 |     - [x] Bagging
183 |     - [x] Randomized Trees
184 |     - [x] AdaBoost
185 |   - [x] Multiclass and Multilabel Algorithms
186 |   - [x] Feature Selection
187 |     - [x] Variance thresholding
188 |     - [x] Univariate feature selection
189 |   - [x] Generalized Linear Models
190 |     - [x] Least Squares
191 |     - [x] RANSAC
192 |     - [x] Bayesian
193 |     - [x] Logistic
194 |     - [x] Polynomial
195 |   - [x] Kernel Ridge Regression
196 | - [x] Unsupervised Learning
197 |   - [x] Gaussian Mixture Models
198 |     - [x] GMM
199 |     - [x] DPGMM
200 |   - [x] Manifold Learning
201 |   - [x] Clustering
202 |     - [x] K-means
203 |     - [x] Spectral clustering
204 |     - [x] Hierarchical clustering
205 |     - [x] DBSCAN
206 |   - [x] Decomposing signals into components
207 |     - [x] PCA
208 |     - [x] ICA
209 |     - [x] Factor Analysis
210 |   - [x] Covariance Estimation
211 |   - [x] Novelty and Outlier Detection
212 |   - [x] Restricted Boltzmann Machines
213 | - [x] Model Selection and Evaluation
214 |   - [x] Cross Validation
215 |   - [x] Grid Search
216 |   - [x] Prediction Metrics
217 |     - [x] Classification Metrics
218 |       - [x] ROC
219 |       - [x] Accuracy Score
220 |       - [x] Confusion Matrix
221 |     - [x] Regression Metrics
222 |       - [x] MAE, MSE, R2
223 |     - [x] Clustering Metrics
224 |       - [x] Adjusted Rand index
225 |       - [x] Homogeneity (similarity of items within cluster)
226 |       - [x] Completeness (same class items all go in one cluster)
227 |   - [x] Validation Curves
228 | - [x] Dataset Transformations
229 |   - [x] Pipelining
230 |   - [x] Feature Extraction
231 |     - [x] Dictionary Vectorization
232 |   - [x] Kernel Approximation
233 | - [x] Visualizations
234 |     - [x] Plotting features (2d, frequency chart, radial plot, etc.)
235 | 
236 | 
237 | 
238 | 


--------------------------------------------------------------------------------
/pk/utils/clustering.py:
--------------------------------------------------------------------------------
  1 | """This module provides clustering utility functions.
  2 |     Author: Bhavesh
  3 | """
  4 | from sklearn import cluster
  5 | from sklearn.mixture import GMM, DPGMM
  6 | 
  7 | def train_gmm(X, n_components=3, covariance_type='diag', random_state=None,
  8 |               thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1,
  9 |               params='wmc', init_params='wmc'):
 10 |     """Variational Inference for the Infinite Gaussian Mixture Model.
 11 | 
 12 |     DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it
 13 |     is an infinite mixture model with the Dirichlet Process as a prior
 14 |     distribution on the number of clusters. In practice the
 15 |     approximate inference algorithm uses a truncated distribution with
 16 |     a fixed maximum number of components, but almost always the number
 17 |     of components actually used depends on the data.
 18 | 
 19 |     Stick-breaking Representation of a Gaussian mixture model
 20 |     probability distribution. This class allows for easy and efficient
 21 |     inference of an approximate posterior distribution over the
 22 |     parameters of a Gaussian mixture model with a variable number of
 23 |     components (smaller than the truncation parameter n_components).
 24 | 
 25 |     Initialization is with normally-distributed means and identity
 26 |     covariance, for proper convergence.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     n_components: int, optional
 31 |         Number of mixture components. Defaults to 1.
 32 | 
 33 |     covariance_type: string, optional
 34 |         String describing the type of covariance parameters to
 35 |         use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
 36 |         Defaults to 'diag'.
 37 | 
 38 |     alpha: float, optional
 39 |         Real number representing the concentration parameter of
 40 |         the dirichlet process. Intuitively, the Dirichlet Process
 41 |         is as likely to start a new cluster for a point as it is
 42 |         to add that point to a cluster with alpha elements. A
 43 |         higher alpha means more clusters, as the expected number
 44 |         of clusters is ``alpha*log(N)``. Defaults to 1.
 45 | 
 46 |     thresh : float, optional
 47 |         Convergence threshold.
 48 |     n_iter : int, optional
 49 |         Maximum number of iterations to perform before convergence.
 50 |     params : string, optional
 51 |         Controls which parameters are updated in the training
 52 |         process.  Can contain any combination of 'w' for weights,
 53 |         'm' for means, and 'c' for covars.  Defaults to 'wmc'.
 54 |     init_params : string, optional
 55 |         Controls which parameters are updated in the initialization
 56 |         process.  Can contain any combination of 'w' for weights,
 57 |         'm' for means, and 'c' for covars.  Defaults to 'wmc'.
 58 | 
 59 |     Attributes
 60 |     ----------
 61 |     covariance_type : string
 62 |         String describing the type of covariance parameters used by
 63 |         the DP-GMM.  Must be one of 'spherical', 'tied', 'diag', 'full'.
 64 | 
 65 |     n_components : int
 66 |         Number of mixture components.
 67 | 
 68 |     `weights_` : array, shape (`n_components`,)
 69 |         Mixing weights for each mixture component.
 70 | 
 71 |     `means_` : array, shape (`n_components`, `n_features`)
 72 |         Mean parameters for each mixture component.
 73 | 
 74 |     `precs_` : array
 75 |         Precision (inverse covariance) parameters for each mixture
 76 |         component.  The shape depends on `covariance_type`::
 77 | 
 78 |             (`n_components`, 'n_features')                if 'spherical',
 79 |             (`n_features`, `n_features`)                  if 'tied',
 80 |             (`n_components`, `n_features`)                if 'diag',
 81 |             (`n_components`, `n_features`, `n_features`)  if 'full'
 82 | 
 83 |     `converged_` : bool
 84 |         True when convergence was reached in fit(), False otherwise.
 85 | 
 86 |     See Also
 87 |     --------
 88 |     GMM : Finite Gaussian mixture model fit with EM
 89 | 
 90 |     VBGMM : Finite Gaussian mixture model fit with a variational
 91 |         algorithm, better for situations where there might be too little
 92 |         data to get a good estimate of the covariance matrix.
 93 |     """
 94 | 
 95 |     model = GMM(n_components=n_components,
 96 |                 covariance_type=covariance_type,
 97 |                 random_state=random_state,
 98 |                 thresh=thresh,
 99 |                 tol=tol,
100 |                 min_covar=min_covar,
101 |                 n_iter=n_iter,
102 |                 n_init=n_init,
103 |                 params=params,
104 |                 init_params=init_params)
105 |     model = model.fit(X)
106 |     return model
107 | 
108 | def train_dpgmm(X, n_components=3, covariance_type='diag', alpha=1.0,
109 |                 random_state=None, thresh=None, tol=0.001, verbose=False,
110 |                 min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
111 |     """
112 |     This function trains a Infinite Gaussian Mixture Model for clustering
113 |     :param X:
114 |     :param n_components:
115 |     :param covariance_type:
116 |     :param alpha:
117 |     :param random_state:
118 |     :param thresh:
119 |     :param tol:
120 |     :param verbose:
121 |     :param min_covar:
122 |     :param n_iter:
123 |     :param params:
124 |     :param init_params:
125 |     :return: a trained DPGMM clustering model
126 |     """
127 |     model = DPGMM(n_components=n_components,
128 |                   covariance_type=covariance_type,
129 |                   alpha=alpha,
130 |                   random_state=random_state,
131 |                   thresh=thresh,
132 |                   verbose=verbose,
133 |                   min_covar=min_covar,
134 |                   n_iter=n_iter,
135 |                   params=params,
136 |                   init_params=init_params)
137 |     model = model.fit(X)
138 |     return model
139 | 
140 | 
141 | def train_kmeans(X, n_clusters=3, init='k-means++', n_init=10,
142 |                  max_iter=300, tol=0.0001, precompute_distances='auto',
143 |                  verbose=0, random_state=None, copy_x=True, n_jobs=1):
144 |     """
145 |     This functions trains a simple kmeans clustering model
146 |     :param X:
147 |     :param n_clusters:
148 |     :param init:
149 |     :param n_init:
150 |     :param max_iter:
151 |     :param tol:
152 |     :param precompute_distances:
153 |     :param verbose:
154 |     :param random_state:
155 |     :param copy_x:
156 |     :param n_jobs:
157 |     :return: trained kmeans model for clustering
158 |     """
159 |     model = cluster.KMeans(n_clusters=n_clusters,
160 |                            init=init,
161 |                            n_init=init,
162 |                            max_iter=max_iter,
163 |                            tol=tol,
164 |                            precompute_distances=precompute_distances,
165 |                            verbose=verbose,
166 |                            random_state=random_state,
167 |                            copy_x=copy_x,
168 |                            n_jobs=n_jobs)
169 |     model = model.fit(X)
170 |     return model
171 | 
172 | def train_spectral(X, n_clusters=3, eigen_solver=None, random_state=None,
173 |                    n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10,
174 |                    eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1,
175 |                    kernel_params=None):
176 |     """
177 |     This functions trains Spectral clustering model
178 |     :param X:
179 |     :param n_clusters:
180 |     :param eigen_solver:
181 |     :param random_state:
182 |     :param n_init:
183 |     :param gamma:
184 |     :param affinity:
185 |     :param n_neighbors:
186 |     :param eigen_tol:
187 |     :param assign_labels:
188 |     :param degree:
189 |     :param coef0:
190 |     :param kernel_params:
191 |     :return: a trained Spectral Model for clustering
192 |     """
193 |     model = cluster.SpectralClustering(n_clusters=n_clusters,
194 |                                        eigen_solver=eigen_solver,
195 |                                        random_state=random_state,
196 |                                        n_init=n_init,
197 |                                        gamma=gamma,
198 |                                        affinity=affinity,
199 |                                        n_neighbors=n_neighbors,
200 |                                        eigen_tol=eigen_tol,
201 |                                        assign_labels=assign_labels,
202 |                                        degree=degree,
203 |                                        coef0=coef0,
204 |                                        kernel_params=kernel_params)
205 |     model = model.fit(X)
206 |     return model
207 | 
208 | def train_agglomerative(X, n_clusters=3, affinity='euclidean',
209 |                         connectivity=None, n_components=None,
210 |                         compute_full_tree='auto', linkage='ward'):
211 |     """
212 |     This function trains hierarchical/agglomerative clustering model
213 |     :param X:
214 |     :param n_clusters:
215 |     :param affinity:
216 |     :param connectivity:
217 |     :param n_components:
218 |     :param compute_full_tree:
219 |     :param linkage:
220 |     :return: a trained hierarchical model for clustering
221 |     """
222 |     model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
223 |                                             affinity=affinity,
224 |                                             connectivity=connectivity,
225 |                                             n_components=n_components,
226 |                                             compute_full_tree=compute_full_tree,
227 |                                             linkage=linkage)
228 |     model = model.fit(X)
229 |     return model
230 | 
231 | def train_dbscan(X, eps=0.5, min_samples=5, metric='euclidean',
232 |                  algorithm='auto', leaf_size=30, p=None, random_state=None):
233 |     """
234 |     This function trains a density based spatial clustering model
235 |     :param X:
236 |     :param eps:
237 |     :param min_samples:
238 |     :param metric:
239 |     :param algorithm:
240 |     :param leaf_size:
241 |     :param p:
242 |     :param random_state:
243 |     :return: a train DBSCAN model for clustering
244 |     """
245 |     model = cluster.DBSCAN(eps=eps,
246 |                            min_samples=min_samples,
247 |                            metric=metric,
248 |                            algorithm=algorithm,
249 |                            leaf_size=leaf_size,
250 |                            p=p,
251 |                            random_state=random_state)
252 |     model = model.fit(X)
253 |     return model
254 | 
255 | 


--------------------------------------------------------------------------------
/pk/utils/loading.py:
--------------------------------------------------------------------------------
  1 | """ This module provides loading utilities for data set files with extensions .csv, .arff, .json.
  2 |     Author: Sean Dai
  3 |     """
  4 | from __future__ import print_function
  5 | import cPickle
  6 | import numpy as np
  7 | import os
  8 | import pandas as pd
  9 | 
 10 | 
 11 | from sklearn.feature_extraction import DictVectorizer
 12 | from sklearn.datasets import fetch_mldata
 13 | from sklearn.datasets import make_blobs
 14 | from scipy.io.arff import loadarff
 15 | 
 16 | 
 17 | def _load_arff(filename):
 18 |     """
 19 |     Base function to load arff files.
 20 |     """
 21 |     dataset = loadarff(open(filename, 'r'))
 22 |     features = dataset[1].names()
 23 |     class_attr = features[-1]
 24 |     y = np.array(dataset[0][class_attr])
 25 |     X = np.array(dataset[0][features[:-1]])
 26 |     X = np.array([list(fv) for fv in X])
 27 |     return X, y, features
 28 | 
 29 | 
 30 | def is_numeric_type(array):
 31 |     """
 32 |     Checks if the array's datatype is a number data type.
 33 | 
 34 |     Args:
 35 |         array: numpy array
 36 | 
 37 |     Returns:
 38 |         True if array.dtype is type float, int, uint, complex, or bool
 39 |         Otherwise, we say it's a string.
 40 |     """
 41 |     numeric_dtypes = []
 42 |     numeric_strings = {'uint', 'complex', 'float', 'int'}
 43 |     for dtype, entries in np.sctypes.items():
 44 |         if dtype in numeric_strings:
 45 |             numeric_dtypes.extend(entries)
 46 |     return array.dtype.type in numeric_dtypes
 47 | 
 48 | 
 49 | def vectorize_categorical_data(X, y, features):
 50 |     """
 51 |     One-hot encoding for categorical attributes in the feature array.
 52 | 
 53 |     Args:
 54 |         X: (num_examples, num_features) numpy array of all the examples
 55 |         y: the class labels of size (1, num_examples)
 56 |         features: list of feature names
 57 | 
 58 |     Returns:
 59 |         X: new numpy array with all categorical labels becoming 1-hot encoded
 60 |         y: class labels, changed to 1-hot if labels were categorical
 61 |     """
 62 |     vec = DictVectorizer()
 63 |     assert (len(features) - 1) == len(X[0])
 64 | 
 65 |     # Create a dictionary for each example with the feature name as the key.
 66 |     # DictVectorizer requires feature arrays to be represented as a list
 67 |     # of dict objects. Each element of the list is 1 feature vector example from
 68 |     # the dataset.
 69 |     measurements = []
 70 |     for ex in X:
 71 |         ex_dict = dict(zip(features, ex.tolist()))
 72 |         measurements.append(ex_dict)
 73 |     measurements = _convert_dict_values_to_num(measurements)
 74 | 
 75 |     if not is_numeric_type(y):
 76 |         y = _convert_target_to_num(y)
 77 | 
 78 |     X = vec.fit_transform(measurements, y).toarray()
 79 |     return X, y
 80 | 
 81 | 
 82 | def _convert_dict_values_to_num(examples):
 83 |     """
 84 |     Convert only the numeric values formatted as strings to actual
 85 |     numeric datatypes in the feature array of dicts.
 86 | 
 87 |     examples - list<dict>
 88 |     """
 89 |     new_examples = examples[:]
 90 |     for dct in new_examples:
 91 |         for key in dct:
 92 |             value = dct[key]
 93 |             if is_number(value):
 94 |                 dct[key] = float(value)
 95 |     return new_examples
 96 | 
 97 | def is_number(s):
 98 |         """ True if string s can be converted to a number type.
 99 |         """
100 |         try:
101 |             float(s)
102 |             return True
103 |         except ValueError:
104 |             return False
105 | 
106 | def _convert_target_to_num(target):
107 |     """
108 |     Convert only the numeric values formatted as strings to actual
109 |     numeric datatypes in the feature array of dicts.
110 | 
111 |     target - nd.array of class values
112 | 
113 |     Returns:
114 |         converted target array to float dtype
115 |     """
116 | 
117 |     if all(map(is_number, target)):
118 |         return target.astype(float)
119 |     else:
120 |         return target
121 | 
122 | 
123 | def load_arff(filename, vectorize_data=False, is_supervised=True):
124 |     """
125 |     Loads .arff dataset files.
126 | 
127 |     Args:
128 |         filename: str
129 | 
130 |     Returns:
131 |         X : a (num_examples, num_features) numpy array of examples X
132 |         y : the class labels y of size (1, num_examples)
133 |         data: DataFrame object of features concatenated with target values
134 |     """
135 |     X, y, features = _load_arff(filename)
136 |     df = stack_to_data_frame(X, y)
137 | 
138 |     # For categorical data, we want the feature label names
139 |     # in order to create a 1-hot encoding of the categorical
140 |     # values in our feature array of examples.
141 |     if not is_numeric_type(X) and vectorize_data:
142 |         return vectorize_categorical_data(X, y, features), df
143 |     else:
144 |         return X, y, df
145 | 
146 | def load_csv(filename, vectorize_data=False):
147 |     """
148 |     Loads csv dataset files.
149 | 
150 |     Args:
151 |         filename: str
152 | 
153 |     Returns:
154 |         X : a (num_examples, num_features) numpy array of examples X
155 |         y : the class labels y of size (1, num_examples)
156 |         dataset: DataFrame object for dataset file
157 |     """
158 |     try:
159 |         dataset = pd.read_csv(filename, sep=',')
160 |         dd = dataset.ix[:, -1]
161 |         y = np.array(dd.tolist()).T
162 |         column_names = dataset.dtypes.index
163 |         X = np.array(dataset[column_names[:-1]])
164 | 
165 |         if is_numeric_type(X):
166 |             X = X.astype(float)
167 |         if is_numeric_type(y):
168 |             y = y.astype(float)
169 | 
170 |         # Change categorical attributes to 1-hot numerical encoding
171 |         if vectorize_data:
172 |             X, y = vectorize_categorical_data(X, y, column_names)
173 |         return X, y, dataset
174 |     except OSError:
175 |         print('File does not exist')
176 | 
177 | def load_excel(filename, vectorize_data=False):
178 |     """
179 |     Loads .excel dataset files.
180 | 
181 |     Args:
182 |         filename: str
183 | 
184 |     Returns:
185 |         X : a (num_examples, num_features) numpy array of examples X
186 |         y : the class labels y of size (1, num_examples)
187 |         data: DataFrame object
188 |     """
189 |     try:
190 |         xl = pd.ExcelFile(filename)
191 |         sheets = xl.sheet_names
192 |         data = xl.parse(sheets[0])
193 |         last_col = data.ix[:, -1]
194 |         # Assumes last column contains class value
195 |         y = np.array(last_col.tolist()).T
196 |         column_names = data.dtypes.index
197 |         X = np.array(data[column_names[:-1]])
198 | 
199 |         if is_numeric_type(X):
200 |             X = X.astype(float)
201 |         if is_numeric_type(y):
202 |             y = y.astype(float)
203 | 
204 |         # Change categorical attributes to 1-hot numerical encoding
205 |         if vectorize_data:
206 |             X, y = vectorize_categorical_data(X, y, column_names)
207 |         return X, y, data
208 |     except OSError:
209 |         print('File does not exist')
210 | 
211 | def generate_random_points(n_samples=100, n_features=2, centers=3):
212 |     """
213 |     Generate a random dataset consisting of Gaussian blobs.
214 | 
215 |     Args:
216 |         n_samples: samples to have
217 |         n_features: num of features per sample
218 |         n_centers: number of clusters (or classes)
219 | 
220 |     Returns:
221 |         X: feature array size (n_samples, n_features)
222 |         y: target array that corresponds each example to a cluster
223 |             with size (n_samples, 1).
224 |         data_frame: DataFrame object
225 |     """
226 |     X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers)
227 |     return X, y, stack_to_data_frame(X, y)
228 | 
229 | def stack_to_data_frame(X, y):
230 |     """
231 |     Concatenates a feature array with its class labels.
232 | 
233 |     Args:
234 |         X: feature array (n_samples, n_features)
235 |         y: target labels (1, n_samples)
236 | 
237 |     Returns:
238 |         Pandas DataFrame with y.T concatenated horizontally to X.
239 | 
240 |     Examples:
241 |     X = [[1,2,3]
242 |          [4,5,6]
243 |          [7,8,9]]
244 |     y = ['yes', 'no', yes']
245 |     stack_to_data_frame(X,y) = [[1,2,3,'yes']
246 |                                 [4,5,6,'no']
247 |                                 [7,8,9,'yes']]
248 |     """
249 |     return pd.DataFrame(np.hstack((X, y[:, np.newaxis])))
250 | 
251 | class DatasetIO(object):
252 |     """
253 |     This class performs loading and saving of dataset files.
254 |     """
255 |     def load_file(self, filename):
256 |         extension = filename[filename.rfind('.'):]
257 |         if extension == '.csv':
258 |             return load_csv(filename)
259 |         elif extension == '.arff':
260 |             return load_arff(filename)
261 |         elif extension == '.xls' or extension == '.xlsx':
262 |             return load_excel(filename)
263 |         else:
264 |             raise IOError('{} is not a valid filename!'.format(filename))
265 | 
266 |     def load_from_mldata(self, dataname):
267 |         """
268 |         Loads a dataset from the mldata.org repository.
269 | 
270 |         Args:
271 |             dataname: Name of the dataset on mldata.org (str)
272 |                 Eg. "regression-datasets stock", "leukemia"
273 | 
274 |         Returns:
275 |             X : a (num_examples, num_features) numpy array of examples X
276 |             y : the class labels y of size (1, num_examples)
277 |             data_frame: Pandas DataFrame object
278 |         """
279 |         import tempfile
280 |         import shutil
281 | 
282 |         # Create a temporary directory to store the downloaded dataset.
283 |         test_data_home = tempfile.mkdtemp()
284 |         # Fetch the dataset from ml data
285 |         try:
286 |             dataset = fetch_mldata(dataname, data_home=test_data_home,
287 |                                    transpose_data=True)
288 |         except Exception as e:
289 |             shutil.rmtree(test_data_home, ignore_errors=True)
290 |             raise Exception("No connection to mldata.org server!", e.message)
291 | 
292 |         X, y = dataset.data, dataset.target
293 |         # Remove the temporary directory
294 |         shutil.rmtree(test_data_home, ignore_errors=True)
295 |         data_frame = stack_to_data_frame(X, y)
296 |         return X, y, data_frame
297 | 
298 |     def pickle_files(self, files_to_save, save_dir):
299 |         """
300 |         Saves a list of files to _temp directory
301 | 
302 |         Args:
303 |             files_to_save: List of tuples in form (obj, filename_to_save)
304 |             save_dir: Directory to save the files (str)
305 |         """
306 |         for obj, filename in files_to_save:
307 |             with open(os.path.join(save_dir, filename), 'wb') as f:
308 |                 cPickle.dump(obj, f)
309 | 


--------------------------------------------------------------------------------
/pk/tests/credit-g.arff:
--------------------------------------------------------------------------------
  1 | % Description of the German credit dataset.
  2 | %
  3 | % 1. Title: German Credit data
  4 | %
  5 | % 2. Source Information
  6 | %
  7 | % Professor Dr. Hans Hofmann
  8 | % Institut f"ur Statistik und "Okonometrie
  9 | % Universit"at Hamburg
 10 | % FB Wirtschaftswissenschaften
 11 | % Von-Melle-Park 5
 12 | % 2000 Hamburg 13
 13 | %
 14 | % 3. Number of Instances:  1000
 15 | %
 16 | % Two datasets are provided.  the original dataset, in the form provided
 17 | % by Prof. Hofmann, contains categorical/symbolic attributes and
 18 | % is in the file "german.data".
 19 | %
 20 | % For algorithms that need numerical attributes, Strathclyde University
 21 | % produced the file "german.data-numeric".  This file has been edited
 22 | % and several indicator variables added to make it suitable for
 23 | % algorithms which cannot cope with categorical variables.   Several
 24 | % attributes that are ordered categorical (such as attribute 17) have
 25 | % been coded as integer.    This was the form used by StatLog.
 26 | %
 27 | %
 28 | % 6. Number of Attributes german: 20 (7 numerical, 13 categorical)
 29 | %    Number of Attributes german.numer: 24 (24 numerical)
 30 | %
 31 | %
 32 | % 7.  Attribute description for german
 33 | %
 34 | % Attribute 1:  (qualitative)
 35 | % 	       Status of existing checking account
 36 | %                A11 :      ... <    0 DM
 37 | % 	       A12 : 0 <= ... <  200 DM
 38 | % 	       A13 :      ... >= 200 DM /
 39 | % 		     salary assignments for at least 1 year
 40 | %                A14 : no checking account
 41 | %
 42 | % Attribute 2:  (numerical)
 43 | % 	      Duration in month
 44 | %
 45 | % Attribute 3:  (qualitative)
 46 | % 	      Credit history
 47 | % 	      A30 : no credits taken/
 48 | % 		    all credits paid back duly
 49 | %               A31 : all credits at this bank paid back duly
 50 | % 	      A32 : existing credits paid back duly till now
 51 | %               A33 : delay in paying off in the past
 52 | % 	      A34 : critical account/
 53 | % 		    other credits existing (not at this bank)
 54 | %
 55 | % Attribute 4:  (qualitative)
 56 | % 	      Purpose
 57 | % 	      A40 : car (new)
 58 | % 	      A41 : car (used)
 59 | % 	      A42 : furniture/equipment
 60 | % 	      A43 : radio/television
 61 | % 	      A44 : domestic appliances
 62 | % 	      A45 : repairs
 63 | % 	      A46 : education
 64 | % 	      A47 : (vacation - does not exist?)
 65 | % 	      A48 : retraining
 66 | % 	      A49 : business
 67 | % 	      A410 : others
 68 | %
 69 | % Attribute 5:  (numerical)
 70 | % 	      Credit amount
 71 | %
 72 | % Attibute 6:  (qualitative)
 73 | % 	      Savings account/bonds
 74 | % 	      A61 :          ... <  100 DM
 75 | % 	      A62 :   100 <= ... <  500 DM
 76 | % 	      A63 :   500 <= ... < 1000 DM
 77 | % 	      A64 :          .. >= 1000 DM
 78 | %               A65 :   unknown/ no savings account
 79 | %
 80 | % Attribute 7:  (qualitative)
 81 | % 	      Present employment since
 82 | % 	      A71 : unemployed
 83 | % 	      A72 :       ... < 1 year
 84 | % 	      A73 : 1  <= ... < 4 years
 85 | % 	      A74 : 4  <= ... < 7 years
 86 | % 	      A75 :       .. >= 7 years
 87 | %
 88 | % Attribute 8:  (numerical)
 89 | % 	      Installment rate in percentage of disposable income
 90 | %
 91 | % Attribute 9:  (qualitative)
 92 | % 	      Personal status and sex
 93 | % 	      A91 : male   : divorced/separated
 94 | % 	      A92 : female : divorced/separated/married
 95 | %               A93 : male   : single
 96 | % 	      A94 : male   : married/widowed
 97 | % 	      A95 : female : single
 98 | %
 99 | % Attribute 10: (qualitative)
100 | % 	      Other debtors / guarantors
101 | % 	      A101 : none
102 | % 	      A102 : co-applicant
103 | % 	      A103 : guarantor
104 | %
105 | % Attribute 11: (numerical)
106 | % 	      Present residence since
107 | %
108 | % Attribute 12: (qualitative)
109 | % 	      Property
110 | % 	      A121 : real estate
111 | % 	      A122 : if not A121 : building society savings agreement/
112 | % 				   life insurance
113 | %               A123 : if not A121/A122 : car or other, not in attribute 6
114 | % 	      A124 : unknown / no property
115 | %
116 | % Attribute 13: (numerical)
117 | % 	      Age in years
118 | %
119 | % Attribute 14: (qualitative)
120 | % 	      Other installment plans
121 | % 	      A141 : bank
122 | % 	      A142 : stores
123 | % 	      A143 : none
124 | %
125 | % Attribute 15: (qualitative)
126 | % 	      Housing
127 | % 	      A151 : rent
128 | % 	      A152 : own
129 | % 	      A153 : for free
130 | %
131 | % Attribute 16: (numerical)
132 | %               Number of existing credits at this bank
133 | %
134 | % Attribute 17: (qualitative)
135 | % 	      Job
136 | % 	      A171 : unemployed/ unskilled  - non-resident
137 | % 	      A172 : unskilled - resident
138 | % 	      A173 : skilled employee / official
139 | % 	      A174 : management/ self-employed/
140 | % 		     highly qualified employee/ officer
141 | %
142 | % Attribute 18: (numerical)
143 | % 	      Number of people being liable to provide maintenance for
144 | %
145 | % Attribute 19: (qualitative)
146 | % 	      Telephone
147 | % 	      A191 : none
148 | % 	      A192 : yes, registered under the customers name
149 | %
150 | % Attribute 20: (qualitative)
151 | % 	      foreign worker
152 | % 	      A201 : yes
153 | % 	      A202 : no
154 | %
155 | %
156 | %
157 | % 8.  Cost Matrix
158 | %
159 | % This dataset requires use of a cost matrix (see below)
160 | %
161 | %
162 | %       1        2
163 | % ----------------------------
164 | %   1   0        1
165 | % -----------------------
166 | %   2   5        0
167 | %
168 | % (1 = Good,  2 = Bad)
169 | %
170 | % the rows represent the actual classification and the columns
171 | % the predicted classification.
172 | %
173 | % It is worse to class a customer as good when they are bad (5),
174 | % than it is to class a customer as bad when they are good (1).
175 | %
176 | %
177 | %
178 | %
179 | %
180 | % Relabeled values in attribute checking_status
181 | %    From: A11                     To: '<0'
182 | %    From: A12                     To: '0<=X<200'
183 | %    From: A13                     To: '>=200'
184 | %    From: A14                     To: 'no checking'
185 | %
186 | %
187 | % Relabeled values in attribute credit_history
188 | %    From: A30                     To: 'no credits/all paid'
189 | %    From: A31                     To: 'all paid'
190 | %    From: A32                     To: 'existing paid'
191 | %    From: A33                     To: 'delayed previously'
192 | %    From: A34                     To: 'critical/other existing credit'
193 | %
194 | %
195 | % Relabeled values in attribute purpose
196 | %    From: A40                     To: 'new car'
197 | %    From: A41                     To: 'used car'
198 | %    From: A42                     To: furniture/equipment
199 | %    From: A43                     To: radio/tv
200 | %    From: A44                     To: 'domestic appliance'
201 | %    From: A45                     To: repairs
202 | %    From: A46                     To: education
203 | %    From: A47                     To: vacation
204 | %    From: A48                     To: retraining
205 | %    From: A49                     To: business
206 | %    From: A410                    To: other
207 | %
208 | %
209 | % Relabeled values in attribute savings_status
210 | %    From: A61                     To: '<100'
211 | %    From: A62                     To: '100<=X<500'
212 | %    From: A63                     To: '500<=X<1000'
213 | %    From: A64                     To: '>=1000'
214 | %    From: A65                     To: 'no known savings'
215 | %
216 | %
217 | % Relabeled values in attribute employment
218 | %    From: A71                     To: unemployed
219 | %    From: A72                     To: '<1'
220 | %    From: A73                     To: '1<=X<4'
221 | %    From: A74                     To: '4<=X<7'
222 | %    From: A75                     To: '>=7'
223 | %
224 | %
225 | % Relabeled values in attribute personal_status
226 | %    From: A91                     To: 'male div/sep'
227 | %    From: A92                     To: 'female div/dep/mar'
228 | %    From: A93                     To: 'male single'
229 | %    From: A94                     To: 'male mar/wid'
230 | %    From: A95                     To: 'female single'
231 | %
232 | %
233 | % Relabeled values in attribute other_parties
234 | %    From: A101                    To: none
235 | %    From: A102                    To: 'co applicant'
236 | %    From: A103                    To: guarantor
237 | %
238 | %
239 | % Relabeled values in attribute property_magnitude
240 | %    From: A121                    To: 'real estate'
241 | %    From: A122                    To: 'life insurance'
242 | %    From: A123                    To: car
243 | %    From: A124                    To: 'no known property'
244 | %
245 | %
246 | % Relabeled values in attribute other_payment_plans
247 | %    From: A141                    To: bank
248 | %    From: A142                    To: stores
249 | %    From: A143                    To: none
250 | %
251 | %
252 | % Relabeled values in attribute housing
253 | %    From: A151                    To: rent
254 | %    From: A152                    To: own
255 | %    From: A153                    To: 'for free'
256 | %
257 | %
258 | % Relabeled values in attribute job
259 | %    From: A171                    To: 'unemp/unskilled non res'
260 | %    From: A172                    To: 'unskilled resident'
261 | %    From: A173                    To: skilled
262 | %    From: A174                    To: 'high qualif/self emp/mgmt'
263 | %
264 | %
265 | % Relabeled values in attribute own_telephone
266 | %    From: A191                    To: none
267 | %    From: A192                    To: yes
268 | %
269 | %
270 | % Relabeled values in attribute foreign_worker
271 | %    From: A201                    To: yes
272 | %    From: A202                    To: no
273 | %
274 | %
275 | % Relabeled values in attribute class
276 | %    From: 1                       To: good
277 | %    From: 2                       To: bad
278 | %
279 | @relation german_credit
280 | @attribute checking_status { '<0', '0<=X<200', '>=200', 'no checking'}
281 | @attribute duration real
282 | @attribute credit_history { 'no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit'}
283 | @attribute purpose { 'new car', 'used car', furniture/equipment, radio/tv, 'domestic appliance', repairs, education, vacation, retraining, business, other}
284 | @attribute credit_amount real
285 | @attribute savings_status { '<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings'}
286 | @attribute employment { unemployed, '<1', '1<=X<4', '4<=X<7', '>=7'}
287 | @attribute installment_commitment real
288 | @attribute personal_status { 'male div/sep', 'female div/dep/mar', 'male single', 'male mar/wid', 'female single'}
289 | @attribute other_parties { none, 'co applicant', guarantor}
290 | @attribute residence_since real
291 | @attribute property_magnitude { 'real estate', 'life insurance', car, 'no known property'}
292 | @attribute age real
293 | @attribute other_payment_plans { bank, stores, none}
294 | @attribute housing { rent, own, 'for free'}
295 | @attribute existing_credits real
296 | @attribute job { 'unemp/unskilled non res', 'unskilled resident', skilled, 'high qualif/self emp/mgmt'}
297 | @attribute num_dependents real
298 | @attribute own_telephone { none, yes}
299 | @attribute foreign_worker { yes, no}
300 | @attribute class { good, bad}
301 | @data
302 | '<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,4,'real estate',67,none,own,2,skilled,1,yes,yes,good
303 | '0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,2,'real estate',22,none,own,1,skilled,1,none,yes,bad
304 | 'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,3,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
305 | '<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,4,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
306 | '<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,4,'no known property',53,none,'for free',2,skilled,2,none,yes,bad
307 | 'no checking',36,'existing paid',education,9055,'no known savings','1<=X<4',2,'male single',none,4,'no known property',35,none,'for free',1,'unskilled resident',2,yes,yes,good
308 | 'no checking',24,'existing paid',furniture/equipment,2835,'500<=X<1000','>=7',3,'male single',none,4,'life insurance',53,none,own,1,skilled,1,none,yes,good
309 | '0<=X<200',36,'existing paid','used car',6948,'<100','1<=X<4',2,'male single',none,2,car,35,none,rent,1,'high qualif/self emp/mgmt',1,yes,yes,good


--------------------------------------------------------------------------------
/pk/main/ui/main_gui.ui:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <ui version="4.0">
  3 |  <class>main_tab</class>
  4 |  <widget class="QTabWidget" name="main_tab">
  5 |   <property name="geometry">
  6 |    <rect>
  7 |     <x>0</x>
  8 |     <y>0</y>
  9 |     <width>987</width>
 10 |     <height>737</height>
 11 |    </rect>
 12 |   </property>
 13 |   <property name="windowTitle">
 14 |    <string>Scikit GUI</string>
 15 |   </property>
 16 |   <property name="currentIndex">
 17 |    <number>0</number>
 18 |   </property>
 19 |   <widget class="QWidget" name="upload_tab">
 20 |    <attribute name="title">
 21 |     <string>Upload</string>
 22 |    </attribute>
 23 |    <widget class="QWidget" name="horizontalWidget" native="true">
 24 |     <property name="geometry">
 25 |      <rect>
 26 |       <x>0</x>
 27 |       <y>0</y>
 28 |       <width>501</width>
 29 |       <height>51</height>
 30 |      </rect>
 31 |     </property>
 32 |     <layout class="QHBoxLayout" name="horizontalLayout">
 33 |      <item>
 34 |       <widget class="QPushButton" name="openfile_btn">
 35 |        <property name="text">
 36 |         <string>Open File</string>
 37 |        </property>
 38 |       </widget>
 39 |      </item>
 40 |      <item>
 41 |       <widget class="QPushButton" name="openurl_btn">
 42 |        <property name="text">
 43 |         <string>Open URL</string>
 44 |        </property>
 45 |       </widget>
 46 |      </item>
 47 |      <item>
 48 |       <widget class="QPushButton" name="generate_btn">
 49 |        <property name="text">
 50 |         <string>Generate</string>
 51 |        </property>
 52 |       </widget>
 53 |      </item>
 54 |     </layout>
 55 |    </widget>
 56 |    <widget class="QWidget" name="verticalLayoutWidget">
 57 |     <property name="geometry">
 58 |      <rect>
 59 |       <x>20</x>
 60 |       <y>50</y>
 61 |       <width>391</width>
 62 |       <height>621</height>
 63 |      </rect>
 64 |     </property>
 65 |     <layout class="QVBoxLayout" name="verticalLayout">
 66 |      <item>
 67 |       <widget class="QLabel" name="datainfo_label">
 68 |        <property name="text">
 69 |         <string>Dataset Information</string>
 70 |        </property>
 71 |       </widget>
 72 |      </item>
 73 |      <item>
 74 |       <widget class="QTextBrowser" name="datainfotext"/>
 75 |      </item>
 76 |     </layout>
 77 |    </widget>
 78 |    <widget class="QWidget" name="verticalLayoutWidget_2">
 79 |     <property name="geometry">
 80 |      <rect>
 81 |       <x>420</x>
 82 |       <y>50</y>
 83 |       <width>541</width>
 84 |       <height>621</height>
 85 |      </rect>
 86 |     </property>
 87 |     <layout class="QVBoxLayout" name="verticalLayout_2">
 88 |      <item>
 89 |       <widget class="QLabel" name="dataplotter_label">
 90 |        <property name="text">
 91 |         <string>Dataset Plotter</string>
 92 |        </property>
 93 |       </widget>
 94 |      </item>
 95 |      <item>
 96 |       <widget class="QGraphicsView" name="dataplottergraphics"/>
 97 |      </item>
 98 |     </layout>
 99 |    </widget>
100 |    <widget class="QProgressBar" name="progressBar">
101 |     <property name="geometry">
102 |      <rect>
103 |       <x>20</x>
104 |       <y>683</y>
105 |       <width>941</width>
106 |       <height>16</height>
107 |      </rect>
108 |     </property>
109 |     <property name="value">
110 |      <number>0</number>
111 |     </property>
112 |    </widget>
113 |   </widget>
114 |   <widget class="QWidget" name="preprocess_tab">
115 |    <property name="enabled">
116 |     <bool>true</bool>
117 |    </property>
118 |    <property name="autoFillBackground">
119 |     <bool>false</bool>
120 |    </property>
121 |    <attribute name="title">
122 |     <string>Preprocess</string>
123 |    </attribute>
124 |    <widget class="QWidget" name="horizontalWidget_2" native="true">
125 |     <property name="geometry">
126 |      <rect>
127 |       <x>0</x>
128 |       <y>0</y>
129 |       <width>981</width>
130 |       <height>51</height>
131 |      </rect>
132 |     </property>
133 |     <layout class="QHBoxLayout" name="horizontalLayout_2">
134 |      <item>
135 |       <widget class="QTabWidget" name="preprocess_tab_2">
136 |        <property name="currentIndex">
137 |         <number>0</number>
138 |        </property>
139 |        <widget class="QWidget" name="normalize_tab">
140 |         <attribute name="title">
141 |          <string>Normalize</string>
142 |         </attribute>
143 |        </widget>
144 |        <widget class="QWidget" name="standardize_tab">
145 |         <attribute name="title">
146 |          <string>Standardize</string>
147 |         </attribute>
148 |        </widget>
149 |        <widget class="QWidget" name="binarize_tab">
150 |         <attribute name="title">
151 |          <string>Binarize</string>
152 |         </attribute>
153 |        </widget>
154 |        <widget class="QWidget" name="impute_tab">
155 |         <attribute name="title">
156 |          <string>Impute</string>
157 |         </attribute>
158 |        </widget>
159 |        <widget class="QWidget" name="noise_tab">
160 |         <attribute name="title">
161 |          <string>Add Noise</string>
162 |         </attribute>
163 |        </widget>
164 |       </widget>
165 |      </item>
166 |     </layout>
167 |    </widget>
168 |    <widget class="QLabel" name="attr_label">
169 |     <property name="geometry">
170 |      <rect>
171 |       <x>20</x>
172 |       <y>60</y>
173 |       <width>81</width>
174 |       <height>17</height>
175 |      </rect>
176 |     </property>
177 |     <property name="text">
178 |      <string>Attributes</string>
179 |     </property>
180 |    </widget>
181 |    <widget class="QWidget" name="verticalLayoutWidget_3">
182 |     <property name="geometry">
183 |      <rect>
184 |       <x>10</x>
185 |       <y>80</y>
186 |       <width>421</width>
187 |       <height>251</height>
188 |      </rect>
189 |     </property>
190 |     <layout class="QVBoxLayout" name="verticalLayout_3">
191 |      <item>
192 |       <widget class="QCheckBox" name="attr_checkbox1">
193 |        <property name="text">
194 |         <string>Attribute 1</string>
195 |        </property>
196 |       </widget>
197 |      </item>
198 |      <item>
199 |       <widget class="QCheckBox" name="attr_checkbox2">
200 |        <property name="text">
201 |         <string>Attribute 2</string>
202 |        </property>
203 |       </widget>
204 |      </item>
205 |      <item>
206 |       <widget class="QCheckBox" name="attr_checkbox3">
207 |        <property name="text">
208 |         <string>Attribute 3</string>
209 |        </property>
210 |       </widget>
211 |      </item>
212 |      <item>
213 |       <widget class="QCheckBox" name="attr_checkbox4">
214 |        <property name="text">
215 |         <string>Attribute 4</string>
216 |        </property>
217 |       </widget>
218 |      </item>
219 |      <item>
220 |       <widget class="QCheckBox" name="attr_checkbox5">
221 |        <property name="text">
222 |         <string>Attribute 5</string>
223 |        </property>
224 |       </widget>
225 |      </item>
226 |     </layout>
227 |    </widget>
228 |    <widget class="QPushButton" name="remove_attr_btn">
229 |     <property name="geometry">
230 |      <rect>
231 |       <x>10</x>
232 |       <y>330</y>
233 |       <width>419</width>
234 |       <height>26</height>
235 |      </rect>
236 |     </property>
237 |     <property name="text">
238 |      <string>Remove</string>
239 |     </property>
240 |    </widget>
241 |   </widget>
242 |   <widget class="QWidget" name="regression_tab">
243 |    <attribute name="title">
244 |     <string>Regression</string>
245 |    </attribute>
246 |    <widget class="QWidget" name="horizontalWidget_4" native="true">
247 |     <property name="geometry">
248 |      <rect>
249 |       <x>0</x>
250 |       <y>0</y>
251 |       <width>981</width>
252 |       <height>51</height>
253 |      </rect>
254 |     </property>
255 |     <layout class="QHBoxLayout" name="horizontalLayout_6">
256 |      <item>
257 |       <widget class="QTabWidget" name="regression_tab_2">
258 |        <property name="currentIndex">
259 |         <number>0</number>
260 |        </property>
261 |        <widget class="QWidget" name="linearreg_tab">
262 |         <attribute name="title">
263 |          <string>Linear</string>
264 |         </attribute>
265 |        </widget>
266 |        <widget class="QWidget" name="polyreg_tab">
267 |         <attribute name="title">
268 |          <string>Polynomial</string>
269 |         </attribute>
270 |        </widget>
271 |        <widget class="QWidget" name="leastsqreg_tab">
272 |         <attribute name="title">
273 |          <string>Least Square</string>
274 |         </attribute>
275 |        </widget>
276 |        <widget class="QWidget" name="logisticreg_tab">
277 |         <attribute name="title">
278 |          <string>Logistic</string>
279 |         </attribute>
280 |        </widget>
281 |        <widget class="QWidget" name="gdreg_tab">
282 |         <attribute name="title">
283 |          <string>Gradient Descent</string>
284 |         </attribute>
285 |        </widget>
286 |       </widget>
287 |      </item>
288 |     </layout>
289 |    </widget>
290 |   </widget>
291 |   <widget class="QWidget" name="classify_tab">
292 |    <attribute name="title">
293 |     <string>Classify</string>
294 |    </attribute>
295 |    <widget class="QWidget" name="horizontalWidget_3" native="true">
296 |     <property name="geometry">
297 |      <rect>
298 |       <x>0</x>
299 |       <y>0</y>
300 |       <width>981</width>
301 |       <height>51</height>
302 |      </rect>
303 |     </property>
304 |     <layout class="QHBoxLayout" name="horizontalLayout_5">
305 |      <item>
306 |       <widget class="QTabWidget" name="classifymethods_tab">
307 |        <property name="currentIndex">
308 |         <number>0</number>
309 |        </property>
310 |        <property name="elideMode">
311 |         <enum>Qt::ElideNone</enum>
312 |        </property>
313 |        <widget class="QWidget" name="dt_tab">
314 |         <attribute name="title">
315 |          <string>Decision Tree</string>
316 |         </attribute>
317 |        </widget>
318 |        <widget class="QWidget" name="ensemble_tab">
319 |         <attribute name="title">
320 |          <string>Ensemble</string>
321 |         </attribute>
322 |        </widget>
323 |        <widget class="QWidget" name="nn_tab">
324 |         <attribute name="title">
325 |          <string>Neural Networks</string>
326 |         </attribute>
327 |        </widget>
328 |        <widget class="QWidget" name="svm_tab">
329 |         <attribute name="title">
330 |          <string>SVM</string>
331 |         </attribute>
332 |        </widget>
333 |        <widget class="QWidget" name="bn_tab">
334 |         <attribute name="title">
335 |          <string>Bayes Nets</string>
336 |         </attribute>
337 |        </widget>
338 |        <widget class="QWidget" name="knn_tab">
339 |         <attribute name="title">
340 |          <string>kNN</string>
341 |         </attribute>
342 |        </widget>
343 |        <widget class="QWidget" name="otherclassify_tab">
344 |         <attribute name="title">
345 |          <string>Others</string>
346 |         </attribute>
347 |        </widget>
348 |       </widget>
349 |      </item>
350 |     </layout>
351 |    </widget>
352 |   </widget>
353 |   <widget class="QWidget" name="cluster_tab">
354 |    <attribute name="title">
355 |     <string>Cluster</string>
356 |    </attribute>
357 |    <widget class="QWidget" name="horizontalWidget_5" native="true">
358 |     <property name="geometry">
359 |      <rect>
360 |       <x>0</x>
361 |       <y>0</y>
362 |       <width>981</width>
363 |       <height>51</height>
364 |      </rect>
365 |     </property>
366 |     <layout class="QHBoxLayout" name="horizontalLayout_7">
367 |      <item>
368 |       <widget class="QTabWidget" name="clustermethod_tab">
369 |        <property name="currentIndex">
370 |         <number>0</number>
371 |        </property>
372 |        <widget class="QWidget" name="kmeans_tab">
373 |         <attribute name="title">
374 |          <string>kMeans</string>
375 |         </attribute>
376 |        </widget>
377 |        <widget class="QWidget" name="em_tab">
378 |         <attribute name="title">
379 |          <string>EM</string>
380 |         </attribute>
381 |        </widget>
382 |        <widget class="QWidget" name="propcluster_tab">
383 |         <attribute name="title">
384 |          <string>Affinity Propogation</string>
385 |         </attribute>
386 |        </widget>
387 |        <widget class="QWidget" name="spectralcluster_tab">
388 |         <attribute name="title">
389 |          <string>Spectral</string>
390 |         </attribute>
391 |        </widget>
392 |        <widget class="QWidget" name="aggcluster_tab">
393 |         <attribute name="title">
394 |          <string>Agglomerative</string>
395 |         </attribute>
396 |        </widget>
397 |        <widget class="QWidget" name="dbscan_tab">
398 |         <attribute name="title">
399 |          <string>DBSCAN</string>
400 |         </attribute>
401 |        </widget>
402 |       </widget>
403 |      </item>
404 |     </layout>
405 |    </widget>
406 |    <zorder>horizontalWidget_5</zorder>
407 |    <zorder>clustermethod_tab</zorder>
408 |   </widget>
409 |   <widget class="QWidget" name="reduce_tab">
410 |    <attribute name="title">
411 |     <string>Reduce</string>
412 |    </attribute>
413 |    <widget class="QWidget" name="horizontalWidget_6" native="true">
414 |     <property name="geometry">
415 |      <rect>
416 |       <x>0</x>
417 |       <y>0</y>
418 |       <width>981</width>
419 |       <height>51</height>
420 |      </rect>
421 |     </property>
422 |     <layout class="QHBoxLayout" name="horizontalLayout_8">
423 |      <item>
424 |       <widget class="QTabWidget" name="reducemethods_tab">
425 |        <property name="currentIndex">
426 |         <number>0</number>
427 |        </property>
428 |        <widget class="QWidget" name="pca_tab">
429 |         <attribute name="title">
430 |          <string>PCA</string>
431 |         </attribute>
432 |        </widget>
433 |        <widget class="QWidget" name="ica_tab">
434 |         <attribute name="title">
435 |          <string>ICA</string>
436 |         </attribute>
437 |        </widget>
438 |        <widget class="QWidget" name="rpa_tab">
439 |         <attribute name="title">
440 |          <string>Random Projection</string>
441 |         </attribute>
442 |        </widget>
443 |       </widget>
444 |      </item>
445 |     </layout>
446 |    </widget>
447 |   </widget>
448 |   <widget class="QWidget" name="visualize_tab">
449 |    <attribute name="title">
450 |     <string>Visualize</string>
451 |    </attribute>
452 |   </widget>
453 |   <widget class="QWidget" name="other_tab">
454 |    <attribute name="title">
455 |     <string>Other</string>
456 |    </attribute>
457 |   </widget>
458 |  </widget>
459 |  <resources/>
460 |  <connections/>
461 | </ui>
462 | 


--------------------------------------------------------------------------------
/pk/tests/correct_array.pkl:
--------------------------------------------------------------------------------
 1 | (cnumpy.core.multiarray
 2 | _reconstruct
 3 | p1
 4 | (cnumpy
 5 | ndarray
 6 | p2
 7 | (I0
 8 | tS'b'
 9 | tRp3
10 | (I1
11 | (I56
12 | I10
13 | tcnumpy
14 | dtype
15 | p4
16 | (S'f8'
17 | I0
18 | I1
19 | tRp5
20 | (I3
21 | S'<'
22 | NNNI-1
23 | I-1
24 | I0
25 | tbI00
26 | S'\xd5A^\x0f&\xc5\xcb?;\xc4?l\xe9\xd1\xc8?\x82\xe6s\xeev\xbd\xac?\xd9wE\xf0\xbf\x95\xb4?_\n\x0f\x9a]\xf7\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\xdfQcB\xcc%\xb1?\x00\x00\x00\x00\x00\x00\x00\x00q\x90\x10\xe5\x0bZ\xd5?\x89\x95\xd1\xc8\xe7\x15\xdb?\x1c\x0b\n\x832\x8d\xc6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\xa1\x11l\\\xff\xae\xc9?~\x18!<\xda8\xc6?x\xb5\xdc\x99\t\x06\xe4?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x1f\xb95\xe9\xb6D\xbe?\xceQG\xc7\xd5\xc8\xba?u\x1e\x15\xffwD\xbd?`"\xde:\xffv\xd6?\x1ak\x7fg{\xf4\xd3?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@8\x13\xd3\x85X\xfd\xc7?\x8e\xcaM\xd4\xd2\xdc\xd9?\xb1\xc0Wt\xeb5\xd1?\x00\x00\x00\x00\x00\x00\x00\x00\xa4\xa6]L3\xdd\xc1?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00&@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x000@lA\xef\x8d!\x00\xdf?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00J_\x089\xef\x7f\xe0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x08@\x0f\xba\x84Co\xf1\xce?\x1f\xbeL\x14!u\xbb?\xe6\xae%\xe4\x83\x9e\xb9?\x91E\x9ax\x07x\xdb?\x9a\x08\x1b\x9e^)\xbf?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00O\xea\xcb\xd2N\xcd\xad?\xdbl\xac\xc4<+\xd6?\xee\n}\xb0\x8c\r\xe3?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00=@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x18@L3\xdd\xeb\xa4\xbe\xd7?\xb8\xc9\xa82\x8c\xbb\x91?<K\x90\x11P\xe1\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\xb8\xe9\xcf~\xa4\x88\xc4?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80G@\x0ek*\x8b\xc2.\xce?\xaa\x9b\x8b\xbf\xed\tb?\xd3\x88\x99}\x1e#\xe7?\x00\x00\x00\x00\x00\x00\x00\x00QN\xb4\xab\x90\xf2\xa3?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00&@\x16i\xe2\x1d\xe0I\xd8?yZ~\xe0*O\x90?\x9a\xeeuR_\x96\xd7?\xdf\x15\xc1\xffV\xb2\xc5?\x99\x7f\xf4M\x9a\x06\xb1?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\xf9\x9d&3\xdeV\xb6?\x00\x00\x00\x00\x00\x00\x00\x00\xf4\xbf\\\x8b\x16\xa0\xdd?\x04\xe5\xb6}\x8f\xfa\xbb?L\xdfk\x08\x8e\xcb\xd5?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x001@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\xdd\xefP\x14\xe8\x93\xe6?\xb8\x93\x88\xf0/\x82\x96?\xa7;O<g\x0b\xa8?\xe7\xc7_Z\xd4\'\xc9?\x08=\x9bU\x9f\xab\x9d?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\xee\xe8\x7f\xb9\x16-\xd1?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x89\x0b@\xa3ti\xe7?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@U\xa0\x16\x83\x87i\xb7?\x10=)\x93\x1a\xda\xca?\x0fH\xc2\xbe\x9dD\xca?\xc5T\xfa\tg\xb7\x86?\xb5\xc2\xf4\xbd\x86\xe0\xde?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x009@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x002@R\x10<\xbe\xbdk\xa8?\xc7e\xdc\xd4@\xf3\xcb?\xf6\xb6\x99\n\xf1H\xb0?\xa3\xcc\x06\x99d\xe4\xd1?\xdf\xf8\xda3K\x02\xd9?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x001@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00P:\x91`\xaa\x99\x95?\x1b\xd9\x95\x96\x91z\xcb?r\xa7t\xb0\xfe\xcf\xe3?\xd4a\x85[>\x92\xc2?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00&@+\xfc\x19\xde\xacA\xe4?R\x0c\x90h\x02E\xac?!\x06\xba\xf6\x05\xf4\xd3?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x18\\sG\xff\xcb\xd0?\x00\x00\x00\x00\x00\x00\x00\x00\xd68\x9b\x8e\x00n\xdf?$\xd6\xe2S\x00\x8c\xcf?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00(@\x0c\xb0\x8fN]\xf9\xa4?>\xb1N\x95\xef\x99\xe4?0\x81[w\xf3T\xaf?\x00\x00\x00\x00\x00\x00\x00\x00]7\xa5\xbcVB\xd0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x80C@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00<\xa5\x83\xf5\x7f\x0e\xbb?\x93\xa9\x82QI\x9d\xde?T5A\xd4}\x00\xc4?\x00\x00\x00\x00\x00\x00\x00\x00t\xd2\xfb\xc6\xd7\x9e\xd0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@s\xf4\xf8\xbdM\x7f\xd7?a2U0*\xa9\xbb?\xaa\x82QI\x9d\x80\xd2?\x00\x00\x00\x00\x00\x00\x00\x00\x96x@\xd9\x94+\xce?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x8e@\xbc\xae_\xb0\xcb?\xd6\x8dwG\xc6j\xc1?\xa2a1\xeaZ{\xc5?N} y\xe7P\xb6?\xaaH\x85\xb1\x85 \xd9?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00;@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x009\x0c\xe6\xaf\x90\xb9\xc6?\x84\xf1\xd3\xb87\xbf\xd3?j\xdeq\x8a\x8e\xe4\xc4?\xc0%\x00\xff\x94*\xcd?\xe0\xb9\xf7p\xc9q\xbf?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@\x80\x99\xef\xe0\'\x0e\xce?Uh \x96\xcd\x1c\xb2?\xb3\xeb\xde\x8a\xc4\x04\xd3?w-!\x1f\xf4l\xd9?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00K@\xe4\xf9\x0c\xa87\xa3\xce?\xe9\x7f\xb9\x16-@\xc7?\x00\x00\x00\x00\x00\x00\x00\x00\x1c\t4\xd8\xd4y\xd9?\xfbs\xd1\x90\xf1(\xc7?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00*@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@"\xa5\xd9<\x0e\x83\xe4?\x0e\xa3 x|{\xc7?\x00\x00\x00\x00\x00\x00\x00\x00i\xc8x\x94Jx\xc6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\xe8-\x1e\xdes`\xb5?\xefq\xa6\t\xdbO\xb6?\xa1\x83.\xe1\xd0[\xc4?\xa6(\x97\xc6/\xbc\xdd?\x82\xab<\x81\xb0S\xca?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x00\x00\x00Ul\xcc\xeb\x88C\xd4?\x00\x00\x00\x00\x00\x00\x00\x00\xf7\x8f\x85\xe8\x108\xda?\xb4\x03\xae+f\x84\xd1?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x90\xa1c\x07\x95\xb8\x9e?\xe7\xa6\xcd8\rQ\xc1?Z)\x04r\x89\xa3\xe0?\xbe\x9f\x1a/\xdd$\xd4?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x000@\xe4\x9f\x19\xc4\x07v\xca?m\x1d\x1c\xecM\x0c\xe3?\xcf\xbaF\xcb\x81\x1e\xc2?b\xbe\xbc\x00\xfb\xe8\xac?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x90\xd7\x83I\xf1\xf1\xcd?\x00\x00\x00\x00\x00\x00\x00\x00`\xab\x04\x8b\xc3\x19\xe3?\xeezi\x8a\x00\xa7\xc5?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00&@ \x96\xcd\x1c\x92Z\x98?\xc0{G\x8d\t1\xdb?Ou\xc8\xcdp\x03\xca?\xa9/K;5\x97\xa3?B\n\x9eB\xae\xd4\xd3?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x009@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00c\xeeZB>\xe8\xe6?\x00\x00\x00\x00\x00\x00\x00\x00J\xb8\x90Gp#\xd2?r\xe0\xd5rg&H?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00H\xc4\x94H\xa2\x17\xe4?pw\xd6n\xbb\xd0\xd7?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00&@A\xef\x8d!\x008\xc8?\xd3\xd8^\x0bzo\xc4?\x83/L\xa6\nF\xc5?\xf8\x88\x98\x12I\xf4\xd4?w\xf6\x95\x07\xe9)\xc4?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80@@\xc7\xd5\xc8\xae\xb4\x8c\xc0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\xa6\xd3\xba\r\xea\xe3?%\x92\xe8e\x14\xcb\xcf?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@\xf9\xbc\xe2\xa9G\x1a\xc6?O\xe5\xb4\xa7\xe4\x9c\xd6?8\x82T\x8a\x1d\x8d\xcf?\xb2\x12\xf3\xac\xa4\x15\xcd?\xd2\xfb\xc6\xd7\x9eY2?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00\x00I\x80\x9aZ\xb6\xd6W?\xcdZ\nH\xfb\x1f\xc6?\xbbF\xcb\x81\x1ej\xd3?\xc6l\xc9\xaa\x08\xb7\xe0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00"@\x84f\xd7\xbd\x15\x89\xd7?\x00\x00\x00\x00\x00\x00\x00\x00w\x87\x14\x03$\x9a\xca?\xb1Pk\x9aw\x9c\xd1? \nfL\xc1\x1a\xc3?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xad\xc3\xd1U\xba;\xe2?\xfa\xb7\xcb~\xdd\xe9\x9e?a\xffun\xda\x8c\xcb?\xeezi\x8a\x00\xa7\xc7?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\xf6Cl\xb0p\x92\xa6?\x00\x00\x00\x00\x00\x00\x00\x00\xa5\x84`U\xbd\xfc\xce?.8\x83\xbf_\xcc\xe3?\xff\xb3\xe6\xc7_Z\xb8?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x8b\xc5o\n+\x15\xb0?q9^\x81\xe8I\x89?zrM\x81\xcc\xce\xba?\xca\xfc\xa3o\xd24\xc0?\xd0\xec\xba\xb7"1\xe6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x002@\x00\x00\x00\x00\x00\x80C@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x00\x00t\xf0Lh\x92X\xd3?u\x8e\x01\xd9\xeb\xdd\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\xda\xc9\xe0(yu\xc8?\xe4\x9f\x19\xc4\x07v\xac?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x00H@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00C@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\xf4\xc2\x9d\x0b#\xd5?\xc6\xfdG\xa6C\xa7\xda?4\x0e\xf5\xbb\xb05\xd0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x004@-[\xeb\x8b\x84\xb6\xc0?\xb2\xf2\xcb`\x8cH\xb4?\n\x82\xc7\xb7w\x8d\xe6?\r\xdf\xc2\xba\xf1\xee\xa8?\xa2\xefne\x89\xce\xa2?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.@\xbbc\xb1M*\x1a\xcb?\xe9\xf3QF\\\x00\xc6?\xc1\x1e\x13)\xcd\xe6\xcb?\xc1\x8d\x94-\x92v\xd3?3\xdc\x80\xcf\x0f#\xb8?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x003@\x00\x00\x00\x00\x00\x00\x00\x00\xd6t=\xd1u\xe1\xde?\x14\xd0D\xd8\xf0\xf4\xc6?[\\\xe33\xd9?\xc7?\xe6\xe9\\QJ\x08\xc4?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(@0\x10\x04\xc8\xd0\xb1\xc5?<\x872T\xc5T\xc0?5\r\x8a\xe6\x01,\xd7?~\xc6\x85\x03!Y\xc0?R\xb7\xb3\xaf<H\xcb?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00.@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x19tB\xe8\xa0K\xb0?E\x9e$]3\xf9\xde?\xdb\x18;\xe1%8\xb5?C\xa9\xbd\x88\xb6c\xd1?\xebT\xf9\x9e\x91\x08\xb9?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x000@\xcb\xa2\xb0\x8b\xa2\x07\xc8?\xe4\x83\x9e\xcd\xaa\xcf\xdc?\xd2\x8cE\xd3\xd9\xc9\xd2?\x00\x00\x00\x00\x00\x00\x00\x00\x92w\x0ee\xa8\x8a\xb1?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001@2\x92=B\xcd\x90\xb2?+\xf6\x97\xdd\x93\x87\xe5?\xa7\xad\x11\xc18\xb8\xb4?\x13\xbaK\xe2\xac\x88\xc4?0\xd5\xccZ\nH\x8b?\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x006@\x9b\x90\xd6\x18tB\xc4?\x89\x9a\xe8\xf3QF\xb0?N\xd5=\xb2\xb9j\xdb?\x95\xd3\x9e\x92sb\xd6?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x003@\x00\x00\x00\x00\x00\x00\x00\x00"\x89^F\xb1\xdc\xda?\xc0=\xcf\x9f6\xaa\xcf?<\xa0l\xca\x15\xde\xd3?<\xf9\xf4\xd8\x96\x01\x97?\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x005@>\xcd\xc9\x8bL\xc0\xc5?\x84\rO\xaf\x94e\xc2?\x8c0E\xb94~\xcf?Y\xfa\xd0\x05\xf5-\xdc?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00I@'
27 | tbg1
28 | (g2
29 | (I0
30 | tS'b'
31 | tRp6
32 | (I1
33 | (I56
34 | tg5
35 | I00
36 | S'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00'
37 | tbt.


--------------------------------------------------------------------------------
/cl_gui.py:
--------------------------------------------------------------------------------
  1 | """ This module contains an executable command-line version of the Pykit-Learn
  2 | GUI.
  3 |     Author: Sean Dai
  4 | """
  5 | 
  6 | # Ignore any warnings issued by third-party modules
  7 | import warnings
  8 | warnings.filterwarnings("ignore")
  9 | 
 10 | import cPickle
 11 | import logging
 12 | import multiprocessing
 13 | import os
 14 | import shutil
 15 | import sys
 16 | import traceback
 17 | from argparse import ArgumentParser
 18 | from collections import Counter
 19 | from glob import glob
 20 | from os.path import join
 21 | 
 22 | from pandas.tools.plotting import radviz
 23 | from pandas.tools.plotting import scatter_matrix
 24 | from pandas.tools.plotting import andrews_curves
 25 | from PyQt4 import QtGui
 26 | from sklearn import cross_validation
 27 | 
 28 | import matplotlib.pyplot as plt
 29 | import seaborn as sns
 30 | import wx
 31 | from PIL import Image
 32 | 
 33 | from pk.utils.loading import *
 34 | from pk.utils.preprocess import *
 35 | from pk.utils.prygress import progress
 36 | from pk.utils.classification_utils import *
 37 | from pk.utils.metrics import *
 38 | from pk.controller import ViewGenerator
 39 | 
 40 | app = QtGui.QApplication(sys.argv)
 41 | 
 42 | class Status(object):
 43 |     DATASET_LOADED = False
 44 |     FILENAME = ''
 45 |     EXTENSION = None
 46 |     TEMP_DIR = '_temp/'
 47 |     USER_QUIT = 'user_quit'
 48 |     RADIAL_NAME = 'plot_radial.png'
 49 |     SCM_NAME = 'plot_scatter_matrix.png'
 50 |     FREQ_NAME = 'plot_frequency.png'
 51 |     FM_NAME = 'plot_feature_matrix.png'
 52 |     ANDREWS_NAME = 'plot_andrews.png'
 53 |     TD_NAME = 'plot_2d.png'
 54 |     FINISH_PLOTS = False
 55 |     PLOT_COMMANDS = {'plot_frequency', 'plot_feature_matrix', 'plot_radial',
 56 |                      'plot_andrews', 'plot_scatter_matrix', 'plot_2d'}
 57 |     ALL_COMMANDS = list(PLOT_COMMANDS) + ['load', 'load_file_gui', 'load_random', 'preprocess',
 58 |                                           'run', 'visualize', 'help', 'quit',
 59 |                                           'see_images']
 60 | 
 61 | 
 62 | class InvalidCommandException(Exception):
 63 |     def __init__(self, message, errors=None):
 64 |         super(InvalidCommandException, self).__init__(message)
 65 |         self.errors = errors
 66 | 
 67 | def _load_file(filename):
 68 |     loader = DatasetIO()
 69 |     return loader.load_file(filename)
 70 | 
 71 | def load_file(filename):
 72 |     """
 73 |     Function to load a dataset file.
 74 |     """
 75 |     X, y, df = _load_file(filename)
 76 | 
 77 |     loader = DatasetIO()
 78 |     loader.pickle_files([(X, 'load_X.pkl'), (y, 'load_y.pkl'), (df, 'df.pkl')],
 79 |                         Status.TEMP_DIR)
 80 | 
 81 |     # Update appropriate status flags.
 82 |     Status.DATASET_LOADED = True
 83 |     Status.FILENAME = os.path.basename(filename)
 84 |     Status.EXTENSION = filename[filename.rfind('.')]
 85 | 
 86 |     print 'Feature Array:\n %s' % X
 87 |     print 'Target classifications:\n %s' % y
 88 | 
 89 | def load_file_gui():
 90 |     from pk.controller import ViewGenerator
 91 |     popup = ViewGenerator()
 92 |     filter = "CSV files (*.csv);;XLS files (*.xls);;ARFF files (*.arff)"
 93 |     filename = popup.open_file_dialog(app, filter)
 94 | 
 95 |     if filename == '':
 96 |         return
 97 |     load_file(filename)
 98 | 
 99 | def load_random():
100 |     """
101 |     Generates a random dataset with 100 samples, 2 features, and 3 classes.
102 |     """
103 |     X, y, df = generate_random_points()
104 |     loader = DatasetIO()
105 |     loader.pickle_files([(X, 'load_X.pkl'), (y, 'load_y.pkl'), (df, 'df.pkl')],
106 |                         Status.TEMP_DIR)
107 | 
108 |     # Update appropriate status flags.
109 |     Status.DATASET_LOADED = True
110 |     Status.FILENAME = 'RANDOM'
111 | 
112 |     print 'Feature Array:\n %s' % X
113 |     print 'Target classifications:\n %s' % y
114 | 
115 | def get_pickled_dataset():
116 |     """
117 |     Returns X, y, and data_frame pickled files.
118 |     """
119 |     f1 = open('_temp/load_X.pkl', 'r')
120 |     f2 = open('_temp/load_y.pkl', 'r')
121 |     f3 = open('_temp/df.pkl', 'r')
122 | 
123 |     X = cPickle.load(f1)
124 |     y = cPickle.load(f2)
125 |     data_frame = cPickle.load(f3)
126 | 
127 |     f1.close()
128 |     f2.close()
129 |     f3.close()
130 |     return X, y, data_frame
131 | 
132 | 
133 | def update_feature_array(changed_X):
134 |     with open('_temp/load_X.pkl', 'wb') as f:
135 |         cPickle.dump(changed_X, f)
136 |     with open('_temp/df.pkl', 'wb') as f:
137 |         cPickle.dump(pd.DataFrame(changed_X), f)
138 | 
139 | 
140 | def visualize_dataset(command='', flags=(), plot_all=False, *args, **kwargs):
141 |     """
142 |     Create and display visualizations to user.
143 |     """
144 | 
145 |     # Build parser for visualization
146 |     parser = ArgumentParser()
147 |     parser.add_argument('--suppress', action='store_true', dest='suppress',
148 |                         help='Disable viewing of any generated plot(s).')
149 |     p_args = parser.parse_args(flags)
150 | 
151 |     if Status.DATASET_LOADED:
152 |         print "Creating visualization(s)",
153 |         make_visualizations(command, plot_all)
154 |         print ""
155 |         if not p_args.suppress:
156 |             print "Viewing generated plots..."
157 |             view_saved_plots(command)
158 |     else:
159 |         raise InvalidCommandException("Can't visualize an unloaded dataset!")
160 | 
161 | def view_saved_plots(plot_name=''):
162 |     # View all plots by default
163 |     if plot_name == '':
164 |         plot_name = '*.png'
165 |         files = glob(join(Status.TEMP_DIR, plot_name))
166 |     else:
167 |         files = glob(join(Status.TEMP_DIR, plot_name + '.png'))
168 | 
169 |     for im_file in files:
170 |         im = Image.open(im_file, 'r')
171 |         im.show()
172 | 
173 | def see_images(*args):
174 |     if '_temp/*.png' in args:
175 |         files = glob('_temp/*.png')
176 |     else:
177 |         files = args
178 | 
179 |     for im_file in files:
180 |         im = Image.open(im_file, 'r')
181 |         im.show()
182 | 
183 | @progress(char='.', pause=0.5)
184 | def make_visualizations(command='', plot_all=False):
185 |     """
186 |     Save the plots to _temp directory.
187 |     """
188 |     X, y, data_frame = get_pickled_dataset()
189 |     class_name = data_frame.dtypes.index[-1]
190 | 
191 |     if command == 'plot_frequency' or plot_all:
192 |         plot_class_frequency_bar(y)
193 |     if command == 'plot_feature_matrix':
194 |         plot_feature_matrix(data_frame)
195 |     if command == 'plot_radial' or plot_all:
196 |         plot_radial(data_frame, class_name)
197 |     if command == 'plot_andrews' or plot_all:
198 |         plot_andrews(data_frame, class_name)
199 |     if command == 'plot_scatter_matrix':
200 |         plot_scatter_matrix(data_frame)
201 |     if command == 'plot_2d' or plot_all:
202 |         plot_2d_dist(X, y)
203 | 
204 | def reset_plot_status():
205 |     Status.FINISH_PLOTS = False
206 | 
207 | def plot_class_frequency_bar(target, bar_width=.35):
208 |     plt.clf()
209 |     # Get the frequency of each class label
210 |     classes = np.unique(target)
211 |     target_counts = Counter(target)
212 | 
213 |     # Plot the bar chart of class frequencies
214 |     fig, ax = plt.subplots()
215 |     ind = np.arange(len(classes))
216 |     ax.set_xticks(ind)
217 |     ax.bar(ind, target_counts.values(), width=bar_width, align='center')
218 |     ax.set_title(Status.FILENAME)
219 |     ax.set_xlabel('Class')
220 |     ax.set_ylabel('Frequency')
221 |     ax.set_xticklabels(target_counts.keys())
222 |     ax.set_title('Bar Chart of Class Label Frequencies')
223 |     plt.savefig(join(Status.TEMP_DIR, Status.FREQ_NAME))
224 | 
225 | 
226 | def plot_feature_matrix(data_frame):
227 |     # Plot the matrix of feature-feature pairs
228 |     plt.clf()
229 |     g = sns.PairGrid(data_frame)
230 |     g.map(plt.scatter)
231 |     # plt.show(block=False)
232 |     plt.title('Feature Matrix')
233 |     plt.savefig(join(Status.TEMP_DIR, Status.FM_NAME))
234 | 
235 | def plot_radial(data_frame, class_name):
236 |     plt.clf()
237 |     radviz(data_frame, class_name)
238 |     # plt.show(block=False)
239 |     plt.title('Radial Plot')
240 |     plt.savefig(join(Status.TEMP_DIR, Status.RADIAL_NAME))
241 | 
242 | def plot_andrews(data_frame, class_name):
243 |     plt.clf()
244 |     andrews_curves(data_frame, class_name)
245 |     plt.title('Andrews Curve')
246 |     # plt.show(block=False)
247 |     plt.savefig(join(Status.TEMP_DIR, Status.ANDREWS_NAME))
248 | 
249 | def plot_scatter_matrix(data_frame):
250 |     plt.clf()
251 |     axes = scatter_matrix(data_frame, alpha=0.2, figsize=(10, 10), diagonal='kde')
252 |     # plt.show(block=False)
253 |     axes[0][0].set_title('Scatter Matrix with KDEs')
254 |     plt.savefig(join(Status.TEMP_DIR, Status.SCM_NAME))
255 | 
256 | def plot_2d_dist(X, y):
257 |     """
258 |     Plots the feature array points on a plane.
259 | 
260 |     If the n_dims > 2, only consider the first two features.
261 |     """
262 |     plt.clf()
263 |     from itertools import cycle
264 |     colors = cycle('bgrcmyk')
265 | 
266 |     if len(X[0]) > 2:
267 |         x_values = X[:, :2]
268 |     else:
269 |         x_values = X
270 | 
271 |     # Create a color-coded scatter plot by class label.
272 |     for class_label, c in zip(np.unique(y), colors):
273 |         xs = x_values[np.where(y == class_label)]
274 |         plt.scatter(xs[:, 0], xs[:, 1], c=c, label=class_label)
275 | 
276 |     # Set plot labels and save.
277 |     plt.xlabel('x1')
278 |     plt.ylabel('x2')
279 |     plt.title('Distribution of Dataset ({})'.format(Status.FILENAME))
280 |     plt.legend(loc='best')
281 |     plt.savefig(join(Status.TEMP_DIR, Status.TD_NAME))
282 | 
283 | def dispatch_preprocess(args):
284 |     if not Status.DATASET_LOADED:
285 |         raise InvalidCommandException("Can't preprocess an unloaded dataset!")
286 | 
287 |     parser = ArgumentParser()
288 |     parser.add_argument('-std', dest='std', action='store_true',
289 |                         help='Standardize the feature array.')
290 |     parser.add_argument('-norm', dest='norm', action='store_true',
291 |                         help="Normalize the values of each feature.")
292 |     p_args = parser.parse_args(args)
293 |     pe = PreprocessingEngine()
294 | 
295 |     if p_args.std:
296 |         print "Standardizing feature array..."
297 |         X, y, _ = get_pickled_dataset()
298 |         new_X = pe.standardize(X)
299 |         print new_X
300 |         update_feature_array(new_X)
301 | 
302 |     if p_args.norm:
303 |         print "Normalizing feature array..."
304 |         X, y, _ = get_pickled_dataset()
305 |         new_X = pe.normalize_data(X)
306 |         print new_X
307 |         update_feature_array(new_X)
308 | 
309 | 
310 | def dispatch_run(args):
311 |     # Build parser for "run" flags
312 |     parser = ArgumentParser()
313 |     parser.add_argument('-A', dest='A', help='Select the ML algorithm to run.')
314 |     parser.add_argument('-test_ratio', type=float, dest='test_ratio',
315 |                         help="Split data into training and test sets.")
316 |     parser.add_argument('-cv', dest='cv', type=int,
317 |                         help='Run with cross-validation.')
318 |     p_args = parser.parse_args(args)
319 | 
320 |     # Process the passed in arguments
321 |     if p_args.A:
322 |         # Run a decision tree algorithm on data
323 |         if p_args.A.strip() == 'dt':
324 |             print "Running decision tree algorithm on dataset..."
325 |             X, y, _ = get_pickled_dataset()
326 |             X_train, y_train = X, y
327 |             X_test, y_test = X, y
328 | 
329 |             # Split the original dataset to training & testing sets
330 |             if p_args.test_ratio:
331 |                 X_train, X_test, y_train, y_test = cross_validation.train_test_split(
332 |                     X, y, test_size=p_args.test_ratio,
333 |                     random_state=0)
334 |             # Train the Decision Tree classifier
335 |             clf = train_decision_tree(X_train, y_train)
336 |             print "Train accuracy: %f" % get_train_accuracy(clf, X_train, y_train)
337 | 
338 |             # Output metrics from train-test split
339 |             if X_test is not None and y_test is not None:
340 |                 print "Test accuracy: %f%%" % get_test_accuracy(clf, X_test, y_test)
341 | 
342 |             # Get cross-validation score(s)
343 |             if p_args.cv:
344 |                 print ""
345 |                 print "Cross Validation Scores:"
346 |                 scores, avg = get_cv_accuracy(clf, X_train, y_train, cv=p_args.cv)
347 |                 print 'Scores: ' + ', '.join(map(str, scores))
348 |                 print 'Average accuracy: %f (+/- %f)' % (avg, scores.std() * 2)
349 | 
350 |             # Plot the confusion matrix
351 |             cm = get_confusion_matrix(clf, X_test, y_test)
352 |             plot_confusion_matrix(cm, y=np.unique(y))
353 | 
354 | def setup():
355 |     # Create temporary directory for storing serialized objects.
356 |     if not os.path.exists("_temp/"):
357 |         os.mkdir("_temp/")
358 | 
359 |     # Configure log file for the application.
360 |     logging.basicConfig(level=logging.DEBUG,
361 |                         format='%(asctime)s %(levelname)s: %(message)s',
362 |                         filename='cl_gui.log')
363 |     logging.info("Starting application...")
364 | 
365 |     # Code snippet for recalling previous commands with the
366 |     # 'up' and 'down' arrow keys.
367 |     import rlcompleter
368 |     import atexit
369 |     import readline
370 | 
371 |     hist_file = os.path.join(os.environ['HOME'], '.pythonhistory')
372 |     try:
373 |         readline.read_history_file(hist_file)
374 |     except IOError:
375 |         pass
376 | 
377 |     # Set a limit on the number of commands to remember.
378 |     # High values will hog system memory!
379 |     readline.set_history_length(25)
380 |     atexit.register(readline.write_history_file, hist_file)
381 | 
382 |     # Tab completion for GUI commands
383 |     def completer(text, state):
384 |         commands = Status.ALL_COMMANDS
385 |         file_paths = []
386 |         for dirname, dirnames, filenames in os.walk('.'):
387 |             if '.git' in dirnames:
388 |                 # don't go into any .git directories.
389 |                 dirnames.remove('.git')
390 |             # Add path to subdirectories
391 |             file_paths.extend([os.path.join(dirname, sub_dir) for sub_dir in dirnames])
392 |             # Add path to all filenames in subdirectories.
393 |             file_paths.extend([os.path.join(dirname, filename) for filename in filenames])
394 |             # Remove './' header in file strings.
395 |             file_paths = [file.strip('./') for file in file_paths]
396 | 
397 |         options = [i for i in commands if i.startswith(text)]
398 |         options.extend([f for f in file_paths if f.startswith(text)])
399 | 
400 |         try:
401 |             return options[state]
402 |         except IndexError:
403 |             return None
404 | 
405 |     readline.set_completer(completer)
406 | 
407 |     # Bind tab completer to specific platforms
408 |     if readline.__doc__ and 'libedit' in readline.__doc__:
409 |         readline.parse_and_bind("bind -e")
410 |         readline.parse_and_bind("bind '\t' rl_complete")
411 |     else:
412 |         readline.parse_and_bind("tab: complete")
413 |     del hist_file, readline, rlcompleter
414 | 
415 | 
416 | def quit_gui():
417 |     shutil.rmtree(Status.TEMP_DIR)
418 |     logging.info("Quitting application...")
419 |     sys.exit(Status.USER_QUIT)
420 | 
421 | 
422 | def help_page():
423 |     output_page = """
424 | Pykit-Learn Command Line GUI
425 | --------------------------------
426 | Commands:
427 |     The following commands are available:
428 | 
429 |     load [file]             Loads the dataset at the path specified by [file].
430 |                             No quotes "" around filename!
431 |     load_random             Load a randomly generated dataset with 3 classes.
432 |     plot_2d                 Plot a 2-D distribution of the dataset.
433 |     plot_andrews            Plots an Andrews curve of the dataset.
434 | 
435 |     plot_frequency          View the frequency of each class label.
436 |     plot_feature_matrix     Generate a matrix plot of feature-feature
437 |                             relationships.
438 |     plot_scatter_matrix     Matrix plot with KDEs along the diagonal.
439 |     plot_radial             Plot a radial chart of the dataset.
440 |     preprocess [flags]      Preprocesses a dataset. Flags are
441 |                                 -std Standardize to mean 0 and variance 1
442 |                                 -norm Normalize each feature to range [0,1]
443 |                                 Eg. "preprocess -std"
444 |     see_images [files]      View temporarily stored plots.
445 |                                 Eg. "see_images _temp/plot_2d.png"
446 |     run                     Runs the ML alg on the loaded dataset.
447 |         -A [alg]            REQUIRED flag! Options for [alg]:
448 |                                 dt = (Decision Tree)
449 |         -test_ratio [0-1]   User can specify the test-train ratio.
450 |         -cv [int]           Enables k-fold cross validation.
451 |                             Example: "run -A dt -test_ratio .3 -cv 5"
452 |     visualize               Plots all possible visualizations for input data.
453 |         --suppress          Disable plotting output.
454 |     help                    Provides a help screen of available commands.
455 |     quit                    Quits the command line GUI.
456 |     """
457 |     return output_page
458 | 
459 | 
460 | def process(line):
461 |     tokens = tuple(line.split(' '))
462 |     command, args = tokens[0], tokens[1:]
463 | 
464 |     # Select the appropriate function to call
465 |     if command == 'load':
466 |         load_file(*args)
467 |     elif command == 'load_random':
468 |         load_random()
469 |     elif command == 'load_file_gui':
470 |         load_file_gui()
471 |     elif command == 'preprocess_gui':
472 |         gen = ViewGenerator()
473 |         gen.get_preprocess_options(app)
474 |     elif command == 'preprocess':
475 |         dispatch_preprocess(args)
476 |     elif command in Status.PLOT_COMMANDS:
477 |         visualize_dataset(command, args)
478 |     elif command == 'visualize':
479 |         visualize_dataset(flags=args, plot_all=True)
480 |     elif command == 'see_images':
481 |         see_images(*args)
482 |     elif command == 'run':
483 |         dispatch_run(args)
484 |     elif command == 'help':
485 |         print help_page()
486 |     elif command == 'quit':
487 |         quit_gui()
488 |     elif command == '':
489 |         return
490 |     else:
491 |         raise InvalidCommandException(
492 |             "{} is not a recognized command.".format(command))
493 | 
494 | 
495 | def main():
496 |     """
497 |     To run, type "python cl_gui.py".
498 |     """
499 |     print "Welcome to the command-line version of Pykit-Learn!"
500 |     print "Type 'help' for a list of available commands"
501 |     setup()
502 | 
503 |     while True:
504 |         try:
505 |             input_line = raw_input(">> ")
506 |             process(input_line.strip())
507 |         except IOError as ioe:
508 |             print ioe.message
509 |         except InvalidCommandException as inv:
510 |             print inv.message
511 |         except AttributeError as ae:
512 |             print ae.message
513 |         except Exception:
514 |             traceback.print_exc()
515 |         except SystemExit as se:
516 |             if str(se.message) == Status.USER_QUIT:
517 |                 return
518 |             else:
519 |                 print se.message
520 |         except KeyboardInterrupt:
521 |             quit_gui()
522 | 
523 | 
524 | if __name__ == "__main__":
525 |     main()
526 | 


--------------------------------------------------------------------------------
/pk/main/ui/main_gui.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Form implementation generated from reading ui file 'main_gui.ui'
  4 | #
  5 | # Created by: PyQt4 UI code generator 4.11.4
  6 | #
  7 | # WARNING! All changes made in this file will be lost!
  8 | 
  9 | from PyQt4 import QtCore, QtGui
 10 | 
 11 | try:
 12 |     _fromUtf8 = QtCore.QString.fromUtf8
 13 | except AttributeError:
 14 |     def _fromUtf8(s):
 15 |         return s
 16 | 
 17 | try:
 18 |     _encoding = QtGui.QApplication.UnicodeUTF8
 19 |     def _translate(context, text, disambig):
 20 |         return QtGui.QApplication.translate(context, text, disambig, _encoding)
 21 | except AttributeError:
 22 |     def _translate(context, text, disambig):
 23 |         return QtGui.QApplication.translate(context, text, disambig)
 24 | 
 25 | class Ui_main_tab(object):
 26 |     def setupUi(self, main_tab):
 27 |         main_tab.setObjectName(_fromUtf8("main_tab"))
 28 |         main_tab.resize(987, 737)
 29 |         self.upload_tab = QtGui.QWidget()
 30 |         self.upload_tab.setObjectName(_fromUtf8("upload_tab"))
 31 |         self.horizontalWidget = QtGui.QWidget(self.upload_tab)
 32 |         self.horizontalWidget.setGeometry(QtCore.QRect(0, 0, 501, 51))
 33 |         self.horizontalWidget.setObjectName(_fromUtf8("horizontalWidget"))
 34 |         self.horizontalLayout = QtGui.QHBoxLayout(self.horizontalWidget)
 35 |         self.horizontalLayout.setObjectName(_fromUtf8("horizontalLayout"))
 36 |         self.openfile_btn = QtGui.QPushButton(self.horizontalWidget)
 37 |         self.openfile_btn.setObjectName(_fromUtf8("openfile_btn"))
 38 |         self.horizontalLayout.addWidget(self.openfile_btn)
 39 |         self.openurl_btn = QtGui.QPushButton(self.horizontalWidget)
 40 |         self.openurl_btn.setObjectName(_fromUtf8("openurl_btn"))
 41 |         self.horizontalLayout.addWidget(self.openurl_btn)
 42 |         self.generate_btn = QtGui.QPushButton(self.horizontalWidget)
 43 |         self.generate_btn.setObjectName(_fromUtf8("generate_btn"))
 44 |         self.horizontalLayout.addWidget(self.generate_btn)
 45 |         self.verticalLayoutWidget = QtGui.QWidget(self.upload_tab)
 46 |         self.verticalLayoutWidget.setGeometry(QtCore.QRect(20, 50, 391, 621))
 47 |         self.verticalLayoutWidget.setObjectName(_fromUtf8("verticalLayoutWidget"))
 48 |         self.verticalLayout = QtGui.QVBoxLayout(self.verticalLayoutWidget)
 49 |         self.verticalLayout.setObjectName(_fromUtf8("verticalLayout"))
 50 |         self.datainfo_label = QtGui.QLabel(self.verticalLayoutWidget)
 51 |         self.datainfo_label.setObjectName(_fromUtf8("datainfo_label"))
 52 |         self.verticalLayout.addWidget(self.datainfo_label)
 53 |         self.datainfotext = QtGui.QTextBrowser(self.verticalLayoutWidget)
 54 |         self.datainfotext.setObjectName(_fromUtf8("datainfotext"))
 55 |         self.verticalLayout.addWidget(self.datainfotext)
 56 |         self.verticalLayoutWidget_2 = QtGui.QWidget(self.upload_tab)
 57 |         self.verticalLayoutWidget_2.setGeometry(QtCore.QRect(420, 50, 541, 621))
 58 |         self.verticalLayoutWidget_2.setObjectName(_fromUtf8("verticalLayoutWidget_2"))
 59 |         self.verticalLayout_2 = QtGui.QVBoxLayout(self.verticalLayoutWidget_2)
 60 |         self.verticalLayout_2.setObjectName(_fromUtf8("verticalLayout_2"))
 61 |         self.dataplotter_label = QtGui.QLabel(self.verticalLayoutWidget_2)
 62 |         self.dataplotter_label.setObjectName(_fromUtf8("dataplotter_label"))
 63 |         self.verticalLayout_2.addWidget(self.dataplotter_label)
 64 |         self.dataplottergraphics = QtGui.QGraphicsView(self.verticalLayoutWidget_2)
 65 |         self.dataplottergraphics.setObjectName(_fromUtf8("dataplottergraphics"))
 66 |         self.verticalLayout_2.addWidget(self.dataplottergraphics)
 67 |         self.progressBar = QtGui.QProgressBar(self.upload_tab)
 68 |         self.progressBar.setGeometry(QtCore.QRect(20, 683, 941, 16))
 69 |         self.progressBar.setProperty("value", 0)
 70 |         self.progressBar.setObjectName(_fromUtf8("progressBar"))
 71 |         main_tab.addTab(self.upload_tab, _fromUtf8(""))
 72 |         self.preprocess_tab = QtGui.QWidget()
 73 |         self.preprocess_tab.setEnabled(True)
 74 |         self.preprocess_tab.setAutoFillBackground(False)
 75 |         self.preprocess_tab.setObjectName(_fromUtf8("preprocess_tab"))
 76 |         self.horizontalWidget_2 = QtGui.QWidget(self.preprocess_tab)
 77 |         self.horizontalWidget_2.setGeometry(QtCore.QRect(0, 0, 981, 51))
 78 |         self.horizontalWidget_2.setObjectName(_fromUtf8("horizontalWidget_2"))
 79 |         self.horizontalLayout_2 = QtGui.QHBoxLayout(self.horizontalWidget_2)
 80 |         self.horizontalLayout_2.setObjectName(_fromUtf8("horizontalLayout_2"))
 81 |         self.preprocess_tab_2 = QtGui.QTabWidget(self.horizontalWidget_2)
 82 |         self.preprocess_tab_2.setObjectName(_fromUtf8("preprocess_tab_2"))
 83 |         self.normalize_tab = QtGui.QWidget()
 84 |         self.normalize_tab.setObjectName(_fromUtf8("normalize_tab"))
 85 |         self.preprocess_tab_2.addTab(self.normalize_tab, _fromUtf8(""))
 86 |         self.standardize_tab = QtGui.QWidget()
 87 |         self.standardize_tab.setObjectName(_fromUtf8("standardize_tab"))
 88 |         self.preprocess_tab_2.addTab(self.standardize_tab, _fromUtf8(""))
 89 |         self.binarize_tab = QtGui.QWidget()
 90 |         self.binarize_tab.setObjectName(_fromUtf8("binarize_tab"))
 91 |         self.preprocess_tab_2.addTab(self.binarize_tab, _fromUtf8(""))
 92 |         self.impute_tab = QtGui.QWidget()
 93 |         self.impute_tab.setObjectName(_fromUtf8("impute_tab"))
 94 |         self.preprocess_tab_2.addTab(self.impute_tab, _fromUtf8(""))
 95 |         self.noise_tab = QtGui.QWidget()
 96 |         self.noise_tab.setObjectName(_fromUtf8("noise_tab"))
 97 |         self.preprocess_tab_2.addTab(self.noise_tab, _fromUtf8(""))
 98 |         self.horizontalLayout_2.addWidget(self.preprocess_tab_2)
 99 |         self.attr_label = QtGui.QLabel(self.preprocess_tab)
100 |         self.attr_label.setGeometry(QtCore.QRect(20, 60, 81, 17))
101 |         self.attr_label.setObjectName(_fromUtf8("attr_label"))
102 |         self.verticalLayoutWidget_3 = QtGui.QWidget(self.preprocess_tab)
103 |         self.verticalLayoutWidget_3.setGeometry(QtCore.QRect(10, 80, 421, 251))
104 |         self.verticalLayoutWidget_3.setObjectName(_fromUtf8("verticalLayoutWidget_3"))
105 |         self.verticalLayout_3 = QtGui.QVBoxLayout(self.verticalLayoutWidget_3)
106 |         self.verticalLayout_3.setObjectName(_fromUtf8("verticalLayout_3"))
107 |         self.attr_checkbox1 = QtGui.QCheckBox(self.verticalLayoutWidget_3)
108 |         self.attr_checkbox1.setObjectName(_fromUtf8("attr_checkbox1"))
109 |         self.verticalLayout_3.addWidget(self.attr_checkbox1)
110 |         self.attr_checkbox2 = QtGui.QCheckBox(self.verticalLayoutWidget_3)
111 |         self.attr_checkbox2.setObjectName(_fromUtf8("attr_checkbox2"))
112 |         self.verticalLayout_3.addWidget(self.attr_checkbox2)
113 |         self.attr_checkbox3 = QtGui.QCheckBox(self.verticalLayoutWidget_3)
114 |         self.attr_checkbox3.setObjectName(_fromUtf8("attr_checkbox3"))
115 |         self.verticalLayout_3.addWidget(self.attr_checkbox3)
116 |         self.attr_checkbox4 = QtGui.QCheckBox(self.verticalLayoutWidget_3)
117 |         self.attr_checkbox4.setObjectName(_fromUtf8("attr_checkbox4"))
118 |         self.verticalLayout_3.addWidget(self.attr_checkbox4)
119 |         self.attr_checkbox5 = QtGui.QCheckBox(self.verticalLayoutWidget_3)
120 |         self.attr_checkbox5.setObjectName(_fromUtf8("attr_checkbox5"))
121 |         self.verticalLayout_3.addWidget(self.attr_checkbox5)
122 |         self.remove_attr_btn = QtGui.QPushButton(self.preprocess_tab)
123 |         self.remove_attr_btn.setGeometry(QtCore.QRect(10, 330, 419, 26))
124 |         self.remove_attr_btn.setObjectName(_fromUtf8("remove_attr_btn"))
125 |         main_tab.addTab(self.preprocess_tab, _fromUtf8(""))
126 |         self.regression_tab = QtGui.QWidget()
127 |         self.regression_tab.setObjectName(_fromUtf8("regression_tab"))
128 |         self.horizontalWidget_4 = QtGui.QWidget(self.regression_tab)
129 |         self.horizontalWidget_4.setGeometry(QtCore.QRect(0, 0, 981, 51))
130 |         self.horizontalWidget_4.setObjectName(_fromUtf8("horizontalWidget_4"))
131 |         self.horizontalLayout_6 = QtGui.QHBoxLayout(self.horizontalWidget_4)
132 |         self.horizontalLayout_6.setObjectName(_fromUtf8("horizontalLayout_6"))
133 |         self.regression_tab_2 = QtGui.QTabWidget(self.horizontalWidget_4)
134 |         self.regression_tab_2.setObjectName(_fromUtf8("regression_tab_2"))
135 |         self.linearreg_tab = QtGui.QWidget()
136 |         self.linearreg_tab.setObjectName(_fromUtf8("linearreg_tab"))
137 |         self.regression_tab_2.addTab(self.linearreg_tab, _fromUtf8(""))
138 |         self.polyreg_tab = QtGui.QWidget()
139 |         self.polyreg_tab.setObjectName(_fromUtf8("polyreg_tab"))
140 |         self.regression_tab_2.addTab(self.polyreg_tab, _fromUtf8(""))
141 |         self.leastsqreg_tab = QtGui.QWidget()
142 |         self.leastsqreg_tab.setObjectName(_fromUtf8("leastsqreg_tab"))
143 |         self.regression_tab_2.addTab(self.leastsqreg_tab, _fromUtf8(""))
144 |         self.logisticreg_tab = QtGui.QWidget()
145 |         self.logisticreg_tab.setObjectName(_fromUtf8("logisticreg_tab"))
146 |         self.regression_tab_2.addTab(self.logisticreg_tab, _fromUtf8(""))
147 |         self.gdreg_tab = QtGui.QWidget()
148 |         self.gdreg_tab.setObjectName(_fromUtf8("gdreg_tab"))
149 |         self.regression_tab_2.addTab(self.gdreg_tab, _fromUtf8(""))
150 |         self.horizontalLayout_6.addWidget(self.regression_tab_2)
151 |         main_tab.addTab(self.regression_tab, _fromUtf8(""))
152 |         self.classify_tab = QtGui.QWidget()
153 |         self.classify_tab.setObjectName(_fromUtf8("classify_tab"))
154 |         self.horizontalWidget_3 = QtGui.QWidget(self.classify_tab)
155 |         self.horizontalWidget_3.setGeometry(QtCore.QRect(0, 0, 981, 51))
156 |         self.horizontalWidget_3.setObjectName(_fromUtf8("horizontalWidget_3"))
157 |         self.horizontalLayout_5 = QtGui.QHBoxLayout(self.horizontalWidget_3)
158 |         self.horizontalLayout_5.setObjectName(_fromUtf8("horizontalLayout_5"))
159 |         self.classifymethods_tab = QtGui.QTabWidget(self.horizontalWidget_3)
160 |         self.classifymethods_tab.setElideMode(QtCore.Qt.ElideNone)
161 |         self.classifymethods_tab.setObjectName(_fromUtf8("classifymethods_tab"))
162 |         self.dt_tab = QtGui.QWidget()
163 |         self.dt_tab.setObjectName(_fromUtf8("dt_tab"))
164 |         self.classifymethods_tab.addTab(self.dt_tab, _fromUtf8(""))
165 |         self.ensemble_tab = QtGui.QWidget()
166 |         self.ensemble_tab.setObjectName(_fromUtf8("ensemble_tab"))
167 |         self.classifymethods_tab.addTab(self.ensemble_tab, _fromUtf8(""))
168 |         self.nn_tab = QtGui.QWidget()
169 |         self.nn_tab.setObjectName(_fromUtf8("nn_tab"))
170 |         self.classifymethods_tab.addTab(self.nn_tab, _fromUtf8(""))
171 |         self.svm_tab = QtGui.QWidget()
172 |         self.svm_tab.setObjectName(_fromUtf8("svm_tab"))
173 |         self.classifymethods_tab.addTab(self.svm_tab, _fromUtf8(""))
174 |         self.bn_tab = QtGui.QWidget()
175 |         self.bn_tab.setObjectName(_fromUtf8("bn_tab"))
176 |         self.classifymethods_tab.addTab(self.bn_tab, _fromUtf8(""))
177 |         self.knn_tab = QtGui.QWidget()
178 |         self.knn_tab.setObjectName(_fromUtf8("knn_tab"))
179 |         self.classifymethods_tab.addTab(self.knn_tab, _fromUtf8(""))
180 |         self.otherclassify_tab = QtGui.QWidget()
181 |         self.otherclassify_tab.setObjectName(_fromUtf8("otherclassify_tab"))
182 |         self.classifymethods_tab.addTab(self.otherclassify_tab, _fromUtf8(""))
183 |         self.horizontalLayout_5.addWidget(self.classifymethods_tab)
184 |         main_tab.addTab(self.classify_tab, _fromUtf8(""))
185 |         self.cluster_tab = QtGui.QWidget()
186 |         self.cluster_tab.setObjectName(_fromUtf8("cluster_tab"))
187 |         self.horizontalWidget_5 = QtGui.QWidget(self.cluster_tab)
188 |         self.horizontalWidget_5.setGeometry(QtCore.QRect(0, 0, 981, 51))
189 |         self.horizontalWidget_5.setObjectName(_fromUtf8("horizontalWidget_5"))
190 |         self.horizontalLayout_7 = QtGui.QHBoxLayout(self.horizontalWidget_5)
191 |         self.horizontalLayout_7.setObjectName(_fromUtf8("horizontalLayout_7"))
192 |         self.clustermethod_tab = QtGui.QTabWidget(self.horizontalWidget_5)
193 |         self.clustermethod_tab.setObjectName(_fromUtf8("clustermethod_tab"))
194 |         self.kmeans_tab = QtGui.QWidget()
195 |         self.kmeans_tab.setObjectName(_fromUtf8("kmeans_tab"))
196 |         self.clustermethod_tab.addTab(self.kmeans_tab, _fromUtf8(""))
197 |         self.em_tab = QtGui.QWidget()
198 |         self.em_tab.setObjectName(_fromUtf8("em_tab"))
199 |         self.clustermethod_tab.addTab(self.em_tab, _fromUtf8(""))
200 |         self.propcluster_tab = QtGui.QWidget()
201 |         self.propcluster_tab.setObjectName(_fromUtf8("propcluster_tab"))
202 |         self.clustermethod_tab.addTab(self.propcluster_tab, _fromUtf8(""))
203 |         self.spectralcluster_tab = QtGui.QWidget()
204 |         self.spectralcluster_tab.setObjectName(_fromUtf8("spectralcluster_tab"))
205 |         self.clustermethod_tab.addTab(self.spectralcluster_tab, _fromUtf8(""))
206 |         self.aggcluster_tab = QtGui.QWidget()
207 |         self.aggcluster_tab.setObjectName(_fromUtf8("aggcluster_tab"))
208 |         self.clustermethod_tab.addTab(self.aggcluster_tab, _fromUtf8(""))
209 |         self.dbscan_tab = QtGui.QWidget()
210 |         self.dbscan_tab.setObjectName(_fromUtf8("dbscan_tab"))
211 |         self.clustermethod_tab.addTab(self.dbscan_tab, _fromUtf8(""))
212 |         self.horizontalLayout_7.addWidget(self.clustermethod_tab)
213 |         self.horizontalWidget_5.raise_()
214 |         self.clustermethod_tab.raise_()
215 |         main_tab.addTab(self.cluster_tab, _fromUtf8(""))
216 |         self.reduce_tab = QtGui.QWidget()
217 |         self.reduce_tab.setObjectName(_fromUtf8("reduce_tab"))
218 |         self.horizontalWidget_6 = QtGui.QWidget(self.reduce_tab)
219 |         self.horizontalWidget_6.setGeometry(QtCore.QRect(0, 0, 981, 51))
220 |         self.horizontalWidget_6.setObjectName(_fromUtf8("horizontalWidget_6"))
221 |         self.horizontalLayout_8 = QtGui.QHBoxLayout(self.horizontalWidget_6)
222 |         self.horizontalLayout_8.setObjectName(_fromUtf8("horizontalLayout_8"))
223 |         self.reducemethods_tab = QtGui.QTabWidget(self.horizontalWidget_6)
224 |         self.reducemethods_tab.setObjectName(_fromUtf8("reducemethods_tab"))
225 |         self.pca_tab = QtGui.QWidget()
226 |         self.pca_tab.setObjectName(_fromUtf8("pca_tab"))
227 |         self.reducemethods_tab.addTab(self.pca_tab, _fromUtf8(""))
228 |         self.ica_tab = QtGui.QWidget()
229 |         self.ica_tab.setObjectName(_fromUtf8("ica_tab"))
230 |         self.reducemethods_tab.addTab(self.ica_tab, _fromUtf8(""))
231 |         self.rpa_tab = QtGui.QWidget()
232 |         self.rpa_tab.setObjectName(_fromUtf8("rpa_tab"))
233 |         self.reducemethods_tab.addTab(self.rpa_tab, _fromUtf8(""))
234 |         self.horizontalLayout_8.addWidget(self.reducemethods_tab)
235 |         main_tab.addTab(self.reduce_tab, _fromUtf8(""))
236 |         self.visualize_tab = QtGui.QWidget()
237 |         self.visualize_tab.setObjectName(_fromUtf8("visualize_tab"))
238 |         main_tab.addTab(self.visualize_tab, _fromUtf8(""))
239 |         self.other_tab = QtGui.QWidget()
240 |         self.other_tab.setObjectName(_fromUtf8("other_tab"))
241 |         main_tab.addTab(self.other_tab, _fromUtf8(""))
242 | 
243 |         self.retranslateUi(main_tab)
244 |         main_tab.setCurrentIndex(0)
245 |         self.preprocess_tab_2.setCurrentIndex(0)
246 |         self.regression_tab_2.setCurrentIndex(0)
247 |         self.classifymethods_tab.setCurrentIndex(0)
248 |         self.clustermethod_tab.setCurrentIndex(0)
249 |         self.reducemethods_tab.setCurrentIndex(0)
250 |         QtCore.QMetaObject.connectSlotsByName(main_tab)
251 | 
252 |     def retranslateUi(self, main_tab):
253 |         main_tab.setWindowTitle(_translate("main_tab", "Scikit GUI", None))
254 |         self.openfile_btn.setText(_translate("main_tab", "Open File", None))
255 |         self.openurl_btn.setText(_translate("main_tab", "Open URL", None))
256 |         self.generate_btn.setText(_translate("main_tab", "Generate", None))
257 |         self.datainfo_label.setText(_translate("main_tab", "Dataset Information", None))
258 |         self.dataplotter_label.setText(_translate("main_tab", "Dataset Plotter", None))
259 |         main_tab.setTabText(main_tab.indexOf(self.upload_tab), _translate("main_tab", "Upload", None))
260 |         self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.normalize_tab), _translate("main_tab", "Normalize", None))
261 |         self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.standardize_tab), _translate("main_tab", "Standardize", None))
262 |         self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.binarize_tab), _translate("main_tab", "Binarize", None))
263 |         self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.impute_tab), _translate("main_tab", "Impute", None))
264 |         self.preprocess_tab_2.setTabText(self.preprocess_tab_2.indexOf(self.noise_tab), _translate("main_tab", "Add Noise", None))
265 |         self.attr_label.setText(_translate("main_tab", "Attributes", None))
266 |         self.attr_checkbox1.setText(_translate("main_tab", "Attribute 1", None))
267 |         self.attr_checkbox2.setText(_translate("main_tab", "Attribute 2", None))
268 |         self.attr_checkbox3.setText(_translate("main_tab", "Attribute 3", None))
269 |         self.attr_checkbox4.setText(_translate("main_tab", "Attribute 4", None))
270 |         self.attr_checkbox5.setText(_translate("main_tab", "Attribute 5", None))
271 |         self.remove_attr_btn.setText(_translate("main_tab", "Remove", None))
272 |         main_tab.setTabText(main_tab.indexOf(self.preprocess_tab), _translate("main_tab", "Preprocess", None))
273 |         self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.linearreg_tab), _translate("main_tab", "Linear", None))
274 |         self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.polyreg_tab), _translate("main_tab", "Polynomial", None))
275 |         self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.leastsqreg_tab), _translate("main_tab", "Least Square", None))
276 |         self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.logisticreg_tab), _translate("main_tab", "Logistic", None))
277 |         self.regression_tab_2.setTabText(self.regression_tab_2.indexOf(self.gdreg_tab), _translate("main_tab", "Gradient Descent", None))
278 |         main_tab.setTabText(main_tab.indexOf(self.regression_tab), _translate("main_tab", "Regression", None))
279 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.dt_tab), _translate("main_tab", "Decision Tree", None))
280 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.ensemble_tab), _translate("main_tab", "Ensemble", None))
281 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.nn_tab), _translate("main_tab", "Neural Networks", None))
282 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.svm_tab), _translate("main_tab", "SVM", None))
283 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.bn_tab), _translate("main_tab", "Bayes Nets", None))
284 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.knn_tab), _translate("main_tab", "kNN", None))
285 |         self.classifymethods_tab.setTabText(self.classifymethods_tab.indexOf(self.otherclassify_tab), _translate("main_tab", "Others", None))
286 |         main_tab.setTabText(main_tab.indexOf(self.classify_tab), _translate("main_tab", "Classify", None))
287 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.kmeans_tab), _translate("main_tab", "kMeans", None))
288 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.em_tab), _translate("main_tab", "EM", None))
289 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.propcluster_tab), _translate("main_tab", "Affinity Propogation", None))
290 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.spectralcluster_tab), _translate("main_tab", "Spectral", None))
291 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.aggcluster_tab), _translate("main_tab", "Agglomerative", None))
292 |         self.clustermethod_tab.setTabText(self.clustermethod_tab.indexOf(self.dbscan_tab), _translate("main_tab", "DBSCAN", None))
293 |         main_tab.setTabText(main_tab.indexOf(self.cluster_tab), _translate("main_tab", "Cluster", None))
294 |         self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.pca_tab), _translate("main_tab", "PCA", None))
295 |         self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.ica_tab), _translate("main_tab", "ICA", None))
296 |         self.reducemethods_tab.setTabText(self.reducemethods_tab.indexOf(self.rpa_tab), _translate("main_tab", "Random Projection", None))
297 |         main_tab.setTabText(main_tab.indexOf(self.reduce_tab), _translate("main_tab", "Reduce", None))
298 |         main_tab.setTabText(main_tab.indexOf(self.visualize_tab), _translate("main_tab", "Visualize", None))
299 |         main_tab.setTabText(main_tab.indexOf(self.other_tab), _translate("main_tab", "Other", None))
300 | 
301 | 


--------------------------------------------------------------------------------