├── tests
    ├── __init__.py
    ├── Makefile
    ├── test_s1_kmeans.py
    ├── test_normalization.py
    ├── test_boston_forest.py
    ├── test_boston_tree.py
    ├── test_sonar_forest.py
    ├── test_pca.py
    ├── test_cartpole_dqn.py
    ├── test_heart_tree.py
    ├── test_banknote_tree.py
    ├── test_heart.py
    ├── test_adult_tree.py
    ├── test_adult_forest.py
    ├── test_iris.py
    ├── test_iris_bayes.py
    ├── test_search.py
    ├── test_iris_tree.py
    ├── test_iris_neighbor.py
    ├── test_adult.py
    ├── test_fashion.py
    ├── test_heart_forest.py
    ├── test_banknote_forest.py
    ├── test_heart_bayes.py
    ├── test_iris_svm.py
    ├── test_fishlength.py
    ├── test_banknote.py
    ├── test_mnist_svm.py
    └── test_functions.py
├── pykitml
    ├── datasets
    │   ├── __init__.py
    │   ├── s1clustering.py
    │   ├── fishlength.py
    │   ├── boston.py
    │   ├── banknote.py
    │   ├── heartdisease.py
    │   ├── sonar.py
    │   ├── mnist.py
    │   └── iris.py
    ├── linear_regression.py
    ├── _exceptions.py
    ├── __init__.py
    ├── pklhandler.py
    ├── logistic_regression.py
    ├── svm.py
    ├── smote.py
    ├── testing.py
    ├── nearest_neighbor.py
    ├── _shared_array.py
    ├── _regressor.py
    ├── pca.py
    ├── cross_val.py
    ├── _single_layer_model.py
    ├── kmeans_clustering.py
    ├── random_search.py
    ├── _heatmap.py
    ├── _functions.py
    ├── fceux.py
    ├── normalize.py
    ├── preprocessing.py
    └── random_forest.py
├── docs
    ├── requirements.txt
    ├── SMOTE.rst
    ├── demo_pics
    │   ├── tree.png
    │   ├── kmeans.png
    │   ├── pca_compressed.png
    │   ├── pca_uncompressed.png
    │   ├── bayes_confusion_matrix.png
    │   ├── linear_svm_perf_graph.png
    │   ├── tree_confusion_matrix.png
    │   ├── forest_confusion_matrix.png
    │   ├── gaussian_svm_perf_graph.png
    │   ├── neighbor_confusion_matrix.png
    │   ├── neural_network_perf_graph.png
    │   ├── linear_svm_confusion_matrix.png
    │   ├── forest_heart_confusion_matrix.png
    │   ├── gaussian_svm_confusion_matrix.png
    │   ├── linear_regression_perf_graph.png
    │   ├── logistic_regression_perf_graph.png
    │   ├── gaussian_bayes_confusion_matrix.png
    │   ├── neural_network_confusion_matrix.png
    │   └── logistic_regression_confusion_matrix.png
    ├── CrossValidation.rst
    ├── SavingAndLoading.rst
    ├── PreprocessingDatasets.rst
    ├── KMeans.rst
    ├── RandomSearch.rst
    ├── LSTM.rst
    ├── Makefile
    ├── PrincipalComponentAnalysis.rst
    ├── DQN.rst
    ├── Optimizers.rst
    ├── make.bat
    ├── Linear Regression.rst
    ├── GaussianNaiveBayes.rst
    ├── index.rst
    ├── NearestNeighbor.rst
    ├── NaiveBayes.rst
    ├── DecisionTree.rst
    ├── LogisticRegression.rst
    ├── FeedForwardNetwork.rst
    ├── RandomForest.rst
    ├── SVM.rst
    ├── Datasets.rst
    ├── Normalization.rst
    ├── FCEUX.rst
    └── conf.py
├── pykitml128.png
├── requirements.txt
├── pyproject.toml
├── .readthedocs.yml
├── Pipfile
├── Makefile
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── .vscode
    └── launch.json


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pykitml/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/docs/SMOTE.rst:
--------------------------------------------------------------------------------
1 | SMOTE
2 | =====
3 | 
4 | .. autofunction:: pykitml.smote


--------------------------------------------------------------------------------
/pykitml128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/pykitml128.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | matplotlib
3 | tqdm
4 | graphviz
5 | sphinx-rtd-theme
6 | 


--------------------------------------------------------------------------------
/docs/demo_pics/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/tree.png


--------------------------------------------------------------------------------
/docs/CrossValidation.rst:
--------------------------------------------------------------------------------
1 | Cross Validation
2 | ================
3 | 
4 | .. autofunction:: pykitml.cross_validate 


--------------------------------------------------------------------------------
/docs/demo_pics/kmeans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/kmeans.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta:__legacy__"


--------------------------------------------------------------------------------
/docs/demo_pics/pca_compressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/pca_compressed.png


--------------------------------------------------------------------------------
/docs/demo_pics/pca_uncompressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/pca_uncompressed.png


--------------------------------------------------------------------------------
/docs/demo_pics/bayes_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/bayes_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/linear_svm_perf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/linear_svm_perf_graph.png


--------------------------------------------------------------------------------
/docs/demo_pics/tree_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/tree_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/forest_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/forest_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/gaussian_svm_perf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/gaussian_svm_perf_graph.png


--------------------------------------------------------------------------------
/docs/demo_pics/neighbor_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/neighbor_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/neural_network_perf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/neural_network_perf_graph.png


--------------------------------------------------------------------------------
/docs/demo_pics/linear_svm_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/linear_svm_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/forest_heart_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/forest_heart_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/gaussian_svm_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/gaussian_svm_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/linear_regression_perf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/linear_regression_perf_graph.png


--------------------------------------------------------------------------------
/docs/demo_pics/logistic_regression_perf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/logistic_regression_perf_graph.png


--------------------------------------------------------------------------------
/docs/demo_pics/gaussian_bayes_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/gaussian_bayes_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/neural_network_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/neural_network_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/demo_pics/logistic_regression_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RainingComputers/pykitml/HEAD/docs/demo_pics/logistic_regression_confusion_matrix.png


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-22.04
 4 |   tools:
 5 |     python: "3.10"
 6 | sphinx:
 7 |   configuration: docs/conf.py
 8 | python:
 9 |   install:
10 |     - method: pip
11 |       path: .
12 |     - requirements: requirements.txt
13 | 


--------------------------------------------------------------------------------
/docs/SavingAndLoading.rst:
--------------------------------------------------------------------------------
 1 | Saving and Loading Objects/Models
 2 | =================================
 3 | 
 4 | Saving objects to file
 5 | ----------------------
 6 | .. autofunction:: pykitml.save
 7 | 
 8 | Loading objects from file
 9 | -------------------------
10 | .. autofunction:: pykitml.load


--------------------------------------------------------------------------------
/docs/PreprocessingDatasets.rst:
--------------------------------------------------------------------------------
 1 | Preprocessing Datasets
 2 | ======================
 3 | 
 4 | Dealing with categorical/one-hot values
 5 | ---------------------------------------
 6 | 
 7 | .. autofunction:: pykitml.onehot
 8 | 
 9 | .. autofunction:: pykitml.onehot_cols
10 | 
11 | .. autofunction:: pykitml.onehot_cols_traintest
12 | 
13 | Generating Polynomial Features
14 | ------------------------------
15 | 
16 | .. autofunction:: pykitml.polynomial


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | numpy = "*"
10 | matplotlib = "*"
11 | pytest = "*"
12 | pylint = "*"
13 | radon = "*"
14 | sphinx = "*"
15 | sphinx-rtd-theme = "*"
16 | sphinx-bootstrap-theme = "*"
17 | tqdm = "*"
18 | graphviz = "*"
19 | gprof2dot = "*"
20 | gymnasium = "*"
21 | autopep8 = "*"
22 | pygame = "*"
23 | 
24 | [requires]
25 | python_version = "3.10.0"
26 | 


--------------------------------------------------------------------------------
/docs/KMeans.rst:
--------------------------------------------------------------------------------
 1 | K-Means Clustering
 2 | ==================
 3 | 
 4 | K-Means Function
 5 | ----------------
 6 | 
 7 | .. autofunction:: pykitml.kmeans
 8 | 
 9 | Example: S1 Dataset
10 | -------------------
11 | 
12 | **Dataset**
13 | 
14 | :ref:`s1clustering_dataset`
15 | 
16 | **Training**
17 | 
18 | .. literalinclude:: ../tests/test_s1_kmeans.py
19 |    :pyobject: test_s1_kmeans
20 |    :lines: 3-
21 |    :end-before: # Assert
22 |    :dedent: 4
23 | 
24 | **Scatter Plot**
25 | 
26 | .. image:: ./demo_pics/kmeans.png


--------------------------------------------------------------------------------
/docs/RandomSearch.rst:
--------------------------------------------------------------------------------
 1 | Random Search for Hyperparameters
 2 | =================================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.RandomSearch
 8 | 
 9 |     .. automethod:: search
10 | 
11 |     .. automethod:: set_cost
12 | 
13 |     .. autoattribute:: best
14 | 
15 | Example: Tuning Feed-forward network for fashion-MNIST
16 | -------------------------------------------------------
17 | 
18 | .. literalinclude:: ../tests/test_search.py
19 |    :pyobject: test_search
20 |    :lines: 3-
21 |    :end-before: # Assert
22 |    :dedent: 4


--------------------------------------------------------------------------------
/docs/LSTM.rst:
--------------------------------------------------------------------------------
 1 | Long short-term memory (LSTM) Network
 2 | =====================================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.LSTM
 8 | 
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: reset
20 | 
21 |     .. automethod:: plot_performance
22 | 
23 |     .. automethod:: cost
24 | 
25 |     .. automethod:: accuracy
26 | 
27 |     .. automethod:: r2score
28 | 
29 |     .. automethod:: confusion_matrix
30 | 
31 |     .. autoattribute:: nlayers


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/pykitml/linear_regression.py:
--------------------------------------------------------------------------------
 1 | from ._single_layer_model import SingleLayerModel
 2 | from ._regressor import Regressor
 3 | from . import _functions
 4 | 
 5 | 
 6 | class LinearRegression(SingleLayerModel, Regressor):
 7 |     '''
 8 |     Implements linear regression.
 9 |     '''
10 | 
11 |     @property
12 |     def _activ_func(self):
13 |         return _functions.identity
14 | 
15 |     @property
16 |     def _activ_func_prime(self):
17 |         return _functions.identity_prime
18 | 
19 |     @property
20 |     def _cost_func(self):
21 |         return _functions.mse
22 | 
23 |     @property
24 |     def _cost_func_prime(self):
25 |         return _functions.mse_prime
26 | 


--------------------------------------------------------------------------------
/pykitml/_exceptions.py:
--------------------------------------------------------------------------------
 1 | class InvalidFeatureType(Exception):
 2 |     '''
 3 |     Raised when specified feature type is invalid for the model.
 4 |     '''
 5 | 
 6 | 
 7 | class InvalidDistributionType(Exception):
 8 |     '''
 9 |     Raised when specified distribution type is invalid for the model.
10 |     '''
11 | 
12 | 
13 | def _valid_list(input_list, valid_items):
14 |     '''
15 |     Used to check if items in a list are valid.
16 | 
17 |     Parameters
18 |     ----------
19 |     input_list : list
20 |         The list to check/validate.
21 |     valid_items : list
22 |         List of valid items the list can contain.
23 |     '''
24 |     return all(item in valid_items for item in input_list) and len(input_list) > 0
25 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY : help viewprofiles clean $(wildcard *.dat)
 2 | help :
 3 | 	@echo "clean       	: Remove auto-generated files."
 4 | 	@echo "test		: Run tests."
 5 | 	@echo "viewprofiles	: View all *.dat profile files."
 6 | 
 7 | now=$(shell date +%d-%m_%H-%M-%S)
 8 | 
 9 | test:
10 | 	rm -f *.pkl
11 | 	python3 -m pytest
12 | 
13 | viewprofiles: $(wildcard *.dat)
14 | 
15 | $(wildcard *.dat):
16 | 	$(eval name=$(patsubst %64,%,$(basename $(notdir $@))))
17 | 	gprof2dot -f pstats $@ | dot -Tpng -o profile_$(name)_$(now).png
18 | 	xdg-open profile_$(name)_$(now).png
19 | 
20 | clean:
21 | 	rm -f *.pkl
22 | 	rm -f -r __pycache__
23 | 	rm -f *.dat
24 | 	rm -f profile_*.png
25 | 	rm -f *.gz
26 | 	rm -f *.gv.png
27 | 	rm -f *.gv.pdf
28 | 	rm -f *.gv


--------------------------------------------------------------------------------
/docs/PrincipalComponentAnalysis.rst:
--------------------------------------------------------------------------------
 1 | Principal Component Analysis
 2 | ============================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.PCA
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: transform
12 | 
13 |     .. automethod:: inverse_transform
14 | 
15 |     .. autoattribute:: retention
16 | 
17 | Example: Compressing Fashion MNIST dataset
18 | ------------------------------------------
19 | 
20 | .. literalinclude:: ../tests/test_pca.py
21 |    :pyobject: test_pca_compression
22 |    :lines: 3-
23 |    :dedent: 4
24 | 
25 | **Original/Uncompressed**
26 | 
27 | .. image:: ./demo_pics/pca_uncompressed.png
28 | 
29 | **Recovered/Compressed**
30 | 
31 | .. image:: ./demo_pics/pca_compressed.png
32 | 


--------------------------------------------------------------------------------
/docs/DQN.rst:
--------------------------------------------------------------------------------
 1 | Deep Q Learning
 2 | ===============
 3 | 
 4 | DQNAgent Class
 5 | --------------
 6 | 
 7 | .. autoclass:: pykitml.DQNAgent
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: train
12 | 
13 |     .. automethod:: exploit
14 | 
15 |     .. automethod:: plot_performance
16 | 
17 | .. _environment:
18 | 
19 | Environment Class
20 | -----------------
21 | 
22 | .. autoclass:: pykitml.Environment
23 |     
24 |     .. automethod:: reset
25 | 
26 |     .. automethod:: step
27 | 
28 |     .. automethod:: close
29 | 
30 |     .. automethod:: render
31 | 
32 | Example : Cartpole using gymnasium
33 | ----------------------------------
34 | 
35 | .. literalinclude:: ../tests/test_cartpole_dqn.py
36 |    :pyobject: test_cartpole
37 |    :lines: 3-
38 |    :dedent: 4
39 | 


--------------------------------------------------------------------------------
/docs/Optimizers.rst:
--------------------------------------------------------------------------------
 1 | .. _optimizers:
 2 | 
 3 | Optimizers
 4 | ==========
 5 | 
 6 | Gradient descent
 7 | ----------------
 8 | 
 9 | .. autoclass:: pykitml.GradientDescent
10 | 
11 |     .. automethod:: __init__
12 | 
13 | Momentum
14 | --------
15 | 
16 | .. autoclass:: pykitml.Momentum
17 | 
18 |     .. automethod:: __init__
19 | 
20 | Nesterov momentum
21 | -----------------
22 | 
23 | .. autoclass:: pykitml.Nesterov
24 | 
25 |     .. automethod:: __init__
26 | 
27 | Adagrad
28 | -------
29 | 
30 | .. autoclass:: pykitml.Adagrad
31 | 
32 |     .. automethod:: __init__
33 | 
34 | RMSprop
35 | -------
36 | 
37 | .. autoclass:: pykitml.RMSprop
38 | 
39 |     .. automethod:: __init__
40 | 
41 | Adam
42 | ----
43 | 
44 | .. autoclass:: pykitml.Adam
45 | 
46 |     .. automethod:: __init__
47 | 


--------------------------------------------------------------------------------
/pykitml/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | pykitml (Python Kit for Machine Learning),
 3 |     pykitml Machine Learning library.
 4 | Copyrights(c) Vishnu Shankar
 5 | MIT License (See LICENSE file)
 6 | https://github.com/RainingComputers
 7 | '''
 8 | 
 9 | from .network import *
10 | from .linear_regression import *
11 | from .logistic_regression import *
12 | from .svm import *
13 | from .naive_bayes import *
14 | from .decision_tree import *
15 | from .random_forest import *
16 | from .nearest_neighbor import *
17 | from .pca import *
18 | from .kmeans_clustering import *
19 | from .pklhandler import *
20 | from .normalize import *
21 | from .optimizers import *
22 | from .preprocessing import *
23 | from .cross_val import *
24 | from .lstm import *
25 | from .smote import *
26 | from .random_search import *
27 | from .fceux import *
28 | from .dqn import *
29 | 
30 | from . import testing
31 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY : help, checkmicc, clean, lint, test, gendocs
 2 | help :
 3 | 	@echo "checkmicc 	: Check maintainability-index and cyclomatic-complexity."
 4 | 	@echo "clean       	: Remove auto-generated files."
 5 | 	@echo "lint     : Tun pylint"
 6 | 	@echo "test		: Run tests."
 7 | 	@echo "gendocs		: Generate documentation."
 8 | 	@echo "opendocs	: Generate and open documentation in default browser."
 9 | 
10 | checkmicc:
11 | 	python3 -m radon mi pykitml
12 | 	python3 -m radon cc pykitml
13 | 
14 | clean:
15 | 	rm -f *.pkl
16 | 	rm -f -r .pytest_cache
17 | 	rm -f -r pykitml/__pycache__
18 | 	make -C tests/ clean
19 | 	make -C docs/ clean
20 | 	rm -f -r build/
21 | 	rm -f -r dist/
22 | 
23 | lint:
24 | 	pylint pykitml tests  --rcfile ./.pylintrc
25 | 
26 | test:
27 | 	make -C tests/ test
28 | 
29 | gendocs:
30 | 	make -C docs/ clean
31 | 	make -C docs/ html
32 | 
33 | opendocs: gendocs
34 | 	xdg-open docs/_build/html/index.html
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/tests/test_s1_kmeans.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_s1_kmeans():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import s1clustering
10 |     import matplotlib.pyplot as plt
11 | 
12 |     # Download the dataset
13 |     if not os.path.exists('s1.pkl'):
14 |         s1clustering.get()
15 | 
16 |     # Load the dataset
17 |     train_data = s1clustering.load()
18 | 
19 |     # Run KMeans
20 |     clusters, cost = pk.kmeans(train_data, 15)
21 | 
22 |     # Plot dataset, x and y
23 |     plt.scatter(train_data[:, 0], train_data[:, 1])
24 | 
25 |     # Plot clusters, x and y
26 |     plt.scatter(clusters[:, 0], clusters[:, 1], c='red')
27 | 
28 |     # Show graph
29 |     plt.show()
30 | 
31 |     # Assert cost
32 |     assert cost <= 1790000000
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     try:
37 |         test_s1_kmeans.__wrapped__()
38 |     except AssertionError:
39 |         pass
40 | 


--------------------------------------------------------------------------------
/tests/test_normalization.py:
--------------------------------------------------------------------------------
 1 | # ================================================
 2 | # = Test normalization/feature-scaling functions =
 3 | # ================================================
 4 | 
 5 | 
 6 | import numpy as np
 7 | 
 8 | import pykitml as pk
 9 | 
10 | eg_array = np.array([
11 |     [0.1,   0.3434, 1.3434, 3],
12 |     [1.2,   4.54,   6.7,    3.456],
13 |     [5.678, 2.345,  2.453,  8.345],
14 |     [2.3,   6.2,    8.3,    1.2]
15 | ])
16 | 
17 | 
18 | def test_minmax():
19 |     expected_output = (np.array([0.1, 0.3434, 1.3434, 1.2]),
20 |                        np.array([5.678, 6.2, 8.3, 8.345]))
21 | 
22 |     assert np.allclose(pk.get_minmax(eg_array), expected_output)
23 | 
24 | 
25 | def test_normalize():
26 |     array_min, array_max = pk.get_minmax(eg_array)
27 | 
28 |     norm_array = pk.normalize_minmax(eg_array, array_min, array_max)
29 |     denorm_array = pk.denormalize_minmax(norm_array, array_min, array_max)
30 | 
31 |     assert np.allclose(denorm_array, eg_array)
32 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r') as f:
 4 |     long_description = f.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='pykitml',
 8 |     version='0.1.3',
 9 |     author='RainingComputers',
10 |     author_email='vishnu.vish.shankar@gmail.com',
11 |     description='Machine Learning library written in Python and NumPy.',
12 |     long_description=long_description,
13 |     long_description_content_type='text/markdown',
14 |     url='https://github.com/RainingComputers/pykitml',
15 |     packages=setuptools.find_packages(exclude=['docs', 'tests']),
16 |     python_requires='>=3.10',
17 |     install_requires=[
18 |         'numpy', 'matplotlib', 'tqdm', 'graphviz'
19 |     ],
20 |     classifiers=[
21 |         'Programming Language :: Python :: 3',
22 |         'License :: OSI Approved :: MIT License',
23 |         'Operating System :: OS Independent',
24 |         'Development Status :: 3 - Alpha',
25 |         'Topic :: Scientific/Engineering :: Artificial Intelligence'
26 |     ],
27 |     keywords='pykitml'
28 | )
29 | 


--------------------------------------------------------------------------------
/docs/Linear Regression.rst:
--------------------------------------------------------------------------------
 1 | Linear Regression
 2 | =================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.LinearRegression
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: train
16 | 
17 |     .. automethod:: plot_performance
18 | 
19 |     .. automethod:: r2score
20 | 
21 |     .. automethod:: cost
22 | 
23 | 
24 | Example: Predicting Fish Length
25 | -------------------------------
26 | **Dataset**
27 | 
28 | :ref:`fishlength_dataset`
29 | 
30 | **Training Model**
31 | 
32 | .. literalinclude:: ../tests/test_fishlength.py
33 |    :pyobject: test_fishlength
34 |    :lines: 3-
35 |    :end-before: # Assert
36 |    :dedent: 4
37 | 
38 | **Predict length of fish that is 28 days old at 25C**
39 | 
40 | .. literalinclude:: ../tests/test_fishlength.py
41 |    :pyobject: test_predict_fishlength
42 |    :lines: 3-
43 |    :dedent: 4
44 | 
45 | **Performance Graph**
46 | 
47 | .. image:: ./demo_pics/linear_regression_perf_graph.png


--------------------------------------------------------------------------------
/docs/GaussianNaiveBayes.rst:
--------------------------------------------------------------------------------
 1 | Gaussian Naive Bayes
 2 | ====================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.GaussianNaiveBayes
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: accuracy
20 | 
21 |     .. automethod:: confusion_matrix
22 | 
23 | Example: Classifying Iris
24 | -------------------------
25 | 
26 | **Dataset**
27 | 
28 | :ref:`iris_dataset`
29 | 
30 | **Training**
31 | 
32 | .. literalinclude:: ../tests/test_iris_bayes.py
33 |    :pyobject: test_iris_bayes
34 |    :lines: 3-
35 |    :end-before: # Assert
36 |    :dedent: 4
37 | 
38 | **Predict type of species with sepal-length, sepal-width, petal-length, petal-width: 
39 | 5.8, 2.7, 3.9, 1.2**
40 | 
41 | .. literalinclude:: ../tests/test_iris_bayes.py
42 |    :pyobject: test_predict_iris_bayes
43 |    :lines: 3-
44 |    :dedent: 4
45 | 
46 | **Confusion Matrix**
47 | 
48 | .. image:: ./demo_pics/gaussian_bayes_confusion_matrix.png
49 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pykitml documentation master file, created by
 2 |    sphinx-quickstart on Thu Feb 21 17:21:36 2019.
 3 | 
 4 | pykitml (Python Kit for Machine Learning) Docs
 5 | ==============================================
 6 | 
 7 | .. image:: ../pykitml128.png
 8 | 
 9 | Documentation and reference for pykitml, simple Machine Learning library written in Python and NumPy.
10 | 
11 | 
12 | Installation
13 | ------------
14 | 
15 | .. code-block:: bash
16 | 
17 |    python3 -m pip install pykitml
18 | 
19 | .. toctree::
20 |    :maxdepth: 1
21 |    :caption: Package Reference:
22 | 
23 |    Linear Regression
24 |    LogisticRegression
25 |    SVM
26 |    FeedForwardNetwork
27 |    LSTM
28 |    Optimizers
29 |    NearestNeighbor
30 |    DecisionTree
31 |    RandomForest
32 |    NaiveBayes
33 |    GaussianNaiveBayes
34 |    KMeans
35 |    PrincipalComponentAnalysis
36 |    RandomSearch
37 |    Normalization
38 |    PreprocessingDatasets
39 |    SMOTE
40 |    CrossValidation
41 |    Datasets
42 |    SavingAndLoading
43 |    FCEUX
44 |    DQN
45 | 
46 | 
47 | Indices and tables
48 | ==================
49 | 
50 | * :ref:`genindex`
51 | 


--------------------------------------------------------------------------------
/docs/NearestNeighbor.rst:
--------------------------------------------------------------------------------
 1 | Nearest Neighbor
 2 | =================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.NearestNeighbor
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: accuracy
20 | 
21 |     .. automethod:: r2score
22 | 
23 |     .. automethod:: confusion_matrix
24 | 
25 | Example: Classifying Iris
26 | -------------------------
27 | 
28 | **Dataset**
29 | 
30 | :ref:`iris_dataset`
31 | 
32 | **Training**
33 | 
34 | .. literalinclude:: ../tests/test_iris_neighbor.py
35 |    :pyobject: test_iris_neighbor
36 |    :lines: 3-
37 |    :end-before: # Assert
38 |    :dedent: 4
39 | 
40 | **Predict type of species with sepal-length, sepal-width, petal-length, petal-width: 
41 | 5.8, 2.7, 3.9, 1.2**
42 | 
43 | .. literalinclude:: ../tests/test_iris_neighbor.py
44 |    :pyobject: test_predict_iris_neighbor
45 |    :lines: 3-
46 |    :dedent: 4
47 | 
48 | **Confusion Matrix**
49 | 
50 | .. image:: ./demo_pics/neighbor_confusion_matrix.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 B Vishnu Shankar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/docs/NaiveBayes.rst:
--------------------------------------------------------------------------------
 1 | Naive Bayes
 2 | ===========
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.NaiveBayes
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: accuracy
20 | 
21 |     .. automethod:: confusion_matrix
22 | 
23 | Example: Heart Disease Prediction
24 | ---------------------------------
25 | 
26 | **Dataset**
27 | 
28 | :ref:`heart_dataset`
29 | 
30 | **Training**
31 | 
32 | .. literalinclude:: ../tests/test_heart_bayes.py
33 |    :pyobject: test_heart_bayes
34 |    :lines: 3-
35 |    :end-before: # Assert
36 |    :dedent: 4
37 | 
38 | **Predict heartdisease for a person with 
39 | age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal:
40 | 67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3**
41 | 
42 | .. literalinclude:: ../tests/test_heart_bayes.py
43 |    :pyobject: test_predict_heart_bayes
44 |    :lines: 3-
45 |    :dedent: 4
46 | 
47 | **Confusion Matrix**
48 | 
49 | .. image:: ./demo_pics/bayes_confusion_matrix.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Pickle files
 2 | *.pkl
 3 | 
 4 | # Datasets
 5 | *.data
 6 | *.gz
 7 | *.*data
 8 | 
 9 | # TODO list
10 | TODO.txt
11 | 
12 | # Python
13 | __pycache__/
14 | *.py[cod]
15 | *$py.class
16 | 
17 | # Sphinx
18 | docs/_build/*
19 | docs/_static/*
20 | 
21 | # Distribution / packaging
22 | .Python
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | pip-wheel-metadata/
36 | share/python-wheels/
37 | *.egg-info/
38 | .installed.cfg
39 | *.egg
40 | MANIFEST
41 | 
42 | # pytest
43 | .pytest_cache/
44 | 
45 | # profiling
46 | *.dat
47 | profile_*.png
48 | 
49 | # pyenv
50 | .python-version
51 | 
52 | # Environments
53 | .env
54 | .venv
55 | env/
56 | venv/
57 | ENV/
58 | env.bak/
59 | venv.bak/
60 | 
61 | # Windows thumbnail cache files
62 | Thumbs.db
63 | ehthumbs.db
64 | ehthumbs_vista.db
65 | 
66 | # Dump file
67 | *.stackdump
68 | 
69 | # Folder config file
70 | [Dd]esktop.ini
71 | 
72 | # Recycle Bin used on file shares
73 | $RECYCLE.BIN/
74 | 
75 | # VS CODE
76 | .vscode/settings.json
77 | 
78 | # graphviz
79 | *.gv
80 | *.gv.png
81 | *.gv.pdf
82 | 
83 | # temp folder
84 | temp/
85 | 


--------------------------------------------------------------------------------
/pykitml/pklhandler.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | '''
 4 | This module contains functions for saving and
 5 | loading .pkl files
 6 | '''
 7 | 
 8 | 
 9 | def save(object_, file_name):
10 |     '''
11 |     Saves an object into a file.
12 | 
13 |     Parameters
14 |     ----------
15 |     object_ : object
16 |         The object to save
17 |     file_name : str
18 |         The name of the file to save the object in.
19 | 
20 |     Raises
21 |     ------
22 |         OSError
23 |             If the file cannot be created due to a system-related error.
24 |     '''
25 |     file = open(file_name, 'wb')
26 |     pickle.dump(object_, file)
27 |     file.close()
28 | 
29 | 
30 | def load(file_name):
31 |     '''
32 |     Loads an object from file.
33 | 
34 |     Parameters
35 |     ----------
36 |     file_name : str
37 |         The name of the file to load the object from.
38 | 
39 |     Returns
40 |     -------
41 |     object
42 |         The python object stored in the file.
43 | 
44 |     Raises
45 |     ------
46 |     FileNotFoundError
47 |         If the file does not exist.
48 |     '''
49 |     file = open(file_name, 'rb')
50 |     object_ = pickle.load(file)
51 |     file.close()
52 |     return object_
53 | 


--------------------------------------------------------------------------------
/docs/DecisionTree.rst:
--------------------------------------------------------------------------------
 1 | Decision Tree
 2 | =============
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.DecisionTree
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: accuracy
20 | 
21 |     .. automethod:: confusion_matrix
22 | 
23 |     .. automethod:: r2score
24 | 
25 |     .. automethod:: show_tree
26 | 
27 | Example: Classifying Iris
28 | -------------------------
29 | 
30 | **Dataset**
31 | 
32 | :ref:`iris_dataset`
33 | 
34 | **Training**
35 | 
36 | .. literalinclude:: ../tests/test_iris_tree.py
37 |    :pyobject: test_iris_tree
38 |    :lines: 3-
39 |    :end-before: # Assert
40 |    :dedent: 4
41 | 
42 | **Predict type of species with sepal-length, sepal-width, petal-length, petal-width: 
43 | 5.8, 2.7, 3.9, 1.2**
44 | 
45 | .. literalinclude:: ../tests/test_iris_tree.py
46 |    :pyobject: test_predict_iris_tree
47 |    :lines: 3-
48 |    :dedent: 4
49 | 
50 | **Tree Graph**
51 | 
52 | .. image:: ./demo_pics/tree.png
53 | 
54 | **Confusion Matrix**
55 | 
56 | .. image:: ./demo_pics/tree_confusion_matrix.png


--------------------------------------------------------------------------------
/docs/LogisticRegression.rst:
--------------------------------------------------------------------------------
 1 | Logistic Regression
 2 | ===================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.LogisticRegression
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: plot_performance
20 | 
21 |     .. automethod:: cost
22 | 
23 |     .. automethod:: accuracy
24 | 
25 |     .. automethod:: confusion_matrix
26 | 
27 | Example: Banknote Authentication
28 | --------------------------------
29 | 
30 | **Dataset**
31 | 
32 | :ref:`banknote_dataset`
33 | 
34 | **Training**
35 | 
36 | .. literalinclude:: ../tests/test_banknote.py
37 |    :pyobject: test_banknote
38 |    :lines: 3-
39 |    :end-before: # Assert
40 |    :dedent: 4
41 | 
42 | **Predict banknote validity with variance, skewness, curtosis, entropy: 
43 | -2.3, -9.3, 9.37, -0.86**
44 | 
45 | .. literalinclude:: ../tests/test_banknote.py
46 |    :pyobject: test_predict_banknote
47 |    :lines: 3-
48 |    :dedent: 4
49 | 
50 | **Performance Graph**
51 | 
52 | .. image:: ./demo_pics/logistic_regression_perf_graph.png
53 | 
54 | **Confusion Matrix**
55 | 
56 | .. image:: ./demo_pics/logistic_regression_confusion_matrix.png
57 | 


--------------------------------------------------------------------------------
/docs/FeedForwardNetwork.rst:
--------------------------------------------------------------------------------
 1 | Feed-Forward Neural Network
 2 | ===========================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.NeuralNetwork
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: plot_performance
20 | 
21 |     .. automethod:: cost
22 | 
23 |     .. automethod:: accuracy
24 | 
25 |     .. automethod:: r2score
26 | 
27 |     .. automethod:: confusion_matrix
28 | 
29 |     .. autoattribute:: nlayers
30 | 
31 | Example: Handwritten Digit Recognition (MNIST)
32 | ----------------------------------------------
33 | 
34 | **Dataset**
35 | 
36 | :ref:`mnist_dataset`
37 | 
38 | **Training**
39 | 
40 | .. literalinclude:: ../tests/test_mnist.py
41 |    :pyobject: test_adam
42 |    :lines: 3-
43 |    :end-before: # Assert
44 |    :dedent: 4
45 | 
46 | **Predicting**
47 | 
48 | .. literalinclude:: ../tests/test_mnist.py
49 |    :pyobject: test_predict_mnist_adam
50 |    :lines: 3-
51 |    :dedent: 4
52 | 
53 | **Performance Graph**
54 | 
55 | .. image:: ./demo_pics/neural_network_perf_graph.png
56 | 
57 | **Confusion Matrix**
58 | 
59 | .. image:: ./demo_pics/neural_network_confusion_matrix.png


--------------------------------------------------------------------------------
/pykitml/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | from ._single_layer_model import SingleLayerModel
 2 | from ._classifier import Classifier
 3 | from . import _functions
 4 | 
 5 | 
 6 | class LogisticRegression(SingleLayerModel, Classifier):
 7 |     '''
 8 |     Implements logistic regression for classification.
 9 |     '''
10 | 
11 |     def __init__(self, input_size, output_size, reg_param=0):
12 |         # Initialize base class
13 |         super(LogisticRegression, self).__init__(input_size, output_size, reg_param)
14 | 
15 |         # Choose output activation function
16 |         if output_size == 1:
17 |             # For binary classification
18 |             self._afunc = _functions.sigmoid
19 |             self._afunc_prime = _functions.sigmoid_prime
20 |         else:
21 |             # For multiclass classification
22 |             self._afunc = _functions.softmax
23 |             self._afunc_prime = _functions.softmax_prime
24 | 
25 |     @property
26 |     def _activ_func(self):
27 |         return self._afunc
28 | 
29 |     @property
30 |     def _activ_func_prime(self):
31 |         return self._afunc_prime
32 | 
33 |     @property
34 |     def _cost_func(self):
35 |         return _functions.cross_entropy
36 | 
37 |     @property
38 |     def _cost_func_prime(self):
39 |         return _functions.cross_entropy_prime
40 | 


--------------------------------------------------------------------------------
/tests/test_boston_forest.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_nograph
 2 | 
 3 | 
 4 | @pktest_nograph
 5 | def test_boston_forest():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import boston
 8 | 
 9 |     import os
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('boston.pkl'):
13 |         boston.get()
14 | 
15 |     # Load heart data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = boston.load()
17 | 
18 |     # Create model
19 |     ftypes = [
20 |         'continues', 'continues', 'continues',
21 |         'categorical', 'continues', 'continues',
22 |         'continues', 'continues', 'continues',
23 |         'continues', 'continues', 'continues', 'continues'
24 |     ]
25 |     forest_boston = pk.RandomForest(13, 1, feature_type=ftypes, max_depth=4, min_split=20, regression=True)
26 | 
27 |     # Train
28 |     forest_boston.train(inputs_train, outputs_train)
29 | 
30 |     # Print r2score
31 |     r2score_train = forest_boston.r2score(inputs_train, outputs_train)
32 |     print('Train r2score:', r2score_train)
33 |     r2score = forest_boston.r2score(inputs_test, outputs_test)
34 |     print('Test r2score:', r2score)
35 | 
36 |     # Assert r2score
37 |     assert r2score_train > 0.7
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     try:
42 |         test_boston_forest.__wrapped__()
43 |     except AssertionError:
44 |         pass
45 | 


--------------------------------------------------------------------------------
/tests/test_boston_tree.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_nograph
 2 | 
 3 | 
 4 | @pktest_nograph
 5 | def test_boston_tree():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import boston
 8 | 
 9 |     import os
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('boston.pkl'):
13 |         boston.get()
14 | 
15 |     # Load heart data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = boston.load()
17 | 
18 |     # Create model
19 |     ftypes = [
20 |         'continues', 'continues', 'continues',
21 |         'categorical', 'continues', 'continues',
22 |         'continues', 'continues', 'continues',
23 |         'continues', 'continues', 'continues', 'continues'
24 |     ]
25 |     tree_boston = pk.DecisionTree(13, 1, feature_type=ftypes, max_depth=8, min_split=20, regression=True)
26 | 
27 |     # Train
28 |     tree_boston.train(inputs_train, outputs_train)
29 | 
30 |     # Print r2score
31 |     r2score_train = tree_boston.r2score(inputs_train, outputs_train)
32 |     print('Train r2score:', r2score_train)
33 |     r2score = tree_boston.r2score(inputs_test, outputs_test)
34 |     print('Test r2score:', r2score)
35 | 
36 |     # Show the tree
37 |     tree_boston.show_tree()
38 | 
39 |     # Assert r2score
40 |     assert r2score_train > 0.9
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     try:
45 |         test_boston_tree.__wrapped__()
46 |     except AssertionError:
47 |         pass
48 | 


--------------------------------------------------------------------------------
/tests/test_sonar_forest.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_sonar_forest():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import sonar
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('sonar.pkl'):
13 |         sonar.get()
14 | 
15 |     # Load the sonar dataset
16 |     inputs_train, outputs_train, inputs_test, outputs_test = sonar.load()
17 |     outputs_train = pk.onehot(outputs_train)
18 |     outputs_test = pk.onehot(outputs_test)
19 | 
20 |     # Create model
21 |     forest_sonar_classifier = pk.RandomForest(60, 2, max_depth=9, feature_type=['continues']*60)
22 | 
23 |     # Train the model
24 |     forest_sonar_classifier.train(inputs_train, outputs_train, num_feature_bag=60)
25 | 
26 |     # Save it
27 |     pk.save(forest_sonar_classifier, 'forest_sonar_classifier.pkl')
28 | 
29 |     # Print accuracy
30 |     accuracy = forest_sonar_classifier.accuracy(inputs_train, outputs_train)
31 |     print('Train accuracy:', accuracy)
32 |     accuracy = forest_sonar_classifier.accuracy(inputs_test, outputs_test)
33 |     print('Test accuracy:', accuracy)
34 | 
35 |     # Plot confusion matrix
36 |     forest_sonar_classifier.confusion_matrix(inputs_test, outputs_test,
37 |                                              gnames=['False', 'True'])
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     try:
42 |         test_sonar_forest.__wrapped__()
43 |     except AssertionError:
44 |         pass
45 | 


--------------------------------------------------------------------------------
/tests/test_pca.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_pca_compression():
 6 |     import os.path
 7 |     import random
 8 | 
 9 |     import matplotlib.pyplot as plt
10 |     import pykitml as pk
11 |     from pykitml.datasets import mnist
12 | 
13 |     # Download dataset
14 |     if not os.path.exists('mnist.pkl'):
15 |         mnist.get()
16 | 
17 |     # Load dataset
18 |     training_data, _, _, _ = mnist.load()
19 | 
20 |     # Train PCA, reduce 784 dimensions to 250 dimensions
21 |     pca = pk.PCA(training_data, 250)
22 |     print('Variance retention:', pca.retention)
23 | 
24 |     # Pick random datapoints
25 |     indices = random.sample(range(1, 1000), 16)
26 |     examples = training_data[indices]
27 | 
28 |     # Show the original images
29 |     plt.figure('Original', figsize=(10, 7))
30 |     for i in range(1, 17):
31 |         plt.subplot(4, 4, i)
32 |         plt.imshow(examples[i-1].reshape((28, 28)), cmap='gray')
33 | 
34 |     # Transform the example and compress
35 |     transformed_examples = pca.transform(examples)
36 | 
37 |     # Inverse transform and recover the examples
38 |     recovered_examples = pca.inverse_transform(transformed_examples)
39 | 
40 |     # Show the inverse transformed examples
41 |     plt.figure('Recovered', figsize=(10, 7))
42 |     for i in range(1, 17):
43 |         plt.subplot(4, 4, i)
44 |         plt.imshow(recovered_examples[i-1].reshape((28, 28)), cmap='gray')
45 | 
46 |     # Show results
47 |     plt.show()
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     test_pca_compression.__wrapped__()
52 | 


--------------------------------------------------------------------------------
/tests/test_cartpole_dqn.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | @pktest_graph
 4 | def test_cartpole():
 5 |     import gymnasium as gym
 6 |     import pykitml as pk
 7 | 
 8 |     # Wrapper class around the environment
 9 |     class Environment:
10 |         def __init__(self):
11 |             self._env = gym.make('CartPole-v1', render_mode="human")
12 | 
13 |         def reset(self):
14 |             return self._env.reset()[0]
15 | 
16 |         def step(self, action):
17 |             obs, reward, done, _, _ = self._env.step(action)
18 | 
19 |             x, _, theta, _ = obs
20 |             x_threshold = self._env.env.env.env.x_threshold
21 |             theta_threshold_radians = self._env.env.env.env.theta_threshold_radians
22 | 
23 |             # Reward function, from
24 |             # https://github.com/keon/deep-q-learning/blob/master/ddqn.py            
25 |             r1 = (x_threshold - abs(x)) / x_threshold - 0.8
26 |             r2 = (theta_threshold_radians - abs(theta)) / theta_threshold_radians - 0.5
27 |             reward = r1 + r2
28 | 
29 |             return obs, reward, done
30 | 
31 |         def close(self):
32 |             self._env.close()
33 | 
34 |         def render(self):
35 |             self._env.render()
36 | 
37 |     env = Environment()
38 | 
39 |     # Create DQN agent and train it
40 |     agent = pk.DQNAgent([4, 64, 64, 2])
41 |     agent.set_save_freq(100, 'cartpole_agent')
42 |     agent.train(env, 500, pk.Adam(0.001), render=True)
43 | 
44 |     # Plot reward graph
45 |     agent.plot_performance()
46 | 
47 | if __name__ == '__main__':
48 |     test_cartpole.__wrapped__()
49 | 


--------------------------------------------------------------------------------
/tests/test_heart_tree.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_heart_tree():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import heartdisease
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('heartdisease.pkl'):
13 |         heartdisease.get()
14 | 
15 |     # Load heart data set
16 |     inputs, outputs = heartdisease.load()
17 |     outputs = pk.onehot(outputs)
18 | 
19 |     # Create model
20 |     ftypes = [
21 |         'continues', 'categorical', 'categorical',
22 |         'continues', 'continues', 'categorical', 'categorical',
23 |         'continues', 'categorical', 'continues', 'categorical',
24 |         'categorical', 'categorical'
25 |     ]
26 |     tree_heart_classifier = pk.DecisionTree(13, 2, max_depth=7, feature_type=ftypes)
27 | 
28 |     # Train
29 |     tree_heart_classifier.train(inputs, outputs)
30 | 
31 |     # Save it
32 |     pk.save(tree_heart_classifier, 'tree_heart_classifier.pkl')
33 | 
34 |     # Print accuracy
35 |     accuracy = tree_heart_classifier.accuracy(inputs, outputs)
36 |     print('Accuracy:', accuracy)
37 | 
38 |     # Plot confusion matrix
39 |     tree_heart_classifier.confusion_matrix(inputs, outputs,
40 |                                            gnames=['False', 'True'])
41 | 
42 |     # Plot descision tree
43 |     tree_heart_classifier.show_tree()
44 | 
45 |     # Assert accuracy
46 |     assert (tree_heart_classifier.accuracy(inputs, outputs)) >= 94
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     try:
51 |         test_heart_tree.__wrapped__()
52 |     except AssertionError:
53 |         pass
54 | 


--------------------------------------------------------------------------------
/pykitml/svm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ._single_layer_model import SingleLayerModel
 4 | from ._classifier import Classifier
 5 | from . import _functions
 6 | 
 7 | 
 8 | def gaussian_kernel(input_data, training_inputs, sigma=1):
 9 |     '''
10 |     Transforms the give input data using the gaussian kernel.
11 | 
12 |     Parameters
13 |     ----------
14 |     input_data : numpy.array
15 |         The input data points to transform.
16 |     training_inputs : numpy.array
17 |         The training data.
18 |     sigma : float
19 |         Hyperparameter that determines the 'spread' of the kernel.
20 | 
21 |     '''
22 |     # Calculate squared L2 norm of each data point with
23 |     # every other data point
24 |     distances = _functions.pdist(input_data, training_inputs)
25 |     # Apply gaussian kernel
26 |     transformed_inputs = np.exp((-1/(2*sigma**2))*distances)
27 |     # return
28 |     return transformed_inputs
29 | 
30 | 
31 | class SVM(SingleLayerModel, Classifier):
32 |     '''
33 |     Implements Support Vector Machine with Linear Kernel.
34 | 
35 |     Note
36 |     ----
37 |     The outputs/targets in the training/testing data should have :code:`-1` instead
38 |     of :code:`0` for training. See example for more details.
39 |     '''
40 | 
41 |     @property
42 |     def _activ_func(self):
43 |         return _functions.identity
44 | 
45 |     @property
46 |     def _activ_func_prime(self):
47 |         return _functions.identity_prime
48 | 
49 |     @property
50 |     def _cost_func(self):
51 |         return _functions.hinge_loss
52 | 
53 |     @property
54 |     def _cost_func_prime(self):
55 |         return _functions.hinge_loss_prime
56 | 


--------------------------------------------------------------------------------
/tests/test_banknote_tree.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_banknote_tree():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import banknote
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('banknote.pkl'):
13 |         banknote.get()
14 | 
15 |     # Load heart data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = banknote.load()
17 | 
18 |     # Change 0/False to [1, 0]
19 |     # Change 1/True to [0, 1]
20 |     outputs_train = pk.onehot(outputs_train)
21 |     outputs_test = pk.onehot(outputs_test)
22 | 
23 |     # Create model
24 |     ftypes = ['continues']*4
25 |     tree_banknote_classifier = pk.DecisionTree(4, 2, max_depth=7, feature_type=ftypes)
26 | 
27 |     # Train
28 |     tree_banknote_classifier.train(inputs_train, outputs_train)
29 | 
30 |     # Save it
31 |     pk.save(tree_banknote_classifier, 'tree_banknote_classifier.pkl')
32 | 
33 |     # Print accuracy
34 |     accuracy = tree_banknote_classifier.accuracy(inputs_train, outputs_train)
35 |     print('Train accuracy:', accuracy)
36 |     accuracy = tree_banknote_classifier.accuracy(inputs_test, outputs_test)
37 |     print('Test accuracy:', accuracy)
38 | 
39 |     # Plot confusion matrix
40 |     tree_banknote_classifier.confusion_matrix(inputs_test, outputs_test,
41 |                                               gnames=['False', 'True'])
42 | 
43 |     # Plot descision tree
44 |     tree_banknote_classifier.show_tree()
45 | 
46 |     # Assert accuracy
47 |     assert (tree_banknote_classifier.accuracy(inputs_test, outputs_test)) >= 97
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     try:
52 |         test_banknote_tree.__wrapped__()
53 |     except AssertionError:
54 |         pass
55 | 


--------------------------------------------------------------------------------
/tests/test_heart.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_heart():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import heartdisease
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('heartdisease.pkl'):
13 |         heartdisease.get()
14 | 
15 |     # Load heartdisease data set
16 |     inputs, outputs = heartdisease.load()
17 | 
18 |     # Normalize inputs in the dataset
19 |     inputs_min, inputs_max = pk.get_minmax(inputs)
20 |     inputs = pk.normalize_minmax(inputs, inputs_min, inputs_max, cols=[0, 3, 4, 7, 9])
21 | 
22 |     # Change categorical values to onehot values
23 |     inputs = pk.onehot_cols(inputs, [1, 2, 5, 6, 8, 10, 11, 12])
24 | 
25 |     # Create model
26 |     heart_classifier = pk.LogisticRegression(35, 1)
27 | 
28 |     # Train the model
29 |     heart_classifier.train(
30 |         training_data=inputs,
31 |         targets=outputs,
32 |         batch_size=10,
33 |         epochs=1500,
34 |         optimizer=pk.Adam(learning_rate=0.015, decay_rate=0.99),
35 |         testing_freq=30,
36 |         decay_freq=40
37 |     )
38 | 
39 |     # Save it
40 |     pk.save(heart_classifier, 'heart_classifier.pkl')
41 | 
42 |     # Print accuracy and plot performance
43 |     heart_classifier.plot_performance()
44 |     accuracy = heart_classifier.accuracy(inputs, outputs)
45 |     print('Accuracy:', accuracy)
46 | 
47 |     # Plot confusion matrix
48 |     heart_classifier.confusion_matrix(inputs, outputs)
49 | 
50 |     # Assert if it has enough accuracy
51 |     assert heart_classifier.accuracy(inputs, outputs) >= 87
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     try:
56 |         test_heart.__wrapped__()
57 |     except AssertionError:
58 |         pass
59 | 


--------------------------------------------------------------------------------
/tests/test_adult_tree.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_adult_tree():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import adult
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('adult.data.pkl'):
13 |         adult.get()
14 | 
15 |     # Load adult data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = adult.load()
17 |     outputs_train = pk.onehot(outputs_train)
18 |     outputs_test = pk.onehot(outputs_test)
19 | 
20 |     # Create model
21 |     ftypes = [
22 |         'continues', 'categorical', 'continues', 'categorical',
23 |         'categorical', 'categorical', 'categorical', 'categorical', 'categorical',
24 |         'continues', 'continues', 'continues', 'categorical'
25 |     ]
26 |     tree_adult_classifier = pk.DecisionTree(13, 2, max_depth=100, min_split=100, feature_type=ftypes)
27 | 
28 |     # Train
29 |     tree_adult_classifier.train(inputs_train, outputs_train)
30 | 
31 |     # Save it
32 |     pk.save(tree_adult_classifier, 'tree_adult_classifier.pkl')
33 | 
34 |     # Print accuracy
35 |     accuracy = tree_adult_classifier.accuracy(inputs_train, outputs_train)
36 |     print('Train accuracy:', accuracy)
37 |     accuracy = tree_adult_classifier.accuracy(inputs_test, outputs_test)
38 |     print('Test accuracy:', accuracy)
39 | 
40 |     # Plot confusion matrix
41 |     tree_adult_classifier.confusion_matrix(inputs_test, outputs_test,
42 |                                            gnames=['False', 'True'])
43 | 
44 |     # Assert accuracy
45 |     assert (tree_adult_classifier.accuracy(inputs_test, outputs_test)) >= 84
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     try:
50 |         test_adult_tree.__wrapped__()
51 |     except AssertionError:
52 |         pass
53 | 


--------------------------------------------------------------------------------
/pykitml/smote.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ._functions import pdist
 4 | 
 5 | 
 6 | def smote(minority_data_points, k=1):
 7 |     '''
 8 |     SMOTE (Synthetic Minority Oversampling TEchnique).
 9 |     Used to generate more data points for minority class or imbalanced learning.
10 | 
11 |     Parameters
12 |     ----------
13 |     minority_data_points : numpy.array
14 |         Inputs or data points corresponding to the minority class.
15 |     k : int
16 |         Number of neighbors to consider.
17 | 
18 |     Returns
19 |     -------
20 |     new_points : numpy.array
21 |         New generated data points (Excluding data points passed to the
22 |         function). :code:`k*minority_data_points.shape[0]` points will be
23 |         generated.
24 |     '''
25 |     npoints = minority_data_points.shape[0]
26 |     nfeatures = minority_data_points.shape[1]
27 | 
28 |     # Calculate distance between each point and every other point
29 |     distances = pdist(minority_data_points, minority_data_points)
30 | 
31 |     # Get indices of closest k neighbours for each point
32 |     indices = np.argsort(distances, axis=1)[:, 1:k+1]
33 | 
34 |     # Get the closest k neighbours for each point
35 |     neighbours = minority_data_points[indices].squeeze()
36 |     neighbours = neighbours.reshape(k*npoints, nfeatures)
37 | 
38 |     # Calculate difference between points and k neighbours
39 |     minority_data_points_dups = minority_data_points[np.tile(np.arange(npoints).reshape(npoints, 1), k)]
40 |     minority_data_points_dups = minority_data_points_dups.reshape(k*npoints, nfeatures)
41 |     diff = neighbours - minority_data_points_dups
42 | 
43 |     # Create new data points
44 |     random_floats = np.random.uniform(0, 1, (npoints*k))
45 |     new_points = minority_data_points_dups + (diff.T*random_floats).T
46 | 
47 |     return new_points
48 | 


--------------------------------------------------------------------------------
/tests/test_adult_forest.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_adult_forest():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import adult
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('adult.data.pkl'):
13 |         adult.get()
14 | 
15 |     # Load adult data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = adult.load()
17 |     outputs_train = pk.onehot(outputs_train)
18 |     outputs_test = pk.onehot(outputs_test)
19 | 
20 |     # Create model
21 |     ftypes = [
22 |         'continues', 'categorical', 'continues', 'categorical',
23 |         'categorical', 'categorical', 'categorical', 'categorical', 'categorical',
24 |         'continues', 'continues', 'continues', 'categorical'
25 |     ]
26 |     forest_adult_classifier = pk.RandomForest(13, 2, max_depth=1000, feature_type=ftypes)
27 | 
28 |     # Train
29 |     forest_adult_classifier.train(inputs_train, outputs_train, num_trees=100)
30 | 
31 |     # Save it
32 |     pk.save(forest_adult_classifier, 'forest_adult_classifier.pkl')
33 | 
34 |     # Print accuracy
35 |     accuracy = forest_adult_classifier.accuracy(inputs_train, outputs_train)
36 |     print('Train accuracy:', accuracy)
37 |     accuracy = forest_adult_classifier.accuracy(inputs_test, outputs_test)
38 |     print('Test accuracy:', accuracy)
39 | 
40 |     # Plot confusion matrix
41 |     forest_adult_classifier.confusion_matrix(inputs_test, outputs_test,
42 |                                              gnames=['False', 'True'])
43 | 
44 |     # Assert accuracy
45 |     assert (forest_adult_classifier.accuracy(inputs_test, outputs_test)) >= 82
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     try:
50 |         test_adult_forest.__wrapped__()
51 |     except AssertionError:
52 |         pass
53 | 


--------------------------------------------------------------------------------
/pykitml/datasets/s1clustering.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import zlib
 3 | 
 4 | import numpy as np
 5 | 
 6 | from .. import pklhandler
 7 | from ._s1_compressed import encoded_data
 8 | 
 9 | '''
10 | This module contains helper functions to download and load
11 | the S1 clustering dataset.
12 | '''
13 | 
14 | 
15 | def get():
16 |     '''
17 |     Downloads the s1 clustering dataset from
18 |     http://cs.joensuu.fi/sipu/datasets/
19 |     and save is as a pkl file `s1.pkl`.
20 | 
21 |     Raises
22 |     ------
23 |         urllib.error.URLError
24 |             If internet connection is not available or the URL is not accessible.
25 |         OSError
26 |             If the file cannot be created due to a system-related error.
27 |         KeyError
28 |             If invalid/unknown type.
29 | 
30 |     Note
31 |     ----
32 |     You only need to call this method once, i.e, after the dataset has been downloaded
33 |     and you have the `s1.pkl` file, you don't need to call this method again.
34 |     '''
35 | 
36 |     # Decompress the data and save it as a pkl file
37 |     decoded_data = base64.decodebytes(encoded_data)
38 |     uncompressed_data = zlib.decompress(decoded_data)
39 |     data_array = np.frombuffer(uncompressed_data, dtype=np.int64).reshape(5000, 2)
40 |     pklhandler.save(data_array, 's1.pkl')
41 | 
42 | 
43 | def load():
44 |     '''
45 |     Loads x, y points from the s1 clustering dataset from saved pickle file `s1.pkl` to
46 |     numpy array. S1 clustering dataset contains 15 clusters.
47 | 
48 |     Returns
49 |     -------
50 |     training_data : numpy.array
51 |         5000x2 numpy array containing x, y points.
52 | 
53 |     Raises
54 |     ------
55 |         FileNotFoundError
56 |             If `s1.pkl` file does not exist, i.e, if the dataset was not
57 |             downloaded and saved using the :py:func:`~get` method.
58 |     '''
59 |     return pklhandler.load('s1.pkl')
60 | 


--------------------------------------------------------------------------------
/tests/test_iris.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_iris():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import iris
 8 | 
 9 |     # Load iris data set
10 |     inputs_train, outputs_train, inputs_test, outputs_test = iris.load()
11 | 
12 |     # Normalize inputs in the dataset
13 |     inputs_min, inputs_max = pk.get_minmax(inputs_train)
14 |     inputs_train = pk.normalize_minmax(inputs_train, inputs_min, inputs_max)
15 |     inputs_test = pk.normalize_minmax(inputs_test, inputs_min, inputs_max)
16 | 
17 |     # Create model
18 |     iris_classifier = pk.LogisticRegression(4, 3)
19 | 
20 |     # Train the model
21 |     iris_classifier.train(
22 |         training_data=inputs_train,
23 |         targets=outputs_train,
24 |         batch_size=10,
25 |         epochs=1500,
26 |         optimizer=pk.Adam(learning_rate=0.4, decay_rate=0.99),
27 |         testing_data=inputs_test,
28 |         testing_targets=outputs_test,
29 |         testing_freq=30,
30 |         decay_freq=20
31 |     )
32 | 
33 |     # Save it
34 |     pk.save(iris_classifier, 'iris_classifier.pkl')
35 | 
36 |     # Print accuracy
37 |     accuracy = iris_classifier.accuracy(inputs_train, outputs_train)
38 |     print('Train accuracy:', accuracy)
39 |     accuracy = iris_classifier.accuracy(inputs_test, outputs_test)
40 |     print('Test accuracy:', accuracy)
41 | 
42 |     # Plot performance
43 |     iris_classifier.plot_performance()
44 | 
45 |     # Plot confusion matrix
46 |     iris_classifier.confusion_matrix(inputs_test, outputs_test,
47 |                                      gnames=['Setosa', 'Versicolor', 'Virginica'])
48 | 
49 |     # Assert if it has enough accuracy
50 |     assert iris_classifier.accuracy(inputs_train, outputs_train) >= 98
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     try:
55 |         test_iris.__wrapped__()
56 |     except AssertionError:
57 |         pass
58 | 


--------------------------------------------------------------------------------
/docs/RandomForest.rst:
--------------------------------------------------------------------------------
 1 | Random Forest
 2 | =============
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.RandomForest
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: accuracy
20 | 
21 |     .. automethod:: r2score
22 | 
23 |     .. automethod:: confusion_matrix
24 | 
25 |     .. autoattribute:: trees
26 | 
27 | Example: Banknote Authentication
28 | --------------------------------
29 | 
30 | **Dataset**
31 | 
32 | :ref:`banknote_dataset`
33 | 
34 | **Training**
35 | 
36 | .. literalinclude:: ../tests/test_banknote_forest.py
37 |    :pyobject: test_banknote_forest
38 |    :lines: 3-
39 |    :end-before: # Assert
40 |    :dedent: 4
41 | 
42 | **Predict banknote validity with variance, skewness, curtosis, entropy: 
43 | -2.3, -9.3, 9.37, -0.86**
44 | 
45 | .. literalinclude:: ../tests/test_banknote_forest.py
46 |    :pyobject: test_predict_banknote_forest
47 |    :lines: 3-
48 |    :dedent: 4
49 | 
50 | **Confusion Matrix**
51 | 
52 | .. image:: ./demo_pics/forest_confusion_matrix.png
53 | 
54 | Example: Heart Disease Prediction
55 | ---------------------------------
56 | 
57 | **Dataset**
58 | 
59 | :ref:`heart_dataset`
60 | 
61 | **Training**
62 | 
63 | .. literalinclude:: ../tests/test_heart_forest.py
64 |    :pyobject: test_heart_forest
65 |    :lines: 3-
66 |    :end-before: # Assert
67 |    :dedent: 4
68 | 
69 | **Predict heartdisease for a person with 
70 | age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal:
71 | 67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3**
72 | 
73 | .. literalinclude:: ../tests/test_heart_forest.py
74 |    :pyobject: test_predict_heart_forest
75 |    :lines: 3-
76 |    :dedent: 4
77 | 
78 | **Confusion Matrix**
79 | 
80 | .. image:: ./demo_pics/forest_heart_confusion_matrix.png


--------------------------------------------------------------------------------
/tests/test_iris_bayes.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_iris_bayes():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import iris
 8 | 
 9 |     # Load iris data set
10 |     inputs_train, outputs_train, inputs_test, outputs_test = iris.load()
11 | 
12 |     # Create model
13 |     bayes_iris_classifier = pk.GaussianNaiveBayes(4, 3)
14 | 
15 |     # Train
16 |     bayes_iris_classifier.train(inputs_train, outputs_train)
17 | 
18 |     # Save it
19 |     pk.save(bayes_iris_classifier, 'bayes_iris_classifier.pkl')
20 | 
21 |     # Print accuracy
22 |     accuracy = bayes_iris_classifier.accuracy(inputs_train, outputs_train)
23 |     print('Train accuracy:', accuracy)
24 |     accuracy = bayes_iris_classifier.accuracy(inputs_test, outputs_test)
25 |     print('Test accuracy:', accuracy)
26 | 
27 |     # Plot confusion matrix
28 |     bayes_iris_classifier.confusion_matrix(inputs_test, outputs_test,
29 |                                            gnames=['Setosa', 'Versicolor', 'Virginica'])
30 | 
31 |     # Assert accuracy
32 |     assert (bayes_iris_classifier.accuracy(inputs_train, outputs_train)) >= 95
33 | 
34 | 
35 | @pktest_nograph
36 | def test_predict_iris_bayes():
37 |     import numpy as np
38 |     import pykitml as pk
39 | 
40 |     # Predict type of species with
41 |     # sepal-length sepal-width petal-length petal-width
42 |     # 5.8, 2.7, 3.9, 1.2
43 |     input_data = np.array([5.8, 2.7, 3.9, 1.2])
44 | 
45 |     # Load the model
46 |     bayes_iris_classifier = pk.load('bayes_iris_classifier.pkl')
47 | 
48 |     # Get output
49 |     bayes_iris_classifier.feed(input_data)
50 |     model_output = bayes_iris_classifier.get_output_onehot()
51 | 
52 |     # Print result
53 |     print(model_output)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     try:
58 |         test_iris_bayes.__wrapped__()
59 |         test_predict_iris_bayes.__wrapped__()
60 |     except AssertionError:
61 |         pass
62 | 


--------------------------------------------------------------------------------
/pykitml/testing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cProfile
 3 | from unittest.mock import patch
 4 | from functools import wraps
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def _profile(test_func):
12 |     '''
13 |     Calls test function and profiles it.
14 | 
15 |     Parameters
16 |     ----------
17 |     test_func : function
18 |         The function to test and profile.
19 |     '''
20 |     # Reset random seed
21 |     np.random.seed(0)
22 |     # Call the test function and profile it
23 |     profiler = cProfile.Profile()
24 |     profiler.runcall(test_func)
25 |     profiler.dump_stats(test_func.__name__+'.dat')
26 | 
27 | 
28 | def pktest_graph(test_func):
29 |     '''
30 |     To test and profile function under pytest. Will prevent
31 |     :code:`matplotlib.pyplot.show()` from blocking other tests.
32 | 
33 |     Parameters
34 |     ----------
35 |     test_func : function
36 |         The function to test and profile.
37 |     '''
38 |     # Create wrapper function for testing and profiling in pytest
39 |     @wraps(test_func)
40 |     def test_wrapper():
41 |         # Close any open plots
42 |         plt.close()
43 |         plt.clf()
44 | 
45 |         with patch('matplotlib.pyplot.show') as show_func, patch('graphviz.Digraph.view') as _:
46 |             # Run the test function
47 |             #_profile(test_func)
48 |             test_func()
49 | 
50 |             # Test if graph worked
51 |             if "PYTEST_CURRENT_TEST" in os.environ:
52 |                 assert show_func.called
53 | 
54 |     return test_wrapper
55 | 
56 | 
57 | def pktest_nograph(test_func):
58 |     '''
59 |     To test and profile function under pytest.
60 | 
61 |     Parameters
62 |     ----------
63 |     test_func : function
64 |         The function to test and profile.
65 |     '''
66 |     # Create wrapper function for testing and profiling in pytest
67 |     @wraps(test_func)
68 |     def test_wrapper():
69 |         _profile(test_func)
70 | 
71 |     return test_wrapper
72 | 


--------------------------------------------------------------------------------
/tests/test_search.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_nograph
 2 | 
 3 | 
 4 | @pktest_nograph
 5 | def test_search():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import mnist
10 | 
11 |     # If the dataset is not available then download it
12 |     if not os.path.exists('mnist.pkl'):
13 |         mnist.get(type='fashion')
14 | 
15 |     # Load dataset
16 |     training_data, training_targets, testing_data, testing_targets = mnist.load()
17 | 
18 |     # Search for hyperparameters
19 |     #   Learning rate alpha = 10^-4 to 10^-2
20 |     #   Decay rate = 0.8 to 1
21 |     #   Decay frequency = 10 to 30
22 |     #   Batch size = 10 to 100
23 |     search = pk.RandomSearch()
24 |     for alpha, decay, decay_freq, bsize in search.search(
25 |         10, 3, 5, [-4, -2, 'log'], [0.8, 1, 'float'], [10, 30, 'int'], [10, 100, 'int']):
26 | 
27 |         # Create a new neural network
28 |         fashion_classifier = pk.NeuralNetwork([784, 100, 10])
29 | 
30 |         # Train it
31 |         fashion_classifier.train(
32 |             training_data=training_data,
33 |             targets=training_targets,
34 |             batch_size=bsize,
35 |             epochs=1200,
36 |             optimizer=pk.Adam(learning_rate=alpha, decay_rate=decay),
37 |             testing_freq=100,
38 |             decay_freq=decay_freq
39 |         )
40 | 
41 |         cost = fashion_classifier.cost(testing_data, testing_targets)
42 |         search.set_cost(cost)
43 | 
44 |         # Save the best model
45 |         if search.best:
46 |             pk.save(fashion_classifier, 'best.pkl')
47 | 
48 |     # Load the best model
49 |     fashion_classifier = pk.load('best.pkl')
50 | 
51 |     # Show performance
52 |     accuracy = fashion_classifier.accuracy(testing_data, testing_targets)
53 |     print('Test Accuracy:', accuracy)
54 | 
55 |     # Assert accuracy
56 |     assert accuracy > 84
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     try:
61 |         test_search.__wrapped__()
62 |     except AssertionError:
63 |         pass
64 | 


--------------------------------------------------------------------------------
/tests/test_iris_tree.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_iris_tree():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import iris
 8 | 
 9 |     # Load iris data set
10 |     inputs_train, outputs_train, inputs_test, outputs_test = iris.load()
11 | 
12 |     # Create model
13 |     tree_iris_classifier = pk.DecisionTree(4, 3, max_depth=5, feature_type=['continues']*4)
14 | 
15 |     # Train
16 |     tree_iris_classifier.train(inputs_train, outputs_train)
17 | 
18 |     # Save it
19 |     pk.save(tree_iris_classifier, 'tree_iris_classifier.pkl')
20 | 
21 |     # Print accuracy
22 |     accuracy = tree_iris_classifier.accuracy(inputs_train, outputs_train)
23 |     print('Train accuracy:', accuracy)
24 |     accuracy = tree_iris_classifier.accuracy(inputs_test, outputs_test)
25 |     print('Test accuracy:', accuracy)
26 | 
27 |     # Plot confusion matrix
28 |     tree_iris_classifier.confusion_matrix(inputs_test, outputs_test,
29 |                                           gnames=['Setosa', 'Versicolor', 'Virginica'])
30 | 
31 |     # Plot decision tree
32 |     tree_iris_classifier.show_tree()
33 | 
34 |     # Assert accuracy
35 |     assert (tree_iris_classifier.accuracy(inputs_train, outputs_train)) >= 98
36 | 
37 | 
38 | @pktest_nograph
39 | def test_predict_iris_tree():
40 |     import numpy as np
41 |     import pykitml as pk
42 | 
43 |     # Predict type of species with
44 |     # sepal-length sepal-width petal-length petal-width
45 |     # 5.8, 2.7, 3.9, 1.2
46 |     input_data = np.array([5.8, 2.7, 3.9, 1.2])
47 | 
48 |     # Load the model
49 |     tree_iris_classifier = pk.load('tree_iris_classifier.pkl')
50 | 
51 |     # Get output
52 |     tree_iris_classifier.feed(input_data)
53 |     model_output = tree_iris_classifier.get_output_onehot()
54 | 
55 |     # Print result
56 |     print(model_output)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     try:
61 |         test_iris_tree.__wrapped__()
62 |         test_predict_iris_tree.__wrapped__()
63 |     except AssertionError:
64 |         pass
65 | 


--------------------------------------------------------------------------------
/pykitml/nearest_neighbor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ._classifier import Classifier
 4 | from ._regressor import Regressor
 5 | from . import _functions
 6 | 
 7 | 
 8 | class NearestNeighbor(Classifier, Regressor):
 9 |     '''
10 |     This class implements nearest neighbor classifier.
11 |     '''
12 | 
13 |     def __init__(self, inputs_size, output_size, no_neighbors=1):
14 |         '''
15 |         Parameters
16 |         ----------
17 |         input_size : int
18 |             Size of input data or number of input features.
19 |         output_size : int
20 |             Number of categories or groups.
21 |         no_neighbors : int
22 |             The number of nearest neighbors to consider.
23 |         '''
24 |         self._k = no_neighbors
25 |         self._output = None
26 | 
27 |         self._input_size = inputs_size
28 |         self._output_size = output_size
29 | 
30 |         self._inputs = None
31 |         self._outputs = None
32 | 
33 |     @property
34 |     def _out_size(self):
35 |         return self._output_size
36 | 
37 |     def train(self, training_data, targets):
38 |         '''
39 |         Trains the model on the training data.
40 | 
41 |         Parameters
42 |         ----------
43 |         training_data : numpy.array
44 |             numpy array containing training data.
45 |         targets : numpy.array
46 |             numpy array containing training targets, corresponding to the training data.
47 |         '''
48 |         self._inputs = training_data
49 |         self._outputs = targets
50 | 
51 |     def feed(self, input_data):
52 |         # Make sure array is 2D
53 |         if input_data.ndim == 1:
54 |             input_data = np.array([input_data])
55 | 
56 |         # Get pair wise distances
57 |         distances = _functions.pdist(input_data, self._inputs)
58 | 
59 |         # Sort the distances
60 |         indices = np.argsort(distances, axis=1)[:, 0:self._k]
61 | 
62 |         # Get output
63 |         self._output = np.mean(self._outputs[indices], axis=1)
64 | 
65 |     def get_output(self):
66 |         return self._output.squeeze()
67 | 


--------------------------------------------------------------------------------
/tests/test_iris_neighbor.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_iris_neighbor():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import iris
 8 | 
 9 |     # Load iris data set
10 |     inputs_train, outputs_train, inputs_test, outputs_test = iris.load()
11 | 
12 |     # Create model
13 |     neighbor_iris_classifier = pk.NearestNeighbor(4, 3)
14 | 
15 |     # Train the model
16 |     neighbor_iris_classifier.train(
17 |         training_data=inputs_train,
18 |         targets=outputs_train,
19 |     )
20 | 
21 |     # Save it
22 |     pk.save(neighbor_iris_classifier, 'neighbor_iris_classifier.pkl')
23 | 
24 |     # Print accuracy
25 |     accuracy = neighbor_iris_classifier.accuracy(inputs_train, outputs_train)
26 |     print('Train accuracy:', accuracy)
27 |     accuracy = neighbor_iris_classifier.accuracy(inputs_test, outputs_test)
28 |     print('Test accuracy:', accuracy)
29 | 
30 |     # Plot confusion matrix
31 |     neighbor_iris_classifier.confusion_matrix(inputs_test, outputs_test,
32 |                                               gnames=['Setosa', 'Versicolor', 'Virginica'])
33 | 
34 |     # Assert if it has enough accuracy
35 |     assert neighbor_iris_classifier.accuracy(inputs_train, outputs_train) >= 100
36 | 
37 | 
38 | @pktest_nograph
39 | def test_predict_iris_neighbor():
40 |     import numpy as np
41 |     import pykitml as pk
42 | 
43 |     # Predict type of species with
44 |     # sepal-length sepal-width petal-length petal-width
45 |     # 5.8, 2.7, 3.9, 1.2
46 |     input_data = np.array([5.8, 2.7, 3.9, 1.2])
47 | 
48 |     # Load the model
49 |     neighbor_iris_classifier = pk.load('neighbor_iris_classifier.pkl')
50 | 
51 |     # Get output
52 |     neighbor_iris_classifier.feed(input_data)
53 |     model_output = neighbor_iris_classifier.get_output_onehot()
54 | 
55 |     # Print result
56 |     print(model_output)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     try:
61 |         test_iris_neighbor.__wrapped__()
62 |         test_predict_iris_neighbor.__wrapped__()
63 |     except AssertionError:
64 |         pass
65 | 


--------------------------------------------------------------------------------
/docs/SVM.rst:
--------------------------------------------------------------------------------
 1 | Support Vector Machine
 2 | ======================
 3 | 
 4 | Class Reference
 5 | ---------------
 6 | 
 7 | .. autoclass:: pykitml.SVM
 8 |     
 9 |     .. automethod:: __init__
10 | 
11 |     .. automethod:: feed
12 | 
13 |     .. automethod:: get_output
14 | 
15 |     .. automethod:: get_output_onehot
16 | 
17 |     .. automethod:: train
18 | 
19 |     .. automethod:: plot_performance
20 | 
21 |     .. automethod:: cost
22 | 
23 |     .. automethod:: accuracy
24 | 
25 |     .. automethod:: confusion_matrix
26 | 
27 | Gaussian Kernel
28 | ---------------
29 | 
30 | .. autofunction:: pykitml.gaussian_kernel
31 | 
32 | Example: Classifying Iris Using SVM with Linear Kernel
33 | ------------------------------------------------------
34 | 
35 | **Dataset**
36 | 
37 | :ref:`iris_dataset`
38 | 
39 | **Training**
40 | 
41 | .. literalinclude:: ../tests/test_iris_svm.py
42 |    :pyobject: test_iris_svm
43 |    :lines: 3-
44 |    :end-before: # Assert
45 |    :dedent: 4
46 | 
47 | **Predict type of species with sepal-length, sepal-width, petal-length, petal-width: 
48 | 5.8, 2.7, 3.9, 1.2**
49 | 
50 | .. literalinclude:: ../tests/test_iris_svm.py
51 |    :pyobject: test_predict_iris_svm
52 |    :lines: 3-
53 |    :dedent: 4
54 | 
55 | **Performance Graph**
56 | 
57 | .. image:: ./demo_pics/linear_svm_perf_graph.png
58 | 
59 | **Confusion Matrix**
60 | 
61 | .. image :: ./demo_pics/linear_svm_confusion_matrix.png
62 | 
63 | Example: Handwritten Digit Recognition (MNIST) using Gaussian Kernel
64 | --------------------------------------------------------------------
65 | 
66 | **Dataset**
67 | 
68 | :ref:`mnist_dataset`
69 | 
70 | **Training**
71 | 
72 | .. literalinclude:: ../tests/test_mnist_svm.py
73 |    :pyobject: test_mnist_svm
74 |    :lines: 3-
75 |    :end-before: # Assert
76 |    :dedent: 4
77 | 
78 | **Predicting**
79 | 
80 | .. literalinclude:: ../tests/test_mnist_svm.py
81 |    :pyobject: test_predict_mnist_svm
82 |    :lines: 3-
83 |    :dedent: 4
84 | 
85 | **Performance Graph**
86 | 
87 | .. image:: ./demo_pics/gaussian_svm_perf_graph.png
88 | 
89 | **Confusion Matrix**
90 | 
91 | .. image:: ./demo_pics/gaussian_svm_confusion_matrix.png


--------------------------------------------------------------------------------
/pykitml/_shared_array.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import multiprocessing
 3 | 
 4 | import numpy
 5 | 
 6 | '''
 7 | This module contains helper functions to share
 8 | numpy arrays between python multiprocessing processes.
 9 | 
10 | See: https://stackoverflow.com/a/5034106/5516481
11 | '''
12 | 
13 | _ctypes_to_numpy = {
14 |     ctypes.c_char: numpy.dtype(numpy.uint8),
15 |     ctypes.c_wchar: numpy.dtype(numpy.int16),
16 |     ctypes.c_byte: numpy.dtype(numpy.int8),
17 |     ctypes.c_ubyte: numpy.dtype(numpy.uint8),
18 |     ctypes.c_short: numpy.dtype(numpy.int16),
19 |     ctypes.c_ushort: numpy.dtype(numpy.uint16),
20 |     ctypes.c_int: numpy.dtype(numpy.int32),
21 |     ctypes.c_uint: numpy.dtype(numpy.uint32),
22 |     ctypes.c_long: numpy.dtype(numpy.int64),
23 |     ctypes.c_ulong: numpy.dtype(numpy.uint64),
24 |     ctypes.c_float: numpy.dtype(numpy.float32),
25 |     ctypes.c_double: numpy.dtype(numpy.float64)
26 | }
27 | 
28 | _numpy_to_ctypes = dict(zip(_ctypes_to_numpy.values(), _ctypes_to_numpy.keys()))
29 | 
30 | 
31 | def shm_as_ndarray(mp_array, shape=None):
32 |     '''
33 |     Given a multiprocessing.Array, returns an ndarray pointing to
34 |     the same data.
35 |     '''
36 | 
37 |     # support SynchronizedArray:
38 |     if not hasattr(mp_array, '_type_'):
39 |         mp_array = mp_array.get_obj()
40 | 
41 |     dtype = _ctypes_to_numpy[mp_array._type_]  # pylint: disable=protected-access
42 |     result = numpy.frombuffer(mp_array, dtype)
43 | 
44 |     if shape is not None:
45 |         result = result.reshape(shape)
46 | 
47 |     return numpy.asarray(result)
48 | 
49 | 
50 | def ndarray_to_shm(array, lock=False):
51 |     '''
52 |     Generate an 1D multiprocessing.Array containing the data from
53 |     the passed ndarray. The data will be *copied* into shared
54 |     memory.
55 |     '''
56 | 
57 |     array1d = array.ravel(order='A')
58 | 
59 |     try:
60 |         c_type = _numpy_to_ctypes[array1d.dtype]
61 |     except KeyError:
62 |         c_type = _numpy_to_ctypes[numpy.dtype(array1d.dtype)]
63 | 
64 |     result = multiprocessing.Array(c_type, array1d.size, lock=lock)
65 |     shm_as_ndarray(result)[:] = array1d
66 |     return result
67 | 


--------------------------------------------------------------------------------
/tests/test_adult.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_adult():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import adult
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('adult.data.pkl'):
13 |         adult.get()
14 | 
15 |     # Load adult data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = adult.load()
17 | 
18 |     # Normalize dataset
19 |     array_min, array_max = pk.get_minmax(inputs_train)
20 |     inputs_train = pk.normalize_minmax(inputs_train, array_min, array_max, cols=[0, 2, 9, 10, 11])
21 |     inputs_test = pk.normalize_minmax(inputs_test, array_min, array_max, cols=[0, 2, 9, 10, 11])
22 | 
23 |     # Convert categorical values to one-hot values
24 |     inputs_train, inputs_test = pk.onehot_cols_traintest(inputs_train, inputs_test, cols=[1, 3, 4, 5, 6, 7, 8, 9, 12])
25 | 
26 |     # Create model
27 |     adult_classifier = pk.LogisticRegression(104, 1)
28 | 
29 |     # Train the model
30 |     adult_classifier.train(
31 |         training_data=inputs_train,
32 |         targets=outputs_train,
33 |         batch_size=10,
34 |         epochs=1500,
35 |         optimizer=pk.Adam(learning_rate=0.015, decay_rate=0.99),
36 |         testing_data=inputs_test,
37 |         testing_targets=outputs_test,
38 |         testing_freq=30,
39 |         decay_freq=40
40 |     )
41 | 
42 |     # Save it
43 |     pk.save(adult_classifier, 'adult_classifier.pkl')
44 | 
45 |     # Plot performance
46 |     adult_classifier.plot_performance()
47 | 
48 |     # Print accuracy
49 |     accuracy = adult_classifier.accuracy(inputs_train, outputs_train)
50 |     print('Train accuracy:', accuracy)
51 |     accuracy = adult_classifier.accuracy(inputs_test, outputs_test)
52 |     print('Test accuracy:', accuracy)
53 | 
54 |     # Plot confusion matrix
55 |     adult_classifier.confusion_matrix(inputs_test, outputs_test)
56 | 
57 |     # Assert if it has enough accuracy
58 |     assert adult_classifier.accuracy(inputs_test, outputs_test) >= 82
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     try:
63 |         test_adult.__wrapped__()
64 |     except AssertionError:
65 |         pass
66 | 


--------------------------------------------------------------------------------
/pykitml/_regressor.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class Regressor(ABC):
 5 |     '''
 6 |     Mix-in class for Regression models.
 7 |     '''
 8 | 
 9 |     @abstractmethod
10 |     def get_output(self):
11 |         '''
12 |         Returns the output activations of the model.
13 | 
14 |         Returns
15 |         -------
16 |         numpy.array
17 |             The output activations.
18 |         '''
19 | 
20 |     @abstractmethod
21 |     def feed(self, input_data):
22 |         '''
23 |         Accepts input array and feeds it to the model.
24 | 
25 |         Parameters
26 |         ----------
27 |         input_data : numpy.array
28 |             The input to feed the model.
29 | 
30 |         Raises
31 |         ------
32 |         ValueError
33 |             If the input data has invalid dimensions/shape.
34 | 
35 |         Note
36 |         ----
37 |         This function only feeds the input data, to get the output after calling this
38 |         function use :py:func:`get_output` or :py:func:`get_output_onehot`
39 |         '''
40 | 
41 |     @property
42 |     @abstractmethod
43 |     def _out_size(self):
44 |         '''
45 |         Returns number of nodes/neurons in the output layer.
46 |         '''
47 | 
48 |     def r2score(self, testing_data, testing_targets):
49 |         '''
50 |         Return R-squared or coefficient of determination value.
51 | 
52 |         Parameters
53 |         ----------
54 |         testing_data : numpy.array
55 |             numpy array containing testing data.
56 |         testing_targets : numpy.array
57 |             numpy array containing testing targets, corresponding to the testing data.
58 | 
59 |         Returns
60 |         -------
61 |         r2score : float
62 |             The average cost of the model over the testing data.
63 | 
64 |         Raises
65 |         ------
66 |         ValueError
67 |             If :code:`testing_data` or :code:`testing_targets` has invalid dimensions/shape.
68 |         '''
69 |         self.feed(testing_data)
70 |         output = self.get_output()
71 | 
72 |         error = ((output-testing_targets)**2).sum()
73 |         var = ((testing_targets-testing_targets.mean(axis=0)) ** 2).sum()
74 | 
75 |         return 1-error/var
76 | 


--------------------------------------------------------------------------------
/tests/test_fashion.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph
 2 | 
 3 | 
 4 | def test_download():
 5 |     from pykitml.datasets import mnist
 6 |     # Download the mnist data set
 7 |     mnist.get(type='fashion')
 8 |     # Test ran successfully
 9 |     assert True
10 | 
11 | 
12 | @pktest_graph
13 | def test_adam_fashion():
14 |     import os
15 | 
16 |     import pykitml as pk
17 |     from pykitml.datasets import mnist
18 | 
19 |     # If the dataset is not available then download it
20 |     if not os.path.exists('mnist.pkl'):
21 |         mnist.get(type='fashion')
22 | 
23 |     # Load dataset
24 |     training_data, training_targets, testing_data, testing_targets = mnist.load()
25 | 
26 |     # Create a new neural network
27 |     fashion_classifier = pk.NeuralNetwork([784, 100, 10])
28 | 
29 |     # Train it
30 |     fashion_classifier.train(
31 |         training_data=training_data,
32 |         targets=training_targets,
33 |         batch_size=50,
34 |         epochs=1200,
35 |         optimizer=pk.Adam(learning_rate=0.012, decay_rate=0.95),
36 |         testing_data=testing_data,
37 |         testing_targets=testing_targets,
38 |         testing_freq=30,
39 |         decay_freq=10
40 |     )
41 | 
42 |     # Save it
43 |     pk.save(fashion_classifier, 'fashion_classifier_network.pkl')
44 | 
45 |     # Show performance
46 |     accuracy = fashion_classifier.accuracy(training_data, training_targets)
47 |     print('Train Accuracy:', accuracy)
48 |     accuracy = fashion_classifier.accuracy(testing_data, testing_targets)
49 |     print('Test Accuracy:', accuracy)
50 | 
51 |     # Plot performance
52 |     fashion_classifier.plot_performance()
53 | 
54 |     # Show confusion matrix
55 |     fashion_classifier.confusion_matrix(
56 |         training_data, training_targets,
57 |         gnames=['T-shirt/Top', 'Trouser', 'Pullover',
58 |                 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker',
59 |                 'Bag', 'Ankle Boot'
60 |                 ]
61 |     )
62 | 
63 |     # Assert if it has enough accuracy
64 |     assert fashion_classifier.accuracy(training_data, training_targets) > 84
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     try:
69 |         test_adam_fashion.__wrapped__()
70 |     except AssertionError:
71 |         pass
72 | 


--------------------------------------------------------------------------------
/pykitml/pca.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class PCA:
 5 |     '''
 6 |     This class implements Principle Component Analysis.
 7 |     '''
 8 | 
 9 |     def __init__(self, data_points, no_components):
10 |         '''
11 |         This class implements Principle Component Analysis, used for
12 |         dimensionality reduction.
13 | 
14 |         Parameters
15 |         ----------
16 |         data_points : numpy.array
17 |             The dataset to perform PCA i.e. dimensionality reduction on.
18 |         no_components : int
19 |             Number of principle components to use.
20 |         '''
21 |         # Calculate covariance matrix
22 |         covariance_matrix = (data_points.T) @ data_points
23 |         covariance_matrix = covariance_matrix/data_points.shape[0]
24 | 
25 |         # Perform Singular Value Decomposition on the comvariance matrix
26 |         u, s, _ = np.linalg.svd(covariance_matrix, full_matrices=True)
27 | 
28 |         # Calculate amount of variance retained
29 |         self._retention = np.sum(s[0:no_components])/np.sum(s)
30 | 
31 |         # The transformation matrix for PCA
32 |         self._transform = u[:, 0:no_components]
33 | 
34 |     def transform(self, data_points):
35 |         '''
36 |         Transforms the input dataset to lower dimensions.
37 | 
38 |         Parameters
39 |         ----------
40 |         data_points : numpy.array
41 |             The input dataset.
42 | 
43 |         Returns
44 |         -------
45 |         transformed_data_points : numpy.array
46 |             The transformed input.
47 |         '''
48 |         # Transform the datapoints using principle components
49 |         return data_points@self._transform
50 | 
51 |     def inverse_transform(self, pca_points):
52 |         '''
53 |         Gets the original dataset from transformed points.
54 | 
55 |         Parameters
56 |         ----------
57 |         pca_points : numpy.array
58 |             The transformed points.
59 | 
60 |         '''
61 |         # Transform from principle components back to approx feature
62 |         return pca_points @ (self._transform.T)
63 | 
64 |     @property
65 |     def retention(self):
66 |         '''
67 |         Returns the amount of variance retained, between 0 and 1.
68 |         '''
69 |         return round(self._retention, 2)
70 | 


--------------------------------------------------------------------------------
/tests/test_heart_forest.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_heart_forest():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import heartdisease
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('heartdisease.pkl'):
13 |         heartdisease.get()
14 | 
15 |     # Load heart data set
16 |     inputs, outputs = heartdisease.load()
17 |     outputs = pk.onehot(outputs)
18 | 
19 |     # Create model
20 |     ftypes = [
21 |         'continues', 'categorical', 'categorical',
22 |         'continues', 'continues', 'categorical', 'categorical',
23 |         'continues', 'categorical', 'continues', 'categorical',
24 |         'categorical', 'categorical'
25 |     ]
26 |     forest_heart_classifier = pk.RandomForest(13, 2, max_depth=8, feature_type=ftypes)
27 | 
28 |     # Train
29 |     forest_heart_classifier.train(inputs, outputs)
30 | 
31 |     # Save it
32 |     pk.save(forest_heart_classifier, 'forest_heart_classifier.pkl')
33 | 
34 |     # Print accuracy
35 |     accuracy = forest_heart_classifier.accuracy(inputs, outputs)
36 |     print('Accuracy:', accuracy)
37 | 
38 |     # Plot confusion matrix
39 |     forest_heart_classifier.confusion_matrix(inputs, outputs,
40 |                                              gnames=['False', 'True'])
41 | 
42 |     # Assert accuracy
43 |     assert (forest_heart_classifier.accuracy(inputs, outputs)) >= 94
44 | 
45 | 
46 | @pktest_nograph
47 | def test_predict_heart_forest():
48 |     import numpy as np
49 |     import pykitml as pk
50 | 
51 |     # Predict heartdisease for a person with
52 |     # age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
53 |     # 67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3
54 |     input_data = np.array([67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3], dtype=float)
55 | 
56 |     # Load the model
57 |     forest_heart_classifier = pk.load('forest_heart_classifier.pkl')
58 | 
59 |     # Get output
60 |     forest_heart_classifier.feed(input_data)
61 |     model_output = forest_heart_classifier.get_output()
62 | 
63 |     # Print result (log of probabilities)
64 |     print(model_output)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     try:
69 |         test_heart_forest.__wrapped__()
70 |         test_predict_heart_forest.__wrapped__()
71 |     except AssertionError:
72 |         pass
73 | 


--------------------------------------------------------------------------------
/tests/test_banknote_forest.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_banknote_forest():
 6 |     import os
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import banknote
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('banknote.pkl'):
13 |         banknote.get()
14 | 
15 |     # Load heart data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = banknote.load()
17 | 
18 |     # Change 0/False to [1, 0]
19 |     # Change 1/True to [0, 1]
20 |     outputs_train = pk.onehot(outputs_train)
21 |     outputs_test = pk.onehot(outputs_test)
22 | 
23 |     # Create model
24 |     ftypes = ['continues']*4
25 |     forest_banknote_classifier = pk.RandomForest(4, 2, max_depth=9, feature_type=ftypes)
26 | 
27 |     # Train
28 |     forest_banknote_classifier.train(inputs_train, outputs_train)
29 | 
30 |     # Save it
31 |     pk.save(forest_banknote_classifier, 'forest_banknote_classifier.pkl')
32 | 
33 |     # Print accuracy
34 |     accuracy = forest_banknote_classifier.accuracy(inputs_train, outputs_train)
35 |     print('Train accuracy:', accuracy)
36 |     accuracy = forest_banknote_classifier.accuracy(inputs_test, outputs_test)
37 |     print('Test accuracy:', accuracy)
38 | 
39 |     # Plot confusion matrix
40 |     forest_banknote_classifier.confusion_matrix(inputs_test, outputs_test,
41 |                                                 gnames=['False', 'True'])
42 | 
43 |     # Assert accuracy
44 |     assert (forest_banknote_classifier.accuracy(inputs_test, outputs_test)) >= 98
45 | 
46 | 
47 | @pktest_nograph
48 | def test_predict_banknote_forest():
49 |     import numpy as np
50 |     import pykitml as pk
51 | 
52 |     # Predict banknote validity with variance, skewness, curtosis, entropy
53 |     # of -2.3, -9.3, 9.37, -0.86
54 |     input_data = np.array([-2.3, -9.3, 9.37, -0.86])
55 | 
56 |     # Load the model
57 |     forest_banknote_classifier = pk.load('forest_banknote_classifier.pkl')
58 | 
59 |     # Get output
60 |     forest_banknote_classifier.feed(input_data)
61 |     model_output = forest_banknote_classifier.get_output()
62 | 
63 |     # Print result
64 |     print(model_output)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     try:
69 |         test_banknote_forest.__wrapped__()
70 |         test_predict_banknote_forest.__wrapped__()
71 |     except AssertionError:
72 |         pass
73 | 


--------------------------------------------------------------------------------
/tests/test_heart_bayes.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_heart_bayes():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import heartdisease
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('heartdisease.pkl'):
13 |         heartdisease.get()
14 | 
15 |     # Load heart data set
16 |     inputs, outputs = heartdisease.load()
17 | 
18 |     # Change 0/False to [1, 0]
19 |     # Change 1/True to [0, 1]
20 |     outputs = pk.onehot(outputs)
21 | 
22 |     distrbutions = [
23 |         'gaussian', 'binomial', 'multinomial',
24 |         'gaussian', 'gaussian', 'binomial', 'multinomial',
25 |         'gaussian', 'binomial', 'gaussian', 'multinomial',
26 |         'multinomial', 'multinomial'
27 |     ]
28 | 
29 |     # Create model
30 |     bayes_heart_classifier = pk.NaiveBayes(13, 2, distrbutions)
31 | 
32 |     # Train
33 |     bayes_heart_classifier.train(inputs, outputs)
34 | 
35 |     # Save it
36 |     pk.save(bayes_heart_classifier, 'bayes_heart_classifier.pkl')
37 | 
38 |     # Print accuracy
39 |     accuracy = bayes_heart_classifier.accuracy(inputs, outputs)
40 |     print('Accuracy:', accuracy)
41 | 
42 |     # Plot confusion matrix
43 |     bayes_heart_classifier.confusion_matrix(inputs, outputs,
44 |                                             gnames=['False', 'True'])
45 | 
46 |     # Assert accuracy
47 |     assert (bayes_heart_classifier.accuracy(inputs, outputs)) > 84
48 | 
49 | 
50 | @pktest_nograph
51 | def test_predict_heart_bayes():
52 |     import numpy as np
53 |     import pykitml as pk
54 | 
55 |     # Predict heartdisease for a person with
56 |     # age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
57 |     # 67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3
58 |     input_data = np.array([67, 1, 4, 160, 286, 0, 2, 108, 1, 1.5, 2, 3, 3], dtype=float)
59 | 
60 |     # Load the model
61 |     bayes_heart_classifier = pk.load('bayes_heart_classifier.pkl')
62 | 
63 |     # Get output
64 |     bayes_heart_classifier.feed(input_data)
65 |     model_output = bayes_heart_classifier.get_output()
66 | 
67 |     # Print result (log of probabilities)
68 |     print(model_output)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     # Train
73 |     try:
74 |         test_heart_bayes.__wrapped__()
75 |         test_predict_heart_bayes.__wrapped__()
76 |     except AssertionError:
77 |         pass
78 | 


--------------------------------------------------------------------------------
/tests/test_iris_svm.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_iris_svm():
 6 |     import numpy as np
 7 |     import pykitml as pk
 8 |     from pykitml.datasets import iris
 9 | 
10 |     # Load iris data set
11 |     inputs_train, outputs_train, inputs_test, outputs_test = iris.load()
12 | 
13 |     # Format the outputs for svm training, zeros to -1
14 |     svm_outputs_train = np.where(outputs_train == 0, -1, 1)
15 |     svm_outputs_test = np.where(outputs_test == 0, -1, 1)
16 | 
17 |     # Create model
18 |     svm_iris_classifier = pk.SVM(4, 3)
19 | 
20 |     # Train the model
21 |     svm_iris_classifier.train(
22 |         training_data=inputs_train,
23 |         targets=svm_outputs_train,
24 |         batch_size=20,
25 |         epochs=1000,
26 |         optimizer=pk.Adam(learning_rate=3, decay_rate=0.95),
27 |         testing_data=inputs_test,
28 |         testing_targets=svm_outputs_test,
29 |         testing_freq=30,
30 |         decay_freq=10
31 |     )
32 | 
33 |     # Save it
34 |     pk.save(svm_iris_classifier, 'svm_iris_classifier.pkl')
35 | 
36 |     # Print accuracy
37 |     accuracy = svm_iris_classifier.accuracy(inputs_train, outputs_train)
38 |     print('Train accuracy:', accuracy)
39 |     accuracy = svm_iris_classifier.accuracy(inputs_test, outputs_test)
40 |     print('Test accuracy:', accuracy)
41 | 
42 |     # Plot performance
43 |     svm_iris_classifier.plot_performance()
44 | 
45 |     # Plot confusion matrix
46 |     svm_iris_classifier.confusion_matrix(inputs_test, outputs_test,
47 |                                          gnames=['Setosa', 'Versicolor', 'Virginica'])
48 | 
49 |     # Assert if it has enough accuracy
50 |     assert svm_iris_classifier.accuracy(inputs_train, outputs_train) >= 97
51 | 
52 | 
53 | @pktest_nograph
54 | def test_predict_iris_svm():
55 |     import numpy as np
56 |     import pykitml as pk
57 | 
58 |     # Predict type of species with
59 |     # sepal-length sepal-width petal-length petal-width
60 |     # 5.8, 2.7, 3.9, 1.2
61 |     input_data = np.array([5.8, 2.7, 3.9, 1.2])
62 | 
63 |     # Load the model
64 |     svm_iris_classifier = pk.load('svm_iris_classifier.pkl')
65 | 
66 |     # Get output
67 |     svm_iris_classifier.feed(input_data)
68 |     model_output = svm_iris_classifier.get_output_onehot()
69 | 
70 |     # Print result
71 |     print(model_output)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     try:
76 |         test_iris_svm.__wrapped__()
77 |         test_predict_iris_svm.__wrapped__()
78 |     except AssertionError:
79 |         pass
80 | 


--------------------------------------------------------------------------------
/pykitml/cross_val.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def cross_validate(inputs, outputs, folds=5):
 5 |     '''
 6 |     Python generator function for making K-fold cross validation easier.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     inputs : numpy.array
11 |         Inputs/features of training data.
12 |     outputs : numpy.array
13 |         Outputs/targets of training data.
14 | 
15 |     Yields
16 |     ------
17 |     train_inputs : numpy.array
18 |         Training data containing inputs.
19 |     train_outputs : numpy.array
20 |         Training data containing outputs.
21 |     test_inputs : numpy.array
22 |         Testing data containing inputs.
23 |     test_outputs : numpy.array
24 |         Testing data containing outputs.
25 | 
26 |     Example
27 |     -------
28 |     >>> import numpy as np
29 |     >>> import pykitml as pk
30 |     >>>
31 |     >>> # Mock training data
32 |     ... x = np.arange(30).reshape((10, 3))
33 |     >>> y = x + 10
34 |     >>>
35 |     >>> # 5-fold cross validation
36 |     ... # Training data is split into 5 blocks, each block takes its turn
37 |     ... # to be the test data.
38 |     ... for train_x, train_y, test_x, test_y in pk.cross_validate(x, y, 5):
39 |     ...     print(train_x)
40 |     ...     print(train_y)
41 |     ...     print(test_x)
42 |     ...     print(test_y)
43 |     '''
44 |     size = inputs.shape[0]
45 |     block_size = size//folds
46 |     remainder = size % folds
47 | 
48 |     # Calculate block sizes
49 |     def get_block_size(block):
50 |         if block < remainder:
51 |             return block_size+1
52 |         else:
53 |             return block_size
54 | 
55 |     block_sizes = [get_block_size(block) for block in range(folds)]
56 | 
57 |     # Calculate block indices
58 |     block_indices = [sum(block_sizes[:block]) for block in range(folds)]
59 | 
60 |     # Generate blocks
61 |     def make_block(i, array):
62 |         start = block_indices[i]
63 |         end = block_indices[i]+block_sizes[i]
64 |         return array[start:end]
65 | 
66 |     for i in range(folds):
67 |         # Create testing data
68 |         test_inputs, test_outputs = make_block(i, inputs), make_block(i, outputs)
69 | 
70 |         # Create training data
71 |         train_blocks_inputs = [make_block(j, inputs) for j in range(folds) if j != i]
72 |         train_inputs = np.concatenate(train_blocks_inputs, axis=0)
73 |         train_blocks_outputs = [make_block(j, outputs) for j in range(folds) if j != i]
74 |         train_outputs = np.concatenate(train_blocks_outputs, axis=0)
75 | 
76 |         yield train_inputs, train_outputs, test_inputs, test_outputs
77 | 


--------------------------------------------------------------------------------
/pykitml/datasets/fishlength.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | '''
  4 | This module contains helper functions to load the fish length dataset.
  5 | '''
  6 | 
  7 | inputs = np.array([
  8 |     # Age Temperature
  9 |     [14, 25],
 10 |     [28, 25],
 11 |     [41, 25],
 12 |     [55, 25],
 13 |     [69, 25],
 14 |     [83, 25],
 15 |     [97, 25],
 16 |     [111, 25],
 17 |     [125, 25],
 18 |     [139, 25],
 19 |     [153, 25],
 20 |     [14, 27],
 21 |     [28, 27],
 22 |     [41, 27],
 23 |     [55, 27],
 24 |     [69, 27],
 25 |     [83, 27],
 26 |     [97, 27],
 27 |     [111, 27],
 28 |     [125, 27],
 29 |     [139, 27],
 30 |     [153, 27],
 31 |     [14, 29],
 32 |     [28, 29],
 33 |     [41, 29],
 34 |     [55, 29],
 35 |     [69, 29],
 36 |     [83, 29],
 37 |     [97, 29],
 38 |     [111, 29],
 39 |     [125, 29],
 40 |     [139, 29],
 41 |     [153, 29],
 42 |     [14, 31],
 43 |     [28, 31],
 44 |     [41, 31],
 45 |     [55, 31],
 46 |     [69, 31],
 47 |     [83, 31],
 48 |     [97, 31],
 49 |     [111, 31],
 50 |     [125, 31],
 51 |     [139, 31],
 52 |     [153, 31]
 53 | ])
 54 | 
 55 | outputs = np.array([
 56 |     # Fish-length
 57 |     620,
 58 |     1315,
 59 |     2120,
 60 |     2600,
 61 |     3110,
 62 |     3535,
 63 |     3935,
 64 |     4465,
 65 |     4530,
 66 |     4570,
 67 |     4600,
 68 |     625,
 69 |     1215,
 70 |     2110,
 71 |     2805,
 72 |     3255,
 73 |     4015,
 74 |     4315,
 75 |     4495,
 76 |     4535,
 77 |     4600,
 78 |     4600,
 79 |     590,
 80 |     1305,
 81 |     2140,
 82 |     2890,
 83 |     3920,
 84 |     3920,
 85 |     4515,
 86 |     4520,
 87 |     4525,
 88 |     4565,
 89 |     4566,
 90 |     590,
 91 |     1205,
 92 |     1915,
 93 |     2140,
 94 |     2710,
 95 |     3020,
 96 |     3030,
 97 |     3040,
 98 |     3180,
 99 |     3257,
100 |     3214,
101 | ])
102 | 
103 | 
104 | def load():
105 |     '''
106 |     Loads the fish length dataset without any preprocessing.
107 |     Source: https://people.sc.fsu.edu/~jburkardt/datasets/regression/x06.txt
108 | 
109 |     The length of a species of fish is to be represented as a function
110 |     of the age and water temperature. The fish are kept in tanks
111 |     at 25, 27, 29 and 31 degrees Celsius.  After birth, a test specimen
112 |     is chosen at random every 14 days and its length measured.
113 | 
114 |     Returns
115 |     -------
116 |     inputs : numpy.array
117 |         44x2 numpy array, each row having 2 features,
118 |         :code:`age temperature`
119 |     outputs : numpy.array
120 |         Length of fish, numpy array with 44 elements.
121 |     '''
122 |     return inputs, outputs
123 | 


--------------------------------------------------------------------------------
/tests/test_fishlength.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_fishlength():
 6 |     import pykitml as pk
 7 |     from pykitml.datasets import fishlength
 8 | 
 9 |     # Load the dataset
10 |     inputs, outputs = fishlength.load()
11 | 
12 |     # Normalize inputs
13 |     array_min, array_max = pk.get_minmax(inputs)
14 |     inputs = pk.normalize_minmax(inputs, array_min, array_max)
15 | 
16 |     # Create polynomial features
17 |     inputs_poly = pk.polynomial(inputs)
18 | 
19 |     # Normalize outputs
20 |     array_min, array_max = pk.get_minmax(outputs)
21 |     outputs = pk.normalize_minmax(outputs, array_min, array_max)
22 | 
23 |     # Create model
24 |     fish_classifier = pk.LinearRegression(inputs_poly.shape[1], 1)
25 | 
26 |     # Train the model
27 |     fish_classifier.train(
28 |         training_data=inputs_poly,
29 |         targets=outputs,
30 |         batch_size=22,
31 |         epochs=200,
32 |         optimizer=pk.Adam(learning_rate=0.02, decay_rate=0.99),
33 |         testing_freq=1,
34 |         decay_freq=10
35 |     )
36 | 
37 |     # Save model
38 |     pk.save(fish_classifier, 'fish_classifier.pkl')
39 | 
40 |     # Plot performance
41 |     fish_classifier.plot_performance()
42 | 
43 |     # Print r2 score
44 |     print('r2score:', fish_classifier.r2score(inputs_poly, outputs))
45 | 
46 |     # Assert if it has enough accuracy
47 |     assert fish_classifier.cost(inputs_poly, outputs) <= 0
48 | 
49 | 
50 | @pktest_nograph
51 | def test_predict_fishlength():
52 |     import numpy as np
53 |     import pykitml as pk
54 |     from pykitml.datasets import fishlength
55 | 
56 |     # Predict length of fish that is 28 days old at 25C
57 | 
58 |     # Load the dataset
59 |     inputs, outputs = fishlength.load()
60 | 
61 |     # Load the model
62 |     fish_classifier = pk.load('fish_classifier.pkl')
63 | 
64 |     # Normalize inputs
65 |     array_min, array_max = pk.get_minmax(inputs)
66 |     input_data = pk.normalize_minmax(np.array([28, 25]), array_min, array_max)
67 | 
68 |     # Create plynomial features
69 |     input_data_poly = pk.polynomial(input_data)
70 | 
71 |     # Get output
72 |     fish_classifier.feed(input_data_poly)
73 |     model_output = fish_classifier.get_output()
74 | 
75 |     # Denormalize output
76 |     array_min, array_max = pk.get_minmax(outputs)
77 |     model_output = pk.denormalize_minmax(model_output, array_min, array_max)
78 | 
79 |     # Print result
80 |     print(model_output)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     try:
85 |         test_fishlength.__wrapped__()
86 | 
87 |         test_predict_fishlength.__wrapped__()
88 |     except AssertionError:
89 |         pass
90 | 


--------------------------------------------------------------------------------
/docs/Datasets.rst:
--------------------------------------------------------------------------------
  1 | Datasets
  2 | ========
  3 | 
  4 | 
  5 | .. _mnist_dataset:
  6 | 
  7 | MNIST - pykitml.datasets.mnist module
  8 | -------------------------------------
  9 | 
 10 | .. automodule:: pykitml.datasets.mnist
 11 | 
 12 | .. autofunction:: pykitml.datasets.mnist.get
 13 | 
 14 | .. autofunction:: pykitml.datasets.mnist.load
 15 | 
 16 | 
 17 | .. _iris_dataset:
 18 | 
 19 | Iris - pykitml.datasets.iris module
 20 | -----------------------------------
 21 | 
 22 | .. automodule:: pykitml.datasets.iris
 23 | 
 24 | .. autofunction:: pykitml.datasets.iris.load
 25 | 
 26 | 
 27 | .. _fishlength_dataset:
 28 | 
 29 | Fish Length - pykitml.datasets.fishlength module
 30 | ------------------------------------------------
 31 | 
 32 | .. automodule:: pykitml.datasets.fishlength
 33 | 
 34 | .. autofunction:: pykitml.datasets.fishlength.load
 35 | 
 36 | 
 37 | .. _heart_dataset:
 38 | 
 39 | Heart Disease - pykitml.datasets.heartdisease module
 40 | ----------------------------------------------------
 41 | 
 42 | .. automodule:: pykitml.datasets.heartdisease
 43 | 
 44 | .. autofunction:: pykitml.datasets.heartdisease.get
 45 | 
 46 | .. autofunction:: pykitml.datasets.heartdisease.load
 47 | 
 48 | 
 49 | .. _adult_dataset:
 50 | 
 51 | Adult - pykitml.datasets.adult module
 52 | ----------------------------------------------------
 53 | 
 54 | .. automodule:: pykitml.datasets.adult
 55 | 
 56 | .. autofunction:: pykitml.datasets.adult.get
 57 | 
 58 | .. autofunction:: pykitml.datasets.adult.load
 59 | 
 60 | 
 61 | .. _banknote_dataset:
 62 | 
 63 | Banknote - pykitml.datasets.banknote module
 64 | ----------------------------------------------------
 65 | 
 66 | .. automodule:: pykitml.datasets.banknote
 67 | 
 68 | .. autofunction:: pykitml.datasets.banknote.get
 69 | 
 70 | .. autofunction:: pykitml.datasets.banknote.load
 71 | 
 72 | 
 73 | .. _sonar_dataset:
 74 | 
 75 | Sonar Rocks and Mines - pykitml.datasets.sonar module
 76 | -------------------------------------------------------
 77 | 
 78 | .. automodule:: pykitml.datasets.sonar
 79 | 
 80 | .. autofunction:: pykitml.datasets.sonar.get
 81 | 
 82 | .. autofunction:: pykitml.datasets.sonar.load
 83 | 
 84 | 
 85 | .. _boston_dataset:
 86 | 
 87 | Boston Housing - pykitml.boston.s1clustering module
 88 | ----------------------------------------------------
 89 | 
 90 | .. automodule:: pykitml.datasets.boston
 91 | 
 92 | .. autofunction:: pykitml.datasets.boston.get
 93 | 
 94 | .. autofunction:: pykitml.datasets.boston.load
 95 | 
 96 | 
 97 | .. _s1clustering_dataset:
 98 | 
 99 | S1 Clustering - pykitml.datasets.s1clustering module
100 | ----------------------------------------------------
101 | 
102 | .. automodule:: pykitml.datasets.s1clustering
103 | 
104 | .. autofunction:: pykitml.datasets.s1clustering.get
105 | 
106 | .. autofunction:: pykitml.datasets.s1clustering.load
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![pykitml logo](https://raw.githubusercontent.com/RainingComputers/pykitml/master/pykitml128.png)
 2 | 
 3 | # pykitml (Python Kit for Machine Learning)
 4 | Machine Learning library written in Python and NumPy.
 5 | 
 6 | ### Installation
 7 | 
 8 | ```
 9 | python3 -m pip install pykitml
10 | ```
11 | 
12 | ### Documentation
13 | 
14 | https://pykitml.readthedocs.io/en/latest/
15 | 
16 | # Demo (MNIST)
17 | ### Training
18 | ``` python
19 | import os.path
20 | 
21 | import numpy as np
22 | import pykitml as pk
23 | from pykitml.datasets import mnist
24 |     
25 | # Download dataset
26 | if(not os.path.exists('mnist.pkl')): mnist.get()
27 | 
28 | # Load dataset
29 | training_data, training_targets, testing_data, testing_targets = mnist.load()
30 |     
31 | # Create a new neural network
32 | digit_classifier = pk.NeuralNetwork([784, 100, 10])
33 |     
34 | # Train it
35 | digit_classifier.train(
36 |     training_data=training_data,
37 |     targets=training_targets, 
38 |     batch_size=50, 
39 |     epochs=1200, 
40 |     optimizer=pk.Adam(learning_rate=0.012, decay_rate=0.95), 
41 |     testing_data=testing_data, 
42 |     testing_targets=testing_targets,
43 |     testing_freq=30,
44 |     decay_freq=15
45 | )
46 |     
47 | # Save it
48 | pk.save(digit_classifier, 'digit_classifier_network.pkl')
49 | 
50 | # Show performance
51 | accuracy = digit_classifier.accuracy(training_data, training_targets)
52 | print('Train Accuracy:', accuracy)        
53 | accuracy = digit_classifier.accuracy(testing_data, testing_targets)
54 | print('Test Accuracy:', accuracy)
55 |     
56 | # Plot performance graph
57 | digit_classifier.plot_performance()
58 | 
59 | # Show confusion matrix
60 | digit_classifier.confusion_matrix(training_data, training_targets)
61 | ```
62 | 
63 | ### Trying the model
64 | ```python
65 | import random
66 | 
67 | import numpy as np
68 | import matplotlib.pyplot as plt
69 | import pykitml as pk
70 | from pykitml.datasets import mnist
71 | 
72 | # Load dataset
73 | training_data, training_targets, testing_data, testing_targets = mnist.load()
74 | 
75 | # Load the trained network
76 | digit_classifier = pk.load('digit_classifier_network.pkl')
77 | 
78 | # Pick a random example from testing data
79 | index = random.randint(0, 9999)
80 | 
81 | # Show the test data and the label
82 | plt.imshow(training_data[index].reshape(28, 28))
83 | plt.show()
84 | print('Label: ', training_targets[index])
85 | 
86 | # Show prediction
87 | digit_classifier.feed(training_data[index])
88 | model_output = digit_classifier.get_output_onehot()
89 | print('Predicted: ', model_output)
90 | ```
91 | 
92 | ### Performance Graph
93 | 
94 | ![Performance Graph](https://raw.githubusercontent.com/RainingComputers/pykitml/master/docs/demo_pics/neural_network_perf_graph.png)
95 | 
96 | ### Confusion Matrix
97 | 
98 | ![Confusion Matrix](https://raw.githubusercontent.com/RainingComputers/pykitml/master/docs/demo_pics/neural_network_confusion_matrix.png)
99 | 


--------------------------------------------------------------------------------
/tests/test_banknote.py:
--------------------------------------------------------------------------------
 1 | from pykitml.testing import pktest_graph, pktest_nograph
 2 | 
 3 | 
 4 | @pktest_graph
 5 | def test_banknote():
 6 |     import os.path
 7 | 
 8 |     import pykitml as pk
 9 |     from pykitml.datasets import banknote
10 | 
11 |     # Download the dataset
12 |     if not os.path.exists('banknote.pkl'):
13 |         banknote.get()
14 | 
15 |     # Load banknote data set
16 |     inputs_train, outputs_train, inputs_test, outputs_test = banknote.load()
17 | 
18 |     # Normalize dataset
19 |     array_min, array_max = pk.get_minmax(inputs_train)
20 |     inputs_train = pk.normalize_minmax(inputs_train, array_min, array_max)
21 |     inputs_test = pk.normalize_minmax(inputs_test, array_min, array_max)
22 | 
23 |     # Create polynomial features
24 |     inputs_train_poly = pk.polynomial(inputs_train)
25 |     inputs_test_poly = pk.polynomial(inputs_test)
26 | 
27 |     # Create model
28 |     banknote_classifier = pk.LogisticRegression(inputs_train_poly.shape[1], 1)
29 | 
30 |     # Train the model
31 |     banknote_classifier.train(
32 |         training_data=inputs_train_poly,
33 |         targets=outputs_train,
34 |         batch_size=10,
35 |         epochs=1500,
36 |         optimizer=pk.Adam(learning_rate=0.06, decay_rate=0.99),
37 |         testing_data=inputs_test_poly,
38 |         testing_targets=outputs_test,
39 |         testing_freq=30,
40 |         decay_freq=40
41 |     )
42 | 
43 |     # Save it
44 |     pk.save(banknote_classifier, 'banknote_classifier.pkl')
45 | 
46 |     # Plot performance
47 |     banknote_classifier.plot_performance()
48 | 
49 |     # Print accuracy
50 |     accuracy = banknote_classifier.accuracy(inputs_train_poly, outputs_train)
51 |     print('Train accuracy:', accuracy)
52 |     accuracy = banknote_classifier.accuracy(inputs_test_poly, outputs_test)
53 |     print('Test accuracy:', accuracy)
54 | 
55 |     # Plot confusion matrix
56 |     banknote_classifier.confusion_matrix(inputs_test_poly, outputs_test)
57 | 
58 |     # Assert if it has enough accuracy
59 |     assert banknote_classifier.accuracy(inputs_test_poly, outputs_test) >= 99
60 | 
61 | 
62 | @pktest_nograph
63 | def test_predict_banknote():
64 |     import numpy as np
65 |     import pykitml as pk
66 |     from pykitml.datasets import banknote
67 | 
68 |     # Predict banknote validity with variance, skewness, curtosis, entropy
69 |     # of -2.3, -9.3, 9.37, -0.86
70 | 
71 |     # Load banknote data set
72 |     inputs_train, _, _, _ = banknote.load()
73 | 
74 |     # Load the model
75 |     banknote_classifier = pk.load('banknote_classifier.pkl')
76 | 
77 |     # Normalize the inputs
78 |     array_min, array_max = pk.get_minmax(inputs_train)
79 |     input_data = pk.normalize_minmax(np.array([-2.3, -9.3, 9.37, -0.86]), array_min, array_max)
80 | 
81 |     # Create polynomial features
82 |     input_data_poly = pk.polynomial(input_data)
83 | 
84 |     # Get output
85 |     banknote_classifier.feed(input_data_poly)
86 |     model_output = banknote_classifier.get_output()
87 | 
88 |     # Print result
89 |     print(model_output)
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     try:
94 |         test_banknote.__wrapped__()
95 |         test_predict_banknote.__wrapped__()
96 |     except AssertionError:
97 |         pass
98 | 


--------------------------------------------------------------------------------
/pykitml/datasets/boston.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from urllib import request
  3 | 
  4 | from numpy import genfromtxt
  5 | 
  6 | from .. import pklhandler
  7 | 
  8 | '''
  9 | This module contains helper functions to download and load
 10 | the boston housing dataset.
 11 | '''
 12 | 
 13 | 
 14 | def get():
 15 |     '''
 16 |     Downloads the boston dataset from
 17 |     https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
 18 |     and saves it as a pkl file `boston.pkl`.
 19 | 
 20 |     Raises
 21 |     ------
 22 |         urllib.error.URLError
 23 |             If internet connection is not available or the URL is not accessible.
 24 |         OSError
 25 |             If the file cannot be created due to a system-related error.
 26 |         KeyError
 27 |             If invalid/unknown type.
 28 | 
 29 |     Note
 30 |     ----
 31 |     You only need to call this method once, i.e, after the dataset has been downloaded
 32 |     and you have the `boston.pkl` file, you don't need to call this method again.
 33 |     '''
 34 |     # Url to download the dataset from
 35 |     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
 36 | 
 37 |     # Download the dataset
 38 |     print('Downloading housing.data...')
 39 |     request.urlretrieve(url, 'housing.data')
 40 |     print('Download complete.')
 41 | 
 42 |     # Parse the data and save it as a pkl file
 43 |     pklhandler.save(genfromtxt('housing.data'), 'boston.pkl')
 44 |     # Delete unnecessary files
 45 |     os.remove('housing.data')
 46 |     print('Deleted unnecessary files.')
 47 | 
 48 | 
 49 | def load():
 50 |     '''
 51 |     Loads the boston housing dataset from pkl file.
 52 | 
 53 |     The inputs have following columns:
 54 | 
 55 |     - CRIM :
 56 |       per capita crime rate by town
 57 |     - ZN :
 58 |       proportion of residential land zoned for lots over 25,000 sq.ft.
 59 |     - INDUS :
 60 |       proportion of non-retail business acres per town
 61 |     - CHAS :
 62 |       Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 63 |     - NOX :
 64 |       nitric oxides concentration (parts per 10 million)
 65 |     - RM :
 66 |       average number of rooms per dwelling
 67 |     - AGE :
 68 |       proportion of owner-occupied units built prior to 1940
 69 |     - DIS :
 70 |       weighted distances to five Boston employment centres
 71 |     - RAD :
 72 |       index of accessibility to radial highways
 73 |     - TAX :
 74 |       full-value property-tax rate per $10,000
 75 |     - PTRATIO :
 76 |       pupil-teacher ratio by town
 77 |     - B :
 78 |       1000(Bk - 0.63)^2 where Bk is the proportion of black by town
 79 |     - LSTAT :
 80 |       % lower status of the population
 81 | 
 82 |     The outputs are
 83 | 
 84 |     - MEDV :
 85 |       Median value of owner-occupied homes in $1000's
 86 | 
 87 |     Returns
 88 |     -------
 89 |     inputs_train : numpy.array
 90 |     outputs_train : numpy.array
 91 |     inputs_test : numpy.array
 92 |     outputs_test : numpy.array
 93 | 
 94 |     '''
 95 |     data_array = pklhandler.load('boston.pkl')
 96 | 
 97 |     inputs_train = data_array[0:500, :-1]
 98 |     outputs_train = data_array[0:500, -1]
 99 |     inputs_test = data_array[500:, :-1]
100 |     outputs_test = data_array[500:, -1]
101 | 
102 |     return inputs_train, outputs_train, inputs_test, outputs_test
103 | 


--------------------------------------------------------------------------------
/tests/test_mnist_svm.py:
--------------------------------------------------------------------------------
  1 | from pykitml.testing import pktest_graph
  2 | 
  3 | 
  4 | @pktest_graph
  5 | def test_mnist_svm():
  6 |     import os.path
  7 | 
  8 |     import numpy as np
  9 |     import pykitml as pk
 10 |     from pykitml.datasets import mnist
 11 | 
 12 |     # Download dataset
 13 |     if not os.path.exists('mnist.pkl'):
 14 |         mnist.get()
 15 | 
 16 |     # Load mnist data set
 17 |     inputs_train, outputs_train, inputs_test, outputs_test = mnist.load()
 18 | 
 19 |     # Train on only first 10000
 20 |     inputs_train = inputs_train[:10000]
 21 |     outputs_train = outputs_train[:10000]
 22 | 
 23 |     # Transform inputs using gaussian kernal
 24 |     sigma = 3.15
 25 |     gaussian_inputs_train = pk.gaussian_kernel(inputs_train, inputs_train, sigma)
 26 |     gaussian_inputs_test = pk.gaussian_kernel(inputs_test, inputs_train, sigma)
 27 | 
 28 |     # Format the outputs for svm training, zeros to -1
 29 |     svm_outputs_train = np.where(outputs_train == 0, -1, 1)
 30 |     svm_outputs_test = np.where(outputs_test == 0, -1, 1)
 31 | 
 32 |     # Create model
 33 |     svm_mnist_classifier = pk.SVM(gaussian_inputs_train.shape[1], 10)
 34 | 
 35 |     # Train the model
 36 |     svm_mnist_classifier.train(
 37 |         training_data=gaussian_inputs_train,
 38 |         targets=svm_outputs_train,
 39 |         batch_size=20,
 40 |         epochs=1000,
 41 |         optimizer=pk.Adam(learning_rate=3.5, decay_rate=0.95),
 42 |         testing_data=gaussian_inputs_test,
 43 |         testing_targets=svm_outputs_test,
 44 |         testing_freq=30,
 45 |         decay_freq=10
 46 |     )
 47 | 
 48 |     # Save it
 49 |     pk.save(svm_mnist_classifier, 'svm_mnist_classifier.pkl')
 50 | 
 51 |     # Print accuracy
 52 |     accuracy = svm_mnist_classifier.accuracy(gaussian_inputs_train, outputs_train)
 53 |     print('Train accuracy:', accuracy)
 54 |     accuracy = svm_mnist_classifier.accuracy(gaussian_inputs_test, outputs_test)
 55 |     print('Test accuracy:', accuracy)
 56 | 
 57 |     # Plot performance
 58 |     svm_mnist_classifier.plot_performance()
 59 | 
 60 |     # Plot confusion matrix
 61 |     svm_mnist_classifier.confusion_matrix(gaussian_inputs_test, outputs_test)
 62 | 
 63 |     # Assert if it has enough accuracy
 64 |     assert svm_mnist_classifier.accuracy(gaussian_inputs_train, outputs_train) >= 90
 65 | 
 66 | 
 67 | @pktest_graph
 68 | def test_predict_mnist_svm():
 69 |     import random
 70 | 
 71 |     import matplotlib.pyplot as plt
 72 |     import pykitml as pk
 73 |     from pykitml.datasets import mnist
 74 | 
 75 |     # Load dataset
 76 |     inputs_train, outputs_train, _, _ = mnist.load()
 77 | 
 78 |     # Use only first 10000
 79 |     inputs_train = inputs_train[:10000]
 80 |     outputs_train = outputs_train[:10000]
 81 | 
 82 |     # Load the trained network
 83 |     svm_mnist_classifier = pk.load('svm_mnist_classifier.pkl')
 84 | 
 85 |     # Pick a random example from testing data
 86 |     index = random.randint(0, 9000)
 87 | 
 88 |     # Show the test data and the label
 89 |     plt.imshow(inputs_train[index].reshape(28, 28))
 90 |     plt.show()
 91 |     print('Label: ', outputs_train[index])
 92 | 
 93 |     # Transform the input
 94 |     input_data = pk.gaussian_kernel(inputs_train[index], inputs_train)
 95 | 
 96 |     # Show prediction
 97 |     svm_mnist_classifier.feed(input_data)
 98 |     model_output = svm_mnist_classifier.get_output_onehot()
 99 |     print('Predicted: ', model_output)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     try:
104 |         test_mnist_svm.__wrapped__()
105 |         test_predict_mnist_svm.__wrapped__()
106 |     except AssertionError:
107 |         pass
108 | 


--------------------------------------------------------------------------------
/docs/Normalization.rst:
--------------------------------------------------------------------------------
 1 | Normalization/Feature-scaling
 2 | =============================
 3 | 
 4 | Min-Max Normalization
 5 | ---------------------
 6 | 
 7 | .. autofunction:: pykitml.get_minmax
 8 | 
 9 | .. autofunction:: pykitml.normalize_minmax
10 | 
11 | .. autofunction:: pykitml.denormalize_minmax
12 | 
13 | **Example**
14 | 
15 | >>> import numpy as np
16 | >>> import pykitml as pk
17 | >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
18 | >>> min_array, max_array = pk.get_minmax(a)
19 | >>> normalized_a = pk.normalize_minmax(a, min_array, max_array)
20 | >>> normalized_a
21 | array([[0.        , 0.        , 0.        , 0.        ],
22 |        [0.33333333, 0.33333333, 0.33333333, 0.33333333],
23 |        [0.66666667, 0.66666667, 0.66666667, 0.66666667],
24 |        [1.        , 1.        , 1.        , 1.        ]])
25 | >>> pk.denormalize_minmax(normalized_a, min_array, max_array)
26 | array([[ 1.,  2.,  3.,  4.],
27 |        [ 5.,  6.,  7.,  8.],
28 |        [ 9., 10., 11., 12.],
29 |        [13., 14., 15., 16.]])
30 | 
31 | You can also only normalize/denormalize specific columns,
32 | 
33 | >>> import numpy as np
34 | >>> import pykitml as pk
35 | >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
36 | >>> min_array, max_array = pk.get_minmax(a)
37 | >>> normalized_a = pk.normalize_minmax(a, min_array, max_array, cols=[0, 2])
38 | >>> normalized_a
39 | array([[ 0.        ,  2.        ,  0.        ,  4.        ],
40 |        [ 0.33333333,  6.        ,  0.33333333,  8.        ],
41 |        [ 0.66666667, 10.        ,  0.66666667, 12.        ],
42 |        [ 1.        , 14.        ,  1.        , 16.        ]])
43 | >>> pk.denormalize_minmax(normalized_a, min_array, max_array, cols=[0, 2])
44 | array([[ 1.,  2.,  3.,  4.],
45 |        [ 5.,  6.,  7.,  8.],
46 |        [ 9., 10., 11., 12.],
47 |        [13., 14., 15., 16.]])
48 | 
49 | Mean Normalization
50 | ------------------
51 | 
52 | .. autofunction:: pykitml.get_meanstd
53 | 
54 | .. autofunction:: pykitml.normalize_mean
55 | 
56 | .. autofunction:: pykitml.denormalize_mean
57 | 
58 | **Example**
59 | 
60 | >>> import numpy as np
61 | >>> import pykitml as pk
62 | >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
63 | >>> array_mean, array_stddev = pk.get_meanstd(a)
64 | >>> normalized_a = pk.normalize_mean(a, array_mean, array_stddev)
65 | >>> normalized_a
66 | array([[-1.34164079, -1.34164079, -1.34164079, -1.34164079],
67 |        [-0.4472136 , -0.4472136 , -0.4472136 , -0.4472136 ],
68 |        [ 0.4472136 ,  0.4472136 ,  0.4472136 ,  0.4472136 ],
69 |        [ 1.34164079,  1.34164079,  1.34164079,  1.34164079]])
70 | >>> pk.denormalize_mean(normalized_a, array_mean, array_stddev)
71 | array([[ 1.,  2.,  3.,  4.],
72 |        [ 5.,  6.,  7.,  8.],
73 |        [ 9., 10., 11., 12.],
74 |        [13., 14., 15., 16.]])
75 | 
76 | You can also only normalize/denormalize specific columns,
77 | 
78 | >>> import numpy as np
79 | >>> import pykitml as pk
80 | >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
81 | >>> array_mean, array_stddev = pk.get_meanstd(a)
82 | >>> normalized_a = pk.normalize_mean(a, array_mean, array_stddev, cols=[0,2])
83 | >>> normalized_a
84 | array([[-1.34164079,  2.        , -1.34164079,  4.        ],
85 |        [-0.4472136 ,  6.        , -0.4472136 ,  8.        ],
86 |        [ 0.4472136 , 10.        ,  0.4472136 , 12.        ],
87 |        [ 1.34164079, 14.        ,  1.34164079, 16.        ]])
88 | >>> pk.denormalize_mean(normalized_a, array_mean, array_stddev, cols=[0,2])
89 | array([[ 1.,  2.,  3.,  4.],
90 |        [ 5.,  6.,  7.,  8.],
91 |        [ 9., 10., 11., 12.],
92 |        [13., 14., 15., 16.]])
93 | 
94 | 


--------------------------------------------------------------------------------
/pykitml/_single_layer_model.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | 
  3 | import numpy as np
  4 | 
  5 | from ._minimize_model import MinimizeModel
  6 | 
  7 | 
  8 | class SingleLayerModel(MinimizeModel, ABC):
  9 |     '''
 10 |     General base class for single layer models.
 11 |     '''
 12 | 
 13 |     def __init__(self, input_size, output_size, reg_param=0):
 14 |         '''
 15 |         Parameters
 16 |         ----------
 17 |         input_size : int
 18 |             Size of input data or number of input features.
 19 |         output_size: int
 20 |             Number of categories or groups.
 21 |         reg_param : int
 22 |             Regularization parameter for the model, also known as 'weight decay'.
 23 |         '''
 24 |         # Save sizes
 25 |         self._input_size = input_size
 26 |         self._output_size = output_size
 27 | 
 28 |         # Initialize regularization parameter
 29 |         self._reg_param = reg_param
 30 |         self._reg_param_half = reg_param/2
 31 | 
 32 |         # Initialize weights and parameters
 33 |         epsilon = np.sqrt(6)/(np.sqrt(output_size) + np.sqrt(input_size))
 34 |         weights = np.random.rand(output_size, input_size)*2*epsilon - epsilon
 35 |         biases = np.random.rand(output_size) * 2 * epsilon - epsilon
 36 | 
 37 |         # Numpy array to store activations
 38 |         self._inputa = np.array([])
 39 |         self.a = np.array([])
 40 |         self.z = np.array([])
 41 | 
 42 |         # Put parameters in numpy dtype=object array
 43 |         W = 0  # Weights
 44 |         B = 1  # Biases
 45 |         self._params = np.array([None, None], dtype=object)
 46 |         self._params[W] = weights
 47 |         self._params[B] = biases
 48 | 
 49 |     @property
 50 |     def _mparams(self):
 51 |         return self._params
 52 | 
 53 |     @_mparams.setter
 54 |     def _mparams(self, mparams):
 55 |         self._params = mparams
 56 | 
 57 |     @property
 58 |     def _cost_function(self):
 59 |         return self._cost_func
 60 | 
 61 |     @property
 62 |     def _out_size(self):
 63 |         return self._output_size
 64 | 
 65 |     def feed(self, input_data):
 66 |         # Constants
 67 |         W = 0  # Weights
 68 |         B = 1  # Biases
 69 | 
 70 |         # feed
 71 |         self._inputa = input_data
 72 |         self.z = (input_data @ self._params[W].T) + self._params[B]
 73 |         self.a = self._activ_func(self.z)
 74 | 
 75 |     def get_output(self):
 76 |         return self.a.squeeze()
 77 | 
 78 |     def _backpropagate(self, index, targets):
 79 |         # Constants
 80 |         W = 0  # Weights
 81 |         B = 1  # Biases
 82 | 
 83 |         # Gradients
 84 |         da_dz = self._activ_func_prime(self.z[index], self.a[index])
 85 |         dc_db = self._cost_func_prime(self.a[index], targets) * da_dz
 86 |         dc_dw = np.multiply.outer(dc_db, self._inputa[index])
 87 | 
 88 |         # Add regularization
 89 |         dc_dw += self._reg_param*self._params[W]
 90 | 
 91 |         # Return gradient
 92 |         gradient = np.array([None, None], dtype=object)
 93 |         gradient[W] = dc_dw
 94 |         gradient[B] = dc_db
 95 |         return gradient
 96 | 
 97 |     @property
 98 |     def bptt(self):
 99 |         return False
100 | 
101 |     def _get_norm_weights(self):
102 |         W = 0
103 |         return self._reg_param_half*(self._params[W]**2).sum()
104 | 
105 |     @property
106 |     @abstractmethod
107 |     def _activ_func(self):
108 |         pass
109 | 
110 |     @property
111 |     @abstractmethod
112 |     def _activ_func_prime(self):
113 |         pass
114 | 
115 |     @property
116 |     @abstractmethod
117 |     def _cost_func(self):
118 |         pass
119 | 
120 |     @property
121 |     @abstractmethod
122 |     def _cost_func_prime(self):
123 |         pass
124 | 


--------------------------------------------------------------------------------
/pykitml/datasets/banknote.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from urllib import request
  3 | 
  4 | import numpy as np
  5 | from numpy import genfromtxt
  6 | 
  7 | from .. import pklhandler
  8 | 
  9 | '''
 10 | This module contains helper functions to download and load
 11 | the banknote dataset.
 12 | '''
 13 | 
 14 | 
 15 | def get():
 16 |     '''
 17 |     Downloads the banknote dataset from
 18 |     http://archive.ics.uci.edu/ml/datasets/banknote+authentication
 19 |     and saves it as a pkl file `banknote.pkl`.
 20 | 
 21 |     Raises
 22 |     ------
 23 |         urllib.error.URLError
 24 |             If internet connection is not available or the URL is not accessible.
 25 |         OSError
 26 |             If the file cannot be created due to a system-related error.
 27 |         KeyError
 28 |             If invalid/unknown type.
 29 | 
 30 |     Note
 31 |     ----
 32 |     You only need to call this method once, i.e, after the dataset has been downloaded
 33 |     and you have the `banknote.pkl` file, you don't need to call this method again.
 34 |     '''
 35 |     # Url to download the dataset from
 36 |     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
 37 | 
 38 |     # Download the dataset
 39 |     print('Downloading data_banknote_authentication.txt')
 40 |     request.urlretrieve(url, 'data_banknote_authentication.txt')
 41 |     print('Download complete.')
 42 | 
 43 |     # Parse the data and save it as a pkl file
 44 |     pklhandler.save(genfromtxt('data_banknote_authentication.txt', delimiter=','), 'banknote.pkl')
 45 | 
 46 |     # Delete unnecessary files
 47 |     os.remove('data_banknote_authentication.txt')
 48 |     print('Deleted unnecessary files.')
 49 | 
 50 | 
 51 | def load():
 52 |     '''
 53 |     Loads the banknote data from pkl file.
 54 | 
 55 |     The inputs have the following columns:
 56 | 
 57 |     - Variance of Wavelet Transformed image (continuous)
 58 |     - Skewness of Wavelet Transformed image (continuous)
 59 |     - Curtosis of Wavelet Transformed image (continuous)
 60 |     - Entropy of image (continuous)
 61 | 
 62 |     The outputs are:
 63 | 
 64 |     - 0 = Real
 65 |     - 1 = Counterfeit
 66 | 
 67 |     Returns
 68 |     -------
 69 |     inputs_train : numpy.array
 70 |         1102x4 numpy array containing training inputs.
 71 |     outputs_train : numpy.array
 72 |         Numpy array of size 1102.
 73 |     inputs_test : numpy.array
 74 |         270x4 numpy array containing testing inputs.
 75 |     outputs_test : numpy.array
 76 |         Numpy array of size 270.
 77 | 
 78 |     '''
 79 |     data_array = pklhandler.load('banknote.pkl')
 80 | 
 81 |     # Separate data, positive and negative examples
 82 |     negative_examples = data_array[:762]
 83 |     positive_examples = data_array[762:]
 84 | 
 85 |     # Separate into training and testing
 86 |     negative_examples_test = negative_examples[:150]
 87 |     negative_examples_train = negative_examples[150:]
 88 |     positive_examples_test = positive_examples[:120]
 89 |     positive_examples_train = positive_examples[120:]
 90 | 
 91 |     # Join them to form training and testing dataset
 92 |     train = np.concatenate((negative_examples_train, positive_examples_train), axis=0)
 93 |     test = np.concatenate((negative_examples_test, positive_examples_test), axis=0)
 94 | 
 95 |     # Shuffle the dataset
 96 |     shuff_indices = np.arange(train.shape[0])
 97 |     np.random.shuffle(shuff_indices)
 98 |     train = train[shuff_indices]
 99 |     shuff_indices = np.arange(test.shape[0])
100 |     np.random.shuffle(shuff_indices)
101 |     test = test[shuff_indices]
102 | 
103 |     inputs_train = train[:, :-1]
104 |     outputs_train = train[:, -1]
105 |     inputs_test = test[:, :-1]
106 |     outputs_test = test[:, -1]
107 | 
108 |     return inputs_train, outputs_train, inputs_test, outputs_test
109 | 


--------------------------------------------------------------------------------
/pykitml/kmeans_clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tqdm
  3 | 
  4 | from . import _functions
  5 | 
  6 | 
  7 | def kmeans(training_data, nclusters, max_iter=1000, trials=50):
  8 |     '''
  9 |     Identifies cluster centres on training data using k-means.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     training_data : numpy.array
 14 |         Numpy array containing training data.
 15 |     nclusters : int
 16 |         Number of cluster to find.
 17 |     max_iter : int
 18 |         Maximum number of iterations to run per trial.
 19 |     trials : int
 20 |         Number of times k-means should run, each with different
 21 |         random initialization.
 22 | 
 23 |     Returns
 24 |     -------
 25 |     clusters : numpy.array
 26 |         Numpy array containing cluster centres.
 27 |     cost : numpy.array
 28 |         The cost of converged cluster centres.
 29 | 
 30 |     '''
 31 | 
 32 |     # Keep track of trial with the least cost
 33 |     min_cost = float('infinity')
 34 |     distances = None
 35 |     clusters_min_cost = None
 36 |     clusters = None
 37 | 
 38 |     # Keep log of maximum number of iterations for convergence
 39 |     max_iter_log = 0
 40 | 
 41 |     pbar = tqdm.trange(0, trials, ncols=80, unit='trials')
 42 |     for _ in pbar:
 43 |         # Use kmeans++ to initialize cluster centres
 44 |         clusters = np.zeros((nclusters, training_data.shape[1]))
 45 | 
 46 |         # First cluster centre is random
 47 |         index = np.random.randint(training_data.shape[0], size=1)
 48 |         clusters[0] = training_data[index]
 49 | 
 50 |         # Loop for rest of cluster centres
 51 |         for i in range(1, nclusters):
 52 |             # Calculate distance between every data point and previous cluster centre
 53 |             prev_cluster_dists = _functions.pdist(clusters[i-1], training_data).squeeze()
 54 |             # Normalize distances
 55 |             prev_cluster_dists = prev_cluster_dists/prev_cluster_dists.sum()
 56 | 
 57 |             # Sample index with probability distribution proportional to distances
 58 |             index = np.random.choice(training_data.shape[0], 1, p=prev_cluster_dists)
 59 | 
 60 |             # Assign next cluster centre
 61 |             clusters[i] = training_data[index]
 62 | 
 63 |         # Start kmeans, Keep looping and moving the cluster points to mean
 64 |         for iteration in range(max_iter):
 65 |             new_clusters = np.zeros((nclusters, training_data.shape[1]))
 66 | 
 67 |             # Calculate distances between clusters and every point in training data
 68 |             distances = _functions.pdist(training_data, clusters)
 69 | 
 70 |             # Assign clusters index to each data point
 71 |             cluster_assignments = np.argmin(distances, axis=1)
 72 | 
 73 |             # Move cluster by taking mean of all the points assigned to that cluster
 74 |             for i in range(nclusters):
 75 |                 cluster_points = training_data[cluster_assignments == i]
 76 |                 if cluster_points.shape[0] == 0:
 77 |                     continue
 78 |                 new_clusters[i] = np.mean(cluster_points, axis=0)
 79 | 
 80 |             # Check for convergence
 81 |             if(np.abs(new_clusters-clusters) == 0).all():
 82 |                 break
 83 | 
 84 |             # Assign new clusters
 85 |             clusters = new_clusters
 86 | 
 87 |         # Select cluster centres with least cost
 88 |         cost = np.mean(np.min(distances, axis=1))
 89 |         if cost < min_cost:
 90 |             clusters_min_cost = clusters
 91 |             min_cost = cost
 92 | 
 93 |         # Update maximum iterations for convergence
 94 |         if iteration > max_iter_log:
 95 |             max_iter_log = iteration
 96 | 
 97 |         # Update progress bar
 98 |         pbar.set_postfix(cost=min_cost, max_it=max_iter_log)
 99 | 
100 |     return clusters_min_cost, min_cost
101 | 


--------------------------------------------------------------------------------
/pykitml/random_search.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | 
  4 | 
  5 | class RandomSearch:
  6 |     '''
  7 |     This class is used to search for hyperparameters.
  8 |     '''
  9 | 
 10 |     def __init__(self):
 11 |         self._curr_cost = None
 12 |         self._best = False
 13 | 
 14 |     @property
 15 |     def best(self):
 16 |         '''
 17 |         If the last generated hyperparameters is the best so far.
 18 | 
 19 |         Note
 20 |         ----
 21 |         This property has to be used AFTER calling :py:func:`set_cost`
 22 |         '''
 23 |         return self._best
 24 | 
 25 |     def set_cost(self, cost):
 26 |         '''
 27 |         Set the cost for current hyperparameter.
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         cost : float
 32 |             The cost corresponding to current set of hyperparameters.
 33 |         '''
 34 |         self._curr_cost = cost
 35 | 
 36 |     def search(self, nsamples, nzoom, zoomratio, *args):
 37 |         '''
 38 |         Generator function to loop through randomly generated hyperparameters.
 39 |         Total number of hyperparameters sampled will be :code:`nsamples*nzoom`.
 40 |         First :code:`nsamples` points will be sampled, then the function will
 41 |         'zoom in' around the best sample, and :code:`nsamples` more points will
 42 |         be sampled. This will be repeated :code:`nzoom` times.
 43 |         The range for each hyperparameter should be passed as a list to
 44 |         :code:`*args`. The range should be :code:`[from, to, 'type']`,
 45 |         for e.g. :code:`[0.8, 1, 'float']`. Three range types are available,
 46 |         :code:`'float'`, :code:`'int'`, :code:`'log'`.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         nsamples : int
 51 |             Number of hyperparameters to sample.
 52 |         nzoom : int
 53 |             Number of times to zoom in.
 54 |         zoomratio : float
 55 |             How much to zoom in.
 56 |         *args
 57 |             Range type for each hyperparameter.
 58 |         '''
 59 |         best_params = None
 60 |         min_cost = float('inf')
 61 |         range_types = args
 62 | 
 63 |         for z in range(nzoom):
 64 |             for i in range(nsamples):
 65 |                 params = []
 66 |                 # Generate hyperparameters
 67 |                 for rtype in range_types:
 68 |                     l = rtype[0]
 69 |                     u = rtype[1]
 70 |                     if rtype[2] == 'int':
 71 |                         params.append(random.randint(int(l), int(u)))
 72 |                     elif rtype[2] == 'float':
 73 |                         params.append(random.uniform(l, u))
 74 |                     elif rtype[2] == 'log':
 75 |                         params.append(10**random.uniform(l, u))
 76 | 
 77 |                 print(f'Testing {i+1}/{nsamples}, zoomlvl {z+1}, params ={params}')
 78 | 
 79 |                 # Yield
 80 |                 yield params
 81 | 
 82 |                 # Track best ones
 83 |                 if self._curr_cost < min_cost:
 84 |                     min_cost = self._curr_cost
 85 |                     best_params = params
 86 |                     self._best = True
 87 |                 else:
 88 |                     self._best = False
 89 | 
 90 |             # Zoom in around the best set of hyperparams
 91 |             new_range_types = []
 92 |             for best_param, rtype in zip(best_params, range_types):
 93 |                 l = rtype[0]
 94 |                 u = rtype[1]
 95 |                 diff = u-l
 96 |                 if rtype[2] == 'log':
 97 |                     best_param = math.log10(best_param)
 98 |                 new_l = best_param-(diff/zoomratio)
 99 |                 new_u = best_param+(diff/zoomratio)
100 |                 new_range_types.append([new_l, new_u, rtype[2]])
101 |             range_types = new_range_types
102 | 
103 |         # Print the best one
104 |         print('\nSearch Finished')
105 |         print('===============')
106 |         print('Best params:', best_params)
107 |         print('Best cost:', min_cost)
108 | 


--------------------------------------------------------------------------------
/pykitml/datasets/heartdisease.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from urllib import request
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .. import pklhandler
  7 | 
  8 | '''
  9 | This module contains helper functions to download and load
 10 | the heart disease dataset.
 11 | '''
 12 | 
 13 | 
 14 | def get():
 15 |     '''
 16 |     Downloads heartdisease dataset from
 17 |     https://archive.ics.uci.edu/ml/datasets/Heart+Disease
 18 |     and saves it as a pkl file `heartdisease.pkl`.
 19 | 
 20 |     Raises
 21 |     ------
 22 |         urllib.error.URLError
 23 |             If internet connection is not available or the URL is not accessible.
 24 |         OSError
 25 |             If the file cannot be created due to a system-related error.
 26 |         KeyError
 27 |             If invalid/unknown type.
 28 | 
 29 |     Note
 30 |     ----
 31 |     You only need to call this method once, i.e, after the dataset has been downloaded
 32 |     and you have the `heartdisease.pkl` file, you don't need to call this method again.
 33 |     '''
 34 |     # Url to download the dataset from
 35 |     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
 36 | 
 37 |     # Download the dataset
 38 |     print('Downloading processed.cleveland.data...')
 39 |     request.urlretrieve(url, 'processed.cleveland.data')
 40 |     print('Download complete.')
 41 | 
 42 |     # Parse data and save it as a pkl file.
 43 |     data_array = []
 44 |     # Open the file and put the values in a list.
 45 |     with open('processed.cleveland.data', 'r') as datafile:
 46 |         for line in datafile:
 47 |             try:
 48 |                 data_array.append(list(map(float, line.split(','))))
 49 |             except ValueError:
 50 |                 continue
 51 |     # Convert the list into a numpy array.
 52 |     heartdisease_data_array = np.array(data_array)
 53 |     # Save as a pkl file.
 54 |     pklhandler.save(heartdisease_data_array, 'heartdisease.pkl')
 55 | 
 56 |     # Delete unnecessary files.
 57 |     os.remove('processed.cleveland.data')
 58 |     print('Deleted unnecessary files.')
 59 | 
 60 | 
 61 | def load():
 62 |     '''
 63 |     Loads heart disease dataset from saved pickle file `heartdisease.pkl` to numpy arrays.
 64 |     Loads data without any preprocessing.
 65 | 
 66 |     Returns
 67 |     -------
 68 |     inputs : numpy.array
 69 |         297x13 numpy array. 297 training examples, each example having 13 inputs(columns).
 70 |         The 13 columns corresponds to:
 71 |         :code:`age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal`.
 72 | 
 73 |         - age : Age in years
 74 |         - sex : 1=male, 0=female
 75 |         - cp : Chest pain type (1=typical-angina, 2=atypical-angina 3=non-anginal 4=asymptomatic)
 76 |         - trestbps :  Resting blood pressure in mmHg
 77 |         - chol : Serum cholesterol in mg/dl
 78 |         - fbs : Fasting blood sugar > 120 mg/dl? (1=true, 0=false)
 79 |         - restecg : Resting electrocardiographic results (0=normal, 1=ST-T-abnormality 2= left-ventricular-hypertrophy)
 80 |         - thalach : Maximum heart rate achieved
 81 |         - exang : Exercise induced angina (1=yes, 0=no)
 82 |         - oldpeak : ST depression induced by exercise relative to rest
 83 |         - slope: Slope of the peak exercise ST segment (1=upsloping 2=flat 3=downsloping)
 84 |         - ca : Number of major vessels colored by flourosopy (0-3)
 85 |         - thal: 3=normal, 6=fixed-defect, 7=reversable-defect
 86 | 
 87 |     outputs : numpy.array
 88 |         Numpy array with 297 elements.
 89 | 
 90 |         - 0: < 50% diameter narrowing
 91 |         - 1: > 50% diameter narrowing
 92 | 
 93 |     Raises
 94 |     ------
 95 |         FileNotFoundError
 96 |             If `heartdisease.pkl` file does not exist, i.e, if the dataset was not
 97 |             downloaded and saved using the :py:func:`~get` method.
 98 |     '''
 99 |     # Load data from pkl file.
100 |     heartdisease_data_array = pklhandler.load('heartdisease.pkl')
101 |     inputs = heartdisease_data_array[:, :-1]
102 |     outputs = (heartdisease_data_array[:, -1] > 0)*1
103 | 
104 |     # return data
105 |     return inputs, outputs
106 | 


--------------------------------------------------------------------------------
/pykitml/datasets/sonar.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from urllib import request
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .. import pklhandler
  7 | 
  8 | '''
  9 | This module contains helper functions to download the sonar dataset.
 10 | '''
 11 | 
 12 | 
 13 | def get():
 14 |     '''
 15 |     Downloads sonar dataset from
 16 |     https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+(Sonar,+Mines+vs.+Rocks)
 17 |     and saves it as a pkl file `sonar.pkl`.
 18 | 
 19 |     Raises
 20 |     ------
 21 |         urllib.error.URLError
 22 |             If internet connection is not available or the URL is not accessible.
 23 |         OSError
 24 |             If the files cannot be created due to a system-related error.
 25 |         KeyError
 26 |             If invalid/unknown type.
 27 | 
 28 |     Note
 29 |     ----
 30 |     You only need to call this method once, i.e, after the dataset has been downloaded
 31 |     and you have the `sonar.pkl` file, you don't need to call
 32 |     this method again.
 33 |     '''
 34 |     # Url to download the dataset from
 35 |     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'  # pylint: disable=line-too-long
 36 | 
 37 |     # Download the dataset
 38 |     print('Downloading sonar.all-data...')
 39 |     request.urlretrieve(url, 'sonar.all-data')
 40 |     print('Download complete.')
 41 | 
 42 |     out_dict = {
 43 |         'R\n': 0, 'M\n': 1
 44 |     }
 45 | 
 46 |     # Parse data and save it as pkl file
 47 |     data_array = []
 48 |     # Open the file and put the values in a list.
 49 |     with open('sonar.all-data', 'r') as datafile:
 50 |         for line in datafile:
 51 |             values = line.split(',')
 52 |             values[-1] = out_dict[values[-1]]
 53 |             data_array.append(list(map(float, values)))
 54 |     # Convert the list to numpy array
 55 |     sonar_data_array = np.array(data_array)
 56 |     # Save it as a pkl file
 57 |     pklhandler.save(sonar_data_array, 'sonar.pkl')
 58 | 
 59 |     # Delete files
 60 |     os.remove('sonar.all-data')
 61 | 
 62 | 
 63 | def load():
 64 |     '''
 65 |     Loads the adult dataset from `sonar.pkl` file.
 66 | 
 67 |     Each pattern is a set of 60 numbers in the range 0.0 to 1.0.
 68 |     Each number represents the energy within a particular frequency band,
 69 |     integrated over a certain period of time. The integration aperture for
 70 |     higher frequencies occur later in time, since these frequencies are
 71 |     transmitted later during the chirp.
 72 | 
 73 |     The label associated with each record contains the letter
 74 |     "R" if the object is a rock and "M" if it is a mine (metal cylinder).
 75 | 
 76 |     Returns
 77 |     -------
 78 |     inputs_train : numpy.array
 79 |         190x60 numpy array containing training inputs.
 80 |     outputs_train : numpy.array
 81 |         Numpy array of size 190.
 82 |     inputs_test : numpy.array
 83 |         18x60 numpy array containing testing inputs.
 84 |     outputs_test : numpy.array
 85 |         Numpy array of size 18.
 86 | 
 87 |     Raises
 88 |     ------
 89 |         filesNotFoundError
 90 |             If `sonar.pkl` file does not exist,
 91 |             i.e, if the dataset was not downloaded and saved using the
 92 |             :py:func:`~get` method.
 93 | 
 94 |     '''
 95 |     # Load the data from pkl file
 96 |     sonar_data_array = pklhandler.load('sonar.pkl')
 97 | 
 98 |     # Split into train and test
 99 |     train_neg = sonar_data_array[0:90]
100 |     train_pos = sonar_data_array[97:197]
101 |     test_neg = sonar_data_array[90:97]
102 |     test_pos = sonar_data_array[197:208]
103 | 
104 |     # Shuffle the dataset, join neg and pos examples
105 |     train = np.concatenate((train_pos, train_neg), axis=0)
106 |     np.random.shuffle(train)
107 |     test = np.concatenate((test_pos, test_neg), axis=0)
108 |     np.random.shuffle(test)
109 | 
110 |     # Split the dataset into inputs and outputs
111 |     inputs_train = train[:, :-1]
112 |     outputs_train = train[:, -1]
113 |     inputs_test = test[:, :-1]
114 |     outputs_test = test[:, -1]
115 | 
116 |     # return
117 |     return inputs_train, outputs_train, inputs_test, outputs_test
118 | 


--------------------------------------------------------------------------------
/tests/test_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from pykitml import _functions
  4 | 
  5 | eg_ws = np.array([[0.1, -0.2, 0.3], [-0.4, 0.5, -0.6]])
  6 | 
  7 | # =============================
  8 | # = Test activation functions =
  9 | # =============================
 10 | 
 11 | 
 12 | def test_sigmoid():
 13 |     expected_output = np.array([[0.52497919, 0.450166, 0.57444252],
 14 |                                 [0.40131234, 0.62245933, 0.35434369]])
 15 | 
 16 |     assert np.allclose(_functions.sigmoid(eg_ws), expected_output)
 17 | 
 18 | 
 19 | def test_tanh():
 20 |     expected_output = np.array([[0.09966799, -0.19737532, 0.29131261],
 21 |                                 [-0.37994896, 0.46211716, -0.53704957]])
 22 | 
 23 |     assert np.allclose(_functions.tanh(eg_ws), expected_output)
 24 | 
 25 | 
 26 | def test_leakyrelu():
 27 |     expected_output = np.array([[0.1, -0.002, 0.3],
 28 |                                 [-0.004, 0.5, -0.006]])
 29 | 
 30 |     assert np.allclose(_functions.leakyrelu(eg_ws), expected_output)
 31 | 
 32 | 
 33 | def test_relu():
 34 |     expected_output = np.array([[0.1, 0, 0.3], [0, 0.5, 0]])
 35 | 
 36 |     assert np.allclose(_functions.relu(eg_ws), expected_output)
 37 | 
 38 | 
 39 | def test_softmax():
 40 |     expected_output = np.array([[0.33758454, 0.25008878, 0.41232669],
 41 |                                 [0.23373585, 0.57489742, 0.19136673]])
 42 | 
 43 |     assert np.allclose(_functions.softmax(eg_ws), expected_output)
 44 | 
 45 | # ===========================================
 46 | # = Test derivative of activation functions =
 47 | # ===========================================
 48 | 
 49 | 
 50 | def test_sigmoid_prime():
 51 |     activ = _functions.sigmoid(eg_ws)
 52 | 
 53 |     expected_output = np.array([[0.24937604, 0.24751657, 0.24445831],
 54 |                                 [0.24026075, 0.23500371, 0.22878424]])
 55 | 
 56 |     assert np.allclose(_functions.sigmoid_prime(eg_ws, activ), expected_output)
 57 | 
 58 | 
 59 | def test_tanh_prime():
 60 |     activ = _functions.tanh(eg_ws)
 61 | 
 62 |     expected_output = np.array([[0.99006629, 0.96104298, 0.91513696],
 63 |                                 [0.85563879, 0.78644773, 0.71157776]])
 64 | 
 65 |     assert np.allclose(_functions.tanh_prime(eg_ws, activ), expected_output)
 66 | 
 67 | 
 68 | def test_leakyrelu_prime():
 69 |     activ = _functions.leakyrelu(eg_ws)
 70 | 
 71 |     expected_output = np.array([[1., 0.01, 1.],
 72 |                                 [0.01, 1., 0.01]])
 73 | 
 74 |     assert np.allclose(_functions.leakyrelu_prime(eg_ws, activ), expected_output)
 75 | 
 76 | 
 77 | def test_relu_prime():
 78 |     activ = _functions.relu(eg_ws)
 79 | 
 80 |     expected_output = np.array([[1, 0, 1], [0, 1, 0]])
 81 | 
 82 |     assert np.allclose(_functions.relu_prime(eg_ws, activ), expected_output)
 83 | 
 84 | 
 85 | def test_softmax_prime():
 86 |     activ = _functions.leakyrelu(eg_ws)
 87 | 
 88 |     expected_output = np.array([[0.09, -0.002004, 0.21],
 89 |                                 [-0.004016, 0.25, -0.006036]])
 90 | 
 91 |     assert np.allclose(_functions.softmax_prime(eg_ws, activ), expected_output)
 92 | 
 93 | # =======================
 94 | # = Test cost functions =
 95 | # =======================
 96 | 
 97 | 
 98 | def test_mse():
 99 |     eg_output = np.array([0.1, 0.4, -0.1, 0.3])
100 |     eg_target = np.array([0.2, 0.3, -0.5, 0.2])
101 |     expected_output = np.array([0.005, 0.005, 0.08, 0.005])
102 | 
103 |     assert np.allclose(_functions.mse(eg_output, eg_target), expected_output)
104 | 
105 | 
106 | def test_cross_entropy():
107 |     eg_output = np.array([0.3, 0.1, 0.9, 0.7])
108 |     eg_target = np.array([1, 0, 1, 1])
109 |     expected_output = np.array([1.2039728, 0.10536052, 0.10536052, 0.35667494])
110 | 
111 |     assert np.allclose(_functions.cross_entropy(eg_output, eg_target), expected_output)
112 | 
113 | # =====================================
114 | # = Test derivative of cost functions =
115 | # =====================================
116 | 
117 | 
118 | def test_mse_prime():
119 |     eg_output = np.array([0.1, 0.4, -0.1, 0.3])
120 |     eg_target = np.array([0.2, 0.3, -0.5, 0.2])
121 |     expected_output = np.array([-0.1, 0.1, 0.4, 0.1])
122 | 
123 |     assert np.allclose(_functions.mse_prime(eg_output, eg_target), expected_output)
124 | 
125 | 
126 | def test_cross_entropy_prime():
127 |     eg_output = np.array([0.3, 0.1, 0.9, 0.7])
128 |     eg_target = np.array([1, 0, 1, 1])
129 |     expected_output = np.array([-3.33333333, 1.11111111, -1.11111111, -1.42857143])
130 | 
131 |     assert np.allclose(_functions.cross_entropy_prime(eg_output, eg_target), expected_output)
132 | 


--------------------------------------------------------------------------------
/pykitml/_heatmap.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | '''
  6 | This module contains helper functions to draw heatmaps.
  7 | REF: https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html
  8 | '''
  9 | 
 10 | 
 11 | def heatmap(data, row_labels, col_labels, ax=None, cbar_kw={}, cbarlabel='', **kwargs):
 12 |     '''
 13 |     Create a heatmap from a numpy array and two lists of labels.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     data
 18 |         A 2D numpy array of shape (N, M).
 19 |     row_labels
 20 |         A list or array of length N with the labels for the rows.
 21 |     col_labels
 22 |         A list or array of length M with the labels for the columns.
 23 |     ax
 24 |         A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
 25 |         not provided, use current axes or create a new one.  Optional.
 26 |     cbar_kw
 27 |         A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
 28 |     cbarlabel
 29 |         The label for the colorbar.  Optional.
 30 |     **kwargs
 31 |         All other arguments are forwarded to `imshow`.
 32 |     '''
 33 | 
 34 |     if not ax:
 35 |         ax = plt.gca()
 36 | 
 37 |     # Plot the heatmap
 38 |     im = ax.imshow(data, **kwargs)
 39 | 
 40 |     # Create colorbar
 41 |     cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
 42 |     cbar.ax.set_ylabel(cbarlabel, rotation=-90, va='bottom')
 43 | 
 44 |     # We want to show all ticks...
 45 |     ax.set_xticks(np.arange(data.shape[1]))
 46 |     ax.set_yticks(np.arange(data.shape[0]))
 47 |     # ... and label them with the respective list entries.
 48 |     ax.set_xticklabels(col_labels)
 49 |     ax.set_yticklabels(row_labels)
 50 | 
 51 |     # Let the horizontal axes labeling appear on top.
 52 |     ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
 53 | 
 54 |     # Rotate the tick labels and set their alignment.
 55 |     plt.setp(ax.get_xticklabels(), rotation=-30, ha='right', rotation_mode='anchor')
 56 | 
 57 |     # Turn spines off and create white grid.
 58 |     for _, spine in ax.spines.items():
 59 |         spine.set_visible(False)
 60 | 
 61 |     ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
 62 |     ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
 63 |     ax.grid(which='minor', color='w', linestyle='-', linewidth=3)
 64 |     ax.tick_params(which='minor', bottom=False, left=False)
 65 | 
 66 |     return im, cbar
 67 | 
 68 | 
 69 | def annotate_heatmap(im, data=None, valfmt='{x:.2f}', textcolors=['black', 'white'],
 70 |                      threshold=None, **textkw):
 71 |     '''
 72 |     A function to annotate a heatmap.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     im
 77 |         The AxesImage to be labeled.
 78 |     data
 79 |         Data used to annotate.  If None, the image's data is used.  Optional.
 80 |     valfmt
 81 |         The format of the annotations inside the heatmap.  This should either
 82 |         use the string format method, e.g. '$ {x:.2f}', or be a
 83 |         `matplotlib.ticker.Formatter`.  Optional.
 84 |     textcolors
 85 |         A list or array of two color specifications.  The first is used for
 86 |         values below a threshold, the second for those above.  Optional.
 87 |     threshold
 88 |         Value in data units according to which the colors from textcolors are
 89 |         applied.  If None (the default) uses the middle of the colormap as
 90 |         separation.  Optional.
 91 |     **kwargs
 92 |         All other arguments are forwarded to each call to `text` used to create
 93 |         the text labels.
 94 |     '''
 95 | 
 96 |     if not isinstance(data, (list, np.ndarray)):
 97 |         data = im.get_array()
 98 | 
 99 |     # Normalize the threshold to the images color range.
100 |     if threshold is not None:
101 |         threshold = im.norm(threshold)
102 |     else:
103 |         threshold = im.norm(data.max())/2.
104 | 
105 |     # Set default alignment to center, but allow it to be
106 |     # overwritten by textkw.
107 |     kw = dict(horizontalalignment='center', verticalalignment='center')
108 |     kw.update(textkw)
109 | 
110 |     # Get the formatter in case a string is supplied
111 |     if isinstance(valfmt, str):
112 |         valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
113 | 
114 |     # Loop over the data and create a `Text` for each 'pixel'.
115 |     # Change the text's color depending on the data.
116 |     texts = []
117 |     for i in range(data.shape[0]):
118 |         for j in range(data.shape[1]):
119 |             kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
120 |             text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
121 |             texts.append(text)
122 | 
123 |     return texts
124 | 


--------------------------------------------------------------------------------
/pykitml/datasets/mnist.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This module contains helper functions to download and load MNIST and MNIST like datasets.
  3 | '''
  4 | 
  5 | # ============================================================
  6 | # = Forked from: https://github.com/hsjeong5/MNIST-for-Numpy =
  7 | # = Modified with minor changes                              =
  8 | # ============================================================
  9 | 
 10 | import gzip
 11 | import os
 12 | from urllib import request
 13 | 
 14 | import numpy as np
 15 | 
 16 | from .. import pklhandler
 17 | 
 18 | 
 19 | def get(type='classic'):  # pylint: disable=redefined-builtin
 20 |     '''
 21 |     Downloads the MNIST dataset and saves it as a pickle file, `mnist.pkl`.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     type : str
 26 |         The type of MNIST dataset to download.
 27 | 
 28 |         - 'classic' : Downloads the classic hanwritten digits dataset from http://yann.lecun.com/exdb/mnist/
 29 |         - 'fashion' : Downloads fashion MNIST from https://github.com/zalandoresearch/fashion-mnist
 30 | 
 31 | 
 32 |     Raises
 33 |     ------
 34 |         urllib.error.URLError
 35 |             If internet connection is not available or the URL is not accessible.
 36 |         OSError
 37 |             If the file cannot be created due to a system-related error.
 38 |         KeyError
 39 |             If invalid/unknown type.
 40 | 
 41 |     Note
 42 |     ----
 43 |     You only need to call this method once, i.e, after the dataset has been
 44 |     downloaded and you have the `mnist.pkl` file, you don't need to call this method again.
 45 |     '''
 46 |     # dict of URLs containing MNIST like datasets
 47 |     type_URLs = {'classic': 'https://ossci-datasets.s3.amazonaws.com/mnist/',
 48 |                  'fashion': 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
 49 |                  }
 50 | 
 51 |     # MNIST files to download
 52 |     filename = [
 53 |         ['training_images', 'train-images-idx3-ubyte.gz'],
 54 |         ['test_images', 't10k-images-idx3-ubyte.gz'],
 55 |         ['training_labels', 'train-labels-idx1-ubyte.gz'],
 56 |         ['test_labels', 't10k-labels-idx1-ubyte.gz']
 57 |     ]
 58 | 
 59 |     def download_mnist():
 60 |         # Download .gz files
 61 |         base_url = type_URLs[type]
 62 |         for name in filename:
 63 |             print('Downloading '+name[1]+'...')
 64 |             request.urlretrieve(base_url+name[1], name[1])
 65 |         print('Download complete.')
 66 | 
 67 |     def save_mnist():
 68 |         # Read .gz files and put them in a numpy array and save it as a pkl file
 69 |         mnist = {}
 70 |         for name in filename[:2]:
 71 |             with gzip.open(name[1], 'rb') as f:
 72 |                 mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28*28)
 73 |         for name in filename[-2:]:
 74 |             with gzip.open(name[1], 'rb') as f:
 75 |                 mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
 76 |         pklhandler.save(mnist, 'mnist.pkl')
 77 |         print('Save complete.')
 78 | 
 79 |     def clean():
 80 |         # Remove unnecessary files
 81 |         os.remove('train-images-idx3-ubyte.gz')
 82 |         os.remove('t10k-images-idx3-ubyte.gz')
 83 |         os.remove('train-labels-idx1-ubyte.gz')
 84 |         os.remove('t10k-labels-idx1-ubyte.gz')
 85 |         print('Deleted unnecessary files.')
 86 | 
 87 |     download_mnist()
 88 |     save_mnist()
 89 |     clean()
 90 | 
 91 | 
 92 | def load():
 93 |     '''
 94 |     Loads MNIST dataset from saved pickle file `mnist.pkl` to numpy arrays.
 95 | 
 96 |     Returns
 97 |     -------
 98 |         training_data : numpy.array
 99 |             60,000x784 numpy array, each row contains flattened version of training images.
100 |         training_targets : numpy.array
101 |             60,000x10 numpy array that contains one hot target array of the corresponding
102 |             training images.
103 |         testing_data : numpy.array
104 |             10,000x784 numpy array, each row contains flattened version of test images.
105 |         testing_targets : numpy.array
106 |             10,000x10 numpy array that contains one hot target array of the corresponding
107 |             test images.
108 | 
109 |     Raises
110 |     ------
111 |         FileNotFoundError
112 |             If `mnist.pkl` file does not exist, i.e, if the dataset was not downloaded and
113 |             saved using the :py:func:`~get` method.
114 |     '''
115 |     mnist = pklhandler.load('mnist.pkl')
116 |     # Normalize data
117 |     training_data = mnist['training_images']/255
118 |     testing_data = mnist['test_images']/255
119 |     # Create one-hot target array for training labels
120 |     training_targets = np.zeros((60000, 10))
121 |     training_targets[np.arange(60000), mnist['training_labels']] = 1
122 |     # Create one-hot target array for testing labels
123 |     testing_targets = np.zeros((10000, 10))
124 |     testing_targets[np.arange(10000), mnist['test_labels']] = 1
125 |     # return the data
126 |     return training_data, training_targets, testing_data, testing_targets
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     get()
131 | 


--------------------------------------------------------------------------------
/pykitml/_functions.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=unused-argument
  2 | 
  3 | import numpy as np
  4 | 
  5 | '''
  6 | This module contains utility functions
  7 | '''
  8 | 
  9 | # =====================
 10 | # = Utility functions =
 11 | # =====================
 12 | 
 13 | 
 14 | def pdist(x, y):
 15 |     '''
 16 |     Calculate pairwise square distances between matrix x and y.
 17 |     See: https://stackoverflow.com/a/56084419/5516481
 18 |     '''
 19 |     if x.ndim == 1:
 20 |         x = np.array([x])
 21 | 
 22 |     nx, p = x.shape
 23 |     x_ext = np.empty((nx, 3*p))
 24 |     x_ext[:, :p] = 1
 25 |     x_ext[:, p:2*p] = x
 26 |     x_ext[:, 2*p:] = np.square(x)
 27 | 
 28 |     ny = y.shape[0]
 29 |     y_ext = np.empty((3*p, ny))
 30 |     y_ext[:p] = np.square(y).T
 31 |     y_ext[p:2*p] = -2*y.T
 32 |     y_ext[2*p:] = 1
 33 | 
 34 |     return x_ext.dot(y_ext)
 35 | 
 36 | # ==============================================
 37 | # = Activation functions and their derivatives =
 38 | # ==============================================
 39 | 
 40 | 
 41 | def sigmoid(weighted_sum):
 42 |     '''
 43 |     Returns sigmoid of the weighted sum array of a layer.
 44 |     '''
 45 |     return 1 / (1 + np.exp(-weighted_sum))
 46 | 
 47 | 
 48 | def sigmoid_prime(weighted_sum, activations):
 49 |     '''
 50 |     Returns the derivative of sigmoid w.r.t layer's weighted sum.
 51 |     '''
 52 |     return activations * (1 - activations)
 53 | 
 54 | 
 55 | def tanh(weighted_sum):
 56 |     '''
 57 |     Returns tanh of the weighted sum array of a layer.
 58 |     '''
 59 |     return np.tanh(weighted_sum)
 60 | 
 61 | 
 62 | def tanh_prime(weighted_sum, activations):
 63 |     '''
 64 |     Returns the derivative of tanh w.r.t layer's weighted sum.
 65 |     '''
 66 |     return 1 - (activations ** 2)
 67 | 
 68 | 
 69 | def leakyrelu(weighted_sum):
 70 |     '''
 71 |     Returns leaky-ReLU of the weighted sum array of a layer.
 72 |     '''
 73 |     return np.where(weighted_sum > 0, weighted_sum, 0.01 * weighted_sum)
 74 | 
 75 | 
 76 | def leakyrelu_prime(weighted_sum, activations):
 77 |     '''
 78 |     Returns the derivative of leaky-ReLU w.r.t layer's weighted sum.
 79 |     '''
 80 |     return np.where(weighted_sum > 0, 1, 0.01)
 81 | 
 82 | 
 83 | def relu(weighted_sum):
 84 |     '''
 85 |     Returns ReLU of the weighted sum array of a layer.
 86 |     '''
 87 |     return np.where(weighted_sum > 0, weighted_sum, 0)
 88 | 
 89 | 
 90 | def relu_prime(weighted_sum, activations):
 91 |     '''
 92 |     Returns the derivative of ReLU w.r.t layer's weighted sum.
 93 |     '''
 94 |     return np.where(weighted_sum > 0, 1, 0)
 95 | 
 96 | 
 97 | def softmax(weighted_sum):
 98 |     '''
 99 |     Returns softmax of the weighted sum array of a layer.
100 |     If weighted_sum is a 2D array, then it performs softmax over each row.
101 |     '''
102 |     if weighted_sum.ndim == 1:
103 |         exps = np.exp(weighted_sum - np.max(weighted_sum))
104 |         return exps / np.sum(exps)
105 | 
106 |     normalized = weighted_sum - np.expand_dims(np.max(weighted_sum, axis=1), axis=1)
107 |     exps = np.exp(normalized)
108 |     return exps / np.expand_dims(np.sum(exps, axis=1), 1)
109 | 
110 | 
111 | def identity(weighted_sum):
112 |     '''
113 |     Returns identity of the weighted sum array of a layer.
114 |     '''
115 |     return weighted_sum
116 | 
117 | 
118 | def identity_prime(weighted_sum, activations):
119 |     '''
120 |     Returns the derivative of identity w.r.t layer's weighted sum.
121 |     '''
122 |     return 1
123 | 
124 | 
125 | def softmax_prime(weighted_sum, activations):
126 |     '''
127 |     Returns the derivative of softmax w.r.t layer's weighted sum.
128 |     '''
129 |     return activations * (1 - activations)
130 | 
131 | # ========================================
132 | # = Cost functions and their derivatives =
133 | # ========================================
134 | 
135 | 
136 | def mse(output, target):
137 |     '''
138 |     Returns mean squared error cost of the output.
139 |     '''
140 |     return 0.5 * ((output - target) ** 2)
141 | 
142 | 
143 | def mse_prime(output, target):
144 |     '''
145 |     Returns the derivative of the mse cost.
146 |     '''
147 |     return output-target
148 | 
149 | 
150 | def cross_entropy(output, target):
151 |     '''
152 |     Returns cross entropy cost of the output.
153 |     '''
154 |     return -(target * np.log(output)) - ((1-target) * np.log(1-output))
155 | 
156 | 
157 | def cross_entropy_prime(output, target):
158 |     '''
159 |     Returns the derivative of the cross entropy cost.
160 |     '''
161 |     return (output-target) / (output * (1-output))
162 | 
163 | 
164 | def hinge_loss(output, target):
165 |     '''
166 |     Returns hinge loss of the output for SVMs.
167 |     '''
168 |     return np.maximum(0, 1 - target*output)
169 | 
170 | 
171 | def hinge_loss_prime(output, target):
172 |     '''
173 |     Returns derivative of hinge loss.
174 |     '''
175 |     return np.where((target*output) > 1, 0, -1*target)
176 | 
177 | 
178 | def huber(output, target):
179 |     '''
180 |     Returns huber loss for dqn
181 |     '''
182 |     error = output - target
183 | 
184 |     is_small_error = np.abs(error) < 1
185 | 
186 |     squared_loss = np.square(error)/2
187 |     linear_loss = np.abs(error) - 0.5
188 | 
189 |     return np.where(is_small_error, squared_loss, linear_loss)
190 | 
191 | 
192 | def huber_prime(output, target):
193 |     '''
194 |     Returns derivative of huber loss.
195 |     '''
196 |     error = output - target
197 | 
198 |     is_small_error = np.abs(error) < 1
199 | 
200 |     return np.where(is_small_error, error, np.sign(error))
201 | 


--------------------------------------------------------------------------------
/docs/FCEUX.rst:
--------------------------------------------------------------------------------
  1 | Creating NES bots using FCEUX emulator
  2 | ======================================
  3 | 
  4 | FCEUX Server
  5 | ------------
  6 | 
  7 | .. autoclass:: pykitml.FCEUXServer
  8 | 
  9 |     .. automethod:: __init__
 10 | 
 11 |     .. automethod:: start
 12 | 
 13 |     .. automethod:: frame_advance
 14 | 
 15 |     .. automethod:: get_joypad
 16 | 
 17 |     .. automethod:: set_joypad
 18 | 
 19 |     .. automethod:: read_mem
 20 | 
 21 |     .. automethod:: reset
 22 | 
 23 |     .. automethod:: quit
 24 | 
 25 |     .. autoattribute:: info
 26 | 
 27 | Lua client script 
 28 | -----------------
 29 | 
 30 | This script has to be loaded into the emulator after
 31 | starting the server. (File > Load Lua Script)
 32 | 
 33 | **fceux_client.lua**
 34 | 
 35 | .. code-block:: lua
 36 | 
 37 |     local socket = require "socket"
 38 |     
 39 |     -- Edit to change
 40 |     ip = 'localhost'
 41 |     port = '1234'
 42 |     
 43 |     -- Table for holding lua code snippets from server
 44 |     func_table = {}
 45 |     
 46 |     -- Start connection with server
 47 |     s = socket.connect('localhost', '1234')
 48 |     
 49 |     -- Helper function to convert table to string
 50 |     function table_to_string(table)
 51 |         str = ''
 52 |     
 53 |         for key, value in pairs(table) do
 54 |             str = str .. tostring(key) .. ' ' .. tostring(value) .. ' '
 55 |         end
 56 |     
 57 |         return str
 58 |     end
 59 |     
 60 |     -- Helper function to split string into token
 61 |     function split(inputstr, sep)
 62 |         if sep == nil then
 63 |                 sep = "%s"
 64 |         end
 65 |         local t={}
 66 |         for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
 67 |                 table.insert(t, str)
 68 |         end
 69 |         return t
 70 |     end
 71 |     
 72 |     -- Helper function to send server message
 73 |     function send(msg)
 74 |         s:send(msg)
 75 |     end
 76 |     
 77 |     -- Helper function to receive message from server
 78 |     function recv(msg)
 79 |         local resp, err = s:receive('*l')
 80 |         return resp
 81 |     end
 82 |     
 83 |     -- Helper function that waits for ackoledgement from server
 84 |     function wait_for_ack()
 85 |         while (recv() ~= 'ACK') do end
 86 |     end
 87 |     
 88 |     -- Set the speed of the emulator
 89 |     emu.speedmode('normal')
 90 |     
 91 |     -- Server info
 92 |     send('FCEUX Client '.._VERSION)
 93 |     wait_for_ack()
 94 |     
 95 |     -- Main loop
 96 |     while true do
 97 |         local resp = ''
 98 |     
 99 |         -- Log frame count
100 |         fcount = string.format('%d', emu.framecount())
101 |         send(fcount)
102 |     
103 |         -- Parse commands from server
104 |         while (resp ~= 'CONT') do
105 |             resp = recv()
106 |     
107 |             if(resp == 'JOYPAD') then
108 |                 local controller = joypad.read(1)
109 |                 send(table_to_string(controller))
110 |             elseif(resp == 'SETJOYPAD') then
111 |                 local values = split(recv())
112 |                 joypad.set(1, {
113 |                     up = (values[1]=='True'), down = (values[2]=='True'),
114 |                     left = (values[3]=='True'), right = (values[4]=='True'),
115 |                     A = (values[5]=='True'), B = (values[6]=='True'),
116 |                     start = (values[7]=='True'), select = (values[8]=='True'),
117 |                 })
118 |             elseif(resp == 'MEM') then
119 |                 local addr = tonumber(recv())
120 |                 send(memory.readbyte(addr))
121 |             elseif(resp == 'RES') then
122 |                 emu.softreset()
123 |             else
124 |                 break
125 |             end
126 |         end
127 |     
128 |         emu.frameadvance()
129 |     end
130 | 
131 | Example bot to spam the 'A' button
132 | ----------------------------------
133 | 
134 | .. code-block:: python
135 |     
136 |     import pykitml as pk
137 | 
138 |     def on_frame(server, frame): 
139 |         # Spam A and start button
140 |         if(frame%10 < 5): server.set_joypad(A=True, start=True)
141 |         else: server.set_joypad(A=False, start=False)    
142 | 
143 |         # Print joypad
144 |         print(server.get_joypad())
145 | 
146 |         # Continue emulation
147 |         server.frame_advance()
148 | 
149 |     # Intialize and start server
150 |     server = pk.FCEUXServer(on_frame)
151 |     print(server.info)
152 |     server.start()
153 | 
154 | Start this script, then run the FCEUX emulator. Open any NES ROM 
155 | (File > Open ROM) and then load the lua client script (File > Load Lua Script). 
156 | The bot will continuously spam the A button.
157 | 
158 | Example bot to spam the 'A' button, second way
159 | ----------------------------------------------
160 | 
161 | .. code-block:: python
162 | 
163 |     import pykitml as pk
164 |     
165 |     # Instantiate server
166 |     server = pk.FCEUXServer(None)
167 |     
168 |     try:
169 |         while True:
170 |             # Intialize frame, get frame count 
171 |             frame = server.init_frame()
172 |             
173 |             # Spam A and start button
174 |             if(frame%10 < 5): server.set_joypad(A=True, start=True)
175 |             else: server.set_joypad(A=False, start=False)    
176 |     
177 |             # Print joypad
178 |             print(server.get_joypad())
179 |     
180 |             # Continue emulation
181 |             server.frame_advance()
182 |     
183 |     except BrokenPipeError:
184 |         server.quit('Client has quit.')
185 |     except KeyboardInterrupt:
186 |         server.quit()
187 | 


--------------------------------------------------------------------------------
/pykitml/fceux.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | 
  3 | 
  4 | class FCEUXServer:
  5 |     '''
  6 |     Server class for making NES bots. Uses FCEUX emulator.
  7 |     Visit https://www.fceux.com for info. You will also need to
  8 |     load client lua script in the emulator.
  9 |     '''
 10 | 
 11 |     def __init__(self, frame_func, quit_func=None, ip='localhost', port=1234):
 12 |         '''
 13 |         Parameters
 14 |         ----------
 15 |         frame_func : function
 16 |             This function will be called every frame. The function should
 17 |             accept two argument, :code:`server` (reference to this class)
 18 |             and :code:`frame` (number of frames executed).
 19 |         quit_func : function
 20 |             This function will be executed when the server disconnects from
 21 |             the emulator
 22 |         ip : str
 23 |             IP address of the computer.
 24 |         port : int
 25 |             Port to listen to.
 26 |         '''
 27 |         # Establish connection with client
 28 |         self._serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 29 |         self._serversocket.bind((ip, port))
 30 |         self._serversocket.listen(5)
 31 |         self._clientsocket, self._address = self._serversocket.accept()
 32 | 
 33 |         # This function will be called every frame
 34 |         self._on_frame_func = frame_func
 35 |         self._on_quit_func = quit_func
 36 | 
 37 |         self._server_info = self.recv() + ' ' + str(self._address)
 38 |         self.send('ACK')
 39 | 
 40 |     @property
 41 |     def info(self):
 42 |         '''
 43 |         Emulator info and lua version.
 44 |         '''
 45 |         return self._server_info
 46 | 
 47 |     def send(self, msg):
 48 |         '''
 49 |         Send message to lua code running on the emulator.
 50 | 
 51 |         Parameters
 52 |         ----------
 53 |         msg : str
 54 |         '''
 55 |         if not isinstance(msg, str):
 56 |             self.quit()
 57 |             raise TypeError('Arguments have to be string')
 58 | 
 59 |         self._clientsocket.send(bytes(msg+'\n', 'utf-8'))
 60 | 
 61 |     def recv(self):
 62 |         '''
 63 |         Receive message from lua code running on emulator.
 64 | 
 65 |         Returns
 66 |         -------
 67 |         str
 68 |             Received message from emulator.
 69 |         '''
 70 |         return self._clientsocket.recv(4096).decode('utf-8')
 71 | 
 72 |     def init_frame(self):
 73 |         '''
 74 |         Signal server to prep for next frame and returns
 75 |         frame count
 76 | 
 77 |         Returns
 78 |         -------
 79 |         int
 80 |             Frame count
 81 |         '''
 82 |         # Receive message from client
 83 |         frame_str = self.recv()
 84 |         if len(frame_str) == 0:
 85 |             self.quit('Client had quit')
 86 |         frame = int(frame_str)
 87 | 
 88 |         return frame
 89 | 
 90 |     def start(self):
 91 |         '''
 92 |         Starts the server, waits for emulator to connect.
 93 |         Calls :code:`frame_func` every frame after connection
 94 |         has been established.
 95 |         '''
 96 |         try:
 97 |             # Keep receiving messaged from FCEUX and acknowledge
 98 |             while True:
 99 |                 frame = self.init_frame()
100 |                 self._on_frame_func(self, frame)
101 | 
102 |         except BrokenPipeError:
103 |             self.quit('Client has quit.')
104 |         except KeyboardInterrupt:
105 |             self.quit()
106 | 
107 |     def frame_advance(self):
108 |         '''
109 |         Move to next frame, should be called at the end of
110 |         :code:`frame_func`.
111 |         '''
112 |         # Send back continue message
113 |         self.send('CONT')
114 | 
115 |     def get_joypad(self):
116 |         '''
117 |         Returns
118 |         -------
119 |         str
120 |             Joypad button states.
121 |         '''
122 |         self.send('JOYPAD')
123 |         return self.recv()
124 | 
125 |     def set_joypad(self, up=False, down=False, left=False,
126 |                    right=False, A=False, B=False, start=False, select=False):
127 |         '''
128 |         Set joypad button states.
129 |         '''
130 |         self.send('SETJOYPAD')
131 |         joypad = str(up)+' '+str(down)+' '+str(left)+' '+str(right)\
132 |             + ' '+str(A)+' '+str(B)+' '+str(start)+' '+str(select)
133 |         self.send(joypad)
134 | 
135 |     def read_mem(self, addr, signed=False):
136 |         '''
137 |         Read memory address.
138 | 
139 |         Parameters
140 |         ----------
141 |         addr : int
142 |             The memory address to read
143 |         signed : bool
144 |             If :code:`True`, returns signed integer
145 | 
146 |         Returns
147 |         -------
148 |         int
149 |             The byte at the address.
150 |         '''
151 |         self.send('MEM')
152 |         self.send(str(addr))
153 |         unsigned = int(self.recv())
154 | 
155 |         if signed:
156 |             return unsigned-256 if unsigned > 127 else unsigned
157 |         else:
158 |             return unsigned
159 | 
160 |     def reset(self):
161 |         '''
162 |         Resets the emulator, executes a power cycle.
163 |         '''
164 |         self.send('RES')
165 | 
166 |     def quit(self, reason=''):
167 |         '''
168 |         Disconnect from emulator.
169 | 
170 |         Parameters
171 |         ----------
172 |         reason : str
173 |             Reason for quitting.
174 |         '''
175 |         if self._on_quit_func is not None:
176 |             self._on_quit_func()
177 |         self._serversocket.close()
178 |         self._clientsocket.close()
179 |         print(reason)
180 |         print('Server has quit.')
181 |         exit()
182 | 
183 | 
184 | if __name__ == '__main__':
185 |     def on_frame(server, frame):
186 |         print(frame)
187 |         print(server.get_joypad())
188 |         server.frame_advance()
189 | 
190 |     fceux_server = FCEUXServer(on_frame)
191 |     print(fceux_server.info)
192 |     fceux_server.start()
193 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | sys.path.insert(0, os.path.abspath('..'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'pykitml'
 23 | copyright = '2019, Vishnu Shankar B'
 24 | author = 'Vishnu Shankar B'
 25 | 
 26 | # The short X.Y version
 27 | version = '0.1.1'
 28 | # The full version, including alpha/beta/rc tags
 29 | release = '0.1.1'
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.doctest',
 44 |     'sphinx.ext.napoleon'
 45 | ]
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ['_templates']
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | #
 53 | # source_suffix = ['.rst', '.md']
 54 | source_suffix = '.rst'
 55 | 
 56 | # The master toctree document.
 57 | master_doc = 'index'
 58 | 
 59 | # The language for content autogenerated by Sphinx. Refer to documentation
 60 | # for a list of supported languages.
 61 | #
 62 | # This is also used if you do content translation via gettext catalogs.
 63 | # Usually you set "language" from the command line for these cases.
 64 | language = None
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | # This pattern also affects html_static_path and html_extra_path.
 69 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 70 | 
 71 | # The name of the Pygments (syntax highlighting) style to use.
 72 | pygments_style = None
 73 | 
 74 | 
 75 | # -- Options for HTML output -------------------------------------------------
 76 | 
 77 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 78 | # a list of builtin themes.
 79 | #
 80 | html_theme = 'sphinx_rtd_theme'
 81 | 
 82 | # Theme options are theme-specific and customize the look and feel of a theme
 83 | # further.  For a list of options available for each theme, see the
 84 | # documentation.
 85 | #
 86 | #html_theme_options = {}
 87 | 
 88 | # Add any paths that contain custom static files (such as style sheets) here,
 89 | # relative to this directory. They are copied after the builtin static files,
 90 | # so a file named "default.css" will overwrite the builtin "default.css".
 91 | html_static_path = ['_static']
 92 | 
 93 | # Custom sidebar templates, must be a dictionary that maps document names
 94 | # to template names.
 95 | #
 96 | # The default sidebars (for documents that don't match any pattern) are
 97 | # defined by theme itself.  Builtin themes are using these templates by
 98 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 99 | # 'searchbox.html']``.
100 | #
101 | # html_sidebars = {}
102 | 
103 | 
104 | # -- Options for HTMLHelp output ---------------------------------------------
105 | 
106 | # Output file base name for HTML help builder.
107 | htmlhelp_basename = 'pykitmldoc'
108 | 
109 | 
110 | # -- Options for LaTeX output ------------------------------------------------
111 | 
112 | latex_elements = {
113 |     # The paper size ('letterpaper' or 'a4paper').
114 |     #
115 |     # 'papersize': 'letterpaper',
116 | 
117 |     # The font size ('10pt', '11pt' or '12pt').
118 |     #
119 |     # 'pointsize': '10pt',
120 | 
121 |     # Additional stuff for the LaTeX preamble.
122 |     #
123 |     # 'preamble': '',
124 | 
125 |     # Latex figure (float) alignment
126 |     #
127 |     # 'figure_align': 'htbp',
128 | }
129 | 
130 | # Grouping the document tree into LaTeX files. List of tuples
131 | # (source start file, target name, title,
132 | #  author, documentclass [howto, manual, or own class]).
133 | latex_documents = [
134 |     (master_doc, 'pykitml.tex', 'pykitml Documentation',
135 |      'Vishnu Shankar B', 'manual'),
136 | ]
137 | 
138 | 
139 | # -- Options for manual page output ------------------------------------------
140 | 
141 | # One entry per manual page. List of tuples
142 | # (source start file, name, description, authors, manual section).
143 | man_pages = [
144 |     (master_doc, 'pykitml', 'pykitml Documentation',
145 |      [author], 1)
146 | ]
147 | 
148 | 
149 | # -- Options for Texinfo output ----------------------------------------------
150 | 
151 | # Grouping the document tree into Texinfo files. List of tuples
152 | # (source start file, target name, title, author,
153 | #  dir menu entry, description, category)
154 | texinfo_documents = [
155 |     (master_doc, 'pykitml', 'pykitml Documentation',
156 |      author, 'pykitml', 'Machine learninh library written in Python and NumPy.',
157 |      'Miscellaneous'),
158 | ]
159 | 
160 | 
161 | # -- Options for Epub output -------------------------------------------------
162 | 
163 | # Bibliographic Dublin Core info.
164 | epub_title = project
165 | 
166 | # The unique identifier of the text. This can be a ISBN number
167 | # or the project homepage.
168 | #
169 | # epub_identifier = ''
170 | 
171 | # A unique identification for the text.
172 | #
173 | # epub_uid = ''
174 | 
175 | # A list of files that should not be packed into the epub file.
176 | epub_exclude_files = ['search.html']
177 | 
178 | 
179 | # -- Extension configuration -------------------------------------------------
180 | 


--------------------------------------------------------------------------------
/pykitml/normalize.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | # ===============================================
  4 | # = Functions for Normalization/Feature-scaling =
  5 | # ===============================================
  6 | 
  7 | 
  8 | def get_minmax(array):
  9 |     '''
 10 |     Returns two row arrays, one array containing minimum values of each column
 11 |     and another one with maximum values.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     array : numpy.array
 16 |         The array to get minimum and maximum values for.
 17 | 
 18 |     Returns
 19 |     -------
 20 |     array_min : numpy.array
 21 |         Array containing minimum values of each column.
 22 |     array_max : numpy.array
 23 |         Array containing maximum values of each column.
 24 |     '''
 25 |     return np.amin(array, axis=0), np.amax(array, axis=0)
 26 | 
 27 | 
 28 | def normalize_minmax(array, array_min, array_max, cols=[]):
 29 |     '''
 30 |     Normalizes columns of the array to between 0 and 1 using min-max
 31 |     normalization.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     array : numpy.array
 36 |         The array to normalize.
 37 |     array_min : numpy.array
 38 |         Array containing minimum values of each column.
 39 |     array_max : numpy.array
 40 |         Array containing maximum values of each column.
 41 |     cols : list
 42 |         The columns to normalize. If the list is empty (default),
 43 |         all columns will be normalized.
 44 | 
 45 |     Returns
 46 |     -------
 47 |     numpy.array
 48 |         The normalized array.
 49 | 
 50 |     Note
 51 |     ----
 52 |     You can use :py:func:`~get_minmax` function to get :code:`array_min`
 53 |     and :code:`array_max` parameters.
 54 |     '''
 55 |     normalized_array = array.astype(float)
 56 |     all_normalized = (array - array_min) / (array_max - array_min)
 57 | 
 58 |     if len(cols) == 0:
 59 |         # Normalize all columns
 60 |         normalized_array = all_normalized
 61 |     elif array.ndim == 1:
 62 |         # Normalize only specified columns, 1D array
 63 |         normalized_array[cols] = all_normalized[cols]
 64 |     else:
 65 |         # Normalize only specified columns, 2D array
 66 |         normalized_array[:, cols] = all_normalized[:, cols]
 67 | 
 68 |     return normalized_array
 69 | 
 70 | 
 71 | def denormalize_minmax(array, array_min, array_max, cols=[]):
 72 |     '''
 73 |     Denormalizes columns of a min-max normalized array.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     array : numpy.array
 78 |         The array to denormalize.
 79 |     array_min : numpy.array
 80 |         Array containing minimum values of each column.
 81 |     array_max : numpy.array
 82 |         Array containing maximum values of each column.
 83 |     cols : list
 84 |         The columns to normalize. If the list is empty (default),
 85 |         all columns will be denormalized.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     numpy.array
 90 |         The denormalized array.
 91 | 
 92 |     Note
 93 |     ----
 94 |     You can use :py:func:`~get_minmax` function to get :code:`array_min`
 95 |     and :code:`array_max` parameters.
 96 |     '''
 97 |     denormalized_array = array.astype(float)
 98 |     all_denormalized = (array * (array_max - array_min)) + array_min
 99 | 
100 |     if len(cols) == 0:
101 |         # Denormalize all columns
102 |         denormalized_array = all_denormalized
103 |     elif array.ndim == 1:
104 |         # Denormalize only specified columns, 1D array
105 |         denormalized_array[cols] = all_denormalized[cols]
106 |     else:
107 |         # Denormalize only specified columns, 2D array
108 |         denormalized_array[:, cols] = all_denormalized[:, cols]
109 | 
110 |     return denormalized_array
111 | 
112 | 
113 | def get_meanstd(array):
114 |     '''
115 |     Returns two row arrays, one array containing mean of each column
116 |     and another one with standard deviation of each column.
117 | 
118 |     Parameters
119 |     ----------
120 |     array : numpy.array
121 |         The array to get mean and standard deviation values for.
122 | 
123 |     Returns
124 |     -------
125 |     array_mean : numpy.array
126 |         Array containing mean values of each column.
127 |     array_stddev : numpy.array
128 |         Array containing standard deviation values of each column.
129 |     '''
130 |     return np.mean(array, axis=0), np.std(array, axis=0)
131 | 
132 | 
133 | def normalize_mean(array, array_mean, array_stddev, cols=[]):
134 |     '''
135 |     Normalizes columns of the array with mean normalization.
136 | 
137 |     Parameters
138 |     ----------
139 |     array : numpy.array
140 |         The array to normalize.
141 |     array_mean : numpy.array
142 |         Array containing mean values of each column.
143 |     array_stddev : numpy.array
144 |         Array containing standard deviation values of each column.
145 |     cols : list
146 |         The columns to normalize. If the list is empty (default),
147 |         all columns will be normalized.
148 | 
149 | 
150 |     Returns
151 |     -------
152 |     numpy.array
153 |         The normalized array.
154 | 
155 |     Note
156 |     ----
157 |     You can use :py:func:`~get_meanstd` function to get :code:`array_mean`
158 |     and :code:`array_stddev` parameters.
159 |     '''
160 |     normalized_array = array.astype(float)
161 |     all_normalized = (array-array_mean)/array_stddev
162 | 
163 |     if len(cols) == 0:
164 |         # Normalize all columns
165 |         normalized_array = all_normalized
166 |     elif array.ndim == 1:
167 |         # Normalize only specified columns, 1D array
168 |         normalized_array[cols] = all_normalized[cols]
169 |     else:
170 |         # Normalize only specified columns, 2D array
171 |         normalized_array[:, cols] = all_normalized[:, cols]
172 | 
173 |     return normalized_array
174 | 
175 | 
176 | def denormalize_mean(array, array_mean, array_stddev, cols=[]):
177 |     '''
178 |     Denormalizes a mean normalized array.
179 | 
180 |     Parameters
181 |     ----------
182 |     array : numpy.array
183 |         The array to denormalize.
184 |     array_mean : numpy.array
185 |         Array containing mean values of each column.
186 |     array_stddev : numpy.array
187 |         Array containing standard deviation values of each column.
188 | 
189 |     Returns
190 |     -------
191 |     numpy.array
192 |         The denormalized array.
193 | 
194 |     Note
195 |     ----
196 |     You can use :py:func:`~get_meanstd` function to get :code:`array_mean`
197 |     and :code:`array_stddev` parameters.
198 |     '''
199 |     denormalized_array = array.astype(float)
200 |     all_denormalized = (array*array_stddev) + array_mean
201 | 
202 |     if len(cols) == 0:
203 |         # Denormalize all columns
204 |         denormalized_array = all_denormalized
205 |     elif array.ndim == 1:
206 |         # Denormalize only specified columns, 1D array
207 |         denormalized_array[cols] = all_denormalized[cols]
208 |     else:
209 |         # Denormalize only specified columns, 2D array
210 |         denormalized_array[:, cols] = all_denormalized[:, cols]
211 | 
212 |     return denormalized_array
213 | 


--------------------------------------------------------------------------------
/pykitml/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from itertools import combinations_with_replacement
  2 | 
  3 | import numpy as np
  4 | 
  5 | '''
  6 | This module contains helper functions for preprocessing data.
  7 | '''
  8 | 
  9 | 
 10 | def onehot(input_array):
 11 |     '''
 12 |     Converts input array to one-hot array.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     input_array : numpy.array
 17 |         The input numpy array.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     one_hot : numpy.array
 22 |         The converted onehot array.
 23 | 
 24 |     Example
 25 |     -------
 26 |     >>> import numpy as np
 27 |     >>> import pykitml as pk
 28 |     >>> a = np.array([0, 1, 2])
 29 |     >>> pk.onehot(a)
 30 |     array([[1., 0., 0.],
 31 |            [0., 1., 0.],
 32 |            [0., 0., 1.]])
 33 |     '''
 34 |     array = input_array.astype(int)
 35 |     one_hot = np.zeros((array.size, array.max()+1))
 36 |     one_hot[np.arange(array.size), array] = 1
 37 |     return one_hot
 38 | 
 39 | 
 40 | def onehot_cols(dataset, cols):
 41 |     '''
 42 |     Converts/replaces columns of dataset to one-hot values.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     dataset : numpy.array
 47 |         The input dataset.
 48 |     cols : list
 49 |         The columns which has to be replaced/converted
 50 |         to one-hot values.
 51 | 
 52 |     Returns
 53 |     -------
 54 |     dataset_new : numpy.array
 55 |         The new dataset with replaced columns.
 56 | 
 57 |     Example
 58 |     -------
 59 | 
 60 |         >>> import pykitml as pk
 61 |         >>> import numpy as np
 62 |         >>> a = np.array([[0, 1, 2.2], [1, 2, 3.4], [0, 0, 1.1]])
 63 |         >>> a
 64 |         array([[0. , 1. , 2.2],
 65 |                [1. , 2. , 3.4],
 66 |                [0. , 0. , 1.1]])
 67 |         >>> pk.onehot_cols(a, cols=[0, 1])
 68 |         array([[1. , 0. , 0. , 1. , 0. , 2.2],
 69 |                [0. , 1. , 0. , 0. , 1. , 3.4],
 70 |                [1. , 0. , 1. , 0. , 0. , 1.1]])
 71 | 
 72 |     '''
 73 |     offset = 0
 74 |     dataset_new = dataset
 75 |     for col in cols:
 76 |         onehot_column = onehot(dataset_new[:, col+offset])
 77 |         dataset_new = np.delete(dataset_new, col+offset, axis=1)
 78 |         dataset_new = np.insert(dataset_new, [col+offset], onehot_column, axis=1)
 79 |         offset += onehot_column.shape[1]-1
 80 | 
 81 |     return dataset_new
 82 | 
 83 | 
 84 | def onehot_cols_traintest(dataset_train, dataset_test, cols):
 85 |     '''
 86 |     Converts/replaces columns of :code:`dataset_train` and
 87 |     :code:`dataset_test` to one-hot values.
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     dataset_train : numpy.array
 92 |         The training dataset.
 93 |     dataset_test : numpy.array
 94 |         The testing dataset.
 95 |     cols : list
 96 |         The columns which has to be replaced/converted
 97 |         to one-hot values.
 98 | 
 99 |     Returns
100 |     -------
101 |     dataset_train_new : numpy.array
102 |         The new training dataset with replaced columns.
103 |     dataset_test_new : numpy.array
104 |         The new testing dataset with replaced columns.
105 | 
106 |     Example
107 |     -------
108 | 
109 |         >>> import pykitml as pk
110 |         >>> import numpy as np
111 |         >>> a_train = np.array([[0, 1, 3.2], [1, 2, 3.5], [0, 0, 3.4]])
112 |         >>> a_test = np.array([[0, 3, 3.2], [1, 2, 4.5], [1, 3, 4.5]])
113 |         >>> a_train_onehot, a_test_onehot = pk.onehot_cols_traintest(a_train, a_test, cols=[0,1])
114 |         >>> a_train_onehot
115 |         array([[1. , 0. , 0. , 1. , 0. , 0. , 3.2],
116 |                [0. , 1. , 0. , 0. , 1. , 0. , 3.5],
117 |                [1. , 0. , 1. , 0. , 0. , 0. , 3.4]])
118 |         >>> a_test_onehot
119 |         array([[1. , 0. , 0. , 0. , 0. , 1. , 3.2],
120 |                [0. , 1. , 0. , 0. , 1. , 0. , 4.5],
121 |                [0. , 1. , 0. , 0. , 0. , 1. , 4.5]])
122 | 
123 |     '''
124 |     # Combine the datasets
125 |     dataset_new = np.concatenate((dataset_train, dataset_test), axis=0)
126 | 
127 |     # Replace columns with on hot values
128 |     offset = 0
129 |     for col in cols:
130 |         onehot_column = onehot(dataset_new[:, col+offset])
131 |         dataset_new = np.delete(dataset_new, col+offset, axis=1)
132 |         dataset_new = np.insert(dataset_new, [col+offset], onehot_column, axis=1)
133 |         offset += onehot_column.shape[1]-1
134 | 
135 |     split = dataset_train.shape[0]
136 |     return dataset_new[:split, :], dataset_new[split:, :]
137 | 
138 | 
139 | def polynomial(dataset_inputs, degree=3, cols=[]):
140 |     '''
141 |     Generates polynomial features from the input dataset.
142 |     For example, if an input sample is two dimensional and of the form [a, b],
143 |     the degree-2 polynomial features are :code:`[a, b, a^2, ab, b^2]`, and degree-3
144 |     polynomial features are
145 |     :code:`[a, b, a^2, ab, b^2, a^3, (a^2)*b, a*(b^2), b^3]`.
146 | 
147 |     Parameters
148 |     ----------
149 |     dataset_inputs : numpy.array
150 |         The input dataset to generate the polynomials from.
151 |     degree : int
152 |         The degree of the polynomial.
153 |     cols : list
154 |         The columns to use to generate polynomial features, columns
155 |         not in this list will be ignored. If empty (default), all columns will
156 |         used to generate polynomial features.
157 | 
158 |     Returns
159 |     -------
160 |     numpy.array
161 |         The new dataset with polynomial features.
162 | 
163 |     Example
164 |     -------
165 | 
166 |         >>> import numpy as np
167 |         >>> import pykitml as pk
168 |         >>> pk.polynomial(np.array([[1, 2], [2, 3]]), degree=2)
169 |         array([[1., 2., 1., 2., 4.],
170 |                [2., 3., 4., 6., 9.]])
171 |         >>> pk.polynomial(np.array([[1, 2], [2, 3]]), degree=3)
172 |         array([[ 1.,  2.,  1.,  2.,  4.,  1.,  2.,  4.,  8.],
173 |                [ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.]])
174 |         >>> pk.polynomial(np.array([[1, 4, 5, 2], [2, 5, 6, 3]]), degree=2, cols=[0, 3])
175 |         array([[1., 4., 5., 2., 1., 2., 4.],
176 |                [2., 5., 6., 3., 4., 6., 9.]])
177 | 
178 |     '''
179 |     # Make sure 2D array
180 |     if dataset_inputs.ndim == 1:
181 |         inputs = np.array([dataset_inputs])
182 |     else:
183 |         inputs = dataset_inputs
184 | 
185 |     # Choose the columns to genrate polynomial features for
186 |     if len(cols) == 0:
187 |         cols = range(inputs.shape[1])
188 | 
189 |     poly_dataset = inputs
190 | 
191 |     # Generate degree terms
192 |     for d in range(2, degree+1):
193 |         # Generate terms indices for degree d
194 |         term_indices = list(combinations_with_replacement(cols, r=d))
195 |         # Multiply them to form the term and concatenate
196 |         for indices in term_indices:
197 |             term = inputs[:, indices].prod(axis=1)
198 |             temp = np.zeros((poly_dataset.shape[0], poly_dataset.shape[1]+1))
199 |             temp[:, :-1] = poly_dataset
200 |             temp[:, -1] = term
201 |             poly_dataset = temp
202 | 
203 |     return poly_dataset.squeeze()
204 | 


--------------------------------------------------------------------------------
/pykitml/datasets/iris.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | '''
  4 | This module contains helper function to load the iris dataset.
  5 | '''
  6 | 
  7 | 
  8 | # sepal-length, sepal-width, petal-length, petal-width
  9 | # all in cm
 10 | 
 11 | 
 12 | inputs_train = np.array([
 13 |     [5.8, 2.7, 5.1, 1.9], [5.5, 2.3, 4.0, 1.3], [5.8, 2.7, 3.9, 1.2],
 14 |     [5.5, 2.5, 4.0, 1.3], [6.4, 2.8, 5.6, 2.2], [6.8, 2.8, 4.8, 1.4],
 15 |     [5.5, 2.4, 3.7, 1.0], [7.6, 3.0, 6.6, 2.1], [5.4, 3.0, 4.5, 1.5],
 16 |     [5.5, 2.4, 3.8, 1.1], [5.9, 3.2, 4.8, 1.8], [6.3, 3.3, 4.7, 1.6],
 17 |     [6.4, 3.2, 5.3, 2.3], [5.5, 3.5, 1.3, 0.2], [6.1, 2.8, 4.0, 1.3],
 18 |     [6.0, 3.4, 4.5, 1.6], [5.1, 3.3, 1.7, 0.5], [7.7, 3.8, 6.7, 2.2],
 19 |     [5.8, 2.7, 4.1, 1.0], [6.0, 2.9, 4.5, 1.5], [5.8, 2.8, 5.1, 2.4],
 20 |     [5.2, 3.4, 1.4, 0.2], [6.7, 2.5, 5.8, 1.8], [7.0, 3.2, 4.7, 1.4],
 21 |     [6.3, 3.3, 6.0, 2.5], [7.7, 2.6, 6.9, 2.3], [6.7, 3.3, 5.7, 2.1],
 22 |     [7.2, 3.6, 6.1, 2.5], [6.3, 3.4, 5.6, 2.4], [5.1, 3.5, 1.4, 0.3],
 23 |     [4.8, 3.4, 1.6, 0.2], [6.3, 2.7, 4.9, 1.8], [6.7, 3.1, 4.4, 1.4],
 24 |     [5.0, 3.4, 1.6, 0.4], [6.6, 3.0, 4.4, 1.4], [5.5, 4.2, 1.4, 0.2],
 25 |     [5.0, 3.4, 1.5, 0.2], [6.4, 3.1, 5.5, 1.8], [5.1, 3.8, 1.5, 0.3],
 26 |     [6.1, 2.8, 4.7, 1.2], [5.0, 2.0, 3.5, 1.0], [4.6, 3.4, 1.4, 0.3],
 27 |     [6.0, 2.2, 5.0, 1.5], [6.1, 2.6, 5.6, 1.4], [4.8, 3.4, 1.9, 0.2],
 28 |     [6.6, 2.9, 4.6, 1.3], [6.1, 2.9, 4.7, 1.4], [6.4, 2.8, 5.6, 2.1],
 29 |     [5.4, 3.7, 1.5, 0.2], [5.0, 3.2, 1.2, 0.2], [6.2, 2.8, 4.8, 1.8],
 30 |     [6.5, 3.0, 5.8, 2.2], [5.6, 3.0, 4.5, 1.5], [6.9, 3.1, 5.4, 2.1],
 31 |     [7.1, 3.0, 5.9, 2.1], [4.9, 3.1, 1.5, 0.1], [6.9, 3.2, 5.7, 2.3],
 32 |     [5.8, 4.0, 1.2, 0.2], [6.3, 2.3, 4.4, 1.3], [6.4, 2.9, 4.3, 1.3],
 33 |     [5.2, 2.7, 3.9, 1.4], [4.6, 3.1, 1.5, 0.2], [6.0, 2.7, 5.1, 1.6],
 34 |     [5.1, 3.5, 1.4, 0.2], [6.0, 3.0, 4.8, 1.8], [5.4, 3.9, 1.7, 0.4],
 35 |     [5.1, 3.4, 1.5, 0.2], [5.4, 3.4, 1.7, 0.2], [5.7, 2.5, 5.0, 2.0],
 36 |     [6.7, 3.0, 5.0, 1.7], [5.6, 2.5, 3.9, 1.1], [6.5, 2.8, 4.6, 1.5],
 37 |     [5.4, 3.9, 1.3, 0.4], [4.9, 3.0, 1.4, 0.2], [7.4, 2.8, 6.1, 1.9],
 38 |     [7.2, 3.2, 6.0, 1.8], [5.6, 2.9, 3.6, 1.3], [6.4, 3.2, 4.5, 1.5],
 39 |     [4.3, 3.0, 1.1, 0.1], [5.1, 3.7, 1.5, 0.4], [5.4, 3.4, 1.5, 0.4],
 40 |     [4.8, 3.1, 1.6, 0.2], [7.7, 3.0, 6.1, 2.3], [5.6, 3.0, 4.1, 1.3],
 41 |     [6.3, 2.5, 4.9, 1.5], [5.7, 4.4, 1.5, 0.4], [6.5, 3.2, 5.1, 2.0],
 42 |     [4.9, 3.1, 1.5, 0.1], [4.8, 3.0, 1.4, 0.1], [5.2, 3.5, 1.5, 0.2],
 43 |     [7.9, 3.8, 6.4, 2.0], [5.7, 3.8, 1.7, 0.3], [5.6, 2.8, 4.9, 2.0],
 44 |     [6.8, 3.0, 5.5, 2.1], [5.7, 2.8, 4.5, 1.3], [4.7, 3.2, 1.3, 0.2],
 45 |     [6.0, 2.2, 4.0, 1.0], [6.1, 3.0, 4.9, 1.8], [4.7, 3.2, 1.6, 0.2],
 46 |     [6.5, 3.0, 5.5, 1.8], [5.0, 3.6, 1.4, 0.2], [4.9, 2.4, 3.3, 1.0],
 47 |     [5.7, 2.6, 3.5, 1.0], [6.7, 3.1, 4.7, 1.5], [6.4, 2.7, 5.3, 1.9],
 48 |     [5.2, 4.1, 1.5, 0.1], [5.0, 3.0, 1.6, 0.2], [7.7, 2.8, 6.7, 2.0],
 49 |     [4.9, 2.5, 4.5, 1.7], [4.9, 3.1, 1.5, 0.1], [7.2, 3.0, 5.8, 1.6],
 50 |     [4.4, 2.9, 1.4, 0.2], [6.3, 2.9, 5.6, 1.8], [5.9, 3.0, 4.2, 1.5],
 51 |     [6.3, 2.8, 5.1, 1.5], [4.4, 3.0, 1.3, 0.2], [7.3, 2.9, 6.3, 1.8],
 52 |     [4.6, 3.6, 1.0, 0.2], [6.2, 2.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5],
 53 | ])
 54 | 
 55 | outputs_train = np.array([
 56 |     [0, 0, 1], [0, 1, 0], [0, 1, 0],
 57 |     [0, 1, 0], [0, 0, 1], [0, 1, 0],
 58 |     [0, 1, 0], [0, 0, 1], [0, 1, 0],
 59 |     [0, 1, 0], [0, 1, 0], [0, 1, 0],
 60 |     [0, 0, 1], [1, 0, 0], [0, 1, 0],
 61 |     [0, 1, 0], [1, 0, 0], [0, 0, 1],
 62 |     [0, 1, 0], [0, 1, 0], [0, 0, 1],
 63 |     [1, 0, 0], [0, 0, 1], [0, 1, 0],
 64 |     [0, 0, 1], [0, 0, 1], [0, 0, 1],
 65 |     [0, 0, 1], [0, 0, 1], [1, 0, 0],
 66 |     [1, 0, 0], [0, 0, 1], [0, 1, 0],
 67 |     [1, 0, 0], [0, 1, 0], [1, 0, 0],
 68 |     [1, 0, 0], [0, 0, 1], [1, 0, 0],
 69 |     [0, 1, 0], [0, 1, 0], [1, 0, 0],
 70 |     [0, 0, 1], [0, 0, 1], [1, 0, 0],
 71 |     [0, 1, 0], [0, 1, 0], [0, 0, 1],
 72 |     [1, 0, 0], [1, 0, 0], [0, 0, 1],
 73 |     [0, 0, 1], [0, 1, 0], [0, 0, 1],
 74 |     [0, 0, 1], [1, 0, 0], [0, 0, 1],
 75 |     [1, 0, 0], [0, 1, 0], [0, 1, 0],
 76 |     [0, 1, 0], [1, 0, 0], [0, 1, 0],
 77 |     [1, 0, 0], [0, 0, 1], [1, 0, 0],
 78 |     [1, 0, 0], [1, 0, 0], [0, 0, 1],
 79 |     [0, 1, 0], [0, 1, 0], [0, 1, 0],
 80 |     [1, 0, 0], [1, 0, 0], [0, 0, 1],
 81 |     [0, 0, 1], [0, 1, 0], [0, 1, 0],
 82 |     [1, 0, 0], [1, 0, 0], [1, 0, 0],
 83 |     [1, 0, 0], [0, 0, 1], [0, 1, 0],
 84 |     [0, 1, 0], [1, 0, 0], [0, 0, 1],
 85 |     [1, 0, 0], [1, 0, 0], [1, 0, 0],
 86 |     [0, 0, 1], [1, 0, 0], [0, 0, 1],
 87 |     [0, 0, 1], [0, 1, 0], [1, 0, 0],
 88 |     [0, 1, 0], [0, 0, 1], [1, 0, 0],
 89 |     [0, 0, 1], [1, 0, 0], [0, 1, 0],
 90 |     [0, 1, 0], [0, 1, 0], [0, 0, 1],
 91 |     [1, 0, 0], [1, 0, 0], [0, 0, 1],
 92 |     [0, 0, 1], [1, 0, 0], [0, 0, 1],
 93 |     [1, 0, 0], [0, 0, 1], [0, 1, 0],
 94 |     [0, 0, 1], [1, 0, 0], [0, 0, 1],
 95 |     [1, 0, 0], [0, 1, 0], [0, 1, 0],
 96 | ])
 97 | 
 98 | inputs_test = np.array([
 99 |     [5.6, 2.7, 4.2, 1.3], [6.2, 3.4, 5.4, 2.3], [4.8, 3.0, 1.4, 0.3],
100 |     [5.8, 2.7, 5.1, 1.9], [6.1, 3.0, 4.6, 1.4], [6.7, 3.3, 5.7, 2.5],
101 |     [6.7, 3.0, 5.2, 2.3], [6.8, 3.2, 5.9, 2.3], [5.7, 2.8, 4.1, 1.3],
102 |     [5.8, 2.6, 4.0, 1.2], [5.0, 3.5, 1.3, 0.3], [5.1, 3.8, 1.6, 0.2],
103 |     [4.6, 3.2, 1.4, 0.2], [6.7, 3.1, 5.6, 2.4], [5.1, 3.8, 1.9, 0.4],
104 |     [5.1, 2.5, 3.0, 1.1], [5.7, 2.9, 4.2, 1.3], [5.9, 3.0, 5.1, 1.8],
105 |     [5.3, 3.7, 1.5, 0.2], [5.7, 3.0, 4.2, 1.2], [5.0, 2.3, 3.3, 1.0],
106 |     [6.9, 3.1, 5.1, 2.3], [5.0, 3.3, 1.4, 0.2], [4.5, 2.3, 1.3, 0.3],
107 |     [5.5, 2.6, 4.4, 1.2], [6.5, 3.0, 5.2, 2.0], [5.0, 3.5, 1.6, 0.6],
108 |     [6.3, 2.5, 5.0, 1.9], [6.2, 2.9, 4.3, 1.3], [4.4, 3.2, 1.3, 0.2],
109 | ])
110 | 
111 | outputs_test = np.array([
112 |     [0, 1, 0], [0, 0, 1], [1, 0, 0],
113 |     [0, 0, 1], [0, 1, 0], [0, 0, 1],
114 |     [0, 0, 1], [0, 0, 1], [0, 1, 0],
115 |     [0, 1, 0], [1, 0, 0], [1, 0, 0],
116 |     [1, 0, 0], [0, 0, 1], [1, 0, 0],
117 |     [0, 1, 0], [0, 1, 0], [0, 0, 1],
118 |     [1, 0, 0], [0, 1, 0], [0, 1, 0],
119 |     [0, 0, 1], [1, 0, 0], [1, 0, 0],
120 |     [0, 1, 0], [0, 0, 1], [1, 0, 0],
121 |     [0, 0, 1], [0, 1, 0], [1, 0, 0],
122 | ])
123 | 
124 | 
125 | def load():
126 |     '''
127 |     Loads the iris dataset without any preprocessing.
128 |     The data set consists of 50 samples from each of three species of Iris
129 |     (Iris setosa, Iris virginica and Iris versicolor).
130 |     Four features were measured from each sample: the length and the width
131 |     of the sepals and petals
132 | 
133 |     Inputs have the following features/columns:
134 | 
135 |         :code:`sepal-length sepal-width petal-length petal-width`
136 | 
137 |     Outputs:
138 | 
139 |         :code:`[1, 0, 0]` - Iris-setosa,
140 |         :code:`[0, 1, 0]` - Iris-versicolor,
141 |         :code:`[0, 0, 1]` - Iris-virginica.
142 | 
143 |     Returns
144 |     -------
145 |     inputs_train : numpy.array
146 |         120x4 numpy array, each row having 4 features,
147 |     outputs_train : numpy.array
148 |         120x3 numpy array, contains 150 one-hot vectors, each
149 |         corresponding to a category,
150 |     inputs_test : numpy.array
151 |         30x4 numpy array, each row having 4 features,
152 |     outputs_test : numpy.array
153 |         30x3 numpy array, contains 150 one-hot vectors, each
154 |         corresponding to a category,
155 |     '''
156 |     return inputs_train, outputs_train, inputs_test, outputs_test
157 | 


--------------------------------------------------------------------------------
/pykitml/random_forest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import multiprocessing as mp
  3 | from math import ceil
  4 | from contextlib import redirect_stdout
  5 | 
  6 | import numpy as np
  7 | import tqdm
  8 | 
  9 | from . import _shared_array
 10 | from ._regressor import Regressor
 11 | from ._classifier import Classifier
 12 | from . import decision_tree
 13 | 
 14 | 
 15 | def _train_trees(input_q, ret_q, inputs_sh, inputs_shape, outputs_sh, outputs_shape):
 16 |     # Retrive numpy arrays from multiprocessing arrays
 17 |     inputs = _shared_array.shm_as_ndarray(inputs_sh, inputs_shape)
 18 |     outputs = _shared_array.shm_as_ndarray(outputs_sh, outputs_shape)
 19 | 
 20 |     # Suppress print statements
 21 |     with redirect_stdout(open(os.devnull, 'w')):
 22 |         while True:
 23 |             # Get tree from input queue
 24 |             try:
 25 |                 tree = input_q.get(block=False)
 26 |             except mp.queues.Empty:
 27 |                 break
 28 | 
 29 |             # Create bootstraped datset
 30 |             indices = np.random.choice(inputs.shape[0], inputs.shape[0])
 31 |             bootstrapped_inputs = inputs[indices]
 32 |             bootstrapped_outputs = outputs[indices]
 33 | 
 34 |             # Grow the tree
 35 |             tree.train(bootstrapped_inputs, bootstrapped_outputs)
 36 | 
 37 |             # Put the trained tree in output queue
 38 |             ret_q.put(tree)
 39 | 
 40 | 
 41 | class _RandomTree(decision_tree.DecisionTree):
 42 |     def __init__(self, input_size, output_size, num_features, feature_type=[],
 43 |                  max_depth=6, min_split=2, max_splits_eval=100, regression=False):
 44 |         # Initialize parent class
 45 |         super(_RandomTree, self).__init__(input_size, output_size, feature_type, max_depth,
 46 |                                           min_split, max_splits_eval, regression)
 47 | 
 48 |         # Select only a few random columns of the dataset for training
 49 |         self._cols_train = np.random.choice(input_size, num_features, replace=False)
 50 | 
 51 |         # Disable progress bar
 52 |         self._pbardis = True
 53 | 
 54 | 
 55 | class RandomForest(Classifier, Regressor):
 56 |     def __init__(self, input_size, output_size, feature_type=[], max_depth=6, min_split=2,
 57 |                  max_splits_eval=100, regression=False):
 58 |         '''
 59 |         Parameters
 60 |         ----------
 61 |         input_size : int
 62 |             Size of input data or number of input features.
 63 |         output_size : int
 64 |             Number of categories or groups.
 65 |         feature_type : list
 66 |             List of string describing the type of feature for
 67 |             each column. Can be :code:`'continues'`,
 68 |             :code:`'ranked'`, or :code:`'categorical'`.
 69 |         max_depth : int
 70 |             The maximum depth the trees can grow to.
 71 |         min_split : int
 72 |             The minimum number of data points a node should have to get
 73 |             split.
 74 |         max_splits_eval : int
 75 |             The maximum number of split points to evaluate for an
 76 |             attribute. If the number of candidate split points exceed
 77 |             this, :code:`max_splits_eval` split candidates will be
 78 |             randomly sampled from the candidates and only the sampled
 79 |             ones will be evaluated from finding the best split point.
 80 |         regression : bool
 81 |             If the tree is being trained on a regression problem.
 82 | 
 83 |         Raises
 84 |         ------
 85 |         InvalidFeatureType
 86 |             Invalid/Unknown feature type. Can only be :code:`'continues'`,
 87 |             :code:`'ranked'`, or :code:`'categorical'`.
 88 |         '''
 89 |         # Save values
 90 |         self._input_size = input_size
 91 |         self._output_size = output_size
 92 |         self._ftype = feature_type
 93 |         self._max_depth = max_depth
 94 |         self._min_split = min_split
 95 |         self._regression = regression
 96 |         self._max_splits_eval = max_splits_eval
 97 | 
 98 |         # List to store trees in
 99 |         self._trees = []
100 | 
101 |         # Outputs
102 |         self._output = None
103 | 
104 |     @property
105 |     def _out_size(self):
106 |         return self._output_size
107 | 
108 |     @property
109 |     def trees(self):
110 |         '''
111 |         A list of decision trees used in the forest.
112 |         '''
113 |         return self._trees
114 | 
115 |     def train(self, inputs, outputs, num_trees=100, num_feature_bag=None):
116 |         '''
117 |         Trains the model on the training data.
118 | 
119 |         Parameters
120 |         ----------
121 |         training_data : numpy.array
122 |             numpy array containing training data.
123 |         targets : numpy.array
124 |             numpy array containing training targets, corresponding to the training data.
125 |         num_trees : int
126 |             Number of trees to grow.
127 |         num_feature_bag : int or None
128 |             Number of random features to select when growing
129 |             a tree. If :code:`None` (default), :code:`ceil(sqrt(input_size))`
130 |             is chosen for classification and :code:`int(input_size/3)` for regression.
131 | 
132 |         Raises
133 |         ------
134 |         numpy.AxisError
135 |             If output_size is less than two. Use :py:func:`pykitml.onehot` to change
136 |             0/False to [1, 0] and 1/True to [0, 1] for binary classification.
137 |         '''
138 |         print('Training Model...')
139 | 
140 |         # Number of features to bag/choose for each tree
141 |         if num_feature_bag is None:
142 |             if not self._regression:
143 |                 num_feature_bag = ceil(np.sqrt(self._input_size))
144 |             else:
145 |                 num_feature_bag = int(self._input_size/3)
146 | 
147 |         # Create queues
148 |         input_q = mp.Queue()
149 |         ret_q = mp.Queue()
150 | 
151 |         # Initialize input queue
152 |         for _ in range(num_trees):
153 |             # Create tree
154 |             tree = _RandomTree(self._input_size, self._output_size, num_feature_bag,
155 |                                self._ftype, self._max_depth, self._min_split, self._max_splits_eval,
156 |                                self._regression)
157 |             # Put it in queue
158 |             input_q.put(tree)
159 | 
160 |         # Create shared multiprocess array for inputs and outputs
161 |         inputs_sh = _shared_array.ndarray_to_shm(inputs)
162 |         outputs_sh = _shared_array.ndarray_to_shm(outputs)
163 | 
164 |         # Start multiprocess
165 |         for _ in range(os.cpu_count()):
166 |             p = mp.Process(
167 |                 target=_train_trees, args=(input_q, ret_q, inputs_sh, inputs.shape, outputs_sh, outputs.shape)
168 |             )
169 |             p.start()
170 | 
171 |         # Progress bar and append trained trees to list
172 |         pbar = tqdm.tqdm(total=num_trees, ncols=80, unit='trees')
173 | 
174 |         while len(self._trees) != num_trees:
175 |             tree = ret_q.get()
176 |             self._trees.append(tree)
177 |             pbar.update()
178 | 
179 |         # Return if done
180 |         pbar.close()
181 | 
182 |     def feed(self, input_data):
183 |         # Loop through all the trees and total their outputs
184 |         total = 0
185 |         for tree in self._trees:
186 |             tree.feed(input_data)
187 |             total += tree.get_output()
188 | 
189 |         # Average
190 |         self._output = total/len(self._trees)
191 | 
192 |     def get_output(self):
193 |         return self._output.squeeze()
194 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // Use IntelliSense to learn about possible attributes.
  3 |     // Hover to view descriptions of existing attributes.
  4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  5 |     "version": "0.2.0",
  6 |     "configurations": [
  7 |         {
  8 |             "name": "PCA",
  9 |             "type": "python",
 10 |             "request": "launch",
 11 |             "cwd": "${workspaceFolder}/tests",
 12 |             "program": "${workspaceFolder}/tests/test_pca.py",
 13 |             "console": "integratedTerminal"
 14 |         },        
 15 |         {
 16 |             "name": "Banknote",
 17 |             "type": "python",
 18 |             "request": "launch",
 19 |             "cwd": "${workspaceFolder}/tests",
 20 |             "program": "${workspaceFolder}/tests/test_banknote.py",
 21 |             "console": "integratedTerminal"
 22 |         },
 23 |         {
 24 |             "name": "Banknote Decision Tree",
 25 |             "type": "python",
 26 |             "request": "launch",
 27 |             "cwd": "${workspaceFolder}/tests",
 28 |             "program": "${workspaceFolder}/tests/test_banknote_tree.py",
 29 |             "console": "integratedTerminal"
 30 |         },
 31 |         {
 32 |             "name": "Adult Random Forest",
 33 |             "type": "python",
 34 |             "request": "launch",
 35 |             "cwd": "${workspaceFolder}/tests",
 36 |             "program": "${workspaceFolder}/tests/test_adult_forest.py",
 37 |             "console": "integratedTerminal"
 38 |         },
 39 |         {
 40 |             "name": "Adult Decision Tree",
 41 |             "type": "python",
 42 |             "request": "launch",
 43 |             "cwd": "${workspaceFolder}/tests",
 44 |             "program": "${workspaceFolder}/tests/test_adult_tree.py",
 45 |             "console": "integratedTerminal"
 46 |         },
 47 |         {
 48 |             "name": "Adult",
 49 |             "type": "python",
 50 |             "request": "launch",
 51 |             "cwd": "${workspaceFolder}/tests",
 52 |             "program": "${workspaceFolder}/tests/test_adult.py",
 53 |             "console": "integratedTerminal"
 54 |         },
 55 |         {
 56 |             "name": "Heart Disease",
 57 |             "type": "python",
 58 |             "request": "launch",
 59 |             "cwd": "${workspaceFolder}/tests",
 60 |             "program": "${workspaceFolder}/tests/test_heart.py",
 61 |             "console": "integratedTerminal"
 62 |         },
 63 |         {
 64 |             "name": "Heart Disease Tree",
 65 |             "type": "python",
 66 |             "request": "launch",
 67 |             "cwd": "${workspaceFolder}/tests",
 68 |             "program": "${workspaceFolder}/tests/test_heart_tree.py",
 69 |             "console": "integratedTerminal"
 70 |         },
 71 |         {
 72 |             "name": "Heart Disease Naive Bayes",
 73 |             "type": "python",
 74 |             "request": "launch",
 75 |             "cwd": "${workspaceFolder}/tests",
 76 |             "program": "${workspaceFolder}/tests/test_heart_bayes.py",
 77 |             "console": "integratedTerminal"
 78 |         },
 79 |         {
 80 |             "name": "Fish Length",
 81 |             "type": "python",
 82 |             "request": "launch",
 83 |             "cwd": "${workspaceFolder}/tests",
 84 |             "program": "${workspaceFolder}/tests/test_fishlength.py",
 85 |             "console": "integratedTerminal"
 86 |         },
 87 |         {
 88 |             "name": "Iris Nearest Neighbor",
 89 |             "type": "python",
 90 |             "request": "launch",
 91 |             "cwd": "${workspaceFolder}/tests",
 92 |             "program": "${workspaceFolder}/tests/test_iris_neighbor.py",
 93 |             "console": "integratedTerminal"
 94 |         },
 95 |         {
 96 |             "name": "Iris Decision Tree",
 97 |             "type": "python",
 98 |             "request": "launch",
 99 |             "cwd": "${workspaceFolder}/tests",
100 |             "program": "${workspaceFolder}/tests/test_iris_tree.py",
101 |             "console": "integratedTerminal"
102 |         },
103 |         {
104 |             "name": "Boston Regression Tree",
105 |             "type": "python",
106 |             "request": "launch",
107 |             "cwd": "${workspaceFolder}/tests",
108 |             "program": "${workspaceFolder}/tests/test_boston_tree.py",
109 |             "console": "integratedTerminal"
110 |         },
111 |         {
112 |             "name": "Boston Regression Forest",
113 |             "type": "python",
114 |             "request": "launch",
115 |             "cwd": "${workspaceFolder}/tests",
116 |             "program": "${workspaceFolder}/tests/test_boston_forest.py",
117 |             "console": "integratedTerminal"
118 |         },
119 |         {
120 |             "name": "Iris Naive Bayes",
121 |             "type": "python",
122 |             "request": "launch",
123 |             "cwd": "${workspaceFolder}/tests",
124 |             "program": "${workspaceFolder}/tests/test_iris_bayes.py",
125 |             "console": "integratedTerminal"
126 |         },
127 |         {
128 |             "name": "Iris SVM",
129 |             "type": "python",
130 |             "request": "launch",
131 |             "cwd": "${workspaceFolder}/tests",
132 |             "program": "${workspaceFolder}/tests/test_iris_svm.py",
133 |             "console": "integratedTerminal"
134 |         },
135 |         {
136 |             "name": "Iris",
137 |             "type": "python",
138 |             "request": "launch",
139 |             "cwd": "${workspaceFolder}/tests",
140 |             "program": "${workspaceFolder}/tests/test_iris.py",
141 |             "console": "integratedTerminal"
142 |         },
143 |         {
144 |             "name": "MNIST",
145 |             "type": "python",
146 |             "request": "launch",
147 |             "cwd": "${workspaceFolder}/tests",
148 |             "program": "${workspaceFolder}/tests/test_mnist.py",
149 |             "args": ["adam"],
150 |             "console": "integratedTerminal"
151 |         },
152 |         {
153 |             "name": "MNIST SVM",
154 |             "type": "python",
155 |             "request": "launch",
156 |             "cwd": "${workspaceFolder}/tests",
157 |             "program": "${workspaceFolder}/tests/test_mnist_svm.py",
158 |             "console": "integratedTerminal"
159 |         },
160 |         {
161 |             "name": "S1 KMEANS",
162 |             "type": "python",
163 |             "request": "launch",
164 |             "cwd": "${workspaceFolder}/tests",
165 |             "program": "${workspaceFolder}/tests/test_s1_kmeans.py",
166 |             "console": "integratedTerminal"
167 |         },
168 |         {
169 |             "name": "LSTM ECG eye",
170 |             "type": "python",
171 |             "request": "launch",
172 |             "cwd": "${workspaceFolder}/tests",
173 |             "program": "${workspaceFolder}/tests/test_lstm_eye.py",
174 |             "console": "integratedTerminal"
175 |         },
176 |         {
177 |             "name": "LSTM Punchout",
178 |             "type": "python",
179 |             "request": "launch",
180 |             "cwd": "${workspaceFolder}/tests",
181 |             "program": "${workspaceFolder}/tests/test_punchout.py",
182 |             "console": "integratedTerminal"
183 |         },
184 |         {
185 |             "name": "Random Search",
186 |             "type": "python",
187 |             "request": "launch",
188 |             "cwd": "${workspaceFolder}/tests",
189 |             "program": "${workspaceFolder}/tests/test_search.py",
190 |             "console": "integratedTerminal"            
191 |         },
192 |         {
193 |             "name": "DQN Cartpole",
194 |             "type": "python",
195 |             "request": "launch",
196 |             "cwd": "${workspaceFolder}/tests",
197 |             "program": "${workspaceFolder}/tests/test_cartpole_dqn.py",
198 |             "console": "integratedTerminal"            
199 |         }
200 |     ]
201 | }


--------------------------------------------------------------------------------