├── .gitignore ├── LICENSE ├── README.md ├── TO_DO.txt ├── data └── lovecraft.txt ├── example_use.ipynb ├── notebooks ├── anomaly_detection │ ├── box_covariance.ipynb │ ├── elliptic_covariance.ipynb │ └── isolation_forest.ipynb ├── assorted_algorithms │ ├── k_nearest_neighbors.ipynb │ ├── kde_approximator.ipynb │ └── markov_chain_text.ipynb ├── classifiers │ ├── bagging_classifier.ipynb │ ├── bernoulli_naive_bayes.ipynb │ ├── decision_tree_classifier.ipynb │ ├── gaussian_naive_bayes.ipynb │ ├── k_nearest_neighbors_classifier.ipynb │ ├── multinomial_naive_bayes.ipynb │ ├── random_forest_classifier.ipynb │ ├── stochastic_gradient_descent_classifier.ipynb │ ├── stochastic_gradient_descent_classifier_binary.ipynb │ ├── support_vector_machine.ipynb │ └── support_vector_machine_binary.ipynb ├── clustering │ ├── agglomerative_clustering.ipynb │ ├── dbscan.ipynb │ ├── dbscan_secondary_method.ipynb │ ├── kmeans.ipynb │ ├── mean_shift.ipynb │ └── spectral_clustering.ipynb ├── datasets │ ├── datasets.ipynb │ ├── iris.data │ ├── make_classification.ipynb │ └── make_regression.ipynb ├── dimensionality_reduction │ ├── PCA.ipynb │ └── SVD.ipynb ├── metrics │ ├── classification_metrics.ipynb │ ├── pairwise_distance.ipynb │ └── regression_metrics.ipynb ├── natural_language_processing │ ├── count_vectorizer.ipynb │ ├── latent_dirichlet_allocation.ipynb │ ├── latent_semantic_indexing.ipynb │ └── tfidf_vectorizer.ipynb ├── neural_net │ ├── nn_classifier.ipynb │ └── nn_regressor.ipynb ├── random_number_generators │ └── middle_square.ipynb ├── regressors │ ├── bagging_regressor.ipynb │ ├── decision_tree_regressor.ipynb │ ├── k_nearest_neighbors_regressor.ipynb │ ├── lasso_regressor.ipynb │ ├── linear_regression_closed_form.ipynb │ ├── random_forest_regressor.ipynb │ ├── ridge_regressor.ipynb │ ├── stochastic_gradient_descent_regression.ipynb │ └── stochastic_gradient_descent_regression_with_regularization.ipynb └── utilities │ ├── grid_search.ipynb │ ├── normalizer.ipynb │ ├── randomized_search.ipynb │ ├── standard_scaler.ipynb │ └── train_test_and_cross_validation.ipynb └── zwml ├── __init__.py ├── anomaly_detection ├── __init__.py ├── box_covariance.py ├── elliptic_covariance.py └── isolation_forest.py ├── clustering ├── __init__.py ├── agglomerative_clustering.py ├── dbscan.py ├── kmeans.py ├── mean_shift.py └── spectral_clustering.py ├── datasets ├── __init__.py ├── datasets.py ├── iris.data ├── make_classification.py └── make_regression.py ├── linear_models ├── __init__.py ├── elastic_net_regressor.py ├── lasso_regressor.py ├── linear_regression.py ├── ridge_regressor.py ├── sgd_classifier.py └── sgd_regressor.py ├── metrics ├── __init__.py ├── classification_metrics.py ├── pairwise_distance.py └── regression_metrics.py ├── naive_bayes ├── __init__.py ├── bernoulli_naive_bayes.py ├── gaussian_naive_bayes.py └── multinomial_naive_bayes.py ├── neighbors ├── __init__.py ├── k_neighbors.py ├── kde_approximator.py ├── knn_classifier.py └── knn_regressor.py ├── nlp ├── __init__.py ├── count_vectorizer.py ├── latent_semantic_indexing.py └── tfidf_vectorizer.py ├── random ├── __init__.py └── middle_square.py ├── svm ├── __init__.py └── svc.py ├── tree_models ├── __init__.py ├── bagging_classifier.py ├── bagging_regressor.py ├── decision_tree_classifier.py ├── decision_tree_regressor.py ├── random_forest_classifier.py └── random_forest_regressor.py └── utilities ├── __init__.py ├── data_splitting.py ├── grid_search.py ├── markov_chain.py ├── normalizer.py ├── randomized_search.py └── standard_scaler.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints* 2 | **~ 3 | lunch_and_learn_notes.md 4 | *.DS_Store* 5 | *.npz 6 | **/__pycache__/* 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning from Scratch in Python 2 | 3 | 4 | ### If you want to understand something, you have to be able to build it. 5 | 6 | This is my attempt to build many of the machine learning algorithms from 7 | scratch, both in an attempt to make sense of them for myself and to write the 8 | algorithms in a way that is pedagogically interesting. At present, SkLearn is 9 | the leading Machine Learning module for Python, but looking through the 10 | open-source code, it's very hard to make sense of because of how abstracted 11 | the code is. These modules will be much simpler in design, such that a student 12 | can read through and understand how the algorithm works. As such, they will 13 | not be as optimized as SkLearn, etc. 14 | 15 | **__Organization__** 16 | 17 | zwml: This contains a fully functioning machine learning library with the ability to import a la sklearn. Want to use a decision tree? Just do `from zwml.tree_models import decision_tree_regressor`. This is still in alpha at the moment as many inconsistencies need to be cleaned up before it can be fully launched. These will always be the "full version" of the library, whereas some notebooks will have only a simpler form of the class (such as sgd without regularization) 18 | 19 | Notebooks: Each notebook will have the class fully written out, with a test case shown. 20 | All version information for the used python and modules (numpy, pandas, etc) 21 | are shown as well for later comparison. 22 | 23 | 24 | ## _Methodology note:_ 25 | 26 | A lot of these modules are *begging* for inheritance. As an example, the 27 | bagging classifier and the random forest classifier are largely the same code, 28 | with a few modified methods. Since these are designed as pedagogical tools and 29 | not "production code," I've chosen to make the modules as self-contained as 30 | possible. So instead of having an abstracted parent class, which a new 31 | programmer may have to track down, I've chosen to keep the code all together. 32 | I know it's sub-optimal for production, but I think it's better for someone to 33 | learn from. The only exceptions are ensemble methods that call entire other 34 | algorithms. For instance, the random forest module is building a bunch of 35 | decision trees, but with modfied data inputs. To illustrate this point, the 36 | decision tree class is imported as a stand-alone module and plugged in to the 37 | random forest module where it belongs - instead of recreating the decision 38 | tree in that class. The idea is that a new student will see how random forest 39 | (or other ensemble methodology) is just a super-class that wraps around 40 | another algorithm. 41 | 42 | 43 | ## _Outdated descriptions of what's available - to be updated soon_ 44 | 45 | # Notebooks/modules 46 | 47 | ## Regression: 48 | 49 | #### linear_regression_closed_form.ipynb 50 | 51 | This modules uses the Linear Algebra, closed-form solution for solving for 52 | coefficients of linear regression. 53 | 54 | #### stochastic_gradient_descent_regressions.ipynb 55 | 56 | This module performs stochastic gradient descent to find the regression 57 | coefficients for linear regression. There are a few options to set, such as 58 | learning rate, number of iterations, etc. There's also an option for setting 59 | the learning rate to be dynamic. **There are two versions of this notebook - 60 | one with and one without regularization included.** 61 | 62 | #### decision_tree_regreeor.ipynb 63 | 64 | This module uses optimization of standard deviation or absolute errors to build decisions trees for 65 | regression. It will be the basis for our random 66 | forest regressor. It has a few setting like max-depth to control how our 67 | trees are built and a few options for optimization method. 68 | 69 | #### random_forest_regressor.ipynb 70 | 71 | This is similar to the random_forest_classifier, but we instead focus on getting a continuous output. 72 | 73 | ## Classification: 74 | 75 | #### decision_tree_classifier.ipynb 76 | 77 | This module uses information gain to build decisions trees for 78 | classification. It will be the basis for our bagging classifier and random 79 | forest classifier. It has a few setting like max-depth to control how our 80 | trees are built. 81 | 82 | 83 | #### k_nearest_neighbors.ipynb 84 | 85 | This module is based on the wisdom of "points that are close together should 86 | be of the same class." It measures the distances to all points and then finds 87 | the k (user specifies 'k' by setting 'n_neighbors') closest points. Those points all get to vote on 88 | what class the new point likely is. 89 | 90 | #### bagging_classifier.ipynb 91 | 92 | This ensemble method is an extension on the decision tree that uses 93 | bootstrapping. Bootstrapping where we sample the dataset (with replacement) 94 | over and over to build out new datasets that "built from" our true data. If we 95 | do this many times, we'll build many slightly different trees on the bootstrapped data 96 | since no two trees will see the exact same data. Then we let all the trees 97 | predict on any new data, and allow the wisdom of the masses to determine our 98 | final outcome. 99 | 100 | #### random_forest_classifier.ipynb 101 | 102 | This is another ensemble method. It's just like the bagging_classifier, except 103 | we also randomize what features go to each tree in our data. Instead of just 104 | randomizing our datapoints, we also say, "this tree only gets features 1, 3, 105 | and 5." This further randomizes out input to each tree, helping to fight 106 | over-fitting; which puts us in a better spot for the bias-variance trade off. 107 | 108 | #### bernoulli_naive_bayes.ipynb 109 | 110 | Uses Bayes rule to calculate the probability that a given observation will belong in each class, 111 | based on what it's learned about probability distributions in the training data. In the Bernoulli 112 | flavor, only "on" or "off" is counted for each feature when determining probability 113 | 114 | #### gaussian_naive_bayes.ipynb 115 | 116 | Uses Bayes rule to calculate the probability that a given observation will belong in each class, 117 | based on what it's learned about probability distributions in the training data. In the Gaussian 118 | flavor, each feature is assumed to have a normal distribution, so the sample mean and standard deviation are used 119 | to approximate the Probability Distribution; which is sampled to determine probability. 120 | 121 | ## Clustering: 122 | 123 | #### KMeans 124 | 125 | Description still to come. 126 | 127 | ## Non-Algorithm - but useful 128 | 129 | #### train_test_and_cross_validation.ipynb 130 | 131 | We use different methods of splitting the data to measure the model 132 | performance on "unseen" or "out-of-sample" data. The cross-validation method 133 | will report the model behavior several different folds. Both cross validation 134 | and train-test split are built from scratch in this notebook. 135 | 136 | #### stats\_regress.py 137 | 138 | This is a suite of statistics calculation functions for regressions. Examples: 139 | mean_squared_error, r2, adjusted r2, etc. 140 | 141 | #### kde_approximator.ipynb 142 | 143 | Kernel Density Approximation. Given a set of points, what surface best 144 | describes the probability of drawing a point from any region of space? This 145 | module approximates that by assuming some probability "kernel" like (what if 146 | every point is representing a gaussian probability distribution). 147 | 148 | #### markov_chain_text.ipynb 149 | 150 | Given a document, can we learn about it and then generate new writings based 151 | on it? This uses the idea of Markov chains (randomly chaining together allowed 152 | possibilities together, via a probabalistic understanding of the document) to 153 | create new text from old documents. 154 | 155 | -------------------------------------------------------------------------------- /TO_DO.txt: -------------------------------------------------------------------------------- 1 | SGD Classifier OVR fix 2 | 3 | Spectral - RBF 4 | 5 | Update ZWML with comments. (Be careful on trees and utilities since they may have slightly different versions) -------------------------------------------------------------------------------- /notebooks/datasets/datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Iris Dataset Loader" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Z. W. Miller - Copyright 2018" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 34, 20 | "metadata": { 21 | "ExecuteTime": { 22 | "end_time": "2017-12-10T06:13:40.452950Z", 23 | "start_time": "2017-12-10T06:13:40.439056Z" 24 | }, 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import numpy as np\n", 31 | "\n", 32 | "def load_iris(as_dataframe=False):\n", 33 | " data = pd.read_csv(\"iris.data\", header=None)\n", 34 | " data.columns = ['sepal_length','sepal_width','petal_length','petal_width','class']\n", 35 | " if as_dataframe:\n", 36 | " return data\n", 37 | " X = data.iloc[:,:-1].as_matrix()\n", 38 | " y = data.iloc[:,-1]\n", 39 | " y = y.str.replace('Iris-setosa','0').replace('Iris-versicolor','1').replace('Iris-virginica','2')\n", 40 | " y = y.astype(int).as_matrix()\n", 41 | " return X,y" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 35, 47 | "metadata": { 48 | "ExecuteTime": { 49 | "end_time": "2017-12-10T06:13:41.704976Z", 50 | "start_time": "2017-12-10T06:13:41.698290Z" 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "X,y = load_iris()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 36, 61 | "metadata": { 62 | "ExecuteTime": { 63 | "end_time": "2017-12-10T06:13:41.857025Z", 64 | "start_time": "2017-12-10T06:13:41.850802Z" 65 | } 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 73 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", 74 | " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n", 75 | " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", 76 | " 2 2]\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "print(y)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python [default]", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.2" 111 | }, 112 | "toc": { 113 | "nav_menu": {}, 114 | "number_sections": true, 115 | "sideBar": true, 116 | "skip_h1_title": false, 117 | "toc_cell": false, 118 | "toc_position": {}, 119 | "toc_section_display": "block", 120 | "toc_window_display": false 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /notebooks/datasets/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | -------------------------------------------------------------------------------- /zwml/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = 'v0.0.alpha' 2 | -------------------------------------------------------------------------------- /zwml/anomaly_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .box_covariance import box_covariance 2 | from .elliptic_covariance import elliptic_covariance 3 | from .isolation_forest import isolation_tree, isolation_forest 4 | 5 | __all__ = ['box_covariance', 'elliptic_covariance', 'isolation_forest'] -------------------------------------------------------------------------------- /zwml/anomaly_detection/box_covariance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class box_covariance: 5 | 6 | def __init__(self, threshold=1.): 7 | """ 8 | Builds a box envelope around the data using a 9 | standard deviation threshold. Any points within this 10 | box are considered inliers, and points outside of this 11 | box are considered outliers. This is a fairly simplistic 12 | method that is not very robust to highly correlated 13 | data with "close by" outliers. 14 | --- 15 | KWargs: 16 | threhsold: how many standard deviations do you want 17 | to consider an "inlier" 18 | """ 19 | self.threshold = threshold 20 | self.data_stats = {} 21 | self.number_of_columns = None 22 | 23 | def fit(self, X): 24 | """ 25 | Learns about the input data and stores the mean and 26 | standard deviation of each column. 27 | --- 28 | In: X (features); np.array or pandas dataframe/series 29 | """ 30 | X = self.convert_to_array(X) 31 | self.number_of_columns = X.shape[1] 32 | 33 | for ix in range(self.number_of_columns): 34 | col = X.T[ix] 35 | col_mean = np.mean(col) 36 | col_std = np.std(col) 37 | self.data_stats[ix] = (col_mean, col_std) 38 | 39 | def predict(self, X): 40 | """ 41 | For each data point, subtract the mean of the column 42 | and then see if the data point is within 43 | threshold*std_dev of that column of 0. If so, it's an 44 | inlier. Otherwise it's an outlier. 45 | """ 46 | X = self.convert_to_array(X) 47 | result = np.ones(X.shape[0]) 48 | for ix in range(self.number_of_columns): 49 | X.T[ix] = X.T[ix] - self.data_stats[ix][0] 50 | result[(result != -1) & (np.abs(X.T[ix]) >= self.data_stats[ix][1]*self.threshold)] = -1 51 | return result 52 | 53 | def fit_predict(self, X): 54 | """ 55 | Learn from X and then return the transformed version 56 | of X for the user to use. 57 | --- 58 | In: X (features); np.array or pandas dataframe/series 59 | """ 60 | self.fit(X) 61 | return self.predict(X) 62 | 63 | def pandas_to_numpy(self, x): 64 | """ 65 | Checks if the input is a Dataframe or series, converts to numpy matrix for 66 | calculation purposes. 67 | --- 68 | Input: X (array, dataframe, or series) 69 | Output: X (array) 70 | """ 71 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 72 | return x.as_matrix() 73 | if type(x) == type(np.array([1,2])): 74 | return x 75 | return np.array(x) 76 | 77 | def handle_1d_data(self,x): 78 | """ 79 | Converts 1 dimensional data into a series of rows with 1 columns 80 | instead of 1 row with many columns. 81 | """ 82 | if x.ndim == 1: 83 | x = x.reshape(-1,1) 84 | return x 85 | 86 | def convert_to_array(self, x): 87 | """ 88 | Takes in an input and converts it to a numpy array 89 | and then checks if it needs to be reshaped for us 90 | to use it properly 91 | """ 92 | x = self.pandas_to_numpy(x) 93 | x = self.handle_1d_data(x) 94 | return x -------------------------------------------------------------------------------- /zwml/anomaly_detection/elliptic_covariance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from copy import copy 4 | 5 | class elliptic_covariance: 6 | 7 | def __init__(self, threshold=5.991): 8 | """ 9 | Uses the covariance matrix to find the eigenvalues 10 | and eigenvectors. Then finds an ellipse that represents 11 | the training data using the standard deviation. 12 | The ellipse is based on the formula: 13 | (x/std_X)^2 + (y/std_y)^2 + (z/std_z)^2 + ... = threshold 14 | The threshold value will define the allowed inliers 15 | and their total deviation by "distance" from the mean. 16 | --- 17 | KWargs: 18 | threshold: how far from the mean do you want the inlier 19 | surface to exist. 5.991 represents a 95% confidence interval 20 | from the Cumulative Chi_2 distribution. 21 | """ 22 | self.threshold = threshold 23 | self.number_of_columns = None 24 | 25 | def fit(self, X): 26 | """ 27 | Learns about the input data and stores the mean and 28 | standard deviation of each column. 29 | --- 30 | In: X (features); np.array or pandas dataframe/series 31 | """ 32 | X = self.convert_to_array(X) 33 | new_X = copy(X) 34 | self.number_of_columns = new_X.shape[1] 35 | 36 | self.means = np.mean(new_X, axis = 0) 37 | new_X -= self.means 38 | cov = np.cov(new_X, rowvar = False) 39 | eigenvals , eigenvecs = np.linalg.eigh(cov) 40 | idx = np.argsort(eigenvals)[::-1] 41 | self.eigenvecs = eigenvecs[:,idx] 42 | self.eigenvals = eigenvals[idx] 43 | 44 | 45 | def predict(self, X): 46 | """ 47 | For each data point, compute whether each point 48 | lies within the ellipsoid created by 49 | (x/std_X)^2 + (y/std_y)^2 + (z/std_z)^2 + ... = threshold 50 | This is checked by converting each point to the new reduced 51 | eigen space, where the ellipsoid is centered on 0 52 | and each direction has an axis the size of the sqrt(eigenvalue) 53 | The standard deviation is that sqrt(eigenvalue) since the 54 | eigenvalue captures the variance in along the eigenvector. 55 | """ 56 | X = self.convert_to_array(X) 57 | new_X = copy(X) 58 | new_X -= self.means 59 | new_X = self.convert_to_pca_space(new_X) 60 | new_X /= np.sqrt(self.eigenvals) 61 | new_X = new_X**2 62 | result = np.ones(X.shape[0]) 63 | result[np.sum(new_X, axis=1) >= self.threshold] = -1 64 | return result 65 | 66 | def convert_to_pca_space(self, X): 67 | """ 68 | Converts the points to the new eigenspace 69 | """ 70 | return np.dot(X,self.eigenvecs) 71 | 72 | def fit_predict(self, X): 73 | """ 74 | Learn from X and then return the transformed version 75 | of X for the user to use. 76 | --- 77 | In: X (features); np.array or pandas dataframe/series 78 | """ 79 | self.fit(X) 80 | return self.predict(X) 81 | 82 | def pandas_to_numpy(self, x): 83 | """ 84 | Checks if the input is a Dataframe or series, converts to numpy matrix for 85 | calculation purposes. 86 | --- 87 | Input: X (array, dataframe, or series) 88 | Output: X (array) 89 | """ 90 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 91 | return x.as_matrix() 92 | if type(x) == type(np.array([1,2])): 93 | return x 94 | return np.array(x) 95 | 96 | def handle_1d_data(self,x): 97 | """ 98 | Converts 1 dimensional data into a series of rows with 1 columns 99 | instead of 1 row with many columns. 100 | """ 101 | if x.ndim == 1: 102 | x = x.reshape(-1,1) 103 | return x 104 | 105 | def convert_to_array(self, x): 106 | """ 107 | Takes in an input and converts it to a numpy array 108 | and then checks if it needs to be reshaped for us 109 | to use it properly 110 | """ 111 | x = self.pandas_to_numpy(x) 112 | x = self.handle_1d_data(x) 113 | return x -------------------------------------------------------------------------------- /zwml/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .kmeans import kmeans 3 | from .mean_shift import mean_shift 4 | from .spectral_clustering import spectral_clustering 5 | from .dbscan import dbscan 6 | from .agglomerative_clustering import agglomerative_clustering 7 | 8 | __all__ = ['kmeans','mean_shift','spectral_clustering','dbscan','agglomerative_clustering'] 9 | -------------------------------------------------------------------------------- /zwml/clustering/agglomerative_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from itertools import combinations 4 | 5 | class agglomerative_clustering: 6 | 7 | def __init__(self, linkage="average", n_clusters=5, max_dist=None): 8 | """ 9 | Agglomerative clustering uses a "linkage" function to measure 10 | how close together two current clusters are. It then merges 11 | the two closest clusters into a single bigger cluster. This 12 | process is repeated until there are n_clusters remaining, 13 | or some other cut-off is applied. If no cut-off applied, 14 | will eventually result in a single cluster of all data points. 15 | --- 16 | KWargs: 17 | linkage: how to measure cluster closeness. Options 18 | ('average','complete','minimal','ward') 19 | n_clusters: when n_clusters is reached, stop merging 20 | max_dist: if no clusters are closer than max_dist, stop merging 21 | """ 22 | self.link = linkage 23 | self.clusters = {} 24 | self.n_clusters = n_clusters 25 | self.max_dist = max_dist 26 | self.merge_tracker = [] 27 | self.data = None 28 | self.labels = None 29 | 30 | def euclidean_distance(self, pt1, pt2): 31 | """ 32 | Returns the distance. Currently only uses Euclidean distance. 33 | --- 34 | Input: Cluster (cluster object), data point (np array) 35 | Output: Distance (float) 36 | """ 37 | return np.sqrt(np.sum((pt1 - pt2)**2)) 38 | 39 | def compute_distance(self, idx1, idx2): 40 | """ 41 | Chooses how do decide "how close" two clusters are. Applies to 42 | proper measure and returns it. 43 | """ 44 | if self.link == 'average': 45 | return self.average_linkage(idx1, idx2) 46 | elif self.link == 'complete': 47 | return self.complete_linkage(idx1, idx2) 48 | elif self.link == 'minimal': 49 | return self.minimal_linkage(idx1, idx2) 50 | elif self.link == 'ward': 51 | return self.ward_linkage(idx1, idx2) 52 | else: 53 | raise TypeError("Not a proper linkage function selection!") 54 | 55 | def average_linkage(self, idx1, idx2): 56 | """ 57 | Finds the distance between the mean of cluster 1 and the mean 58 | of cluster 2. 59 | """ 60 | return self.euclidean_distance(self.clusters[idx1]['mean'], self.clusters[idx2]['mean']) 61 | 62 | def complete_linkage(self, idx1, idx2): 63 | """ 64 | Finds the maximum possible distance between points in 65 | cluster 1 and cluster 2. Meaning it returns the distance of the 66 | two points in the clusters that are furthest apart. 67 | """ 68 | max_dist = 0. 69 | for pt in self.clusters[idx1]['members']: 70 | for pt2 in self.clusters[idx2]['members']: 71 | dist = self.euclidean_distance(self.data[pt], self.data[pt2]) 72 | if dist > max_dist: 73 | max_dist = dist 74 | return max_dist 75 | 76 | def minimal_linkage(self, idx1, idx2): 77 | """ 78 | Finds the minimum possible distance between points in 79 | cluster 1 and cluster 2. Meaning it returns the distance of the 80 | two points in the clusters that are nearest together. 81 | """ 82 | min_dist = 99999999. 83 | for pt in self.clusters[idx1]['members']: 84 | for pt2 in self.clusters[idx2]['members']: 85 | dist = self.euclidean_distance(self.data[pt], self.data[pt2]) 86 | if dist < min_dist: 87 | min_dist = dist 88 | return min_dist 89 | 90 | def ward_linkage(self, idx1, idx2): 91 | """ 92 | Measures how far every point in each cluster is from its own 93 | cluster mean, called the inertia. Then "pretends to merge" the 94 | points and measures the inertia of the resulting mega-cluster. 95 | Returns the "gained" inertia by the pretend merge. 96 | """ 97 | inertia_1 = 0 98 | inertia_2 = 0 99 | inertia_combined = 0 100 | 101 | for pt in self.clusters[idx1]['members']: 102 | inertia_1 += self.euclidean_distance(self.data[pt], self.clusters[idx1]['mean']) 103 | for pt in self.clusters[idx2]['members']: 104 | inertia_2 += self.euclidean_distance(self.data[pt], self.clusters[idx2]['mean']) 105 | 106 | combined_members = self.clusters[idx1]['members'] + self.clusters[idx2]['members'] 107 | combined_mean = np.mean([X[i] for i in combined_members], axis=0) 108 | 109 | for pt in combined_members: 110 | inertia_combined += self.euclidean_distance(self.data[pt], combined_mean) 111 | 112 | return inertia_combined - inertia_1 - inertia_2 113 | 114 | def init_clusters(self, X): 115 | """ 116 | Create a lookup table where each point is its own cluster. 117 | As we merge clusters, we'll remove members and track the progress 118 | with this dictionary. 119 | """ 120 | for idx, pt in enumerate(X): 121 | self.clusters[idx] = {'members': [idx], 'mean': pt} 122 | self.data = X 123 | 124 | def merge_clusters(self, idx1, idx2, distance): 125 | """ 126 | Takes two clusters and makes them into a single, 127 | larger cluster. Also tracks the "distance" that the merge 128 | occurred at for future reference. 129 | """ 130 | self.clusters[idx1]['members'] += self.clusters[idx2]['members'] 131 | self.clusters[idx1]['mean'] = np.mean([X[i] for i in self.clusters[idx1]['members']], axis=0) 132 | self.clusters.pop(idx2, None) 133 | self.merge_tracker.append((idx1, idx2, distance)) 134 | 135 | def fit(self, X): 136 | """ 137 | Makes ever point into it's own cluster. Checks the 138 | linkage distance for all possible merges (using the 139 | combinations to see what merges are possible). Whatever 140 | clusters have the smallest linkage relationship are merged 141 | together into a new cluster which takes the id of the lower 142 | numbered cluster. Tracks the "size" of each merge for 143 | review. Repeat this until down to n_clusters or the distance 144 | is larger than the allowed maximum. Then label the clusters. 145 | --- 146 | Input: X (data, array/dataframe) 147 | """ 148 | X = self.convert_to_array(X) 149 | self.init_clusters(X) 150 | 151 | while len(self.clusters.keys()) > self.n_clusters: 152 | decision_tracker = {} 153 | for combo in combinations(self.clusters.keys(), r=2): 154 | decision_tracker[combo] = self.compute_distance(combo[0], combo[1]) 155 | to_merge = sorted(decision_tracker.items(), key=lambda x: x[1])[0][0] 156 | 157 | if self.max_dist != None and self.linkage != 'ward' and decision_tracker[combo] > self.max_dist: 158 | break 159 | 160 | self.merge_clusters(to_merge[0], to_merge[1], decision_tracker[combo]) 161 | 162 | self.labels = np.zeros(X.shape[0]) 163 | for ix, clst in enumerate(self.clusters.keys()): 164 | members = self.clusters[clst]['members'] 165 | self.labels[members] = ix 166 | 167 | def fit_predict(self,X): 168 | """ 169 | Creates clusters for data X, and returns cluster ID's for each point. 170 | --- 171 | Input: X (data, array) 172 | Output: cluster IDs for X (array) 173 | """ 174 | self.fit(X) 175 | return self.labels 176 | 177 | def pandas_to_numpy(self, x): 178 | """ 179 | Checks if the input is a Dataframe or series, converts to numpy matrix for 180 | calculation purposes. 181 | --- 182 | Input: X (array, dataframe, or series) 183 | Output: X (array) 184 | """ 185 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 186 | return x.as_matrix() 187 | if type(x) == type(np.array([1,2])): 188 | return x 189 | return np.array(x) 190 | 191 | def handle_1d_data(self,x): 192 | """ 193 | Converts 1 dimensional data into a series of rows with 1 columns 194 | instead of 1 row with many columns. 195 | """ 196 | if x.ndim == 1: 197 | x = x.reshape(-1,1) 198 | return x 199 | 200 | def convert_to_array(self, x): 201 | """ 202 | Takes in an input and converts it to a numpy array 203 | and then checks if it needs to be reshaped for us 204 | to use it properly 205 | """ 206 | x = self.pandas_to_numpy(x) 207 | x = self.handle_1d_data(x) 208 | return x 209 | -------------------------------------------------------------------------------- /zwml/clustering/dbscan.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class dbscan: 5 | 6 | def __init__(self, epsilon=0.5, min_points=5): 7 | self.epsilon = epsilon 8 | self.min_points = min_points 9 | self.data_cols = None 10 | self.labels_ = None 11 | self.neighbors = {} 12 | 13 | def fit(self, X): 14 | X = self.pandas_to_numpy(X) 15 | if not self.data_cols: 16 | self.data_cols = X.shape[1] 17 | self.check_feature_shape(X) 18 | self.visited_points = [] 19 | self.clusters = [] 20 | 21 | for ix in range(X.shape[0]): 22 | if ix in self.visited_points: 23 | continue 24 | self.neighbors[ix] = self.get_neighbors(ix, X) 25 | if len(self.neighbors[ix]) >= self.min_points: 26 | self.visited_points.append(ix) 27 | self.clusters.append(self.make_cluster(ix, X)) 28 | 29 | self.labels_ = self.get_labels(X) 30 | 31 | def get_labels(self, X): 32 | labels = [-1]*X.shape[0] 33 | for clst_id, cluster in enumerate(self.clusters): 34 | for pt_id in cluster: 35 | labels[pt_id] = clst_id 36 | return np.array(labels) 37 | 38 | def make_cluster(self, ix, X): 39 | cluster = [ix] 40 | for neighbor in self.neighbors[ix]: 41 | if neighbor not in self.visited_points: 42 | self.visited_points.append(neighbor) 43 | self.neighbors[neighbor]= self.get_neighbors(ix, X) 44 | if len(self.neighbors[neighbor]) >= self.min_points: 45 | cluster_from_neighbor = self.make_cluster(neighbor, X) 46 | cluster = cluster + cluster_from_neighbor 47 | else: 48 | cluster.append(neighbor) 49 | return cluster 50 | 51 | def fit_predict(self,X): 52 | self.fit(X) 53 | return self.labels_ 54 | 55 | def get_neighbors(self, ix, X): 56 | neighbors = [] 57 | pt = X[ix] 58 | for ix2, pt2 in enumerate(X): 59 | dist = np.sqrt(np.sum((pt2 - pt)**2)) 60 | if dist <= self.epsilon: 61 | neighbors.append(ix2) 62 | return neighbors 63 | 64 | def check_feature_shape(self, x): 65 | """ 66 | Helper function to make sure any new data conforms to the fit data shape 67 | --- 68 | In: numpy array, (unknown shape) 69 | Out: numpy array, shape: (rows, self.data_cols)""" 70 | return x.reshape(-1,self.data_cols) 71 | 72 | def rbf_kernel(self, x1, x2, sig=1.): 73 | """ 74 | Returns the rbf affinity between two points (x1 and x2), 75 | for a given bandwidth (standard deviation). 76 | --- 77 | Inputs: 78 | x1; point 1(array) 79 | x2; point 2(array) 80 | sig; standard deviation (float) 81 | """ 82 | diff = np.sum((x1-x2)**2) 83 | norm = 1/(np.sqrt(2*np.pi*sig**2)) 84 | return norm*np.exp(-diff/(2*sig**2)) 85 | 86 | def pandas_to_numpy(self, x): 87 | """ 88 | Checks if the input is a Dataframe or series, converts to numpy matrix for 89 | calculation purposes. 90 | --- 91 | Input: X (array, dataframe, or series) 92 | 93 | Output: X (array) 94 | """ 95 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 96 | return x.as_matrix() 97 | if type(x) == type(np.array([1,2])): 98 | return x 99 | return np.array(x) 100 | 101 | -------------------------------------------------------------------------------- /zwml/clustering/kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class kmeans: 5 | 6 | def __init__(self, k = 5, random_seed=None, iters=1000, n_init=10, init='kmeans++'): 7 | """ 8 | Kmeans is a clustering algorithm which involves randomly initializing a set 9 | of clusters, assigning points by distance metric, then updating the means. 10 | The algorithm terminates if movements stops or after "iters" iterations. 11 | --- 12 | Inputs: 13 | k: the number of clusters to create 14 | random_seed: sets the random seed for reproducibility 15 | iters: how many iterations to attempt before breaking 16 | n_init: Initialize and run the algorithm this many times, keeping the 17 | best clusters, as decided by score. 18 | init: How to initialize the clusters. KMeans++ performs better, but takes more 19 | calculation. It weights the starting points based on distance from one anothers. 20 | Options: 'Random' (randomly select data points to act as seeds), 21 | 'Kmeans++' (randomly select with distance squared weighting) 22 | """ 23 | self._k = int(k) 24 | self._iters = iters 25 | self._n_init = n_init 26 | if init not in ['kmeans++','random']: 27 | print("Not a valid initialization, defaulting to kmeans++") 28 | init = 'kmeans++' 29 | self._init = init 30 | if random_seed: 31 | np.random.seed(random_seed) 32 | 33 | def compute_distance_to_cluster_mean(self, clst, pt): 34 | """ 35 | Returns the distance to the cluster mean. Currently only uses Euclidean distance. 36 | --- 37 | Input: Cluster (cluster object), data point (np array) 38 | Output: Distance (float) 39 | """ 40 | return np.sqrt(np.sum((clst.mean - pt)**2)) 41 | 42 | def classify(self, pt): 43 | """ 44 | Add a data point to the closest cluster. 45 | --- 46 | Input: data point (array) 47 | """ 48 | cluster_num = self.get_clust_id(pt) 49 | self.clusters[cluster_num].add_member(pt) 50 | 51 | def get_clust_id(self,pt): 52 | """ 53 | Given a point, return clusterid for cluster who's mean is the closest. 54 | --- 55 | Input: point (array) 56 | Output: cluster ID (int) 57 | """ 58 | return min(range(self._k), key=lambda i: self.compute_distance_to_cluster_mean(self.clusters[i],pt)) 59 | 60 | def init_clusters(self, X): 61 | """ 62 | Select the initial starting points for the clusters. Two options: "random" which 63 | randomly draws starting points from the data AND "kmeans++" which randomly draws 64 | with distance based weighting. 65 | --- 66 | Input: X (data, array) 67 | """ 68 | self.clusters = [self.cluster() for _ in range(0,self._k)] 69 | 70 | if self._init == 'random': 71 | rand_points = np.copy(X) 72 | np.random.shuffle(rand_points) 73 | rand_points = rand_points.tolist() 74 | for c in self.clusters: 75 | c.mean = rand_points.pop() 76 | else: # default to kmeans++ 77 | starting_points = [X[np.random.choice(np.arange(0,len(X)))]] 78 | 79 | for _ in range(self._k-1): 80 | dists = [] 81 | datum = [] 82 | for x in X: 83 | if np.sum([np.array_equal(x,row) for row in starting_points]): 84 | continue 85 | 86 | dist2 = 0. 87 | for sp in starting_points: 88 | dist2 += np.sum((x - sp)**2) 89 | dists.append(dist2) 90 | datum.append(x) 91 | dists = dists/np.sum(dists) 92 | starting_points.append(datum[np.random.choice(np.arange(0,len(datum)), p=dists)]) 93 | for c, sp in zip(self.clusters, starting_points): 94 | c.mean = sp 95 | 96 | for p in X: 97 | self.classify(p) 98 | 99 | def fit_predict(self,X): 100 | """ 101 | Creates clusters for data X, and returns cluster ID's for each point. 102 | --- 103 | Input: X (data, array) 104 | Output: cluster IDs for X (array) 105 | """ 106 | self.fit(X) 107 | return self.predict(X) 108 | 109 | def fit(self, X): 110 | """ 111 | Initializes clusters, then moves the mean of the cluster to the center of 112 | all points in the cluster. Reassigns all points to their new 'nearest' cluster 113 | and repeats this process until no more assignments can occur (or too many iterations). 114 | Whole procedure is repeated n_init times, to overcome local minima. Only the best 115 | clustering is kept as part of the model. 116 | --- 117 | Input: X (data, array/dataframe) 118 | """ 119 | X = self.pandas_to_numpy(X) 120 | 121 | best_inertia = None 122 | best_clusters = [] 123 | for _ in range(self._n_init): 124 | self.init_clusters(X) 125 | ischange = True 126 | i = 0 127 | while ischange and i < self._iters: 128 | ischange = False 129 | for c in self.clusters: 130 | c.get_mean() 131 | c.set_prev_members() 132 | c.members = [] 133 | 134 | for p in X: 135 | self.classify(p) 136 | 137 | for c in self.clusters: 138 | if c.is_changed(): 139 | ischange = True 140 | i += 1 141 | current_inertia = 0. 142 | for c in self.clusters: 143 | c.get_mean() 144 | current_inertia += c.get_total_square_distance() 145 | 146 | if not best_inertia or current_inertia < best_inertia: 147 | best_clusters = self.clusters 148 | best_inertia = current_inertia 149 | 150 | self.clusters = best_clusters 151 | self.inertia = best_inertia 152 | 153 | def predict(self, X): 154 | """ 155 | Given a point, the distance to each cluster center is calculated 156 | and the nearest cluster's ID is returned. 157 | --- 158 | Input: X (data, array/dataframe) 159 | """ 160 | clust_ids = [] 161 | for dt in self.pandas_to_numpy(X): 162 | clust_ids.append([self.get_clust_id(dt)]) 163 | return np.array(clust_ids) 164 | 165 | def pandas_to_numpy(self, x): 166 | """ 167 | Checks if the input is a Dataframe or series, converts to numpy matrix for 168 | calculation purposes. 169 | --- 170 | Input: X (array, dataframe, or series) 171 | 172 | Output: X (array) 173 | """ 174 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 175 | return x.as_matrix() 176 | if type(x) == type(np.array([1,2])): 177 | return x 178 | return np.array(x) 179 | 180 | def score(self): 181 | """ 182 | Inertia is a measure of the distance from each point to the cluster center, 183 | summed over all points and clusters. It's calculated during the fit 184 | procedure. 185 | --- 186 | Output: inertia (float) 187 | """ 188 | return self.inertia 189 | 190 | class cluster: 191 | def __init__(self): 192 | """ 193 | This sub-class stores all the information related to each cluster. 194 | mean: where is the average location of points in this cluster 195 | members: which data points are in this cluster 196 | prev_members: which data points were in this cluster last optimization step 197 | """ 198 | self.mean = None 199 | self.members = [] 200 | self.prev_members = [] 201 | 202 | def set_prev_members(self): 203 | """ 204 | Transfers current_members to prev_members for later comparison 205 | """ 206 | self.prev_members = self.members 207 | self.members = [] 208 | 209 | def add_member(self,pt): 210 | """ 211 | Helper function to add a point to this cluster. 212 | --- 213 | Input: data point (array) 214 | """ 215 | self.members.append(pt) 216 | 217 | def is_changed(self): 218 | """ 219 | Checks if this cluster has been modified by the most recent 220 | optimizatino step. 221 | --- 222 | Output: 223 | did cluster change (bool) 224 | """ 225 | return not np.array_equal(self.members,self.prev_members) 226 | 227 | def get_mean(self): 228 | means = [] 229 | for dim in np.array(self.members).T: 230 | means.append(np.mean(dim)) 231 | self.mean = means 232 | # if not len(self.members): 233 | # self.mean = [-999,-999] 234 | # return 235 | # x,y = 0.,0. 236 | # for p in self.members: 237 | # x+=p[0] 238 | # y+=p[1] 239 | # self.mean = [x/len(self.members),y/len(self.members)] 240 | 241 | def get_total_square_distance(self): 242 | val = 0. 243 | for p in self.members: 244 | val += np.sqrt(np.sum((self.mean - p)**2)) 245 | return val -------------------------------------------------------------------------------- /zwml/clustering/mean_shift.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from copy import copy 4 | 5 | class mean_shift: 6 | 7 | def __init__(self, bandwidth=1, iters=10, threshold = .1): 8 | self._iters = iters 9 | self.bandwidth = bandwidth 10 | self.data_cols = None 11 | self.threshold = threshold 12 | 13 | def fit(self, X): 14 | X = self.pandas_to_numpy(X) 15 | if not self.data_cols: 16 | self.data_cols = X.shape[1] 17 | self.check_feature_shape(X) 18 | self._original_data = copy(X) 19 | 20 | def transform(self, X): 21 | X = self.pandas_to_numpy(X) 22 | if not self.data_cols: 23 | self.data_cols = X.shape[1] 24 | X = self.check_feature_shape(X) 25 | new_X = [] 26 | for pt in X: 27 | movement = self.threshold+1 28 | it=0 29 | p = copy(pt) 30 | while it < self._iters and movement > self.threshold: 31 | shift = np.zeros(len(p)) 32 | scale = 0. 33 | for orig_pt in self._original_data: 34 | weight = self.rbf_kernel(p, orig_pt, sig=self.bandwidth) 35 | shift += weight*orig_pt 36 | scale += weight 37 | movement = p - shift/scale 38 | p = shift/scale 39 | movement = np.sqrt(np.sum(movement**2)) 40 | it+=1 41 | new_X.append(p) 42 | return new_X 43 | 44 | def fit_transform(self, X): 45 | self.fit(X) 46 | return self.transform(X) 47 | 48 | def check_feature_shape(self, x): 49 | """ 50 | Helper function to make sure any new data conforms to the fit data shape 51 | --- 52 | In: numpy array, (unknown shape) 53 | Out: numpy array, shape: (rows, self.data_cols)""" 54 | return x.reshape(-1,self.data_cols) 55 | 56 | def rbf_kernel(self, x1, x2, sig=1.): 57 | """ 58 | Returns the rbf affinity between two points (x1 and x2), 59 | for a given bandwidth (standard deviation). 60 | --- 61 | Inputs: 62 | x1; point 1(array) 63 | x2; point 2(array) 64 | sig; standard deviation (float) 65 | """ 66 | diff = np.sum((x1-x2)**2) 67 | norm = 1/(np.sqrt(2*np.pi*sig**2)) 68 | return norm*np.exp(-diff/(2*sig**2)) 69 | 70 | def pandas_to_numpy(self, x): 71 | """ 72 | Checks if the input is a Dataframe or series, converts to numpy matrix for 73 | calculation purposes. 74 | --- 75 | Input: X (array, dataframe, or series) 76 | 77 | Output: X (array) 78 | """ 79 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 80 | return x.as_matrix() 81 | if type(x) == type(np.array([1,2])): 82 | return x 83 | return np.array(x) -------------------------------------------------------------------------------- /zwml/clustering/spectral_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from zwml.clustering import kmeans 4 | 5 | class spectral_clustering: 6 | 7 | def __init__(self, k=3, connectivity=20, svd_dims=3, affinity='neighbors', bandwidth=1.): 8 | self.k = k 9 | self.connect = connectivity 10 | self.dims = svd_dims 11 | if affinity in ['neighbors', 'rbf']: 12 | self.affinity_type = affinity 13 | else: 14 | print("Not a valid affinity type, default to 'neighbors'.") 15 | self.affinity_type = 'neighbors' 16 | self.bandwidth = bandwidth 17 | 18 | def rbf_kernel(self, x1, x2, sig=1.): 19 | """ 20 | Returns the rbf affinity between two points (x1 and x2), 21 | for a given bandwidth (standard deviation). 22 | --- 23 | Inputs: 24 | x1; point 1(array) 25 | x2; point 2(array) 26 | sig; standard deviation (float) 27 | """ 28 | diff = np.sqrt(np.sum((x1-x2)**2)) 29 | norm = 1/(np.sqrt(2*np.pi*sig**2)) 30 | return norm*np.exp(-diff**2/(2*sig**2)) 31 | 32 | def compute_distance_between_all_points(self, pt1, pts, connectivity=None): 33 | """ 34 | Returns the distance between points. Currently only uses Euclidean distance. 35 | --- 36 | Input: data point, all data points (np arrays) 37 | Output: Distance (float) 38 | """ 39 | if self.affinity_type == 'neighbors': 40 | x = np.sqrt(np.sum((pt1 - pts)**2, axis=1)) 41 | idxs = x.argsort()[:connectivity] 42 | filt = np.ones(len(x), dtype=bool) 43 | filt[idxs] = False 44 | x[filt] = 0. 45 | x[~filt] = 1. 46 | elif self.affinity_type == 'rbf': 47 | x = [] 48 | for p in pts: 49 | x.append(self.rbf_kernel(pt1, p, sig=self.bandwidth)) 50 | return x 51 | 52 | def fit(self, X): 53 | X = self.pandas_to_numpy(X) 54 | self.original_data = np.copy(X) 55 | self.similarity = np.array([self.compute_distance_between_all_points(p,X, connectivity=self.connect) for p in X]) 56 | self.similarity /= max(self.similarity.ravel()) 57 | self.U, self.Sigma, self.VT = self.do_svd(self.similarity) 58 | self.kmeans = kmeans(k=self.k) 59 | self.kmeans.fit(self.U) 60 | 61 | def fit_predict(self, X): 62 | self.fit(X) 63 | return self.predict(X) 64 | 65 | def transform_to_svd_space(self,X): 66 | sig_inv = np.linalg.inv(self.Sigma) 67 | return np.dot(np.dot(X,self.U),sig_inv) 68 | 69 | def predict(self, X): 70 | X = self.pandas_to_numpy(X) 71 | sim_space = [self.compute_distance_between_all_points(p,self.original_data, connectivity=self.connect) for p in X] 72 | transformed_X = np.array([self.transform_to_svd_space(x) for x in sim_space]) 73 | return self.kmeans.predict(transformed_X) 74 | 75 | def do_svd(self, similarity): 76 | dims = self.dims 77 | U, Sigma, VT = np.linalg.svd(similarity) 78 | VT = VT[:dims,:] 79 | U = U[:,:dims] 80 | Sigma = np.diag(Sigma[:dims]) 81 | return U, Sigma, VT 82 | 83 | def plot_similarity_matrix(self): 84 | plt.figure(dpi=200) 85 | plt.imshow(self.similarity, cmap=plt.cm.Blues) 86 | plt.xlabel("Point ID", fontsize=16) 87 | plt.ylabel("Point ID", fontsize=16) 88 | plt.title("Similarity Matrix (1 for neighbors, 0 for not)", fontsize=16); 89 | plt.colorbar(cmap=plt.cm.Blues); 90 | 91 | def pandas_to_numpy(self, x): 92 | """ 93 | Checks if the input is a Dataframe or series, converts to numpy matrix for 94 | calculation purposes. 95 | --- 96 | Input: X (array, dataframe, or series) 97 | 98 | Output: X (array) 99 | """ 100 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 101 | return x.as_matrix() 102 | if type(x) == type(np.array([1,2])): 103 | return x 104 | return np.array(x) -------------------------------------------------------------------------------- /zwml/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .make_regression import make_regression 2 | from .make_classification import make_classification 3 | from .datasets import load_iris 4 | 5 | __all__ = ['make_regression','make_classification','load_iris'] -------------------------------------------------------------------------------- /zwml/datasets/datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | 5 | def load_iris(as_dataframe=False): 6 | directory, _ = os.path.split(__file__) 7 | DATA_PATH = os.path.join(directory, "iris.data") 8 | data = pd.read_csv(DATA_PATH, header=None) 9 | data.columns = ['sepal_length','sepal_width','petal_length','petal_width','class'] 10 | if as_dataframe: 11 | return data 12 | X = data.iloc[:,:-1].as_matrix() 13 | y = data.iloc[:,-1] 14 | y = y.str.replace('Iris-setosa','0').replace('Iris-versicolor','1').replace('Iris-virginica','2') 15 | y = y.astype(int).as_matrix() 16 | return X,y -------------------------------------------------------------------------------- /zwml/datasets/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /zwml/datasets/make_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class make_classification: 4 | 5 | def __init__(self): 6 | self.model_params = None 7 | self.num_feats = None 8 | self.random_state = None 9 | self.is_clean = None 10 | self.noise = None 11 | 12 | def build_dataset(self, num_feat=10, num_rows_per_class=100, num_classes=2, random_state = None, num_important=10, 13 | misclassify=0.01, dirty_data=False): 14 | assert num_feat > 0 and num_rows_per_class > 0, "Must have rows and features > 0." 15 | 16 | if random_state: 17 | np.random.seed(random_state) 18 | self.random_state = random_state 19 | 20 | if num_important > num_feat: 21 | num_important = num_feat 22 | 23 | self.num_important = num_important 24 | self.num_feats = num_feat 25 | 26 | means = np.random.uniform(-1,1,size=(num_classes, num_important)) 27 | sigmas = np.random.uniform(1e-6,0.5,size=(num_classes, num_important)) 28 | 29 | X = np.empty(num_important) 30 | y = np.zeros(num_rows_per_class) 31 | for i in range(0,num_classes): 32 | new_X = np.random.normal(means[i][0],sigmas[i][0],num_rows_per_class).reshape(-1,1) 33 | for j in range(1,num_important): 34 | col_X = np.random.normal(means[i][j],sigmas[i][j],num_rows_per_class).reshape(-1,1) 35 | new_X = np.hstack((new_X, col_X)) 36 | if not i: 37 | X = np.vstack((X, new_X))[1:] 38 | else: 39 | X = np.vstack((X, new_X)) 40 | y = np.hstack((y,[i]*num_rows_per_class)) 41 | 42 | # fill in the rest of the unimportant columns 43 | means = np.random.uniform(-1,1,num_feat-num_important) 44 | sigmas = np.random.uniform(1e-6,0.5, num_feat-num_important) 45 | for i in range(num_feat-num_important): 46 | X = np.hstack((X,np.random.normal(means[i],sigmas[i],X.shape[0]).reshape(-1,1))) 47 | 48 | #shuffle rows 49 | permute = np.random.permutation(len(X)) 50 | X = X[permute] 51 | y = y[permute] 52 | 53 | #shuffle columns 54 | np.random.shuffle(X.T) 55 | 56 | if dirty_data: 57 | X = self.muck_up_data(X) 58 | 59 | return X, y 60 | 61 | def muck_up_data(self, X, dup_cols=True, add_nan=True, combine_feats=True): 62 | if dup_cols: 63 | X = self._add_duplicate_columns(X, dup_cols) 64 | if combine_feats: 65 | X = self._combine_features(X, combine_feats) 66 | if add_nan: 67 | X = self._add_nans(X, add_nan) 68 | return X 69 | 70 | def _add_duplicate_columns(self,X, dup_cols): 71 | if isinstance(dup_cols, float): 72 | num_to_dupe = int(dup_cols*X.size) 73 | elif isinstance(dup_cols, bool): 74 | max_dupe = int((0.1*self.num_feats)+1.5) 75 | num_to_dupe = np.random.randint(1,max_dupe) 76 | elif isinstance(dup_cols, int): 77 | num_to_dupe = dup_cols 78 | else: 79 | raise TypeError('dup_cols must be type float, int, or bool.') 80 | 81 | cols_to_dup = np.random.choice(np.arange(self.num_feats), num_to_dupe, replace=False) 82 | new_X = np.hstack((X, X.T[cols_to_dup].T.reshape(-1,len(cols_to_dup)))) 83 | return new_X 84 | 85 | def _combine_features(self, X, combine_feats): 86 | if isinstance(combine_feats, float): 87 | num_to_dupe = int(combine_feats*X.size) 88 | elif isinstance(combine_feats, bool): 89 | max_dupe = int((0.1*self.num_feats)+1.5) 90 | num_to_dupe = np.random.randint(1,max_dupe) 91 | elif isinstance(combine_feats, int): 92 | num_to_dupe = combine_feats 93 | else: 94 | raise TypeError('combine_feats must be type float, int, or bool.') 95 | 96 | cols = np.random.choice(np.arange(self.num_feats), size=(num_to_dupe,2), replace=True) 97 | for col_set in cols: 98 | new_X = np.random.uniform(-1,1)*X.T[col_set[0]]+np.random.uniform(-1,1)*X.T[col_set[1]] 99 | X = np.hstack((X, new_X.T.reshape(-1,1))) 100 | return X 101 | 102 | def _add_nans(self, X, add_nan_val): 103 | if isinstance(add_nan_val, float): 104 | num_of_nans = int(add_nan_val*X.size) 105 | elif isinstance(add_nan_val, int): 106 | num_of_nans = add_nan_val 107 | else: 108 | max_nans = int(0.1*X.size) 109 | num_of_nans = np.random.randint(1,max_nans) 110 | 111 | for _ in range(num_of_nans): 112 | i = np.random.randint(0,X.shape[0]) 113 | j = np.random.randint(0,X.shape[1]) 114 | X[i,j] = np.nan 115 | return X -------------------------------------------------------------------------------- /zwml/datasets/make_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class make_regression: 4 | 5 | def __init__(self): 6 | self.model_params = None 7 | self.num_feats = None 8 | self.random_state = None 9 | self.is_clean = None 10 | self.noise = None 11 | self.bias = None 12 | 13 | def build_dataset(self, num_feat=10, num_rows=100, random_state = None, num_important=10, 14 | noise=0.1, bias=None, dirty_data=False): 15 | assert num_feat > 0 and num_rows > 0, "Must have rows and features > 0." 16 | if random_state: 17 | np.random.seed(random_state) 18 | self.random_state = random_state 19 | 20 | means = np.random.uniform(-1,1, size=num_feat) 21 | sigmas = np.random.uniform(1e-6,1, size=num_feat) 22 | X = np.zeros((num_rows, num_feat)) 23 | for i, mu in enumerate(means): 24 | X.T[i] = np.random.normal(mu, sigmas[i], num_rows) 25 | 26 | if bias == True: 27 | bias = np.random.uniform(-1,1) 28 | elif isinstance(bias, float): 29 | pass 30 | else: 31 | bias = 0. 32 | 33 | self.bias = bias 34 | 35 | if num_important > num_feat: 36 | num_important = num_feat 37 | 38 | self.num_important = num_important 39 | self.num_feats = num_feat 40 | 41 | target_builder = np.random.choice(np.arange(num_feat),num_important, replace=False) 42 | X_target = X.T[target_builder].T 43 | betas = np.random.uniform(-1,1,num_important) 44 | params = [] 45 | for i,j in zip(betas, target_builder): 46 | params.append((j,i)) 47 | self.model_params = params 48 | 49 | y = np.sum(X_target*betas, axis=1) + bias + np.random.normal(0, noise, num_rows) 50 | 51 | if dirty_data: 52 | X = self.muck_up_data(X) 53 | 54 | return X, y 55 | 56 | def muck_up_data(self, X, dup_cols=True, add_nan=True, combine_feats=True): 57 | if dup_cols: 58 | X = self._add_duplicate_columns(X, dup_cols) 59 | if combine_feats: 60 | X = self._combine_features(X, combine_feats) 61 | if add_nan: 62 | X = self._add_nans(X, add_nan) 63 | return X 64 | 65 | def _add_duplicate_columns(self,X, dup_cols): 66 | if isinstance(dup_cols, float): 67 | num_to_dupe = int(dup_cols*X.size) 68 | elif isinstance(dup_cols, bool): 69 | max_dupe = int((0.1*self.num_feats)+1.5) 70 | num_to_dupe = np.random.randint(1,max_dupe) 71 | elif isinstance(dup_cols, int): 72 | num_to_dupe = dup_cols 73 | else: 74 | raise TypeError('dup_cols must be type float, int, or bool.') 75 | 76 | cols_to_dup = np.random.choice(np.arange(self.num_feats), num_to_dupe, replace=False) 77 | new_X = np.hstack((X, X.T[cols_to_dup].T.reshape(-1,len(cols_to_dup)))) 78 | return new_X 79 | 80 | def _combine_features(self, X, combine_feats): 81 | if isinstance(combine_feats, float): 82 | num_to_dupe = int(combine_feats*X.size) 83 | elif isinstance(combine_feats, bool): 84 | max_dupe = int((0.1*self.num_feats)+1.5) 85 | num_to_dupe = np.random.randint(1,max_dupe) 86 | elif isinstance(combine_feats, int): 87 | num_to_dupe = combine_feats 88 | else: 89 | raise TypeError('combine_feats must be type float, int, or bool.') 90 | 91 | cols = np.random.choice(np.arange(self.num_feats), size=(num_to_dupe,2), replace=True) 92 | for col_set in cols: 93 | new_X = np.random.uniform(-1,1)*X.T[col_set[0]]+np.random.uniform(-1,1)*X.T[col_set[1]] 94 | X = np.hstack((X, new_X.T.reshape(-1,1))) 95 | return X 96 | 97 | def _add_nans(self, X, add_nan_val): 98 | if isinstance(add_nan_val, float): 99 | num_of_nans = int(add_nan_val*X.size) 100 | elif isinstance(add_nan_val, int): 101 | num_of_nans = add_nan_val 102 | else: 103 | max_nans = int(0.1*X.size) 104 | num_of_nans = np.random.randint(1,max_nans) 105 | 106 | for _ in range(num_of_nans): 107 | i = np.random.randint(0,X.shape[0]) 108 | j = np.random.randint(0,X.shape[1]) 109 | X[i,j] = np.nan 110 | return X -------------------------------------------------------------------------------- /zwml/linear_models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .sgd_classifier import sgd_classifier 3 | from .sgd_regressor import sgd_regressor 4 | from .elastic_net_regressor import elastic_net_regressor 5 | from .lasso_regressor import lasso_regressor 6 | from .ridge_regressor import ridge_regressor 7 | from .linear_regression import linear_regression 8 | 9 | __all__ = ['linear_regression','ridge_regressor','lasso_regressor','elastic_net_regressor','sgd_regressor','sgd_classifier'] 10 | -------------------------------------------------------------------------------- /zwml/linear_models/elastic_net_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sys 4 | from zwml.linear_models import sgd_regressor 5 | 6 | class elastic_net_regressor(sgd_regressor): 7 | 8 | def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 9 | dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5): 10 | """ 11 | Ridge Regressor - This is a wrapper on the SGD class where the regularization is set 12 | to the L2 Norm. All other functionality is the same as the SGD class. 13 | --- 14 | KWargs: 15 | 16 | n_iter: number of epochs to run in while fitting to the data. Total number of steps 17 | will be n_iter*X.shape[0]. 18 | 19 | alpha: The learning rate. Moderates the step size during the gradient descent algorithm. 20 | 21 | verbose: Whether to print out coefficient information during the epochs 22 | 23 | return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics 24 | 25 | fit_intercept: If True, an extra coefficient is added with no associated feature to act as the 26 | base prediction if all X are 0. 27 | 28 | dynamic: If true, an annealing scedule is used to scale the learning rate. 29 | 30 | lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large 31 | regression. If regularization is off, this does not apply to anything. 32 | 33 | l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 34 | If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 35 | regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'. 36 | """ 37 | self.coef_ = None 38 | self.trained = False 39 | self.n_iter = n_iter 40 | self.alpha_ = alpha 41 | self.verbosity = verbose 42 | self._return_steps = return_steps 43 | self._fit_intercept = fit_intercept 44 | self._next_alpha_shift = 0.1 # Only used if dynamic=True 45 | self._dynamic = dynamic 46 | self._regularize = 'EN' 47 | self._lamb = lamb 48 | self._l1_perc = l1_perc 49 | -------------------------------------------------------------------------------- /zwml/linear_models/lasso_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sys 4 | from zwml.linear_models import sgd_regressor 5 | 6 | class lasso_regressor(sgd_regressor): 7 | 8 | def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 9 | dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5): 10 | """ 11 | Lasso Regressor - This is a wrapper on the SGD class where the regularization is set 12 | to the L1 Norm. All other functionality is the same as the SGD class. 13 | --- 14 | KWargs: 15 | 16 | n_iter: number of epochs to run in while fitting to the data. Total number of steps 17 | will be n_iter*X.shape[0]. 18 | 19 | alpha: The learning rate. Moderates the step size during the gradient descent algorithm. 20 | 21 | verbose: Whether to print out coefficient information during the epochs 22 | 23 | return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics 24 | 25 | fit_intercept: If True, an extra coefficient is added with no associated feature to act as the 26 | base prediction if all X are 0. 27 | 28 | dynamic: If true, an annealing scedule is used to scale the learning rate. 29 | 30 | lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large 31 | regression. If regularization is off, this does not apply to anything. 32 | 33 | l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 34 | If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 35 | regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'. 36 | """ 37 | self.coef_ = None 38 | self.trained = False 39 | self.n_iter = n_iter 40 | self.alpha_ = alpha 41 | self.verbosity = verbose 42 | self._return_steps = return_steps 43 | self._fit_intercept = fit_intercept 44 | self._next_alpha_shift = 0.1 # Only used if dynamic=True 45 | self._dynamic = dynamic 46 | self._regularize = 'L1' 47 | self._lamb = lamb 48 | self._l1_perc = l1_perc 49 | -------------------------------------------------------------------------------- /zwml/linear_models/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class linear_regression: 4 | 5 | def __init__(self, w_intercept=True): 6 | """ 7 | Performs linear regression using the direct matrix solution 8 | from linear algebra. Minimizes the sum of squared errors of 9 | all included data points by drawing a best fit line and 10 | extracting the coefficients of that line. 11 | --- 12 | KWargs: 13 | w_intercept: flag to choose whether to include a y-intercept 14 | term in the calculation or not. 15 | """ 16 | self.coef_ = None 17 | self.intercept = w_intercept 18 | self.is_fit = False 19 | 20 | def add_intercept(self, X): 21 | """ 22 | Adds an 'all 1's' bias term to function as the y-intercept 23 | """ 24 | rows = X.shape[0] 25 | inter = np.ones(rows).reshape(-1,1) 26 | return np.hstack((X,inter)) 27 | 28 | def fit(self, X, y): 29 | """ 30 | Read in X (all features) and y (target) and use the Linear Algebra solution 31 | to extract the coefficients for Linear Regression. 32 | """ 33 | X = self.convert_to_array(X) 34 | y = self.convert_to_array(y) 35 | 36 | if self.intercept: 37 | X = self.add_intercept(X) 38 | 39 | temp_xtx = np.linalg.inv(np.dot(X.T,X)) 40 | temp_xty = np.dot(X.T,y) 41 | self.coef_ = np.dot(temp_xtx,temp_xty) 42 | self.is_fit = True 43 | 44 | def predict(self, X): 45 | """ 46 | Takes in a new X value (that must be the same shape as the original X for fitting) 47 | and returns the predicted y value, using the coefficients from fitting. 48 | """ 49 | if not self.is_fit: 50 | raise ValueError("You have to run the 'fit' method before using predict!") 51 | 52 | X = self.convert_to_array(X) 53 | if self.intercept: 54 | X = self.add_intercept(X) 55 | return np.dot(X,self.coef_) 56 | 57 | def pandas_to_numpy(self, x): 58 | """ 59 | Checks if the input is a Dataframe or series, converts to numpy matrix for 60 | calculation purposes. 61 | --- 62 | Input: X (array, dataframe, or series) 63 | Output: X (array) 64 | """ 65 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 66 | return x.as_matrix() 67 | if type(x) == type(np.array([1,2])): 68 | return x 69 | return np.array(x) 70 | 71 | def handle_1d_data(self,x): 72 | """ 73 | Converts 1 dimensional data into a series of rows with 1 columns 74 | instead of 1 row with many columns. 75 | """ 76 | if x.ndim == 1: 77 | x = x.reshape(-1,1) 78 | return x 79 | 80 | def convert_to_array(self, x): 81 | """ 82 | Takes in an input and converts it to a numpy array 83 | and then checks if it needs to be reshaped for us 84 | to use it properly 85 | """ 86 | x = self.pandas_to_numpy(x) 87 | x = self.handle_1d_data(x) 88 | return x 89 | 90 | def score(self, X, y): 91 | """ 92 | Uses the predict method to measure the (negative) 93 | mean squared error of the model. 94 | --- 95 | In: X (list or array), feature matrix; y (list or array) labels 96 | Out: negative mean squared error (float) 97 | """ 98 | X = self.convert_to_array(X) 99 | y = self.convert_to_array(y) 100 | pred = self.predict(X) 101 | return -1.* np.mean((np.array(pred)-np.array(y))**2) -------------------------------------------------------------------------------- /zwml/linear_models/ridge_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sys 4 | from zwml.linear_models import sgd_regressor 5 | 6 | class ridge_regressor(sgd_regressor): 7 | 8 | def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 9 | dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5): 10 | """ 11 | Ridge Regressor - This is a wrapper on the SGD class where the regularization is set 12 | to the L2 Norm. All other functionality is the same as the SGD class. 13 | --- 14 | KWargs: 15 | 16 | n_iter: number of epochs to run in while fitting to the data. Total number of steps 17 | will be n_iter*X.shape[0]. 18 | 19 | alpha: The learning rate. Moderates the step size during the gradient descent algorithm. 20 | 21 | verbose: Whether to print out coefficient information during the epochs 22 | 23 | return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics 24 | 25 | fit_intercept: If True, an extra coefficient is added with no associated feature to act as the 26 | base prediction if all X are 0. 27 | 28 | dynamic: If true, an annealing scedule is used to scale the learning rate. 29 | 30 | lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large 31 | regression. If regularization is off, this does not apply to anything. 32 | 33 | l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 34 | If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 35 | regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'. 36 | """ 37 | self.coef_ = None 38 | self.trained = False 39 | self.n_iter = n_iter 40 | self.alpha_ = alpha 41 | self.verbosity = verbose 42 | self._return_steps = return_steps 43 | self._fit_intercept = fit_intercept 44 | self._next_alpha_shift = 0.1 # Only used if dynamic=True 45 | self._dynamic = dynamic 46 | self._regularize = 'L2' 47 | self._lamb = lamb 48 | self._l1_perc = l1_perc 49 | -------------------------------------------------------------------------------- /zwml/linear_models/sgd_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class sgd_classifier: 5 | 6 | def __init__(self, n_iter=10, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 7 | dynamic=False, loss='ols', epsilon=0.1, random_state=None): 8 | """ 9 | Stochastic Gradient Descent Algorithm, with Logistic Regression 10 | cost function. 11 | --- 12 | KWargs: 13 | 14 | n_iter: number of epochs to run in while fitting to the data. 15 | Total number of steps will be n_iter*X.shape[0]. 16 | alpha: The learning rate. Moderates the step size during the 17 | gradient descent algorithm. 18 | verbose: Whether to print out coefficient information during 19 | the epochs 20 | return_steps: If True, fit returns a list of the coefficients 21 | at each update step for diagnostics 22 | fit_intercept: If True, an extra coefficient is added with no 23 | associated feature to act as the base prediction if all X are 0. 24 | dynamic: If true, an annealing scedule is used to scale the learning rate. 25 | """ 26 | self.coef_ = None 27 | self.trained = False 28 | self.n_iter = n_iter 29 | self.alpha_ = alpha 30 | self.verbosity = verbose 31 | self._return_steps = return_steps 32 | self._fit_intercept = fit_intercept 33 | self._next_alpha_shift = 0.1 # Only used if dynamic=True 34 | self._dynamic = dynamic 35 | if random_state: 36 | np.random.seed(random_state) 37 | self._data_cols = None 38 | 39 | def update(self, x, error): 40 | """ 41 | Calculating the change of the coeficients for SGD. This is the derivative of the cost 42 | function. B_i = B_i - alpha * dJ/dB_i. If fit_intercept=True, a slightly different 43 | value is used to update the intercept coefficient, since the associated feature is "1." 44 | --- 45 | Inputs: 46 | 47 | data_point: A single row of the feature matrix. Since this is Stochastic, batches are not allowed. 48 | 49 | error: The residual for the current data point, given the current coefficients. Prediction - True 50 | for the current datapoint and coefficients. 51 | """ 52 | step = self.alpha_*error*x 53 | if self._fit_intercept: 54 | self.coef_[1:] = self.coef_[1:] - step 55 | self.coef_[0] = self.coef_[0] - self.alpha_ * error 56 | else: 57 | self.coef_ = self.coef_ - step 58 | 59 | def shuffle_data(self, X, y): 60 | """ 61 | Given X and y, shuffle them together to get a new_X and new_y that maintain feature-target 62 | correlations. 63 | --- 64 | Inputs: 65 | 66 | X: A numpy array of any shape 67 | y: A numpy array of any shape 68 | 69 | Both X and y must have the same first dimension length. 70 | 71 | Returns: 72 | X,y: two numpy arrays 73 | """ 74 | assert len(X) == len(y) 75 | permute = np.random.permutation(len(y)) 76 | return X[permute], y[permute] 77 | 78 | def dynamic_learning_rate_check(self, epoch): 79 | """ 80 | If dynamic=True, shrink the learning rate by a factor of 2 after every 10% of 81 | the total number of epochs. This should cause a more direct path to the global 82 | minimum after the initial large steps. 83 | --- 84 | Inputs: epoch (int,float), the current iteration number. 85 | """ 86 | percent_of_epochs = float(epoch)/float(self.n_iter) 87 | if percent_of_epochs > self._next_alpha_shift: 88 | self._next_alpha_shift += 0.1 89 | self.alpha_ = self.alpha_/2 90 | 91 | def fit(self, X, y): 92 | """ 93 | Actually trains the model. Given feature-target combinations, gradient descent is performed 94 | using the optimization stepping given in the 'update' function. At present, all epochs are 95 | completed, as no tolerance is set. The learning rate is currently fixed. 96 | --- 97 | Inputs: 98 | X (array, dataframe, series), The features to regress on using SGD 99 | y (array, series), Must be a 1D set of targets. 100 | Outputs: 101 | steps (optional): If return_steps=True, a list of the evolution of the coefficients is returned 102 | """ 103 | X = self.convert_to_array(X) 104 | y = self.convert_to_array(y) 105 | self._stdy = np.std(y) 106 | self.coef_ = self.init_coef(X) 107 | if self._return_steps: 108 | steps = [] 109 | steps.append(np.copy(self.coef_)) 110 | for epoch in range(self.n_iter): 111 | shuf_X, shuf_y = self.shuffle_data(X,y) 112 | if self.verbosity: 113 | print("Epoch ", epoch, ", Coeff: ", self.coef_) 114 | for data, true in zip(shuf_X,shuf_y): 115 | pred = self.predict_proba(data, is_array=True) 116 | error = pred - true 117 | self.update(data, error) 118 | if self._return_steps: 119 | steps.append(np.copy(self.coef_)) 120 | if self._dynamic: 121 | self.dynamic_learning_rate_check(epoch) 122 | if self._return_steps: 123 | return steps 124 | 125 | def init_coef(self, X): 126 | """ 127 | Returns the initial starting values for the coefficients. At present, these are randomly 128 | set. If fit_intercept = True, an extra coefficient is generated. 129 | --- 130 | Input: X, Feature matrix. Needed to decide how many coefficients to generate. 131 | """ 132 | if self._fit_intercept: 133 | return np.random.rand(X.shape[1]+1) 134 | return np.random.rand(X.shape[1]) 135 | 136 | def predict_proba(self, X, is_array=False): 137 | """ 138 | Returns a prediction for a new data set, using the model coefficients. 139 | --- 140 | Input: 141 | X (dataframe, array): The new feature set. Must be the same number of columns 142 | as the initial training features. 143 | Output: 144 | prediction (array): The dot product of the input data and the coeficients. 145 | """ 146 | if not is_array: 147 | X = self.convert_to_array(X) 148 | if not self.coef_.all(): 149 | raise ValueError("Coefficients not defined, must fit() before predict().") 150 | if self._fit_intercept: 151 | return self.logit(np.dot(X,self.coef_[1:]) + self.coef_[0]) 152 | 153 | return self.logit(np.dot(X,self.coef_)) 154 | 155 | def predict(self, X, threshold=0.5): 156 | """ 157 | Takes the output of predict_proba and applies a threshold 158 | to the probability value. If the value is greater than the 159 | threshold, labels the row as class 1. Else class 0. 160 | """ 161 | preds = self.predict_proba(X) 162 | preds[preds >= threshold] = 1 163 | preds[preds < threshold] = 0 164 | return preds.reshape(-1,1) 165 | 166 | def logit(self, beta_x): 167 | """ 168 | Applies the sigmoid or logit function to current 169 | linear prediction from beta * X. 170 | """ 171 | denom = 1. - np.exp(-beta_x) 172 | val = 1./denom 173 | 174 | if type(val) != 'numpy.ndarray': 175 | val = np.array([val]) 176 | 177 | # Handle rounding errors! 178 | val[val>1] = 1 179 | val[val<0] = 0 180 | return val 181 | 182 | def score(self, X, y): 183 | """ 184 | Uses the predict method to measure the accuracy of the model. 185 | --- 186 | In: X (list or array), feature matrix; y (list or array) labels 187 | Out: accuracy (float) 188 | """ 189 | pred = self.predict(X) 190 | correct = 0 191 | for i,j in zip(y,pred): 192 | if i == j: 193 | correct+=1 194 | return float(correct)/float(len(y)) 195 | 196 | def pandas_to_numpy(self, x): 197 | """ 198 | Checks if the input is a Dataframe or series, converts to numpy matrix for 199 | calculation purposes. 200 | --- 201 | Input: X (array, dataframe, or series) 202 | Output: X (array) 203 | """ 204 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 205 | return x.as_matrix() 206 | if type(x) == type(np.array([1,2])): 207 | return x 208 | return np.array(x) 209 | 210 | def handle_1d_data(self,x): 211 | """ 212 | Converts 1 dimensional data into a series of rows with 1 columns 213 | instead of 1 row with many columns. 214 | """ 215 | if x.ndim == 1: 216 | x = x.reshape(-1,1) 217 | return x 218 | 219 | def convert_to_array(self, x): 220 | """ 221 | Takes in an input and converts it to a numpy array 222 | and then checks if it needs to be reshaped for us 223 | to use it properly 224 | """ 225 | x = self.pandas_to_numpy(x) 226 | x = self.handle_1d_data(x) 227 | return x -------------------------------------------------------------------------------- /zwml/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .regression_metrics import * 2 | from .classification_metrics import * 3 | from .pairwise_distance import * 4 | 5 | __all__ = ['get_error','mean_square_error','root_mean_square_error','mean_absolute_error','sum_square_error','r2_score','adj_r2','assess_model','test_regression_results', 'accuracy','precision','recall','f1_score','average_precision','average_recall','average_f1','confusion_matrix','pretty_confusion_matrix','classification_report', 6 | 'pandas_to_numpy','manhattan_distance','euclidean_distance','cosine_similarity_without_numpy','cosine_similarity','gaussian_kernel','uniform_kernel','rbf_kernel'] 7 | -------------------------------------------------------------------------------- /zwml/metrics/classification_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | def accuracy(true, pred): 6 | true = pandas_to_numpy(true) 7 | pred = pandas_to_numpy(pred) 8 | mask = (true == pred) 9 | numeric_mask = mask.astype(int) 10 | correct = np.sum(numeric_mask) 11 | return correct/true.shape[0] 12 | 13 | def get_precision_score(cls, true, pred): 14 | mask = (pred == cls) 15 | pred_of_cls = pred[mask] 16 | trues = true[mask] 17 | prec = accuracy(trues, pred_of_cls) 18 | return prec 19 | 20 | def precision(true, pred): 21 | true = pandas_to_numpy(true) 22 | pred = pandas_to_numpy(pred) 23 | unique_pred = np.unique(pred) 24 | precision_result = {} 25 | for cls in unique_pred: 26 | precision_result[cls] = get_precision_score(cls, true, pred) 27 | return precision_result 28 | 29 | def average_precision(true,pred): 30 | true = pandas_to_numpy(true) 31 | pred = pandas_to_numpy(pred) 32 | prec = precision(true,pred) 33 | 34 | clses = 0 35 | pr_score = 0. 36 | for pr in prec.values(): 37 | clses += 1 38 | pr_score += pr 39 | return pr_score/clses 40 | 41 | def get_recall_score(cls, true, pred): 42 | mask = (true == cls) 43 | pred_of_cls = pred[mask] 44 | trues = true[mask] 45 | reca = accuracy(trues, pred_of_cls) 46 | return reca 47 | 48 | def recall(true, pred): 49 | true = pandas_to_numpy(true) 50 | pred = pandas_to_numpy(pred) 51 | unique_true = np.unique(true) 52 | recall_result = {} 53 | for cls in unique_true: 54 | recall_result[cls] = get_recall_score(cls, true, pred) 55 | return recall_result 56 | 57 | def average_recall(true,pred): 58 | true = pandas_to_numpy(true) 59 | pred = pandas_to_numpy(pred) 60 | reca = recall(true,pred) 61 | 62 | clses = 0 63 | rc_score = 0. 64 | for rc in reca.values(): 65 | clses += 1 66 | rc_score += rc 67 | return rc_score/clses 68 | 69 | def f1_score(true, pred): 70 | rec = recall(true,pred) 71 | prec = precision(true,pred) 72 | f1 = {} 73 | for key in rec.keys(): 74 | f1[key] = (2*rec[key]*prec[key])/(rec[key]+prec[key]) 75 | return f1 76 | 77 | def average_f1(true,pred): 78 | f1 = f1_score(true,pred) 79 | 80 | clses = 0 81 | f1_scr = 0. 82 | for f1 in f1.values(): 83 | clses += 1 84 | f1_scr += f1 85 | return f1_scr/clses 86 | 87 | def classification_report(true, pred): 88 | prec = precision(true,pred) 89 | reca = recall(true,pred) 90 | f1 = f1_score(true,pred) 91 | acc = accuracy(true,pred) 92 | clses = np.unique(true) 93 | fill_empty_slots(clses, [prec, reca, f1]) 94 | for cls in clses: 95 | print("--- Label %s ---"%str(cls)) 96 | print("Precision: %.3f"%prec[cls]) 97 | print("Recall: %.3f"%reca[cls]) 98 | print("F1: %.3f\n"%f1[cls]) 99 | print("--- Average ---") 100 | print("Precision: %.3f"%average_precision(true,pred)) 101 | print("Recall: %.3f"%average_recall(true,pred)) 102 | print("F1: %.3f"%average_f1(true,pred)) 103 | print("Accuracy: %.3f"%accuracy(true,pred)) 104 | 105 | def fill_empty_slots(clses, metrics): 106 | for metric in metrics: 107 | for cls in clses: 108 | if cls not in metric: 109 | metric[cls] = 0. 110 | 111 | def confusion_matrix(true,pred): 112 | true = pandas_to_numpy(true) 113 | pred = pandas_to_numpy(pred) 114 | unique_true = np.unique(true) 115 | 116 | cm = np.zeros((unique_true.shape[0], unique_true.shape[0])) 117 | 118 | for cls in unique_true: 119 | mask = (true == cls) 120 | pred_of_cls = pred[mask] 121 | counts = np.unique(pred_of_cls, return_counts=True) 122 | for pred_cls, count in zip(*counts): 123 | cm[cls][pred_cls] = count 124 | return cm 125 | 126 | def pretty_confusion_matrix(true,pred, show_text=True): 127 | cm = confusion_matrix(true,pred) 128 | plt.figure(dpi=250) 129 | plt.imshow(cm, cmap=plt.cm.RdBu) 130 | plt.grid(False) 131 | plt.colorbar() 132 | ax = plt.gca() 133 | if show_text: 134 | for (j,i),label in np.ndenumerate(cm): 135 | ax.text(i,j,label,ha='center',va='center', fontsize=20, color='w') 136 | plt.xticks(list(range(cm.shape[0]))) 137 | plt.yticks(list(range(cm.shape[1]))); 138 | plt.xlabel("True") 139 | plt.ylabel("Predicted"); 140 | plt.show(); 141 | return cm 142 | 143 | def pandas_to_numpy(x): 144 | """ 145 | Checks if the input is a Dataframe or series, converts to numpy matrix for 146 | calculation purposes. 147 | --- 148 | Input: X (array, dataframe, or series) 149 | Output: X (array) 150 | """ 151 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 152 | return x.as_matrix() 153 | if type(x) == type(np.array([1,2])): 154 | return x 155 | return np.array(x) -------------------------------------------------------------------------------- /zwml/metrics/pairwise_distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def pandas_to_numpy(x): 5 | """ 6 | Checks if the input is a Dataframe or series, converts to numpy matrix for 7 | calculation purposes. 8 | --- 9 | Input: X (array, dataframe, or series) 10 | Output: X (array) 11 | """ 12 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 13 | return x.as_matrix() 14 | if type(x) == type(np.array([1,2])): 15 | return x 16 | return np.array(x) 17 | 18 | def manhattan_distance(vec1, vec2): 19 | """ 20 | Manhattan distance measures the distance along 21 | each direction and sums them together. 22 | """ 23 | vec1 = pandas_to_numpy(vec1) 24 | vec2 = pandas_to_numpy(vec2) 25 | return np.sum(np.abs(vec1-vec2)) 26 | 27 | def euclidean_distance(vec1, vec2): 28 | """ 29 | Calculating the Euclidean distance which is 30 | the more traditional method for distance 31 | calculation. sqrt((x1-x2)^2 + (y1-y2)^2 + ...) 32 | """ 33 | vec1 = pandas_to_numpy(vec1) 34 | vec2 = pandas_to_numpy(vec2) 35 | return np.sqrt(np.sum((vec1-vec2)**2)) 36 | 37 | def cosine_similarity_without_numpy(vec1, vec2): 38 | """ 39 | Calculates the angular similarity of two vectors. 40 | Does so by calculating cos(theta) between the vectors 41 | using the dot product. 42 | 43 | cos_sim = A dot B/(magnitude(A)*magnitude(B)) 44 | """ 45 | dot_product=0 46 | vec1_sum_sq = 0 47 | vec2_sum_sq = 0 48 | for idx, val in enumerate(vec1): 49 | dot_product += val*vec2[idx] 50 | vec1_sum_sq += val*val 51 | vec2_sum_sq += vec2[idx]*vec2[idx] 52 | return dot_product/(vec1_sum_sq**0.5*vec2_sum_sq**0.5) 53 | 54 | def cosine_similarity(vec1,vec2): 55 | """ 56 | Calculates the angular similarity of two vectors. 57 | Does so by calculating cos(theta) between the vectors 58 | using the dot product. 59 | 60 | cos_sim = A dot B/(magnitude(A)*magnitude(B)) 61 | """ 62 | vec1 = pandas_to_numpy(vec1) 63 | vec2 = pandas_to_numpy(vec2) 64 | dot_product = np.dot(vec1, vec2) 65 | vec1_norm = np.linalg.norm(vec1) 66 | vec2_norm = np.linalg.norm(vec2) 67 | return dot_product/(vec1_norm* vec2_norm) 68 | 69 | def gaussian_kernel(vec1, vec2, bandwidth=1.): 70 | """ 71 | Returns the Gaussian kernel relationship between two 72 | vectors. The Gaussian kernel assumes a bandwidth that 73 | defines the "width" of the Gaussian used to determine 74 | the relationship between the two points. 75 | """ 76 | dist = euclidean_distance(vec1, vec2) 77 | norm = 1/(np.sqrt(2*np.pi*bandwidth**2)) 78 | return norm*np.exp(-dist**2/(2*bandwidth**2)) 79 | 80 | def uniform_kernel(vec1, vec2, threshold_range=1, value=0.5): 81 | """ 82 | Returns a value if the two provided vectors are 83 | within threshold range of each other. In normal 84 | implementation, the integration of value over the 85 | whole range should be 1. 86 | """ 87 | distance = euclidean_distance(vec1, vec2) 88 | if distance <= threshold_range: 89 | probs = value 90 | else: 91 | probs = 0. 92 | return probs 93 | 94 | def rbf_kernel(vec1, vec2, gamma=None): 95 | """ 96 | The RBF, or radial basis function, kernel 97 | is similar to the gaussian kernel. However, 98 | it has a different scaling factor, using 99 | gamma instead of the bandwidth for normalization 100 | and width scaling. Gamma defaults to 1/dimensions 101 | unless otherwise specified.d 102 | """ 103 | if not gamma: 104 | gamma = 1/len(vec1) 105 | distance = euclidean_distance(vec1, vec2)**2 106 | distance *= -gamma 107 | return np.exp(distance) -------------------------------------------------------------------------------- /zwml/metrics/regression_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def get_error(true,pred): 5 | """ 6 | Returns predicted - true for each entry 7 | """ 8 | true = pandas_to_numpy(true) 9 | pred = pandas_to_numpy(pred) 10 | return pred-true 11 | 12 | def get_square_error(true,pred): 13 | """ 14 | Returns the square of predicted - true for each entry 15 | """ 16 | return np.square(get_error(true,pred)) 17 | 18 | def mean_square_error(true, pred): 19 | """ 20 | Returns the average predicted - true 21 | """ 22 | return np.mean(get_square_error(true,pred)) 23 | 24 | def root_mean_square_error(true,pred): 25 | """ 26 | Returns the sqrt of mean square error 27 | """ 28 | return np.sqrt(mean_square_error(true,pred)) 29 | 30 | def mean_absolute_error(true,pred): 31 | """ 32 | Returns the mean absolute value of error 33 | """ 34 | return np.mean(np.abs(get_error(true,pred))) 35 | 36 | def sum_square_error(true,pred): 37 | """ 38 | Returns the sum of squared errors 39 | """ 40 | true = pandas_to_numpy(true) 41 | pred = pandas_to_numpy(pred) 42 | return np.sum(get_square_error(true,pred)) 43 | 44 | def r2_score(true,pred): 45 | """ 46 | Returns R2 which is computed by 47 | SSE = sum of squared errors from the model 48 | SST = sume of squared errors to the mean of the data (y) 49 | R2 = 1 - SSE/SST 50 | """ 51 | true = pandas_to_numpy(true) 52 | pred = pandas_to_numpy(pred) 53 | SSE = np.sum(get_square_error(true,pred)) 54 | shpe = len(np.array(true)) 55 | SST = np.sum(get_square_error(true,np.mean(true)*shpe)) 56 | return 1.-(SSE/SST) 57 | 58 | def adj_r2(true, pred, X): 59 | """ 60 | Returns a version of R2 that penalizes for having many 61 | features. Fights against false correlations in data 62 | and is generally better than R2. 63 | """ 64 | X = pandas_to_numpy(X) 65 | rsquare = r2_score(true,pred) 66 | num_data = X.shape[0] 67 | num_features = X.shape[1] 68 | temp = (1-rsquare)*(num_data-1) 69 | temp = temp/(num_data-num_features-1) 70 | temp = 1 - temp 71 | return temp 72 | 73 | def assess_model(true, pred): 74 | """ 75 | Computes a suite of metrics all at once 76 | """ 77 | true = pandas_to_numpy(true) 78 | pred = pandas_to_numpy(pred) 79 | return sum_square_error(true,pred), mean_square_error(true,pred), root_mean_square_error(true,pred) 80 | 81 | def test_regression_results(X, true, pred): 82 | """ 83 | A print out of many of the metrics that show model performance 84 | """ 85 | true = pandas_to_numpy(true) 86 | pred = pandas_to_numpy(pred) 87 | print("Mean Square Error: ", mean_square_error(true,pred)) 88 | print("Root Mean Square Error: ", np.sqrt(mean_square_error(true,pred))) 89 | print("Mean Absolute Error: ",mean_absolute_error(true,pred)) 90 | r2 = r2_score(true,pred) 91 | print("R2: ", r2) 92 | print("Adj R2: ", adj_r2(true,pred,X)) 93 | 94 | def pandas_to_numpy(x): 95 | """ 96 | Checks if the input is a Dataframe or series, converts to numpy matrix for 97 | calculation purposes. 98 | --- 99 | Input: X (array, dataframe, or series) 100 | Output: X (array) 101 | """ 102 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 103 | return x.as_matrix() 104 | if type(x) == type(np.array([1,2])): 105 | return x 106 | return np.array(x) -------------------------------------------------------------------------------- /zwml/naive_bayes/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .bernoulli_naive_bayes import bernoulli_naive_bayes 3 | from .gaussian_naive_bayes import gaussian_naive_bayes 4 | 5 | __all__ = ['bernoulli_naive_bayes','gaussian_naive_bayes'] 6 | 7 | -------------------------------------------------------------------------------- /zwml/naive_bayes/bernoulli_naive_bayes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import defaultdict 4 | 5 | class bernoulli_naive_bayes: 6 | 7 | def __init__(self, smoothing = 1.): 8 | """ 9 | Bernoulli Naive Bayes builds it's understanding of the data by 10 | applying Bayes rule and calculating the conditional probability of 11 | being a class based on a probabilistic understanding of how the 12 | class has behaved before. We only care if a feature is zero or non-zero 13 | in this style of naive bayes and will calculate our conditional probabilities 14 | accordingly. 15 | --- 16 | Inputs: 17 | smoothing: the Laplace smoothing factor overcome the problem of multiplying 18 | a 0 probability, that causes the total probability to be 0. 19 | """ 20 | self._prob_by_class = defaultdict(float) 21 | self._cond_probs = defaultdict(lambda: defaultdict(float)) 22 | self._log_prob_by_class = defaultdict(float) 23 | self._log_cond_probs = defaultdict(lambda: defaultdict(float)) 24 | self._data_cols = None 25 | self._smoothing = smoothing 26 | 27 | def fit(self, X, y): 28 | """ 29 | For each class, we find out what percentage of the data is that class. 30 | We then filter the data so only the rows that are that class remain, 31 | and then go column by column - calculating what percentage of rows are 32 | non-zero, given the class. We store all of these values to be used later 33 | for predictions. We also store the log of these values for later prediction. 34 | --- 35 | Input: X, data (array/DataFrame) 36 | y, targets (array/Series) 37 | """ 38 | X = self.convert_to_array(X) 39 | y = self.pandas_to_numpy(y) # keep as 1D 40 | self._data_cols = X.shape[1] 41 | 42 | self._classes = np.unique(y) 43 | 44 | for cl in self._classes: 45 | filtered_targets = y[y == cl] 46 | filtered_data = X[y == cl] 47 | self._prob_by_class[cl] = len(filtered_targets)/len(y) 48 | self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl]) 49 | denom = len(filtered_targets) 50 | for col in range(self._data_cols): 51 | binarized_column = filtered_data.T[col] > 0 52 | num_ones = np.sum(binarized_column) 53 | #smoothing applied here so we never get a zero probability 54 | self._cond_probs[cl][col] = (num_ones+self._smoothing)/(denom+self._smoothing) 55 | self._log_cond_probs[cl][col] = np.log(self._cond_probs[cl][col]) 56 | 57 | def predict(self, X): 58 | """ 59 | Wrapper to return only the class of the prediction 60 | --- 61 | Input: X, data (array/dataframe) 62 | """ 63 | return self._predict(X, mode="predict") 64 | 65 | def predict_proba(self, X): 66 | """ 67 | Wrapper to return probability of each class of the prediction 68 | --- 69 | Input: X, data (array/dataframe) 70 | """ 71 | return self._predict(X, mode="predict_proba") 72 | 73 | def predict_log_proba(self, X): 74 | """ 75 | Wrapper to return log of the probability of each class of 76 | the prediction. 77 | --- 78 | Input: X, data (array/dataframe) 79 | """ 80 | return self._predict(X, mode="predict_log_proba") 81 | 82 | def _predict(self, X, mode="predict"): 83 | """ 84 | For each data point, we go through and calculate the probability 85 | of it being each class. We do so by using the probability of 86 | seeing each value per feature, then combining them together with 87 | the class probability. We work in the log space to fight against 88 | combining too many really small or large values and under/over 89 | flowing Python's memory capabilities for a float. Depending on the mode 90 | we return either the prediction, the probabilities for each class, 91 | or the log of the probabilities for each class. 92 | --- 93 | Inputs: X, data (array/DataFrame) 94 | mode: type of prediction to return, defaults to single prediction mode 95 | """ 96 | X = self.convert_to_array(X) 97 | X = (X > 0).astype(int) # convert to 1 or 0 98 | results = [] 99 | for row in X: 100 | beliefs = [] 101 | for cl in self._classes: 102 | prob_for_class = self._log_prob_by_class[cl] 103 | for col in range(self._data_cols): 104 | p = self._log_cond_probs[cl][col] 105 | # The row or (1-row) chooses either the 0 or 1 probability 106 | # based on whether our row is a 0 or 1. 107 | prob_for_class += p*row[col] + (1-p)*(1-row[col]) 108 | beliefs.append([cl, prob_for_class]) 109 | 110 | if mode == "predict_log_proba": 111 | _, log_probs = zip(*beliefs) 112 | results.append(log_probs) 113 | 114 | elif mode == "predict_proba": 115 | _, probs = zip(*beliefs) 116 | unlog_probs = np.exp(probs) 117 | normed_probs = unlog_probs/np.sum(unlog_probs) 118 | results.append(normed_probs) 119 | 120 | else: 121 | sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True) 122 | results.append(sort_beliefs[0][0]) 123 | 124 | return np.array(results).reshape(-1,1) 125 | 126 | def score(self, X, y): 127 | """ 128 | Uses the predict method to measure the accuracy of the model. 129 | --- 130 | In: X (list or array), feature matrix; y (list or array) labels 131 | Out: accuracy (float) 132 | """ 133 | pred = self.predict(X) 134 | correct = 0 135 | for i,j in zip(y,pred): 136 | if i == j: 137 | correct+=1 138 | return float(correct)/float(len(y)) 139 | 140 | def pandas_to_numpy(self, x): 141 | """ 142 | Checks if the input is a Dataframe or series, converts to numpy matrix for 143 | calculation purposes. 144 | --- 145 | Input: X (array, dataframe, or series) 146 | Output: X (array) 147 | """ 148 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 149 | return x.as_matrix() 150 | if type(x) == type(np.array([1,2])): 151 | return x 152 | return np.array(x) 153 | 154 | def handle_1d_data(self,x): 155 | """ 156 | Converts 1 dimensional data into a series of rows with 1 columns 157 | instead of 1 row with many columns. 158 | """ 159 | if x.ndim == 1: 160 | x = x.reshape(-1,1) 161 | return x 162 | 163 | def convert_to_array(self, x): 164 | """ 165 | Takes in an input and converts it to a numpy array 166 | and then checks if it needs to be reshaped for us 167 | to use it properly 168 | """ 169 | x = self.pandas_to_numpy(x) 170 | x = self.handle_1d_data(x) 171 | return x -------------------------------------------------------------------------------- /zwml/naive_bayes/gaussian_naive_bayes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import defaultdict 4 | 5 | class gaussian_naive_bayes: 6 | 7 | def __init__(self): 8 | """ 9 | Gaussian Naive Bayes builds it's understanding of the data by 10 | applying Bayes rule and calculating the conditional probability of 11 | being a class based on a probabilistic understanding of how the 12 | class has behaved before. We will assume each feature is normally 13 | distributed in its own space, then use a gaussian PDF to calculate 14 | the probability of a class based on behavior. 15 | """ 16 | self._prob_by_class = defaultdict(float) 17 | self._cond_means = defaultdict(lambda: defaultdict(float)) 18 | self._cond_std = defaultdict(lambda: defaultdict(float)) 19 | self._log_prob_by_class = defaultdict(float) 20 | self._data_cols = None 21 | 22 | def gaus(self, x, mu=0, sig=1): 23 | """ 24 | Returns the probability of x given the mean and standard 25 | deviation provided - assuming a Gaussian probability. 26 | --- 27 | Inputs: x (the value to find the probability for, float), 28 | mu (the mean value of the feature in the training data, float), 29 | sig (the standard deviation of the feature in the training data, float) 30 | Outputs: probability (float) 31 | """ 32 | norm = 1/(np.sqrt(2*np.pi*sig**2)) 33 | return norm*np.exp(-(x-mu)**2/(2*sig**2)) 34 | 35 | def fit(self, X, y): 36 | """ 37 | For each class, we find out what percentage of the data is that class. 38 | We then filter the data so only the rows that are that class remain, 39 | and then go column by column - calculating the mean and standard dev 40 | for the values of that column, given the class. We store all of these 41 | values to be used later for predictions. 42 | --- 43 | Input: X, data (array/DataFrame) 44 | y, targets (array/Series) 45 | """ 46 | X = self.convert_to_array(X) 47 | y = self.pandas_to_numpy(y) 48 | self._data_cols = X.shape[1] 49 | 50 | self._classes = np.unique(y) 51 | 52 | for cl in self._classes: 53 | self._prob_by_class[cl] = len(y[y == cl])/len(y) 54 | self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl]) 55 | filt = (y == cl) 56 | filtered_data = X[filt] 57 | for col in range(self._data_cols): 58 | self._cond_means[cl][col] = np.mean(filtered_data.T[col]) 59 | self._cond_std[cl][col] = np.std(filtered_data.T[col]) 60 | 61 | def predict(self, X): 62 | """ 63 | Wrapper to return only the class of the prediction 64 | --- 65 | Input: X, data (array/dataframe) 66 | """ 67 | return self._predict(X, mode="predict") 68 | 69 | def predict_proba(self, X): 70 | """ 71 | Wrapper to return probability of each class of the prediction 72 | --- 73 | Input: X, data (array/dataframe) 74 | """ 75 | return self._predict(X, mode="predict_proba") 76 | 77 | def predict_log_proba(self, X): 78 | """ 79 | Wrapper to return log of the probability of each class of 80 | the prediction. 81 | --- 82 | Input: X, data (array/dataframe) 83 | """ 84 | return self._predict(X, mode="predict_log_proba") 85 | 86 | def _predict(self, X, mode="predict"): 87 | """ 88 | For each data point, we go through and calculate the probability 89 | of it being each class. We do so by sampling the probability of 90 | seeing each value per feature, then combining them together with 91 | the class probability. We work in the log space to fight against 92 | combining too many really small or large values and under/over 93 | flowing Python's memory capabilities for a float. Depending on the mode 94 | we return either the prediction, the probabilities for each class, 95 | or the log of the probabilities for each class. 96 | --- 97 | Inputs: X, data (array/DataFrame) 98 | mode: type of prediction to return, defaults to single prediction mode 99 | """ 100 | X = self.convert_to_array(X) 101 | results = [] 102 | for row in X: 103 | beliefs = [] 104 | for cl in self._classes: 105 | prob_for_class = self._log_prob_by_class[cl] 106 | for col in range(self._data_cols): 107 | if self._cond_std[cl][col]: 108 | p = self.gaus(row[col],mu=self._cond_means[cl][col],sig=self._cond_std[cl][col]) 109 | logp = np.log(p) 110 | prob_for_class += logp 111 | beliefs.append([cl, prob_for_class]) 112 | 113 | if mode == "predict_log_proba": 114 | _, log_probs = zip(*beliefs) 115 | results.append(log_probs) 116 | 117 | elif mode == "predict_proba": 118 | _, probs = zip(*beliefs) 119 | unlog_probs = np.exp(probs) 120 | normed_probs = unlog_probs/np.sum(unlog_probs) 121 | results.append(normed_probs) 122 | 123 | else: 124 | sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True) 125 | results.append(sort_beliefs[0][0]) 126 | 127 | return results 128 | 129 | def score(self, X, y): 130 | """ 131 | Uses the predict method to measure the accuracy of the model. 132 | --- 133 | In: X (list or array), feature matrix; y (list or array) labels 134 | Out: accuracy (float) 135 | """ 136 | pred = self.predict(X) 137 | correct = 0 138 | for i,j in zip(y,pred): 139 | if i == j: 140 | correct+=1 141 | return float(correct)/float(len(y)) 142 | 143 | def pandas_to_numpy(self, x): 144 | """ 145 | Checks if the input is a Dataframe or series, converts to numpy matrix for 146 | calculation purposes. 147 | --- 148 | Input: X (array, dataframe, or series) 149 | Output: X (array) 150 | """ 151 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 152 | return x.as_matrix() 153 | if type(x) == type(np.array([1,2])): 154 | return x 155 | return np.array(x) 156 | 157 | def handle_1d_data(self,x): 158 | """ 159 | Converts 1 dimensional data into a series of rows with 1 columns 160 | instead of 1 row with many columns. 161 | """ 162 | if x.ndim == 1: 163 | x = x.reshape(-1,1) 164 | return x 165 | 166 | def convert_to_array(self, x): 167 | """ 168 | Takes in an input and converts it to a numpy array 169 | and then checks if it needs to be reshaped for us 170 | to use it properly 171 | """ 172 | x = self.pandas_to_numpy(x) 173 | x = self.handle_1d_data(x) 174 | return x -------------------------------------------------------------------------------- /zwml/naive_bayes/multinomial_naive_bayes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import defaultdict 4 | 5 | class multinomial_naive_bayes: 6 | 7 | def __init__(self, smoothing = 1.): 8 | """ 9 | Multinomial Naive Bayes builds it's understanding of the data by 10 | applying Bayes rule and calculating the conditional probability of 11 | being a class based on a probabilistic understanding of how the 12 | class has behaved before. We calculate conditional probabilities 13 | . 14 | --- 15 | Inputs: 16 | smoothing: the Laplace smoothing factor overcome the problem of multiplying 17 | a 0 probability, that causes the total probability to be 0. 18 | """ 19 | self._prob_by_class = defaultdict(float) 20 | self._cond_probs = defaultdict(lambda: defaultdict(float)) 21 | self._log_prob_by_class = defaultdict(float) 22 | self._log_cond_probs = defaultdict(lambda: defaultdict(float)) 23 | self._data_cols = None 24 | self._smoothing = smoothing 25 | 26 | def fit(self, X, y): 27 | """ 28 | For each class, we find out what percentage of the data is that class. 29 | We then filter the data so only the rows that are that class remain, 30 | and then go column by column - calculating what of total counts in the 31 | class come from that feature. We store all of these values to be used later 32 | for predictions. We also store the log of these values for later prediction. 33 | --- 34 | Input: X, data (array/DataFrame) 35 | y, targets (array/Series) 36 | """ 37 | X = self.convert_to_array(X) 38 | y = self.pandas_to_numpy(y) 39 | self._data_cols = X.shape[1] 40 | 41 | self._classes = np.unique(y) 42 | 43 | for cl in self._classes: 44 | filtered_targets = y[y == cl] 45 | filtered_data = X[y == cl] 46 | self._prob_by_class[cl] = len(filtered_targets)/len(y) 47 | self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl]) 48 | denom = np.sum(filtered_data) 49 | for col in range(self._data_cols): 50 | sum_of_column = np.sum(filtered_data.T[col]) 51 | #smoothing applied here so we never get a zero probability 52 | self._cond_probs[cl][col] = (sum_of_column+self._smoothing)/(denom+self._smoothing) 53 | self._log_cond_probs[cl][col] = np.log(self._cond_probs[cl][col]) 54 | 55 | def predict(self, X): 56 | """ 57 | Wrapper to return only the class of the prediction 58 | --- 59 | Input: X, data (array/dataframe) 60 | """ 61 | return self._predict(X, mode="predict") 62 | 63 | def predict_proba(self, X): 64 | """ 65 | Wrapper to return probability of each class of the prediction 66 | --- 67 | Input: X, data (array/dataframe) 68 | """ 69 | return self._predict(X, mode="predict_proba") 70 | 71 | def predict_log_proba(self, X): 72 | """ 73 | Wrapper to return log of the probability of each class of 74 | the prediction. 75 | --- 76 | Input: X, data (array/dataframe) 77 | """ 78 | return self._predict(X, mode="predict_log_proba") 79 | 80 | def _predict(self, X, mode="predict"): 81 | """ 82 | For each data point, we go through and calculate the probability 83 | of it being each class. We do so by using the probability of 84 | seeing each feature/class and multiplying that by the number 85 | of times we see that feature, then combining them together with 86 | the class probability. We work in the log space to fight against 87 | combining too many really small or large values and under/over 88 | flowing Python's memory capabilities for a float. Depending on the mode 89 | we return either the prediction, the probabilities for each class, 90 | or the log of the probabilities for each class. 91 | --- 92 | Inputs: X, data (array/DataFrame) 93 | mode: type of prediction to return, defaults to single prediction mode 94 | """ 95 | X = self.convert_to_array(X) 96 | results = [] 97 | for row in X: 98 | beliefs = [] 99 | for cl in self._classes: 100 | prob_for_class = self._log_prob_by_class[cl] 101 | for col in range(self._data_cols): 102 | val = row[col] 103 | p = self._log_cond_probs[cl][col] 104 | prob_for_class += val*p 105 | beliefs.append([cl, prob_for_class]) 106 | 107 | if mode == "predict_log_proba": 108 | _, log_probs = zip(*beliefs) 109 | results.append(log_probs) 110 | 111 | elif mode == "predict_proba": 112 | _, probs = zip(*beliefs) 113 | unlog_probs = np.exp(probs) 114 | normed_probs = unlog_probs/np.sum(unlog_probs) 115 | results.append(normed_probs) 116 | 117 | else: 118 | sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True) 119 | results.append(sort_beliefs[0][0]) 120 | 121 | return np.array(results).reshape(-1,1) 122 | 123 | def score(self, X, y): 124 | """ 125 | Uses the predict method to measure the accuracy of the model. 126 | --- 127 | In: X (list or array), feature matrix; y (list or array) labels 128 | Out: accuracy (float) 129 | """ 130 | pred = self.predict(X) 131 | correct = 0 132 | for i,j in zip(y,pred): 133 | if i == j: 134 | correct+=1 135 | return float(correct)/float(len(y)) 136 | 137 | def pandas_to_numpy(self, x): 138 | """ 139 | Checks if the input is a Dataframe or series, converts to numpy matrix for 140 | calculation purposes. 141 | --- 142 | Input: X (array, dataframe, or series) 143 | Output: X (array) 144 | """ 145 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 146 | return x.as_matrix() 147 | if type(x) == type(np.array([1,2])): 148 | return x 149 | return np.array(x) 150 | 151 | def handle_1d_data(self,x): 152 | """ 153 | Converts 1 dimensional data into a series of rows with 1 columns 154 | instead of 1 row with many columns. 155 | """ 156 | if x.ndim == 1: 157 | x = x.reshape(-1,1) 158 | return x 159 | 160 | def convert_to_array(self, x): 161 | """ 162 | Takes in an input and converts it to a numpy array 163 | and then checks if it needs to be reshaped for us 164 | to use it properly 165 | """ 166 | x = self.pandas_to_numpy(x) 167 | x = self.handle_1d_data(x) 168 | return x -------------------------------------------------------------------------------- /zwml/neighbors/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .knn_classifier import knn_classifier 3 | from .knn_regressor import knn_regressor 4 | from .kde_approximator import kde_approximator 5 | 6 | __all__ = ['knn_classifier','knn_regressor','kde_approximator'] 7 | -------------------------------------------------------------------------------- /zwml/neighbors/k_neighbors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | 5 | class k_neighbors: 6 | 7 | def __init__(self, n_neighbors=5, return_dist=False): 8 | """ 9 | KNearestNeighbors finds the nearest points in the feature space. 10 | --- 11 | In: n_neighbors (int) - how many closest neighbors do we consider 12 | """ 13 | if n_neighbors > 0: 14 | self.k = int(n_neighbors) 15 | else: 16 | print("n_neighbors must be >0. Set to 5!") 17 | self.k = 5 18 | self.X = None 19 | self._return_dist = return_dist 20 | 21 | def fit(self, X): 22 | """ 23 | Makes a copy of the training data that can live within the class. 24 | Thus, the model can be serialized and used away from the original 25 | training data. 26 | --- 27 | In: X (features); np.array or pandas dataframe/series 28 | """ 29 | self.X = copy.copy(self.convert_to_array(X)) 30 | 31 | def find_neighbors(self, X): 32 | """ 33 | Iterates through all points to predict, calculating the distance 34 | to all of the training points. It then finds the closest points. 35 | ___ 36 | In: new data to predict (np.array, pandas series/dataframe) 37 | Out: predictions (np.array) 38 | """ 39 | X = self.convert_to_array(X) 40 | results = [] 41 | for x in X: 42 | local_results = [] 43 | for x2 in self.X: 44 | local_results.append([self.dist_between_points(x,x2),x2]) 45 | neighbors = sorted(local_results, key=lambda x: x[0])[:self.k] 46 | if self._return_dist: 47 | results.append(neighbors) 48 | else: 49 | for x in neighbors: 50 | results.append(x[1]) 51 | #results.append([x[1] for x in neighbors]) 52 | return np.array(results) 53 | 54 | def dist_between_points(self, a, b): 55 | """ 56 | Calculates the distance between two vectors. 57 | --- 58 | Inputs: a,b (np.arrays) 59 | Outputs: distance (float)""" 60 | assert np.array(a).shape == np.array(b).shape, 'Vectors must be of same size' 61 | return np.sqrt(np.sum((a-b)**2)) 62 | 63 | def pandas_to_numpy(self, x): 64 | """ 65 | Checks if the input is a Dataframe or series, converts to numpy matrix for 66 | calculation purposes. 67 | --- 68 | Input: X (array, dataframe, or series) 69 | Output: X (array) 70 | """ 71 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 72 | return x.as_matrix() 73 | if type(x) == type(np.array([1,2])): 74 | return x 75 | return np.array(x) 76 | 77 | def handle_1d_data(self,x): 78 | """ 79 | Converts 1 dimensional data into a series of rows with 1 columns 80 | instead of 1 row with many columns. 81 | """ 82 | if x.ndim == 1: 83 | x = x.reshape(-1,1) 84 | return x 85 | 86 | def convert_to_array(self, x): 87 | """ 88 | Takes in an input and converts it to a numpy array 89 | and then checks if it needs to be reshaped for us 90 | to use it properly 91 | """ 92 | x = self.pandas_to_numpy(x) 93 | x = self.handle_1d_data(x) 94 | return x -------------------------------------------------------------------------------- /zwml/neighbors/kde_approximator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from mpl_toolkits.mplot3d import Axes3D 5 | from copy import copy 6 | plt.style.use('seaborn') 7 | 8 | class kde_approximator: 9 | 10 | def __init__(self, kernel='gaus', bandwidth=1., grid_fineness=10.): 11 | """ 12 | KDE allows us a method of drawing samples from an 13 | already known set of data, with the same distribution 14 | of data. This is done by assuming a probability dist 15 | for each point and using that as a probabilistic 16 | interpretation of the data. 17 | --- 18 | KWargs: 19 | kernel: type of probability dist to assume. Options 20 | "gaus", "uniform". (string) 21 | bandwidth: Used with gaussian, sets the width of the 22 | assumed gaussian distribution. (float) 23 | grid_fineness: Sets how many points to use in each 24 | dimension when building a probability surface for 25 | plotting purposes. 26 | 27 | """ 28 | self.kernel = kernel 29 | self.bandwidth = bandwidth 30 | if kernel == "gaus": 31 | self.estim = self.gaus 32 | elif kernel == "uniform": 33 | self.estim = self.uniform 34 | else: 35 | raise TypeError("Invalid Kernel Selection") 36 | self.grid_fineness = grid_fineness 37 | self.data_cols = None 38 | 39 | def gaus(self, x, mu): 40 | """ 41 | Returns the probability of x given the mean and standard 42 | deviation provided - assuming a Gaussian probability. 43 | --- 44 | Inputs: x (the value to find the probability for, float), 45 | mu (the mean value of the feature in the training data, float), 46 | sig (the standard deviation of the feature in the training data, float) 47 | Outputs: probability (float) 48 | """ 49 | sig = self.bandwidth 50 | diff = np.sqrt(np.sum((x-mu)**2)) 51 | norm = 1/(np.sqrt(2*np.pi*sig**2)) 52 | return norm*np.exp(-diff**2/(2*sig**2)) 53 | 54 | def uniform(self, x, pt): 55 | """ 56 | Returns probability of x, assuming uniform distribution 57 | in each direction around pt2 in range (-1, 1). 58 | """ 59 | diff = np.sqrt(np.sum((x-pt)**2)) 60 | probs = np.zeros_like(diff) 61 | probs[diff <= 1] = 0.5 62 | return probs 63 | 64 | def get_grid(self, X): 65 | """ 66 | Given a dataset, figure out how many dimensions there are 67 | then create a series of sampling points based on the 68 | user's requested grid_fineness. Create this sampling region 69 | over the span of the data +/- 10% in each dimension. 70 | --- 71 | Input: X, Data matrix 72 | """ 73 | mins, maxes = [], [] 74 | 75 | for col in range(self.data_cols): 76 | data = X.T[col] 77 | mins.append(np.min(data)-abs(np.min(data)*0.10)) 78 | maxes.append(np.max(data)+abs(np.max(data)*0.10)) 79 | grid = np.stack(np.meshgrid(*[np.linspace(i,j,self.grid_fineness) for i,j in zip(mins, maxes)], indexing='ij'),self.data_cols) 80 | return grid 81 | 82 | def fit(self, X): 83 | """ 84 | Copies the data for later use. 85 | --- 86 | In: X (features), np.array or pandas dataframe/series 87 | """ 88 | X = self.convert_to_array(X) 89 | self.data_cols = X.shape[1] 90 | self.X = copy(X) 91 | 92 | def make_surface(self): 93 | """ 94 | Using a sampling grid, goes point by point along the 95 | grid to determine the probability of data existing 96 | at that point, given all of the known data. 97 | --- 98 | In: X (features), np.array or pandas dataframe/series 99 | """ 100 | X = self.X 101 | span = self.get_grid(X) 102 | 103 | probs = [] 104 | points = [] 105 | for dim in span: 106 | for p in dim: 107 | prob = 0. 108 | for d in X: 109 | prob += self.estim(p,d) 110 | if np.isnan(prob): 111 | prob = 0. 112 | points.append(p) 113 | probs.append(prob) 114 | self.region = points 115 | self.probs = probs 116 | 117 | def sample(self, num_samples=1, random_state=None): 118 | """ 119 | Given the data we trained on, sample new points 120 | based on the density of the data. Use the kernel 121 | to sample not just the available points, but the 122 | whole region of possiblities given the kernel. 123 | --- 124 | Inputs: 125 | num_samples: how many samples to draw (int) 126 | random_state: seed to make the random draws 127 | reproducible (int) 128 | """ 129 | if random_state: 130 | np.random.seed(random_state) 131 | 132 | samples = [] 133 | for i in range(num_samples): 134 | pt = self.X[np.random.randint(self.X.shape[0])] 135 | sample_pt = [] 136 | for dim in pt: 137 | if self.kernel == "gaus": 138 | sample_pt.append(np.random.normal(dim, self.bandwidth)) 139 | elif self.kernel == "uniform": 140 | sample_pt.append(np.random.uniform(dim-1,dim+1)) 141 | samples.append(sample_pt) 142 | return np.array(samples) 143 | 144 | def pandas_to_numpy(self, x): 145 | """ 146 | Checks if the input is a Dataframe or series, converts to numpy matrix for 147 | calculation purposes. 148 | --- 149 | Input: X (array, dataframe, or series) 150 | Output: X (array) 151 | """ 152 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 153 | return x.as_matrix() 154 | if type(x) == type(np.array([1,2])): 155 | return x 156 | return np.array(x) 157 | 158 | def handle_1d_data(self,x): 159 | """ 160 | Converts 1 dimensional data into a series of rows with 1 columns 161 | instead of 1 row with many columns. 162 | """ 163 | if x.ndim == 1: 164 | x = x.reshape(-1,1) 165 | return x 166 | 167 | def convert_to_array(self, x): 168 | """ 169 | Takes in an input and converts it to a numpy array 170 | and then checks if it needs to be reshaped for us 171 | to use it properly 172 | """ 173 | x = self.pandas_to_numpy(x) 174 | x = self.handle_1d_data(x) 175 | return x 176 | 177 | def make_plot(self): 178 | """ 179 | Creates a plot of the surface created by make_surface 180 | using 2D or 1D, depending on request. 181 | """ 182 | if self.data_cols == 2: 183 | Xpl, Ypl = zip(*self.region) 184 | Zpl = kde2.probs/max(self.probs) 185 | fig = plt.figure(dpi=200, figsize=(18,14)) 186 | ax = fig.gca(projection='3d') 187 | ax.plot_trisurf(Xpl,Ypl,Zpl, cmap=plt.cm.rainbow, linewidth=1) 188 | 189 | Xsc, Ysc = zip(*X) 190 | ax.scatter(Xsc,Ysc,[max(Zpl)]*len(Xsc),c='k',s=20, label="Data", alpha=0.5); 191 | proxy = plt.Circle((0,0), fc="k") 192 | ax.legend([proxy],['Data (z = 1)'], fontsize=18, loc='upper right', frameon=True, facecolor='#FFFFFF', edgecolor='#333333'); 193 | ax.set_zlabel("Norm. Prob.",fontsize=16, labelpad=10) 194 | ax.set_xlabel("X",fontsize=16, labelpad=10) 195 | ax.set_ylabel("Y",fontsize=16, labelpad=10); 196 | 197 | elif self.data_cols == 1: 198 | plt.figure(figsize=(10,6)) 199 | plt.hist(X, label="Binned data", bins=18, alpha=0.8, zorder=1) 200 | plt.plot(self.region, self.probs, c='k', lw=3, label="KDE", zorder=2); 201 | plt.scatter(X, [5]*len(X), marker='o', c='r', s=30, alpha=0.3,label='Actual Data', zorder=3) 202 | plt.legend(fontsize=20, loc='upper left', frameon=True, facecolor='#FFFFFF', edgecolor='#333333'); 203 | ax = plt.gca() 204 | else: 205 | print("Can only draw if KDE is done on 2 or fewer columns.") 206 | return None 207 | return ax 208 | 209 | -------------------------------------------------------------------------------- /zwml/neighbors/knn_classifier.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import copy 4 | import collections 5 | 6 | class knn_classifier: 7 | 8 | def __init__(self, n_neighbors=5): 9 | """ 10 | KNearestNeighbors is a distance based classifier that returns 11 | predictions based on the nearest points in the feature space. 12 | --- 13 | In: n_neighbors (int) - how many closest neighbors do we consider 14 | """ 15 | if n_neighbors > 0: 16 | self.k = int(n_neighbors) 17 | else: 18 | print("n_neighbors must be >0. Set to 5!") 19 | self.k = 5 20 | self.X = None 21 | self.y = None 22 | 23 | def fit(self, X, y): 24 | """ 25 | Makes a copy of the training data that can live within the class. 26 | Thus, the model can be serialized and used away from the original 27 | training data. 28 | --- 29 | In: X (features), y (labels); both np.array or pandas dataframe/series 30 | """ 31 | self.X = copy.copy(self.convert_to_array(X)) 32 | self.y = copy.copy(self.pandas_to_numpy(y)) 33 | 34 | def predict(self, X): 35 | """ 36 | Iterates through all points to predict, calculating the distance 37 | to all of the training points. It then passes that to a sorting function 38 | which returns the most common vote of the n_neighbors (k) closest training 39 | points. 40 | ___ 41 | In: new data to predict (np.array, pandas series/dataframe) 42 | Out: predictions (np.array) 43 | """ 44 | X = self.pandas_to_numpy(X) 45 | results = [] 46 | for x in X: 47 | local_results = [] 48 | for (x2,y) in zip(self.X,self.y): 49 | local_results.append([self.dist_between_points(x,x2),y]) 50 | results.append(self.get_final_predict(local_results)) 51 | return np.array(results).reshape(-1,1) 52 | 53 | def get_final_predict(self,results): 54 | """ 55 | Takes a list of [distance, label] pairs and sorts by distance, 56 | returning the mode vote for the n_neighbors (k) closest votes. 57 | --- 58 | In: [[distance, label]] list of lists 59 | Output: class label (int) 60 | """ 61 | results = sorted(results, key=lambda x: x[0]) 62 | dists, votes = zip(*results) 63 | return collections.Counter(votes[:self.k]).most_common(1)[0][0] 64 | 65 | def dist_between_points(self, a, b): 66 | """ 67 | Calculates the distance between two vectors. 68 | --- 69 | Inputs: a,b (np.arrays) 70 | Outputs: distance (float)""" 71 | assert np.array(a).shape == np.array(b).shape 72 | return np.sqrt(np.sum((a-b)**2)) 73 | 74 | def score(self, X, y): 75 | """ 76 | Uses the predict method to measure the accuracy of the model. 77 | --- 78 | In: X (list or array), feature matrix; y (list or array) labels 79 | Out: accuracy (float) 80 | """ 81 | pred = self.predict(X) 82 | correct = 0 83 | for i,j in zip(y,pred): 84 | if i == j: 85 | correct+=1 86 | return float(correct)/float(len(y)) 87 | 88 | def pandas_to_numpy(self, x): 89 | """ 90 | Checks if the input is a Dataframe or series, converts to numpy matrix for 91 | calculation purposes. 92 | --- 93 | Input: X (array, dataframe, or series) 94 | Output: X (array) 95 | """ 96 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 97 | return x.as_matrix() 98 | if type(x) == type(np.array([1,2])): 99 | return x 100 | return np.array(x) 101 | 102 | def handle_1d_data(self,x): 103 | """ 104 | Converts 1 dimensional data into a series of rows with 1 columns 105 | instead of 1 row with many columns. 106 | """ 107 | if x.ndim == 1: 108 | x = x.reshape(-1,1) 109 | return x 110 | 111 | def convert_to_array(self, x): 112 | """ 113 | Takes in an input and converts it to a numpy array 114 | and then checks if it needs to be reshaped for us 115 | to use it properly 116 | """ 117 | x = self.pandas_to_numpy(x) 118 | x = self.handle_1d_data(x) 119 | return x -------------------------------------------------------------------------------- /zwml/neighbors/knn_regressor.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import copy 4 | import collections 5 | 6 | class knn_regressor: 7 | 8 | def __init__(self, n_neighbors=5): 9 | """ 10 | KNearestNeighbors is a distance based regressors that returns 11 | predictions based on the nearest points in the feature space. 12 | --- 13 | In: n_neighbors (int) - how many closest neighbors do we consider 14 | """ 15 | if n_neighbors > 0: 16 | self.k = int(n_neighbors) 17 | else: 18 | print("n_neighbors must be >0. Set to 5!") 19 | self.k = 5 20 | self.X = None 21 | self.y = None 22 | 23 | def fit(self, X, y): 24 | """ 25 | Makes a copy of the training data that can live within the class. 26 | Thus, the model can be serialized and used away from the original 27 | training data. 28 | --- 29 | In: X (features), y (labels); both np.array or pandas dataframe/series 30 | """ 31 | self.X = copy.copy(self.convert_to_array(X)) 32 | self.y = copy.copy(self.convert_to_array(y)) 33 | 34 | def pandas_to_numpy(self, x): 35 | """ 36 | Checks if the input is a Dataframe or series, converts to numpy matrix for 37 | calculation purposes. 38 | --- 39 | Input: X (array, dataframe, or series) 40 | Output: X (array) 41 | """ 42 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 43 | return x.as_matrix() 44 | if type(x) == type(np.array([1,2])): 45 | return x 46 | return np.array(x) 47 | 48 | def handle_1d_data(self,x): 49 | """ 50 | Converts 1 dimensional data into a series of rows with 1 columns 51 | instead of 1 row with many columns. 52 | """ 53 | if x.ndim == 1: 54 | x = x.reshape(-1,1) 55 | return x 56 | 57 | def convert_to_array(self, x): 58 | """ 59 | Takes in an input and converts it to a numpy array 60 | and then checks if it needs to be reshaped for us 61 | to use it properly 62 | """ 63 | x = self.pandas_to_numpy(x) 64 | x = self.handle_1d_data(x) 65 | return x 66 | 67 | def predict(self, X): 68 | """ 69 | Iterates through all points to predict, calculating the distance 70 | to all of the training points. It then passes that to a sorting function 71 | which returns the most common vote of the n_neighbors (k) closest training 72 | points. 73 | ___ 74 | In: new data to predict (np.array, pandas series/dataframe) 75 | Out: predictions (np.array) 76 | """ 77 | X = self.convert_to_array(X) 78 | results = [] 79 | for x in X: 80 | local_results = [] 81 | for (x2,y) in zip(self.X,self.y): 82 | local_results.append([self.dist_between_points(x,x2),y]) 83 | results.append(self.get_final_predict(local_results)) 84 | return np.array(results).reshape(-1,1) 85 | 86 | def get_final_predict(self,results): 87 | """ 88 | Takes a list of [distance, label] pairs and sorts by distance, 89 | returning themean of the n_neighbors (k) closest points. 90 | --- 91 | In: [[distance, label]] list of lists 92 | Output: class label (int) 93 | """ 94 | results = sorted(results, key=lambda x: x[0]) 95 | dists, votes = zip(*results) 96 | return np.mean(votes[:self.k]) 97 | 98 | def dist_between_points(self, a, b): 99 | """ 100 | Calculates the distance between two vectors. 101 | --- 102 | Inputs: a,b (np.arrays) 103 | Outputs: distance (float)""" 104 | assert np.array(a).shape == np.array(b).shape 105 | return np.sqrt(np.sum((a-b)**2)) 106 | 107 | def score(self, X, y): 108 | """ 109 | Uses the predict method to measure the (negative) 110 | mean squared error of the model. 111 | --- 112 | In: X (list or array), feature matrix; y (list or array) labels 113 | Out: negative mean squared error (float) 114 | """ 115 | pred = self.predict(X) 116 | return -1.* np.mean((np.array(pred)-np.array(y))**2) -------------------------------------------------------------------------------- /zwml/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | from .count_vectorizer import count_vectorizer 2 | from .tfidf_vectorizer import tfidf_vectorizer 3 | from .latent_semantic_indexing import latent_semantic_indexing 4 | 5 | __all__ = ['count_vectorizer','latent_semantic_indexing','tfidf_vectorizer'] 6 | -------------------------------------------------------------------------------- /zwml/nlp/count_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | from string import punctuation 4 | 5 | class count_vectorizer: 6 | 7 | def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False): 8 | """ 9 | Count vectorizer reads the text provided, tokenizes it 10 | with the provided tokenizer (or the default), then generates 11 | ngrams keeping track of all ngrams as the vocabulary. 12 | Then it takes provided texts and converts them into vectors 13 | by counting the appearance of each ngram and tracking that 14 | for every document. 15 | --- 16 | KWargs: 17 | max_features: how many ngrams to allow in the vector, using the 18 | most common features first. If None, defaults to using all 19 | ngrams (int) 20 | ngrams: how many tokens to combine to form features. First element 21 | of tuple is starting point, second is ending point. 22 | tokenizer: what function to use to create tokens (must return 23 | list of tokens) 24 | remove_stopwords: whether to include very common english words that 25 | do not add much value due to their commonness. 26 | """ 27 | self.max_features = max_features 28 | self.vocabulary = {} 29 | self.ngrams = ngrams 30 | if tokenizer == None: 31 | self.tokenizer = self.tokenize 32 | else: 33 | self.tokenizer = tokenizer 34 | self.remove_stopwords = remove_stopwords 35 | self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 36 | 'there', 'about', 'once', 'during', 'out', 'very', 'having', 37 | 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 38 | 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 39 | 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 40 | 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 41 | 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 42 | 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 43 | 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 44 | 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 45 | 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 46 | 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 47 | 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 48 | 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 49 | 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 50 | 'was', 'here', 'than'} 51 | 52 | def token_generator(self, X): 53 | """ 54 | Generator that returns joined tokens as a single 55 | string to act as a feature. It generates the tokens 56 | by iterating through the allowed ngrams and combining 57 | the appropriate number of tokens into a string. 58 | """ 59 | for i in range(self.ngrams[0],self.ngrams[1]+1): 60 | for ix, _ in enumerate(X): 61 | if ix+i < len(X)+1: 62 | yield ' '.join(X[ix:ix+i]) 63 | 64 | def tokenize(self, X): 65 | """ 66 | Simple tokenizer that removes punctuation, 67 | lowercases the text, and breaks on spaces. 68 | Also removes stopwords and numeric values 69 | from being treated as words. 70 | """ 71 | for symbol in punctuation: 72 | X = X.replace(symbol,'') 73 | final_token_list = [] 74 | for token in X.lower().split(): 75 | if self.remove_stopwords: 76 | if not self.check_stopwords(token): 77 | try: 78 | int(token) 79 | float(token) 80 | except: 81 | final_token_list.append(token) 82 | else: 83 | final_token_list.append(token) 84 | return final_token_list 85 | 86 | def check_stopwords(self, token): 87 | """ 88 | Checks if the token is in our list of common 89 | stopwords, and returns a boolean. 90 | """ 91 | return token in self.stopwords 92 | 93 | def fit(self, X): 94 | """ 95 | Go through all provided training documents and 96 | create the list of vocabulary for known documents 97 | by looking at all ngrams and tracking how often 98 | those ngrams appear. If max_features is defined, 99 | only keep the most common tokens. Afterward, 100 | generate a token_to_id mapper and an id_to_token 101 | mapper. 102 | """ 103 | for document in X: 104 | tokens = self.tokenizer(document) 105 | for token in self.token_generator(tokens): 106 | if token in self.vocabulary.keys(): 107 | self.vocabulary[token] += 1 108 | else: 109 | self.vocabulary[token] = 1 110 | 111 | if self.max_features != None: 112 | temp_vocab = {} 113 | for key, value in Counter(self.vocabulary).most_common(self.max_features): 114 | temp_vocab[key] = value 115 | self.vocabulary = temp_vocab 116 | del temp_vocab 117 | 118 | self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))} 119 | self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))} 120 | 121 | 122 | def transform(self, X): 123 | """ 124 | Go through all provided documents and use the known 125 | vocabulary to track how often each ngram appears in 126 | the document. At the end, stack all of the generated 127 | document vectors together. Skip the initial vector that 128 | all 0's, which is just there to act as a template. 129 | """ 130 | vectorized_docs = np.zeros(len(self.vocabulary.keys())) 131 | for document in X: 132 | tokens = self.tokenizer(document) 133 | vectorized_doc = np.zeros(len(self.vocabulary.keys())) 134 | for token in self.token_generator(tokens): 135 | if token in self.vocabulary: 136 | word_id = self.token_to_id[token] 137 | vectorized_doc[word_id] += 1 138 | vectorized_docs = np.vstack((vectorized_docs,vectorized_doc)) 139 | return vectorized_docs[1:] 140 | 141 | def fit_transform(self, X): 142 | """ 143 | Fit on X and then transform X and return it as vectors. 144 | """ 145 | self.fit(X) 146 | return self.transform(X) 147 | -------------------------------------------------------------------------------- /zwml/nlp/latent_semantic_indexing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class latent_semantic_indexing: 4 | 5 | def __init__(self, num_topics=5): 6 | """ 7 | Latent semantic indexing uses matrix decomposition 8 | techniques to reduce the large feature space associated 9 | with text analysis into a smaller "topic" space which 10 | by exploiting SVD's ability to find correlations in 11 | features and combine them into super-dimensions made 12 | of the correlated columns. In the text analysis, that 13 | means if the original features are word, LSI will 14 | find words that tend to be in the same document together 15 | and group them as unique topics. 16 | """ 17 | self.num_topics = num_topics 18 | 19 | def fit(self, X): 20 | """ 21 | Using SVD as the base of the algorithm (we use numpy since 22 | it's faster than our method), we do a dimensionality 23 | reduction. Remember that V is an expression of the new 24 | dimensions in terms of the old columns. If we do count 25 | vectorizer, this is an expression of topics in terms of 26 | ngrams. We'll use this to extract our topics. We can also 27 | cast new documents into topic space using the V matrix. 28 | """ 29 | X = self.convert_to_array(X) 30 | self.U, self.sigma, self.V = np.linalg.svd(X) 31 | self.V = self.V[:self.num_topics,:] 32 | self.sigma = self.sigma[:self.num_topics] 33 | self.U = self.U[:,:self.num_topics] 34 | 35 | def transform(self, X): 36 | """ 37 | Since V is a conversion of columns to the lower 38 | dimensional space, we can just use matrix 39 | multiplication to cast any new data into that 40 | space. 41 | --- 42 | Input: X, data matrix (dataframe, array, list of lists) 43 | """ 44 | X = self.convert_to_array(X) 45 | return np.dot(X, self.V.T) 46 | 47 | def fit_transform(self, X): 48 | """ 49 | Fit on X and then transform X and return it as vectors. 50 | """ 51 | self.fit(X) 52 | return self.transform(X) 53 | 54 | def print_topics(self, X, id_to_word=None, num_words_per_topics=10): 55 | """ 56 | For each topic created in the SVD decomposition, 57 | iterate through the strongest contributors (positive 58 | or negative), and print out those words. Requires a 59 | column number to word dictionary, otherwise just prints 60 | the column number for the strong correlations. 61 | """ 62 | for idx, row in enumerate(self.V): 63 | sorted_word_ids = np.argsort(row)[-num_words_per_topics:] 64 | print("--- Topic ", idx, " ---") 65 | words_to_print = "" 66 | for word_id in sorted_word_ids: 67 | if id_to_word != None: 68 | words_to_print += id_to_word[word_id] 69 | words_to_print += ', ' 70 | else: 71 | words_to_print += "Column " 72 | words_to_print += str(word_id) 73 | words_to_print += ', ' 74 | print(words_to_print[:-2]) 75 | 76 | def pandas_to_numpy(self, x): 77 | """ 78 | Checks if the input is a Dataframe or series, converts to numpy matrix for 79 | calculation purposes. 80 | --- 81 | Input: X (array, dataframe, or series) 82 | Output: X (array) 83 | """ 84 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 85 | return x.as_matrix() 86 | if type(x) == type(np.array([1,2])): 87 | return x 88 | return np.array(x) 89 | 90 | def handle_1d_data(self,x): 91 | """ 92 | Converts 1 dimensional data into a series of rows with 1 columns 93 | instead of 1 row with many columns. 94 | """ 95 | if x.ndim == 1: 96 | x = x.reshape(-1,1) 97 | return x 98 | 99 | def convert_to_array(self, x): 100 | """ 101 | Takes in an input and converts it to a numpy array 102 | and then checks if it needs to be reshaped for us 103 | to use it properly 104 | """ 105 | x = self.pandas_to_numpy(x) 106 | x = self.handle_1d_data(x) 107 | return x 108 | -------------------------------------------------------------------------------- /zwml/nlp/tfidf_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | from string import punctuation 4 | 5 | class tfidf_vectorizer: 6 | 7 | def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False): 8 | """ 9 | Term frequency, inverse document frequency vectorizer 10 | reads the text provided, tokenizes it with the provided 11 | tokenizer (or the default), then generates ngrams keeping 12 | track of all ngrams as the vocabulary. Then it takes provided 13 | texts and converts them into vectors by counting the 14 | appearance of each ngram and tracking that for every document. 15 | The counts are then scaled by the max term frequency and the 16 | inverse document frequency (see converter method). This new 17 | result is better than counts at picking out how important 18 | words are based on both usage and uniqueness. 19 | --- 20 | KWargs: 21 | max_features: how many ngrams to allow in the vector, using the 22 | most common features first. If None, defaults to using all 23 | ngrams (int) 24 | ngrams: how many tokens to combine to form features. First element 25 | of tuple is starting point, second is ending point. 26 | tokenizer: what function to use to create tokens (must return 27 | list of tokens) 28 | remove_stopwords: whether to include very common english words that 29 | do not add much value due to their commonness. 30 | """ 31 | self.max_features = max_features 32 | self.vocabulary = {} 33 | self.ngrams = ngrams 34 | if tokenizer == None: 35 | self.tokenizer = self.tokenize 36 | else: 37 | self.tokenizer = tokenizer 38 | self.remove_stopwords = remove_stopwords 39 | self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 40 | 'there', 'about', 'once', 'during', 'out', 'very', 'having', 41 | 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 42 | 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 43 | 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 44 | 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 45 | 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 46 | 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 47 | 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 48 | 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 49 | 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 50 | 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 51 | 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 52 | 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 53 | 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 54 | 'was', 'here', 'than'} 55 | 56 | def token_generator(self, X): 57 | """ 58 | Generator that returns joined tokens as a single 59 | string to act as a feature. It generates the tokens 60 | by iterating through the allowed ngrams and combining 61 | the appropriate number of tokens into a string. 62 | """ 63 | for i in range(self.ngrams[0],self.ngrams[1]+1): 64 | for ix, _ in enumerate(X): 65 | if ix+i < len(X)+1: 66 | yield ' '.join(X[ix:ix+i]) 67 | 68 | def tokenize(self, X): 69 | """ 70 | Simple tokenizer that removes punctuation, 71 | lowercases the text, and breaks on spaces. 72 | Also removes stopwords and numeric values 73 | from being treated as words. 74 | """ 75 | for symbol in punctuation: 76 | X = X.replace(symbol,'') 77 | final_token_list = [] 78 | for token in X.lower().split(): 79 | if self.remove_stopwords: 80 | if not self.check_stopwords(token): 81 | try: 82 | int(token) 83 | float(token) 84 | except: 85 | final_token_list.append(token) 86 | else: 87 | final_token_list.append(token) 88 | return final_token_list 89 | 90 | def check_stopwords(self, token): 91 | """ 92 | Checks if the token is in our list of common 93 | stopwords, and returns a boolean. 94 | """ 95 | return token in self.stopwords 96 | 97 | def fit(self, X): 98 | """ 99 | Go through all provided training documents and 100 | create the list of vocabulary for known documents 101 | by looking at all ngrams and tracking how often 102 | those ngrams appear. If max_features is defined, 103 | only keep the most common tokens. Afterward, 104 | generate a token_to_id mapper and an id_to_token 105 | mapper. 106 | """ 107 | for document in X: 108 | tokens = self.tokenizer(document) 109 | for token in self.token_generator(tokens): 110 | if token in self.vocabulary.keys(): 111 | self.vocabulary[token] += 1 112 | else: 113 | self.vocabulary[token] = 1 114 | 115 | if self.max_features != None: 116 | temp_vocab = {} 117 | for key, value in Counter(self.vocabulary).most_common(self.max_features): 118 | temp_vocab[key] = value 119 | self.vocabulary = temp_vocab 120 | del temp_vocab 121 | 122 | self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))} 123 | self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))} 124 | 125 | 126 | def transform(self, X): 127 | """ 128 | Go through all provided documents and use the known 129 | vocabulary to track how often each ngram appears in 130 | the document. At the end, stack all of the generated 131 | document vectors together. Convert them to tf-idf 132 | and skip the initial vector that's all 0's, which 133 | is just there to act as a template. 134 | """ 135 | vectorized_docs = np.zeros(len(self.vocabulary.keys())) 136 | for document in X: 137 | tokens = self.tokenizer(document) 138 | vectorized_doc = np.zeros(len(self.vocabulary.keys())) 139 | for token in self.token_generator(tokens): 140 | if token in self.vocabulary: 141 | word_id = self.token_to_id[token] 142 | vectorized_doc[word_id] += 1 143 | vectorized_docs = np.vstack((vectorized_docs,vectorized_doc)) 144 | return self.convert_counts_to_tf_idf(vectorized_docs)[1:] 145 | 146 | def convert_counts_to_tf_idf(self, docs): 147 | """ 148 | To convert from counts to TF-IDF, we first scale 149 | each value by the maximum in it's own column. This 150 | lowers dependence on document length. Then we calculate 151 | log(number of documents/(1+documents containing this ngram)). 152 | This is the inverse document frequency (the one is to make 153 | combat division by 0). Each value is scaled as: 154 | term_frequency*inverse_document_frequency. 155 | """ 156 | number_of_columns = docs.shape[1] 157 | number_of_docs = docs.shape[0] 158 | frequency_scalers = np.ones(number_of_columns) 159 | idf_terms = np.ones(number_of_columns) 160 | for col in range(number_of_columns): 161 | column_vals = docs.T[col] 162 | frequency_scalers[col] = np.max(column_vals) 163 | number_of_docs_containing = np.sum((column_vals > 0).astype(int)) 164 | idf_terms[col] = np.log(number_of_docs/(1+number_of_docs_containing)) 165 | docs = docs/frequency_scalers 166 | docs = docs*idf_terms 167 | 168 | return docs 169 | 170 | def fit_transform(self, X): 171 | """ 172 | Fit on X and then transform X and return it as vectors. 173 | """ 174 | self.fit(X) 175 | return self.transform(X) 176 | -------------------------------------------------------------------------------- /zwml/random/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .middle_square import middle_square 3 | 4 | __all__ = ['middle_square'] 5 | -------------------------------------------------------------------------------- /zwml/random/middle_square.py: -------------------------------------------------------------------------------- 1 | class middle_square: 2 | 3 | def __init__(self): 4 | """ 5 | Generates random numbers using a middle square method. 6 | Squares the seed, pads the left side of the number with 7 | zeroes, then takes the middle values as the next random 8 | number in the sequence. Note: do not use in production, 9 | very easy to crack. 10 | """ 11 | pass 12 | 13 | def middle_square_list(self, seed, count, width=4, seeds=[]): 14 | """ 15 | Creates a list of length "count" of random numbers 16 | given a seed, by squaring the seed and taking the middle 17 | digits. If the seed becomes 0000, stops early. 18 | Works recursively by creating one value at a time and 19 | sending that value to the next call as the new seed. 20 | --- 21 | KWargs: 22 | seed: starting value for the RNG 23 | count: how many numbers to generate 24 | width: how many digits is the generated number 25 | seeds: stores the results so far, can be used to force 26 | a certain number to be in the result. 27 | """ 28 | if not seeds: 29 | assert len(str(seed)) == width, "Seed must have a length equal to request width!" 30 | x = str(seed**2) 31 | while len(x) 4 columns used) 19 | "sqrt" (square root of the number of cols in input data) 20 | "div3" (number of input cols divided by 3) 21 | mode: If mode='rfnode' the column randomization happens at each node. Otherwise 22 | Each tree gets one randomized set of columns for all nodes in that tree. 23 | seed: Random seed to allow for reproducibility. 24 | """ 25 | self.n_trees = n_trees 26 | self.max_depth = max_depth 27 | self.n_features = n_features 28 | self.tree_filter_pairs = [] 29 | self.mode = mode 30 | if seed: 31 | self._seed = seed 32 | np.random.seed(seed) 33 | 34 | def find_number_of_columns(self, X): 35 | """ 36 | Uses the user input for n_features to decide how many columns should 37 | be included in each model. Uses the shape of X to decide the final number 38 | if 'sqrt' is called. 39 | --- 40 | Input: X (array, dataframe, or series) 41 | """ 42 | if isinstance(self.n_features, int): 43 | return self.n_features 44 | if self.n_features == 'sqrt': 45 | return int(np.sqrt(X.shape[1])+0.5) 46 | if self.n_features == 'div3': 47 | return int(X.shape[1]/3+0.5) 48 | else: 49 | raise ValueError("Invalid n_features selection") 50 | 51 | def get_bagged_data(self, X, y): 52 | """ 53 | Chooses random rows to populate a bootstrapped dataset, with replacement. 54 | Maintains the correlation between X and y 55 | --- 56 | Input: X, y (arrays) 57 | Outputs: randomized X,y (arrays) 58 | """ 59 | index = np.random.choice(np.arange(len(X)),len(X)) 60 | return X[index], y[index] 61 | 62 | def randomize_columns(self,X): 63 | """ 64 | Chooses a set of columns to keep from the input data. These are 65 | randomly drawn, according the number requested by the user. The data 66 | is filtered and only the allowed columns are returned, along with the 67 | filter. 68 | --- 69 | Input: X (array) 70 | Output: filtered_X (array), filter (array) 71 | """ 72 | num_col = self.find_number_of_columns(X) 73 | filt = np.random.choice(np.arange(0,X.shape[1]),num_col,replace=False) 74 | filtered_X = self.apply_filter(X, filt) 75 | return filtered_X, filt 76 | 77 | def apply_filter(self, X, filt): 78 | """ 79 | Given X and a filter, only the columns matching the index values 80 | in filter are returned. 81 | --- 82 | Input: X (array), filter (array of column IDs) 83 | Output: filtered_X (array) 84 | """ 85 | filtered_X = X.T[filt] 86 | return filtered_X.T 87 | 88 | def fit(self, X, y): 89 | """ 90 | Generates the bootstrapped data, decides which column to keep, 91 | and then uses the decision tree class to build a model on each 92 | bootstrapped and column-randomized dataset. Each tree is stored 93 | as part of the model for later use, along with the appropriate 94 | filter - which is needed to filter new data for use with the model. 95 | --- 96 | Input: X, y (arrays, dataframe, or series) 97 | """ 98 | X = self.convert_to_array(X) 99 | y = self.pandas_to_numpy(y) 100 | try: 101 | self.base_filt = [x for x in range(X.shape[1])] 102 | except IndexError: 103 | self.base_filt = [0] 104 | for _ in range(self.n_trees): 105 | filt = self.base_filt 106 | bagX, bagy = self.get_bagged_data(X,y) 107 | if self.mode == 'rftree': 108 | bagX, filt = self.randomize_columns(bagX) 109 | new_tree = decision_tree_classifier(self.max_depth, mode=self.mode, n_features=self.n_features) 110 | new_tree.fit(bagX, bagy) 111 | self.tree_filter_pairs.append((new_tree, filt)) 112 | 113 | def predict(self, X): 114 | """ 115 | Uses the list of tree models built in the fit, doing a predict with each 116 | model. The associated filter is applied to X, so the model sees the columns 117 | it has learned about. The final prediction uses the mode of all the trees 118 | predictions. 119 | --- 120 | Input: X (array, dataframe, or series) 121 | Output: Class ID (int) 122 | """ 123 | X = self.convert_to_array(X) 124 | self.predicts = [] 125 | for tree, filt in self.tree_filter_pairs: 126 | filtered_X = self.apply_filter(X, filt) 127 | self.predicts.append(tree.predict(filtered_X)) 128 | self.pred_by_row = np.array(self.predicts).T 129 | 130 | ensemble_predict = [] 131 | for row in self.pred_by_row: 132 | ensemble_predict.append(collections.Counter(row).most_common(1)[0][0]) 133 | return ensemble_predict 134 | 135 | def score(self, X, y): 136 | """ 137 | Uses the predict method to measure the accuracy of the model. 138 | --- 139 | In: X (list or array), feature matrix; y (list or array) labels 140 | Out: accuracy (float) 141 | """ 142 | pred = self.predict(X) 143 | correct = 0 144 | for i,j in zip(y,pred): 145 | if i == j: 146 | correct+=1 147 | return float(correct)/float(len(y)) 148 | 149 | def pandas_to_numpy(self, x): 150 | """ 151 | Checks if the input is a Dataframe or series, converts to numpy matrix for 152 | calculation purposes. 153 | --- 154 | Input: X (array, dataframe, or series) 155 | Output: X (array) 156 | """ 157 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 158 | return x.as_matrix() 159 | if type(x) == type(np.array([1,2])): 160 | return x 161 | return np.array(x) 162 | 163 | def handle_1d_data(self,x): 164 | """ 165 | Converts 1 dimensional data into a series of rows with 1 columns 166 | instead of 1 row with many columns. 167 | """ 168 | if x.ndim == 1: 169 | x = x.reshape(-1,1) 170 | return x 171 | 172 | def convert_to_array(self, x): 173 | """ 174 | Takes in an input and converts it to a numpy array 175 | and then checks if it needs to be reshaped for us 176 | to use it properly 177 | """ 178 | x = self.pandas_to_numpy(x) 179 | x = self.handle_1d_data(x) 180 | return x -------------------------------------------------------------------------------- /zwml/tree_models/random_forest_regressor.py: -------------------------------------------------------------------------------- 1 | from zwml.tree_models import decision_tree_regressor 2 | import collections 3 | import pandas as pd 4 | import numpy as np 5 | 6 | class random_forest_regressor: 7 | 8 | def __init__(self, n_trees = 10, max_depth=None, n_features='sqrt', mode='rfnode', seed=None, criteria='std'): 9 | """ 10 | Random Forest Regressor uses bootstrapping and column randomization 11 | to generate n_trees different datasets and then applies a decision 12 | tree to each dataset. The final prediction is an ensemble of all created trees. 13 | --- 14 | Params: 15 | n_trees (int): number of bootstrapped trees to grow for ensembling 16 | max_depth (int): maximum number of splits to make in the tree 17 | mode: If mode='rfnode' the column randomization happens at each node. Otherwise 18 | the tree will assume all input columns are valid choices and randomize at 19 | a "per tree" level. 20 | n_features: The number of columns to include in the models. Only applies if 21 | mode='rfnode.' Otherwise n_features = number of columns in data. 22 | Options: numeric value (e.g. 4 => 4 columns used) 23 | "sqrt" (square root of the number of cols in input data) 24 | "div3" (number of input cols divided by 3) 25 | criteria: Options are "std" (standard deviation) and "mae" (absolute error from mean). 26 | This choice decides how the tree will be optimized. Default: "std" 27 | seed: Random seed to allow for reproducibility. 28 | """ 29 | self.n_trees = n_trees 30 | self.max_depth = max_depth 31 | self.n_features = n_features 32 | self.tree_filter_pairs = [] 33 | self.mode = mode 34 | self.criteria = criteria 35 | if seed: 36 | self._seed = seed 37 | np.random.seed(seed) 38 | 39 | def find_number_of_columns(self, X): 40 | """ 41 | Uses the user input for n_features to decide how many columns should 42 | be included in each model. Uses the shape of X to decide the final number 43 | if 'sqrt' is called. 44 | --- 45 | Input: X (array, dataframe, or series) 46 | """ 47 | if isinstance(self.n_features, int): 48 | return self.n_features 49 | if self.n_features == 'sqrt': 50 | return int(np.sqrt(X.shape[1])+0.5) 51 | if self.n_features == 'div3': 52 | return int(X.shape[1]/3+0.5) 53 | else: 54 | raise ValueError("Invalid n_features selection") 55 | 56 | def get_bagged_data(self, X, y): 57 | """ 58 | Chooses random rows to populate a bootstrapped dataset, with replacement. 59 | Maintains the correlation between X and y 60 | --- 61 | Input: X, y (arrays) 62 | Outputs: randomized X,y (arrays) 63 | """ 64 | index = np.random.choice(np.arange(len(X)),len(X)) 65 | return X[index], y[index] 66 | 67 | def randomize_columns(self,X): 68 | """ 69 | Chooses a set of columns to keep from the input data. These are 70 | randomly drawn, according the number requested by the user. The data 71 | is filtered and only the allowed columns are returned, along with the 72 | filter. 73 | --- 74 | Input: X (array) 75 | Output: filtered_X (array), filter (array) 76 | """ 77 | num_col = self.find_number_of_columns(X) 78 | filt = np.random.choice(np.arange(0,X.shape[1]),num_col,replace=False) 79 | filtered_X = self.apply_filter(X, filt) 80 | return filtered_X, filt 81 | 82 | def apply_filter(self, X, filt): 83 | """ 84 | Given X and a filter, only the columns matching the index values 85 | in filter are returned. 86 | --- 87 | Input: X (array), filter (array of column IDs) 88 | Output: filtered_X (array) 89 | """ 90 | filtered_X = X.T[filt] 91 | return filtered_X.T 92 | 93 | def pandas_to_numpy(self, x): 94 | """ 95 | Checks if the input is a Dataframe or series, converts to numpy matrix for 96 | calculation purposes. 97 | --- 98 | Input: X (array, dataframe, or series) 99 | Output: X (array) 100 | """ 101 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 102 | return x.as_matrix() 103 | if type(x) == type(np.array([1,2])): 104 | return x 105 | return np.array(x) 106 | 107 | def handle_1d_data(self,x): 108 | """ 109 | Converts 1 dimensional data into a series of rows with 1 columns 110 | instead of 1 row with many columns. 111 | """ 112 | if x.ndim == 1: 113 | x = x.reshape(-1,1) 114 | return x 115 | 116 | def convert_to_array(self, x): 117 | """ 118 | Takes in an input and converts it to a numpy array 119 | and then checks if it needs to be reshaped for us 120 | to use it properly 121 | """ 122 | x = self.pandas_to_numpy(x) 123 | x = self.handle_1d_data(x) 124 | return x 125 | 126 | def fit(self, X, y): 127 | """ 128 | Generates the bootstrapped data, decides which column to keep, 129 | and then uses the decision tree class to build a model on each 130 | bootstrapped and column-randomized dataset. Each tree is stored 131 | as part of the model for later use, along with the appropriate 132 | filter - which is needed to filter new data for use with the model. 133 | --- 134 | Input: X, y (arrays, dataframe, or series) 135 | """ 136 | X = self.convert_to_array(X) 137 | y = self.convert_to_array(y) 138 | try: 139 | self.base_filt = [x for x in range(X.shape[1])] 140 | except IndexError: 141 | self.base_filt = [0] 142 | for _ in range(self.n_trees): 143 | filt = self.base_filt 144 | bagX, bagy = self.get_bagged_data(X,y) 145 | if self.mode == 'rftree': 146 | bagX, filt = self.randomize_columns(bagX) 147 | new_tree = decision_tree_regressor(self.max_depth, mode=self.mode, 148 | n_features=self.n_features, criteria=self.criteria) 149 | new_tree.fit(bagX, bagy) 150 | self.tree_filter_pairs.append((new_tree, filt)) 151 | 152 | def predict(self, X): 153 | """ 154 | Uses the list of tree models built in the fit, doing a predict with each 155 | model. The associated filter is applied to X, so the model sees the columns 156 | it has learned about. The final prediction uses the mode of all the trees 157 | predictions. 158 | --- 159 | Input: X (array, dataframe, or series) 160 | Output: Class ID (int) 161 | """ 162 | X = self.convert_to_array(X) 163 | self.predicts = [] 164 | for tree, filt in self.tree_filter_pairs: 165 | filtered_X = self.apply_filter(X, filt) 166 | self.predicts.append(tree.predict(filtered_X)) 167 | self.pred_by_row = np.array(self.predicts).T 168 | 169 | ensemble_predict = [] 170 | for row in self.pred_by_row: 171 | ensemble_predict.append(np.mean(row)) 172 | return ensemble_predict 173 | 174 | def score(self, X, y): 175 | """ 176 | Uses the predict method to measure the (negative) 177 | mean squared error of the model. 178 | --- 179 | In: X (list or array), feature matrix; y (list or array) labels 180 | Out: negative mean squared error (float) 181 | """ 182 | pred = self.predict(X) 183 | return -1.* np.mean((np.array(pred)-np.array(y))**2) -------------------------------------------------------------------------------- /zwml/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data_splitting import * 3 | from .grid_search import * 4 | from .randomized_search import * 5 | from .markov_chain import markov_chain 6 | from .standard_scaler import standard_scaler 7 | from .normalizer import normalizer 8 | 9 | __all__ = ['train_test_split','cross_val','grid_search','grid_search_cv','randomized_search','randomized_search_cv','markov_chain', 'standard_scaler','normalizer'] 10 | -------------------------------------------------------------------------------- /zwml/utilities/grid_search.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | class grid_search(): 4 | 5 | def __init__(self, model_name, param_grid): 6 | """ 7 | Given a base model and a parameter grid of params 8 | for that model, iterates through all the combinations 9 | of parameters, builds a model with each combo, 10 | and returns the score of the model. 11 | --- 12 | Inputs: 13 | model_name : the name of the model with parenthesis 14 | and as a string. Any parameters you wish to set for all 15 | models can be set in the parameter name. 16 | param_grid: dictionary with parameter names as keys, 17 | and list of param values to test as value for each key 18 | """ 19 | self._base_model = str(model_name).replace(')','') 20 | self._param_grid = param_grid 21 | self.models = self.get_models() 22 | 23 | def get_models(self): 24 | """ 25 | Finds every combination of parameters from the param grid. 26 | Uses the string basename for to create a list of model 27 | names with the proper parameters. This command_list is 28 | still in string form until we're ready to test the models. 29 | """ 30 | params = [] 31 | order = [] 32 | for key, value in self._param_grid.items(): 33 | order.append(key) 34 | params.append(value) 35 | options = list(product(*params)) 36 | 37 | command_list = [] 38 | for option in options: 39 | cmd = self._base_model 40 | if cmd[-1] != '(': 41 | cmd+=', ' 42 | for i,j in zip(order, option): 43 | if type(j) == type('string'): 44 | cmd += str(i)+"='"+str(j)+"', " 45 | else: 46 | cmd += str(i)+"="+str(j)+", " 47 | command_list.append(cmd[:-2]+')') 48 | return command_list 49 | 50 | def fit(self, X, y): 51 | """ 52 | Uses the "eval" function in Python to convert the model 53 | name from string to an actual model. Fits each model 54 | and scores it. Creates a lists of models and scores. 55 | Sets the best possible model and score to be easily 56 | retrievable and usable. 57 | """ 58 | results = [] 59 | for model_name in self.models: 60 | model = eval(model_name) 61 | model.fit(X,y) 62 | s = model.score(X,y) 63 | results.append([model, s, model_name]) 64 | self.all_results = sorted(results, key=lambda x: x[1], reverse=True) 65 | self.best_model = self.all_results[0][0] 66 | self.best_score = self.all_results[0][1] 67 | 68 | def print_results(self): 69 | """ 70 | Method to print the results in a nice readable format. 71 | """ 72 | if self.all_results: 73 | print("Model | Score\n--------------------\n") 74 | for result in self.all_results: 75 | print(result[2], " | ", result[1],"\n") 76 | 77 | 78 | from itertools import product 79 | from zwml.utilities import cross_val 80 | 81 | class grid_search_cv(): 82 | 83 | def __init__(self, model_name, param_grid={}, k=5): 84 | """ 85 | Given a base model and a parameter grid of params 86 | for that model, iterates through all the combinations 87 | of parameters, builds a model with each combo, 88 | and does kFold cross validation on them model 89 | --- 90 | Inputs: 91 | model_name : the name of the model with parenthesis 92 | and as a string. Any parameters you wish to set for all 93 | models can be set in the parameter name. 94 | param_grid: dictionary with parameter names as keys, 95 | and list of param values to test as value for each key 96 | k: number of folds for cross val 97 | """ 98 | self._base_model = str(model_name).replace(')','') 99 | self._param_grid = param_grid 100 | self.models = self.get_models() 101 | self.k = k 102 | 103 | def get_models(self): 104 | """ 105 | Finds every combination of parameters from the param grid. 106 | Uses the string basename for to create a list of model 107 | names with the proper parameters. This command_list is 108 | still in string form until we're ready to test the models. 109 | """ 110 | params = [] 111 | order = [] 112 | for key, value in self._param_grid.items(): 113 | order.append(key) 114 | params.append(value) 115 | options = list(product(*params)) 116 | 117 | command_list = [] 118 | for option in options: 119 | cmd = self._base_model 120 | if cmd[-1] != '(': 121 | cmd+=', ' 122 | for i,j in zip(order, option): 123 | if type(j) == type('string'): 124 | cmd += str(i)+"='"+str(j)+"', " 125 | else: 126 | cmd += str(i)+"="+str(j)+", " 127 | command_list.append(cmd[:-2]+')') 128 | return command_list 129 | 130 | def fit(self, X, y): 131 | """ 132 | Uses the "eval" function in Python to convert the model 133 | name from string to an actual model. Fits each model 134 | and scores it with kfold cross_val. 135 | Creates a lists of models and scores. 136 | Sets the best possible model and score to be easily 137 | retrievable and usable. 138 | """ 139 | results = [] 140 | for model_name in self.models: 141 | model = eval(model_name) 142 | cv = cross_val() 143 | cv.cross_validation_scores(model, X, y, self.k) 144 | results.append([model, cv.score_folds, model_name]) 145 | self.all_results = sorted(results, key=lambda x: np.mean(x[1]), reverse=True) 146 | self.best_model = self.all_results[0][0] 147 | self.best_score = self.all_results[0][1] 148 | 149 | def print_results(self, coefs=False, mean=False): 150 | """ 151 | Method to print the results in a nice readable format. 152 | If the user asks for mean, only show the average score 153 | across all folds. If the user asks for coefficients 154 | show coefficients if the model has them. 155 | """ 156 | if self.all_results: 157 | print("Model | Scores\n--------------------") 158 | for result in self.all_results: 159 | if mean: 160 | print(result[2], " | ", np.mean(result[1])) 161 | else: 162 | print(result[2], " | ", result[1]) 163 | if coefs: 164 | try: 165 | print("Coefs: ", result[0].coefs_) 166 | except AttributeError: 167 | print("No Coefficients in model!") 168 | print() -------------------------------------------------------------------------------- /zwml/utilities/markov_chain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class markov_chain: 4 | 5 | def __init__(self, text, from_file=True, ngram=2, random_state=None): 6 | """ 7 | Markov Chains are great for generating text based on previously seen text. 8 | Here we'll either read from file or from one big string, then generate a 9 | probabilistic understanding of the document by using ngrams as keys and 10 | storing all possible following words. We can then generate sentences 11 | using random dice and this object. 12 | --- 13 | Inputs 14 | text: either the path to a file containing the text or the text (string) 15 | from_file: whether the text is in a file or note (bool) 16 | ngram: how many words to use as a key for the text generation 17 | random_state: used to set the random state for reproducibility 18 | """ 19 | self.ngram = int(ngram) 20 | self.markov_keys = dict() 21 | self._from_file = from_file 22 | if type(text) != type("string"): 23 | raise TypeError("'text' must be a PATH or string object") 24 | if from_file: 25 | self.path = text 26 | else: 27 | self.raw = text 28 | self.text_as_list = None 29 | if random_state: 30 | np.random.seed(random_state) 31 | self.create_probability_object() 32 | 33 | def preprocess(self): 34 | """ 35 | Opens and cleans the text to be learned. If self.from_file, it reads 36 | from the path provided. The cleaning is very minor, just lowercasing 37 | and getting rid of quotes. Creates a list of words from the text. 38 | """ 39 | if self._from_file: 40 | with open(self.path,'r') as f: 41 | self.raw = f.read() 42 | self.text_as_list = self.raw.lower().replace('"','').replace("'","").split() 43 | 44 | def markov_group_generator(self,text_as_list): 45 | """ 46 | Generator that creates the ngram groupings to act as keys. 47 | Just grabs ngram number of words and puts them into a tuple 48 | and yields that upon iteration request. 49 | --- 50 | Inputs 51 | text_as_list: the text after preprocessing (list) 52 | Outputs 53 | keys: word groupings of length self.ngram (tuple) 54 | """ 55 | if len(text_as_list) < self.ngram+1: 56 | raise ValueError("NOT A LONG ENOUGH TEXT!") 57 | return 58 | 59 | for i in range(self.ngram,len(text_as_list)): 60 | yield tuple(text_as_list[i-self.ngram:i+1]) 61 | 62 | def create_probability_object(self): 63 | """ 64 | Steps through the text, pulling keys out and keeping track 65 | of which words follow the keys. Duplication is allowed for 66 | values for each key - but all keys are unique. 67 | """ 68 | if self.markov_keys: 69 | print("Probability Object already built!") 70 | return 71 | if not self.text_as_list: 72 | self.preprocess() 73 | for group in self.markov_group_generator(self.text_as_list): 74 | word_key = tuple(group[:-1]) 75 | if word_key in self.markov_keys: 76 | self.markov_keys[word_key].append(group[-1]) 77 | else: 78 | self.markov_keys[word_key] = [group[-1]] 79 | 80 | def generate_sentence(self, length=25, starting_word_id=None): 81 | """ 82 | Given a seed word, pulls the key associated with that word and 83 | samples from the values available. Then moves to the newly generated 84 | word and gets the key associated with it, and generates again. 85 | Repeats until the sentence is 'length' words long. 86 | --- 87 | Inputs 88 | length: how many words to generate (int) 89 | starting_word_id: what word to use as seed, by location (int) 90 | Outputs 91 | gen_words: the generated sentence, including seed words (string) 92 | """ 93 | if not self.markov_keys: 94 | raise ValueError("No probability object built. Check initialization!") 95 | 96 | if (not starting_word_id or type(starting_word_id) != type(int(1)) 97 | or starting_word_id < 0 or starting_word_id > len(self.text_as_list)-self.ngram): 98 | starting_word_id = np.random.randint(0,len(self.text_as_list)-self.ngram) 99 | 100 | gen_words = self.text_as_list[starting_word_id:starting_word_id+self.ngram] 101 | 102 | while len(gen_words) < length: 103 | seed = tuple(gen_words[-self.ngram:]) 104 | gen_words.append(np.random.choice(self.markov_keys[seed])) 105 | return ' '.join(gen_words) 106 | 107 | def print_key_value_pairs(self, num_keys=20): 108 | """ 109 | Iterates through the probability object, printing key-value 110 | pairs. 111 | --- 112 | Input 113 | num_keys: how many pairs to show (int) 114 | """ 115 | i = 1 116 | for key,value in self.markov_keys.items(): 117 | print(key,value) 118 | print() 119 | i+=1 120 | if i>int(num_keys): 121 | break -------------------------------------------------------------------------------- /zwml/utilities/normalizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy 3 | 4 | 5 | class normalizer: 6 | 7 | def __init__(self, axis='col'): 8 | """ 9 | Normalizer has two behaviors. If the axis is 'col', it divides 10 | each column by the maximum magnitude in that column. If the axis 11 | is 'row', it forces each row to sum to 1. 12 | --- 13 | KWargs: 14 | axis: mode of behavior. See description for details. 15 | """ 16 | self.axis = axis 17 | self.data_stats = {} 18 | self.number_of_columns = None 19 | if self.axis not in ['col', 'row']: 20 | raise ValueError("axis must be either 'row' or 'col'") 21 | 22 | def fit(self, X): 23 | """ 24 | If axis='col', learns about the input data and 25 | stores the max value of each column. If set for 26 | 'row', does nothing. 27 | --- 28 | In: X (features); np.array or pandas dataframe/series 29 | """ 30 | X = self.convert_to_array(X) 31 | self.number_of_columns = X.shape[1] 32 | 33 | if self.axis == 'col': 34 | for ix in range(self.number_of_columns): 35 | self.data_stats[ix] = np.amax(np.abs(X.T[ix])) 36 | 37 | def transform(self,X): 38 | """ 39 | Given the information learned about the training data, 40 | remove the mean and scale the new data as requested by 41 | the user. 42 | --- 43 | In: X (features); np.array or pandas dataframe/series 44 | """ 45 | X = self.convert_to_array(X) 46 | new_X = copy(X) 47 | 48 | if self.axis == 'col': 49 | for ix in range(self.number_of_columns): 50 | new_X.T[ix] = new_X.T[ix]/self.data_stats[ix] 51 | 52 | if self.axis == 'row': 53 | new_X = new_X/np.sum(new_X**2, axis=1).reshape(-1,1) 54 | 55 | return new_X 56 | 57 | def fit_transform(self, X): 58 | """ 59 | Learn from X and then return the transformed version 60 | of X for the user to use. 61 | --- 62 | In: X (features); np.array or pandas dataframe/series 63 | """ 64 | self.fit(X) 65 | return self.transform(X) 66 | 67 | def pandas_to_numpy(self, x): 68 | """ 69 | Checks if the input is a Dataframe or series, converts to numpy matrix for 70 | calculation purposes. 71 | --- 72 | Input: X (array, dataframe, or series) 73 | Output: X (array) 74 | """ 75 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 76 | return x.as_matrix() 77 | if type(x) == type(np.array([1,2])): 78 | return x 79 | return np.array(x) 80 | 81 | def handle_1d_data(self,x): 82 | """ 83 | Converts 1 dimensional data into a series of rows with 1 columns 84 | instead of 1 row with many columns. 85 | """ 86 | if x.ndim == 1: 87 | x = x.reshape(-1,1) 88 | return x 89 | 90 | def convert_to_array(self, x): 91 | """ 92 | Takes in an input and converts it to a numpy array 93 | and then checks if it needs to be reshaped for us 94 | to use it properly 95 | """ 96 | x = self.pandas_to_numpy(x) 97 | x = self.handle_1d_data(x) 98 | return x -------------------------------------------------------------------------------- /zwml/utilities/standard_scaler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy 3 | 4 | 5 | class standard_scaler: 6 | 7 | def __init__(self, demean=True, dev_scale=True): 8 | """ 9 | Standard Scaler demeans each column and converts 10 | each column to have a standard deviation of 1. 11 | --- 12 | KWargs: 13 | demean: whether to subtract the mean from each column 14 | dev_scale: whether to convert to unit variance 15 | """ 16 | self.demean = demean 17 | self.dev_scale = dev_scale 18 | self.data_stats = {} 19 | self.number_of_columns = None 20 | 21 | def fit(self, X): 22 | """ 23 | Learns about the input data and stores the mean and 24 | standard deviation of each column. 25 | --- 26 | In: X (features); np.array or pandas dataframe/series 27 | """ 28 | X = self.convert_to_array(X) 29 | self.number_of_columns = X.shape[1] 30 | 31 | for ix in range(self.number_of_columns): 32 | col = X.T[ix] 33 | col_mean = np.mean(col) 34 | col_std = np.std(col) 35 | self.data_stats[ix] = (col_mean, col_std) 36 | 37 | def transform(self,X): 38 | """ 39 | Given the information learned about the training data, 40 | remove the mean and scale the new data as requested by 41 | the user. 42 | --- 43 | In: X (features); np.array or pandas dataframe/series 44 | """ 45 | X = self.convert_to_array(X) 46 | new_X = copy(X) 47 | 48 | for ix in range(self.number_of_columns): 49 | if self.demean: 50 | new_X.T[ix] = new_X.T[ix] - self.data_stats[ix][0] 51 | if self.dev_scale: 52 | new_X.T[ix] = new_X.T[ix]/self.data_stats[ix][1] 53 | 54 | return new_X 55 | 56 | def fit_transform(self, X): 57 | """ 58 | Learn from X and then return the transformed version 59 | of X for the user to use. 60 | --- 61 | In: X (features); np.array or pandas dataframe/series 62 | """ 63 | self.fit(X) 64 | return self.transform(X) 65 | 66 | def pandas_to_numpy(self, x): 67 | """ 68 | Checks if the input is a Dataframe or series, converts to numpy matrix for 69 | calculation purposes. 70 | --- 71 | Input: X (array, dataframe, or series) 72 | Output: X (array) 73 | """ 74 | if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()): 75 | return x.as_matrix() 76 | if type(x) == type(np.array([1,2])): 77 | return x 78 | return np.array(x) 79 | 80 | def handle_1d_data(self,x): 81 | """ 82 | Converts 1 dimensional data into a series of rows with 1 columns 83 | instead of 1 row with many columns. 84 | """ 85 | if x.ndim == 1: 86 | x = x.reshape(-1,1) 87 | return x 88 | 89 | def convert_to_array(self, x): 90 | """ 91 | Takes in an input and converts it to a numpy array 92 | and then checks if it needs to be reshaped for us 93 | to use it properly 94 | """ 95 | x = self.pandas_to_numpy(x) 96 | x = self.handle_1d_data(x) 97 | return x --------------------------------------------------------------------------------