├── .gitignore
├── LICENSE
├── README.md
├── TO_DO.txt
├── data
    └── lovecraft.txt
├── example_use.ipynb
├── notebooks
    ├── anomaly_detection
    │   ├── box_covariance.ipynb
    │   ├── elliptic_covariance.ipynb
    │   └── isolation_forest.ipynb
    ├── assorted_algorithms
    │   ├── k_nearest_neighbors.ipynb
    │   ├── kde_approximator.ipynb
    │   └── markov_chain_text.ipynb
    ├── classifiers
    │   ├── bagging_classifier.ipynb
    │   ├── bernoulli_naive_bayes.ipynb
    │   ├── decision_tree_classifier.ipynb
    │   ├── gaussian_naive_bayes.ipynb
    │   ├── k_nearest_neighbors_classifier.ipynb
    │   ├── multinomial_naive_bayes.ipynb
    │   ├── random_forest_classifier.ipynb
    │   ├── stochastic_gradient_descent_classifier.ipynb
    │   ├── stochastic_gradient_descent_classifier_binary.ipynb
    │   ├── support_vector_machine.ipynb
    │   └── support_vector_machine_binary.ipynb
    ├── clustering
    │   ├── agglomerative_clustering.ipynb
    │   ├── dbscan.ipynb
    │   ├── dbscan_secondary_method.ipynb
    │   ├── kmeans.ipynb
    │   ├── mean_shift.ipynb
    │   └── spectral_clustering.ipynb
    ├── datasets
    │   ├── datasets.ipynb
    │   ├── iris.data
    │   ├── make_classification.ipynb
    │   └── make_regression.ipynb
    ├── dimensionality_reduction
    │   ├── PCA.ipynb
    │   └── SVD.ipynb
    ├── metrics
    │   ├── classification_metrics.ipynb
    │   ├── pairwise_distance.ipynb
    │   └── regression_metrics.ipynb
    ├── natural_language_processing
    │   ├── count_vectorizer.ipynb
    │   ├── latent_dirichlet_allocation.ipynb
    │   ├── latent_semantic_indexing.ipynb
    │   └── tfidf_vectorizer.ipynb
    ├── neural_net
    │   ├── nn_classifier.ipynb
    │   └── nn_regressor.ipynb
    ├── random_number_generators
    │   └── middle_square.ipynb
    ├── regressors
    │   ├── bagging_regressor.ipynb
    │   ├── decision_tree_regressor.ipynb
    │   ├── k_nearest_neighbors_regressor.ipynb
    │   ├── lasso_regressor.ipynb
    │   ├── linear_regression_closed_form.ipynb
    │   ├── random_forest_regressor.ipynb
    │   ├── ridge_regressor.ipynb
    │   ├── stochastic_gradient_descent_regression.ipynb
    │   └── stochastic_gradient_descent_regression_with_regularization.ipynb
    └── utilities
    │   ├── grid_search.ipynb
    │   ├── normalizer.ipynb
    │   ├── randomized_search.ipynb
    │   ├── standard_scaler.ipynb
    │   └── train_test_and_cross_validation.ipynb
└── zwml
    ├── __init__.py
    ├── anomaly_detection
        ├── __init__.py
        ├── box_covariance.py
        ├── elliptic_covariance.py
        └── isolation_forest.py
    ├── clustering
        ├── __init__.py
        ├── agglomerative_clustering.py
        ├── dbscan.py
        ├── kmeans.py
        ├── mean_shift.py
        └── spectral_clustering.py
    ├── datasets
        ├── __init__.py
        ├── datasets.py
        ├── iris.data
        ├── make_classification.py
        └── make_regression.py
    ├── linear_models
        ├── __init__.py
        ├── elastic_net_regressor.py
        ├── lasso_regressor.py
        ├── linear_regression.py
        ├── ridge_regressor.py
        ├── sgd_classifier.py
        └── sgd_regressor.py
    ├── metrics
        ├── __init__.py
        ├── classification_metrics.py
        ├── pairwise_distance.py
        └── regression_metrics.py
    ├── naive_bayes
        ├── __init__.py
        ├── bernoulli_naive_bayes.py
        ├── gaussian_naive_bayes.py
        └── multinomial_naive_bayes.py
    ├── neighbors
        ├── __init__.py
        ├── k_neighbors.py
        ├── kde_approximator.py
        ├── knn_classifier.py
        └── knn_regressor.py
    ├── nlp
        ├── __init__.py
        ├── count_vectorizer.py
        ├── latent_semantic_indexing.py
        └── tfidf_vectorizer.py
    ├── random
        ├── __init__.py
        └── middle_square.py
    ├── svm
        ├── __init__.py
        └── svc.py
    ├── tree_models
        ├── __init__.py
        ├── bagging_classifier.py
        ├── bagging_regressor.py
        ├── decision_tree_classifier.py
        ├── decision_tree_regressor.py
        ├── random_forest_classifier.py
        └── random_forest_regressor.py
    └── utilities
        ├── __init__.py
        ├── data_splitting.py
        ├── grid_search.py
        ├── markov_chain.py
        ├── normalizer.py
        ├── randomized_search.py
        └── standard_scaler.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.ipynb_checkpoints*
2 | **~
3 | lunch_and_learn_notes.md
4 | *.DS_Store*
5 | *.npz
6 | **/__pycache__/*
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning from Scratch in Python
  2 | 
  3 | 
  4 | ### If you want to understand something, you have to be able to build it. 
  5 | 
  6 | This is my attempt to build many of the machine learning algorithms from
  7 | scratch, both in an attempt to make sense of them for myself and to write the
  8 | algorithms in a way that is pedagogically interesting. At present, SkLearn is
  9 | the leading Machine Learning module for Python, but looking through the
 10 | open-source code, it's very hard to make sense of because of how abstracted
 11 | the code is. These modules will be much simpler in design, such that a student
 12 | can read through and understand how the algorithm works. As such, they will
 13 | not be as optimized as SkLearn, etc.
 14 | 
 15 | **__Organization__**
 16 | 
 17 | zwml: This contains a fully functioning machine learning library with the ability to import a la sklearn. Want to use a decision tree? Just do `from zwml.tree_models import decision_tree_regressor`. This is still in alpha at the moment as many inconsistencies need to be cleaned up before it can be fully launched. These will always be the "full version" of the library, whereas some notebooks will have only a simpler form of the class (such as sgd without regularization)
 18 | 
 19 | Notebooks: Each notebook will have the class fully written out, with a test case shown.
 20 | All version information for the used python and modules (numpy, pandas, etc)
 21 | are shown as well for later comparison. 
 22 | 
 23 | 
 24 | ## _Methodology note:_
 25 | 
 26 | A lot of these modules are *begging* for inheritance. As an example, the
 27 | bagging classifier and the random forest classifier are largely the same code,
 28 | with a few modified methods. Since these are designed as pedagogical tools and
 29 | not "production code," I've chosen to make the modules as self-contained as
 30 | possible. So instead of having an abstracted parent class, which a new
 31 | programmer may have to track down, I've chosen to keep the code all together.
 32 | I know it's sub-optimal for production, but I think it's better for someone to
 33 | learn from. The only exceptions are ensemble methods that call entire other
 34 | algorithms. For instance, the random forest module is building a bunch of
 35 | decision trees, but with modfied data inputs. To illustrate this point, the
 36 | decision tree class is imported as a stand-alone module and plugged in to the
 37 | random forest module where it belongs - instead of recreating the decision
 38 | tree in that class. The idea is that a new student will see how random forest
 39 | (or other ensemble methodology) is just a super-class that wraps around
 40 | another algorithm.
 41 | 
 42 | 
 43 | ## _Outdated descriptions of what's available - to be updated soon_
 44 | 
 45 | # Notebooks/modules
 46 | 
 47 | ## Regression: 
 48 | 
 49 | #### linear_regression_closed_form.ipynb 
 50 | 
 51 | This modules uses the Linear Algebra, closed-form solution for solving for
 52 | coefficients of linear regression. 
 53 | 
 54 | #### stochastic_gradient_descent_regressions.ipynb 
 55 | 
 56 | This module performs stochastic gradient descent to find the regression
 57 | coefficients for linear regression. There are a few options to set, such as
 58 | learning rate, number of iterations, etc. There's also an option for setting
 59 | the learning rate to be dynamic. **There are two versions of this notebook -
 60 | one with and one without regularization included.**
 61 | 
 62 | #### decision_tree_regreeor.ipynb 
 63 | 
 64 | This module uses optimization of standard deviation or absolute errors to build decisions trees for
 65 | regression. It will be the basis for our random
 66 | forest regressor. It has a few setting like max-depth to control how our
 67 | trees are built and a few options for optimization method.
 68 | 
 69 | #### random_forest_regressor.ipynb 
 70 | 
 71 | This is similar to the random_forest_classifier, but we instead focus on getting a continuous output.
 72 | 
 73 | ## Classification:
 74 | 
 75 | #### decision_tree_classifier.ipynb 
 76 | 
 77 | This module uses information gain to build decisions trees for
 78 | classification. It will be the basis for our bagging classifier and random
 79 | forest classifier. It has a few setting like max-depth to control how our
 80 | trees are built.
 81 | 
 82 | 
 83 | #### k_nearest_neighbors.ipynb 
 84 | 
 85 | This module is based on the wisdom of "points that are close together should
 86 | be of the same class." It measures the distances to all points and then finds
 87 | the k (user specifies 'k' by setting 'n_neighbors') closest points. Those points all get to vote on
 88 | what class the new point likely is. 
 89 | 
 90 | #### bagging_classifier.ipynb 
 91 | 
 92 | This ensemble method is an extension on the decision tree that uses
 93 | bootstrapping. Bootstrapping where we sample the dataset (with replacement)
 94 | over and over to build out new datasets that "built from" our true data. If we
 95 | do this many times, we'll build many slightly different trees on the bootstrapped data
 96 | since no two trees will see the exact same data. Then we let all the trees
 97 | predict on any new data, and allow the wisdom of the masses to determine our
 98 | final outcome.
 99 | 
100 | #### random_forest_classifier.ipynb 
101 | 
102 | This is another ensemble method. It's just like the bagging_classifier, except
103 | we also randomize what features go to each tree in our data. Instead of just
104 | randomizing our datapoints, we also say, "this tree only gets features 1, 3,
105 | and 5." This further randomizes out input to each tree, helping to fight
106 | over-fitting; which puts us in a better spot for the bias-variance trade off.
107 | 
108 | #### bernoulli_naive_bayes.ipynb 
109 | 
110 | Uses Bayes rule to calculate the probability that a given observation will belong in each class, 
111 | based on what it's learned about probability distributions in the training data. In the Bernoulli 
112 | flavor, only "on" or "off" is counted for each feature when determining probability
113 | 
114 | #### gaussian_naive_bayes.ipynb 
115 | 
116 | Uses Bayes rule to calculate the probability that a given observation will belong in each class, 
117 | based on what it's learned about probability distributions in the training data. In the Gaussian 
118 | flavor, each feature is assumed to have a normal distribution, so the sample mean and standard deviation are used
119 | to approximate the Probability Distribution; which is sampled to determine probability.
120 | 
121 | ## Clustering:
122 | 
123 | #### KMeans
124 | 
125 | Description still to come. 
126 | 
127 | ## Non-Algorithm - but useful
128 | 
129 | #### train_test_and_cross_validation.ipynb 
130 | 
131 | We use different methods of splitting the data to measure the model
132 | performance on "unseen" or "out-of-sample" data. The cross-validation method
133 | will report the model behavior several different folds. Both cross validation
134 | and train-test split are built from scratch in this notebook. 
135 | 
136 | #### stats\_regress.py 
137 | 
138 | This is a suite of statistics calculation functions for regressions. Examples:
139 | mean_squared_error, r2, adjusted r2, etc.
140 | 
141 | #### kde_approximator.ipynb 
142 | 
143 | Kernel Density Approximation. Given a set of points, what surface best
144 | describes the probability of drawing a point from any region of space? This
145 | module approximates that by assuming some probability "kernel" like (what if
146 | every point is representing a gaussian probability distribution). 
147 | 
148 | #### markov_chain_text.ipynb
149 | 
150 | Given a document, can we learn about it and then generate new writings based
151 | on it? This uses the idea of Markov chains (randomly chaining together allowed
152 | possibilities together, via a probabalistic understanding of the document) to
153 | create new text from old documents.
154 | 
155 | 


--------------------------------------------------------------------------------
/TO_DO.txt:
--------------------------------------------------------------------------------
1 | SGD Classifier OVR fix
2 | 
3 | Spectral - RBF
4 | 
5 | Update ZWML with comments. (Be careful on trees and utilities since they may have slightly different versions)


--------------------------------------------------------------------------------
/notebooks/datasets/datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Iris Dataset Loader"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Z. W. Miller - Copyright 2018"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 34,
 20 |    "metadata": {
 21 |     "ExecuteTime": {
 22 |      "end_time": "2017-12-10T06:13:40.452950Z",
 23 |      "start_time": "2017-12-10T06:13:40.439056Z"
 24 |     },
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import pandas as pd\n",
 30 |     "import numpy as np\n",
 31 |     "\n",
 32 |     "def load_iris(as_dataframe=False):\n",
 33 |     "    data = pd.read_csv(\"iris.data\", header=None)\n",
 34 |     "    data.columns = ['sepal_length','sepal_width','petal_length','petal_width','class']\n",
 35 |     "    if as_dataframe:\n",
 36 |     "        return data\n",
 37 |     "    X = data.iloc[:,:-1].as_matrix()\n",
 38 |     "    y = data.iloc[:,-1]\n",
 39 |     "    y = y.str.replace('Iris-setosa','0').replace('Iris-versicolor','1').replace('Iris-virginica','2')\n",
 40 |     "    y = y.astype(int).as_matrix()\n",
 41 |     "    return X,y"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 35,
 47 |    "metadata": {
 48 |     "ExecuteTime": {
 49 |      "end_time": "2017-12-10T06:13:41.704976Z",
 50 |      "start_time": "2017-12-10T06:13:41.698290Z"
 51 |     }
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "X,y = load_iris()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 36,
 61 |    "metadata": {
 62 |     "ExecuteTime": {
 63 |      "end_time": "2017-12-10T06:13:41.857025Z",
 64 |      "start_time": "2017-12-10T06:13:41.850802Z"
 65 |     }
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
 73 |       " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
 74 |       " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n",
 75 |       " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n",
 76 |       " 2 2]\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "print(y)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python [default]",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.6.2"
111 |   },
112 |   "toc": {
113 |    "nav_menu": {},
114 |    "number_sections": true,
115 |    "sideBar": true,
116 |    "skip_h1_title": false,
117 |    "toc_cell": false,
118 |    "toc_position": {},
119 |    "toc_section_display": "block",
120 |    "toc_window_display": false
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 2
125 | }
126 | 


--------------------------------------------------------------------------------
/notebooks/datasets/iris.data:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica
151 | 


--------------------------------------------------------------------------------
/zwml/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = 'v0.0.alpha'
2 | 


--------------------------------------------------------------------------------
/zwml/anomaly_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .box_covariance import box_covariance
2 | from .elliptic_covariance import elliptic_covariance
3 | from .isolation_forest import isolation_tree, isolation_forest
4 | 
5 | __all__ = ['box_covariance', 'elliptic_covariance', 'isolation_forest']


--------------------------------------------------------------------------------
/zwml/anomaly_detection/box_covariance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | class box_covariance:
 5 |     
 6 |     def __init__(self, threshold=1.): 
 7 |         """
 8 |         Builds a box envelope around the data using a
 9 |         standard deviation threshold. Any points within this
10 |         box are considered inliers, and points outside of this
11 |         box are considered outliers. This is a fairly simplistic
12 |         method that is not very robust to highly correlated
13 |         data with "close by" outliers.
14 |         ---
15 |         KWargs:
16 |         threhsold: how many standard deviations do you want
17 |         to consider an "inlier"
18 |         """
19 |         self.threshold = threshold
20 |         self.data_stats = {}
21 |         self.number_of_columns = None
22 |         
23 |     def fit(self, X):
24 |         """
25 |         Learns about the input data and stores the mean and 
26 |         standard deviation of each column.
27 |         ---
28 |         In: X (features); np.array or pandas dataframe/series
29 |         """
30 |         X = self.convert_to_array(X)
31 |         self.number_of_columns = X.shape[1]
32 |         
33 |         for ix in range(self.number_of_columns):
34 |             col = X.T[ix]
35 |             col_mean = np.mean(col)
36 |             col_std = np.std(col)
37 |             self.data_stats[ix] = (col_mean, col_std)
38 |             
39 |     def predict(self, X):
40 |         """
41 |         For each data point, subtract the mean of the column
42 |         and then see if the data point is within 
43 |         threshold*std_dev of that column of 0. If so, it's an
44 |         inlier. Otherwise it's an outlier.
45 |         """
46 |         X = self.convert_to_array(X)
47 |         result = np.ones(X.shape[0])
48 |         for ix in range(self.number_of_columns):
49 |             X.T[ix] = X.T[ix] - self.data_stats[ix][0]
50 |             result[(result != -1) & (np.abs(X.T[ix]) >= self.data_stats[ix][1]*self.threshold)] = -1
51 |         return result
52 |     
53 |     def fit_predict(self, X):
54 |         """
55 |         Learn from X and then return the transformed version
56 |         of X for the user to use.
57 |         ---
58 |         In: X (features); np.array or pandas dataframe/series
59 |         """
60 |         self.fit(X)
61 |         return self.predict(X)
62 |     
63 |     def pandas_to_numpy(self, x):
64 |         """
65 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
66 |         calculation purposes.
67 |         ---
68 |         Input: X (array, dataframe, or series)
69 |         Output: X (array)
70 |         """
71 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
72 |             return x.as_matrix()
73 |         if type(x) == type(np.array([1,2])):
74 |             return x
75 |         return np.array(x) 
76 |     
77 |     def handle_1d_data(self,x):
78 |         """
79 |         Converts 1 dimensional data into a series of rows with 1 columns
80 |         instead of 1 row with many columns.
81 |         """
82 |         if x.ndim == 1:
83 |             x = x.reshape(-1,1)
84 |         return x
85 |     
86 |     def convert_to_array(self, x):
87 |         """
88 |         Takes in an input and converts it to a numpy array
89 |         and then checks if it needs to be reshaped for us
90 |         to use it properly
91 |         """
92 |         x = self.pandas_to_numpy(x)
93 |         x = self.handle_1d_data(x)
94 |         return x


--------------------------------------------------------------------------------
/zwml/anomaly_detection/elliptic_covariance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from copy import copy
  4 | 
  5 | class elliptic_covariance:
  6 |     
  7 |     def __init__(self, threshold=5.991): 
  8 |         """
  9 |         Uses the covariance matrix to find the eigenvalues
 10 |         and eigenvectors. Then finds an ellipse that represents
 11 |         the training data using the standard deviation. 
 12 |         The ellipse is based on the formula:
 13 |         (x/std_X)^2 + (y/std_y)^2 + (z/std_z)^2 + ... = threshold
 14 |         The threshold value will define the allowed inliers
 15 |         and their total deviation by "distance" from the mean.
 16 |         ---
 17 |         KWargs:
 18 |         threshold: how far from the mean do you want the inlier 
 19 |         surface to exist. 5.991 represents a 95% confidence interval
 20 |         from the Cumulative Chi_2 distribution.
 21 |         """
 22 |         self.threshold = threshold
 23 |         self.number_of_columns = None
 24 |         
 25 |     def fit(self, X):
 26 |         """
 27 |         Learns about the input data and stores the mean and 
 28 |         standard deviation of each column.
 29 |         ---
 30 |         In: X (features); np.array or pandas dataframe/series
 31 |         """
 32 |         X = self.convert_to_array(X)
 33 |         new_X = copy(X)
 34 |         self.number_of_columns = new_X.shape[1]
 35 |         
 36 |         self.means = np.mean(new_X, axis = 0)  
 37 |         new_X -= self.means
 38 |         cov = np.cov(new_X, rowvar = False)
 39 |         eigenvals , eigenvecs = np.linalg.eigh(cov)
 40 |         idx = np.argsort(eigenvals)[::-1]
 41 |         self.eigenvecs = eigenvecs[:,idx]
 42 |         self.eigenvals = eigenvals[idx]
 43 | 
 44 |             
 45 |     def predict(self, X):
 46 |         """
 47 |         For each data point, compute whether each point
 48 |         lies within the ellipsoid created by
 49 |         (x/std_X)^2 + (y/std_y)^2 + (z/std_z)^2 + ... = threshold
 50 |         This is checked by converting each point to the new reduced
 51 |         eigen space, where the ellipsoid is centered on 0
 52 |         and each direction has an axis the size of the sqrt(eigenvalue)
 53 |         The standard deviation is that sqrt(eigenvalue) since the
 54 |         eigenvalue captures the variance in along the eigenvector.
 55 |         """
 56 |         X = self.convert_to_array(X)
 57 |         new_X = copy(X)
 58 |         new_X -= self.means
 59 |         new_X = self.convert_to_pca_space(new_X)  
 60 |         new_X /= np.sqrt(self.eigenvals)
 61 |         new_X = new_X**2
 62 |         result = np.ones(X.shape[0])
 63 |         result[np.sum(new_X, axis=1) >= self.threshold] = -1
 64 |         return result
 65 |     
 66 |     def convert_to_pca_space(self, X):
 67 |         """
 68 |         Converts the points to the new eigenspace
 69 |         """
 70 |         return np.dot(X,self.eigenvecs)  
 71 |     
 72 |     def fit_predict(self, X):
 73 |         """
 74 |         Learn from X and then return the transformed version
 75 |         of X for the user to use.
 76 |         ---
 77 |         In: X (features); np.array or pandas dataframe/series
 78 |         """
 79 |         self.fit(X)
 80 |         return self.predict(X)
 81 |     
 82 |     def pandas_to_numpy(self, x):
 83 |         """
 84 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 85 |         calculation purposes.
 86 |         ---
 87 |         Input: X (array, dataframe, or series)
 88 |         Output: X (array)
 89 |         """
 90 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 91 |             return x.as_matrix()
 92 |         if type(x) == type(np.array([1,2])):
 93 |             return x
 94 |         return np.array(x) 
 95 |     
 96 |     def handle_1d_data(self,x):
 97 |         """
 98 |         Converts 1 dimensional data into a series of rows with 1 columns
 99 |         instead of 1 row with many columns.
100 |         """
101 |         if x.ndim == 1:
102 |             x = x.reshape(-1,1)
103 |         return x
104 |     
105 |     def convert_to_array(self, x):
106 |         """
107 |         Takes in an input and converts it to a numpy array
108 |         and then checks if it needs to be reshaped for us
109 |         to use it properly
110 |         """
111 |         x = self.pandas_to_numpy(x)
112 |         x = self.handle_1d_data(x)
113 |         return x


--------------------------------------------------------------------------------
/zwml/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .kmeans import kmeans
3 | from .mean_shift import mean_shift
4 | from .spectral_clustering import spectral_clustering
5 | from .dbscan import dbscan
6 | from .agglomerative_clustering import agglomerative_clustering
7 | 
8 | __all__ = ['kmeans','mean_shift','spectral_clustering','dbscan','agglomerative_clustering']
9 | 


--------------------------------------------------------------------------------
/zwml/clustering/agglomerative_clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from itertools import combinations
  4 | 
  5 | class agglomerative_clustering:
  6 |     
  7 |     def __init__(self, linkage="average", n_clusters=5, max_dist=None):
  8 |         """
  9 |         Agglomerative clustering uses a "linkage" function to measure
 10 |         how close together two current clusters are. It then merges
 11 |         the two closest clusters into a single bigger cluster. This
 12 |         process is repeated until there are n_clusters remaining,
 13 |         or some other cut-off is applied. If no cut-off applied,
 14 |         will eventually result in a single cluster of all data points.
 15 |         ---
 16 |         KWargs: 
 17 |         linkage: how to measure cluster closeness. Options 
 18 |         ('average','complete','minimal','ward')
 19 |         n_clusters: when n_clusters is reached, stop merging
 20 |         max_dist: if no clusters are closer than max_dist, stop merging
 21 |         """
 22 |         self.link = linkage
 23 |         self.clusters = {}
 24 |         self.n_clusters = n_clusters
 25 |         self.max_dist = max_dist
 26 |         self.merge_tracker = []
 27 |         self.data = None
 28 |         self.labels = None
 29 | 
 30 |     def euclidean_distance(self, pt1, pt2):
 31 |         """
 32 |         Returns the distance. Currently only uses Euclidean distance.
 33 |         ---
 34 |         Input: Cluster (cluster object), data point (np array)
 35 |         Output: Distance (float)
 36 |         """
 37 |         return np.sqrt(np.sum((pt1 - pt2)**2))
 38 |        
 39 |     def compute_distance(self, idx1, idx2):
 40 |         """
 41 |         Chooses how do decide "how close" two clusters are. Applies to
 42 |         proper measure and returns it.
 43 |         """
 44 |         if self.link == 'average':
 45 |             return self.average_linkage(idx1, idx2)
 46 |         elif self.link == 'complete':
 47 |             return self.complete_linkage(idx1, idx2)
 48 |         elif self.link == 'minimal':
 49 |             return self.minimal_linkage(idx1, idx2)
 50 |         elif self.link == 'ward':
 51 |             return self.ward_linkage(idx1, idx2)
 52 |         else:
 53 |             raise TypeError("Not a proper linkage function selection!")
 54 |         
 55 |     def average_linkage(self, idx1, idx2):
 56 |         """
 57 |         Finds the distance between the mean of cluster 1 and the mean
 58 |         of cluster 2.
 59 |         """
 60 |         return self.euclidean_distance(self.clusters[idx1]['mean'], self.clusters[idx2]['mean'])
 61 |     
 62 |     def complete_linkage(self, idx1, idx2):
 63 |         """
 64 |         Finds the maximum possible distance between points in 
 65 |         cluster 1 and cluster 2. Meaning it returns the distance of the
 66 |         two points in the clusters that are furthest apart.
 67 |         """
 68 |         max_dist = 0.
 69 |         for pt in self.clusters[idx1]['members']:
 70 |             for pt2 in self.clusters[idx2]['members']:
 71 |                 dist = self.euclidean_distance(self.data[pt], self.data[pt2])
 72 |                 if dist > max_dist:
 73 |                     max_dist = dist
 74 |         return max_dist
 75 |     
 76 |     def minimal_linkage(self, idx1, idx2):
 77 |         """
 78 |         Finds the minimum possible distance between points in 
 79 |         cluster 1 and cluster 2. Meaning it returns the distance of the
 80 |         two points in the clusters that are nearest together.
 81 |         """
 82 |         min_dist = 99999999.
 83 |         for pt in self.clusters[idx1]['members']:
 84 |             for pt2 in self.clusters[idx2]['members']:
 85 |                 dist = self.euclidean_distance(self.data[pt], self.data[pt2])
 86 |                 if dist < min_dist:
 87 |                     min_dist = dist
 88 |         return min_dist
 89 |     
 90 |     def ward_linkage(self, idx1, idx2):
 91 |         """
 92 |         Measures how far every point in each cluster is from its own
 93 |         cluster mean, called the inertia. Then "pretends to merge" the
 94 |         points and measures the inertia of the resulting mega-cluster.
 95 |         Returns the "gained" inertia by the pretend merge. 
 96 |         """
 97 |         inertia_1 = 0
 98 |         inertia_2 = 0
 99 |         inertia_combined = 0
100 |         
101 |         for pt in self.clusters[idx1]['members']:
102 |             inertia_1 += self.euclidean_distance(self.data[pt], self.clusters[idx1]['mean'])
103 |         for pt in self.clusters[idx2]['members']:
104 |             inertia_2 += self.euclidean_distance(self.data[pt], self.clusters[idx2]['mean'])
105 |             
106 |         combined_members = self.clusters[idx1]['members'] + self.clusters[idx2]['members']
107 |         combined_mean = np.mean([X[i] for i in combined_members], axis=0)
108 |         
109 |         for pt in combined_members:
110 |             inertia_combined += self.euclidean_distance(self.data[pt], combined_mean)
111 |             
112 |         return inertia_combined - inertia_1 - inertia_2
113 |         
114 |     def init_clusters(self, X):
115 |         """
116 |         Create a lookup table where each point is its own cluster.
117 |         As we merge clusters, we'll remove members and track the progress
118 |         with this dictionary.
119 |         """
120 |         for idx, pt in enumerate(X):
121 |             self.clusters[idx] = {'members': [idx], 'mean': pt}
122 |         self.data = X
123 |             
124 |     def merge_clusters(self, idx1, idx2, distance):
125 |         """
126 |         Takes two clusters and makes them into a single, 
127 |         larger cluster. Also tracks the "distance" that the merge
128 |         occurred at for future reference.
129 |         """
130 |         self.clusters[idx1]['members'] += self.clusters[idx2]['members']
131 |         self.clusters[idx1]['mean'] = np.mean([X[i] for i in self.clusters[idx1]['members']], axis=0)
132 |         self.clusters.pop(idx2, None)
133 |         self.merge_tracker.append((idx1, idx2, distance))
134 |     
135 |     def fit(self, X):
136 |         """
137 |         Makes ever point into it's own cluster. Checks the 
138 |         linkage distance for all possible merges (using the 
139 |         combinations to see what merges are possible). Whatever 
140 |         clusters have the smallest linkage relationship are merged 
141 |         together into a new cluster which takes the id of the lower
142 |         numbered cluster. Tracks the "size" of each merge for
143 |         review. Repeat this until down to n_clusters or the distance
144 |         is larger than the allowed maximum. Then label the clusters.
145 |         ---
146 |         Input: X (data, array/dataframe)
147 |         """
148 |         X = self.convert_to_array(X)
149 |         self.init_clusters(X)
150 |         
151 |         while len(self.clusters.keys()) > self.n_clusters:
152 |             decision_tracker = {}
153 |             for combo in combinations(self.clusters.keys(), r=2):
154 |                 decision_tracker[combo] = self.compute_distance(combo[0], combo[1])
155 |             to_merge = sorted(decision_tracker.items(), key=lambda x: x[1])[0][0]
156 |             
157 |             if self.max_dist != None and self.linkage != 'ward' and decision_tracker[combo] > self.max_dist:
158 |                 break
159 |                 
160 |             self.merge_clusters(to_merge[0], to_merge[1], decision_tracker[combo])
161 |         
162 |         self.labels = np.zeros(X.shape[0])
163 |         for ix, clst in enumerate(self.clusters.keys()):
164 |             members = self.clusters[clst]['members']
165 |             self.labels[members] = ix
166 |     
167 |     def fit_predict(self,X):
168 |         """
169 |         Creates clusters for data X, and returns cluster ID's for each point.
170 |         ---
171 |         Input: X (data, array)
172 |         Output: cluster IDs for X (array)
173 |         """
174 |         self.fit(X)
175 |         return self.labels
176 |     
177 |     def pandas_to_numpy(self, x):
178 |         """
179 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
180 |         calculation purposes.
181 |         ---
182 |         Input: X (array, dataframe, or series)
183 |         Output: X (array)
184 |         """
185 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
186 |             return x.as_matrix()
187 |         if type(x) == type(np.array([1,2])):
188 |             return x
189 |         return np.array(x) 
190 |     
191 |     def handle_1d_data(self,x):
192 |         """
193 |         Converts 1 dimensional data into a series of rows with 1 columns
194 |         instead of 1 row with many columns.
195 |         """
196 |         if x.ndim == 1:
197 |             x = x.reshape(-1,1)
198 |         return x
199 |     
200 |     def convert_to_array(self, x):
201 |         """
202 |         Takes in an input and converts it to a numpy array
203 |         and then checks if it needs to be reshaped for us
204 |         to use it properly
205 |         """
206 |         x = self.pandas_to_numpy(x)
207 |         x = self.handle_1d_data(x)
208 |         return x
209 |     


--------------------------------------------------------------------------------
/zwml/clustering/dbscan.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | class dbscan:
  5 |     
  6 |     def __init__(self, epsilon=0.5, min_points=5):
  7 |         self.epsilon = epsilon
  8 |         self.min_points = min_points
  9 |         self.data_cols = None
 10 |         self.labels_ = None
 11 |         self.neighbors = {}
 12 |         
 13 |     def fit(self, X):
 14 |         X = self.pandas_to_numpy(X)
 15 |         if not self.data_cols:
 16 |             self.data_cols = X.shape[1]
 17 |         self.check_feature_shape(X)
 18 |         self.visited_points = []
 19 |         self.clusters = []
 20 |         
 21 |         for ix in range(X.shape[0]):
 22 |             if ix in self.visited_points:
 23 |                 continue
 24 |             self.neighbors[ix] = self.get_neighbors(ix, X)
 25 |             if len(self.neighbors[ix]) >= self.min_points:
 26 |                 self.visited_points.append(ix)
 27 |                 self.clusters.append(self.make_cluster(ix, X))
 28 |                 
 29 |         self.labels_ = self.get_labels(X)
 30 |         
 31 |     def get_labels(self, X):
 32 |         labels = [-1]*X.shape[0]
 33 |         for clst_id, cluster in enumerate(self.clusters):
 34 |             for pt_id in cluster:
 35 |                 labels[pt_id] = clst_id
 36 |         return np.array(labels)
 37 |             
 38 |     def make_cluster(self, ix, X):
 39 |         cluster = [ix]
 40 |         for neighbor in self.neighbors[ix]:
 41 |             if neighbor not in self.visited_points:
 42 |                 self.visited_points.append(neighbor)
 43 |                 self.neighbors[neighbor]= self.get_neighbors(ix, X)
 44 |                 if len(self.neighbors[neighbor]) >= self.min_points:
 45 |                     cluster_from_neighbor = self.make_cluster(neighbor, X)
 46 |                     cluster = cluster + cluster_from_neighbor
 47 |                 else:
 48 |                     cluster.append(neighbor)
 49 |         return cluster
 50 |           
 51 |     def fit_predict(self,X):
 52 |         self.fit(X)
 53 |         return self.labels_
 54 |     
 55 |     def get_neighbors(self, ix, X):
 56 |         neighbors = []
 57 |         pt = X[ix]
 58 |         for ix2, pt2 in enumerate(X):
 59 |             dist = np.sqrt(np.sum((pt2 - pt)**2)) 
 60 |             if dist <= self.epsilon:
 61 |                 neighbors.append(ix2)
 62 |         return neighbors
 63 |         
 64 |     def check_feature_shape(self, x):
 65 |         """
 66 |         Helper function to make sure any new data conforms to the fit data shape
 67 |         ---
 68 |         In: numpy array, (unknown shape)
 69 |         Out: numpy array, shape: (rows, self.data_cols)"""
 70 |         return x.reshape(-1,self.data_cols)
 71 |         
 72 |     def rbf_kernel(self, x1, x2, sig=1.):
 73 |         """
 74 |         Returns the rbf affinity between two points (x1 and x2),
 75 |         for a given bandwidth (standard deviation).
 76 |         ---
 77 |         Inputs: 
 78 |             x1; point 1(array)
 79 |             x2; point 2(array)
 80 |             sig; standard deviation (float)
 81 |         """
 82 |         diff = np.sum((x1-x2)**2)
 83 |         norm = 1/(np.sqrt(2*np.pi*sig**2))
 84 |         return norm*np.exp(-diff/(2*sig**2))
 85 |     
 86 |     def pandas_to_numpy(self, x):
 87 |         """
 88 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 89 |         calculation purposes.
 90 |         ---
 91 |         Input: X (array, dataframe, or series)
 92 |         
 93 |         Output: X (array)
 94 |         """
 95 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 96 |             return x.as_matrix()
 97 |         if type(x) == type(np.array([1,2])):
 98 |             return x
 99 |         return np.array(x)
100 |     
101 |    


--------------------------------------------------------------------------------
/zwml/clustering/kmeans.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | class kmeans:
  5 |     
  6 |     def __init__(self, k = 5, random_seed=None, iters=1000, n_init=10, init='kmeans++'):
  7 |         """
  8 |         Kmeans is a clustering algorithm which involves randomly initializing a set
  9 |         of clusters, assigning points by distance metric, then updating the means. 
 10 |         The algorithm terminates if movements stops or after "iters" iterations.
 11 |         ---
 12 |         Inputs: 
 13 |         k: the number of clusters to create
 14 |         random_seed: sets the random seed for reproducibility
 15 |         iters: how many iterations to attempt before breaking
 16 |         n_init: Initialize and run the algorithm this many times, keeping the
 17 |                 best clusters, as decided by score.
 18 |         init: How to initialize the clusters. KMeans++ performs better, but takes more
 19 |               calculation. It weights the starting points based on distance from one anothers.
 20 |               Options: 'Random' (randomly select data points to act as seeds), 
 21 |                        'Kmeans++' (randomly select with distance squared weighting)
 22 |         """
 23 |         self._k = int(k)
 24 |         self._iters = iters
 25 |         self._n_init = n_init
 26 |         if init not in ['kmeans++','random']:
 27 |             print("Not a valid initialization, defaulting to kmeans++")
 28 |             init = 'kmeans++'
 29 |         self._init = init
 30 |         if random_seed:
 31 |             np.random.seed(random_seed)
 32 | 
 33 |     def compute_distance_to_cluster_mean(self, clst, pt):
 34 |         """
 35 |         Returns the distance to the cluster mean. Currently only uses Euclidean distance.
 36 |         ---
 37 |         Input: Cluster (cluster object), data point (np array)
 38 |         Output: Distance (float)
 39 |         """
 40 |         return np.sqrt(np.sum((clst.mean - pt)**2))
 41 | 
 42 |     def classify(self, pt):
 43 |         """
 44 |         Add a data point to the closest cluster.
 45 |         ---
 46 |         Input: data point (array)
 47 |         """
 48 |         cluster_num = self.get_clust_id(pt)
 49 |         self.clusters[cluster_num].add_member(pt)
 50 | 
 51 |     def get_clust_id(self,pt):
 52 |         """
 53 |         Given a point, return clusterid for cluster who's mean is the closest.
 54 |         ---
 55 |         Input: point (array)
 56 |         Output: cluster ID (int)
 57 |         """
 58 |         return min(range(self._k), key=lambda i: self.compute_distance_to_cluster_mean(self.clusters[i],pt))
 59 |     
 60 |     def init_clusters(self, X):
 61 |         """
 62 |         Select the initial starting points for the clusters. Two options: "random" which
 63 |         randomly draws starting points from the data AND "kmeans++" which randomly draws
 64 |         with distance based weighting. 
 65 |         ---
 66 |         Input: X (data, array)
 67 |         """
 68 |         self.clusters = [self.cluster() for _ in range(0,self._k)]
 69 |         
 70 |         if self._init == 'random':
 71 |             rand_points = np.copy(X)
 72 |             np.random.shuffle(rand_points)
 73 |             rand_points = rand_points.tolist()
 74 |             for c in self.clusters:
 75 |                 c.mean = rand_points.pop()
 76 |         else: # default to kmeans++
 77 |             starting_points = [X[np.random.choice(np.arange(0,len(X)))]]
 78 |             
 79 |             for _ in range(self._k-1):
 80 |                 dists = []
 81 |                 datum = []
 82 |                 for x in X:
 83 |                     if np.sum([np.array_equal(x,row) for row in starting_points]):
 84 |                         continue
 85 |                         
 86 |                     dist2 = 0.
 87 |                     for sp in starting_points:
 88 |                         dist2 += np.sum((x - sp)**2)
 89 |                     dists.append(dist2)
 90 |                     datum.append(x)
 91 |                 dists = dists/np.sum(dists)
 92 |                 starting_points.append(datum[np.random.choice(np.arange(0,len(datum)), p=dists)])
 93 |             for c, sp in zip(self.clusters, starting_points):
 94 |                 c.mean = sp
 95 |         
 96 |         for p in X:
 97 |             self.classify(p)  
 98 | 
 99 |     def fit_predict(self,X):
100 |         """
101 |         Creates clusters for data X, and returns cluster ID's for each point.
102 |         ---
103 |         Input: X (data, array)
104 |         Output: cluster IDs for X (array)
105 |         """
106 |         self.fit(X)
107 |         return self.predict(X)
108 |     
109 |     def fit(self, X):
110 |         """
111 |         Initializes clusters, then moves the mean of the cluster to the center of
112 |         all points in the cluster. Reassigns all points to their new 'nearest' cluster
113 |         and repeats this process until no more assignments can occur (or too many iterations).
114 |         Whole procedure is repeated n_init times, to overcome local minima. Only the best 
115 |         clustering is kept as part of the model.
116 |         ---
117 |         Input: X (data, array/dataframe)
118 |         """
119 |         X = self.pandas_to_numpy(X)
120 |         
121 |         best_inertia = None
122 |         best_clusters = []
123 |         for _ in range(self._n_init):
124 |             self.init_clusters(X)
125 |             ischange = True
126 |             i = 0
127 |             while ischange and i < self._iters:
128 |                 ischange = False
129 |                 for c in self.clusters:
130 |                     c.get_mean()
131 |                     c.set_prev_members()
132 |                     c.members = []
133 | 
134 |                 for p in X:
135 |                     self.classify(p)
136 | 
137 |                 for c in self.clusters:
138 |                     if c.is_changed():
139 |                         ischange = True
140 |                 i += 1 
141 |             current_inertia = 0.
142 |             for c in self.clusters:
143 |                 c.get_mean()
144 |                 current_inertia += c.get_total_square_distance()
145 |             
146 |             if not best_inertia or current_inertia < best_inertia:
147 |                 best_clusters = self.clusters
148 |                 best_inertia = current_inertia
149 |         
150 |         self.clusters = best_clusters
151 |         self.inertia = best_inertia
152 |             
153 |     def predict(self, X):
154 |         """
155 |         Given a point, the distance to each cluster center is calculated
156 |         and the nearest cluster's ID is returned.
157 |         ---
158 |         Input: X (data, array/dataframe)
159 |         """
160 |         clust_ids = []
161 |         for dt in self.pandas_to_numpy(X):
162 |             clust_ids.append([self.get_clust_id(dt)])
163 |         return np.array(clust_ids)
164 |     
165 |     def pandas_to_numpy(self, x):
166 |         """
167 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
168 |         calculation purposes.
169 |         ---
170 |         Input: X (array, dataframe, or series)
171 |         
172 |         Output: X (array)
173 |         """
174 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
175 |             return x.as_matrix()
176 |         if type(x) == type(np.array([1,2])):
177 |             return x
178 |         return np.array(x)
179 |     
180 |     def score(self):
181 |         """
182 |         Inertia is a measure of the distance from each point to the cluster center,
183 |         summed over all points and clusters. It's calculated during the fit 
184 |         procedure.
185 |         ---
186 |         Output: inertia (float)
187 |         """
188 |         return self.inertia
189 |         
190 |     class cluster:
191 |         def __init__(self):
192 |             """
193 |             This sub-class stores all the information related to each cluster.
194 |             mean: where is the average location of points in this cluster
195 |             members: which data points are in this cluster
196 |             prev_members: which data points were in this cluster last optimization step
197 |             """
198 |             self.mean = None
199 |             self.members = []
200 |             self.prev_members = []
201 | 
202 |         def set_prev_members(self):
203 |             """
204 |             Transfers current_members to prev_members for later comparison
205 |             """
206 |             self.prev_members = self.members
207 |             self.members = []
208 | 
209 |         def add_member(self,pt):
210 |             """
211 |             Helper function to add a point to this cluster.
212 |             ---
213 |             Input: data point (array)
214 |             """
215 |             self.members.append(pt)
216 | 
217 |         def is_changed(self):
218 |             """ 
219 |             Checks if this cluster has been modified by the most recent
220 |             optimizatino step.
221 |             ---
222 |             Output:
223 |             did cluster change (bool)
224 |             """
225 |             return not np.array_equal(self.members,self.prev_members)
226 | 
227 |         def get_mean(self):
228 |             means = []
229 |             for dim in np.array(self.members).T:
230 |                 means.append(np.mean(dim))
231 |             self.mean = means
232 | #             if not len(self.members):
233 | #                 self.mean = [-999,-999]
234 | #                 return
235 | #             x,y = 0.,0.
236 | #             for p in self.members:
237 | #                 x+=p[0]
238 | #                 y+=p[1]
239 | #             self.mean = [x/len(self.members),y/len(self.members)]
240 | 
241 |         def get_total_square_distance(self):
242 |             val = 0.
243 |             for p in self.members:
244 |                 val += np.sqrt(np.sum((self.mean - p)**2))
245 |             return val


--------------------------------------------------------------------------------
/zwml/clustering/mean_shift.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from copy import copy
 4 | 
 5 | class mean_shift:
 6 |     
 7 |     def __init__(self, bandwidth=1, iters=10, threshold = .1):
 8 |         self._iters = iters
 9 |         self.bandwidth = bandwidth
10 |         self.data_cols = None
11 |         self.threshold = threshold
12 |         
13 |     def fit(self, X):
14 |         X = self.pandas_to_numpy(X)
15 |         if not self.data_cols:
16 |             self.data_cols = X.shape[1]
17 |         self.check_feature_shape(X)
18 |         self._original_data = copy(X)
19 |         
20 |     def transform(self, X):
21 |         X = self.pandas_to_numpy(X)
22 |         if not self.data_cols:
23 |             self.data_cols = X.shape[1]
24 |         X = self.check_feature_shape(X)
25 |         new_X = []
26 |         for pt in X:
27 |             movement = self.threshold+1
28 |             it=0
29 |             p = copy(pt)
30 |             while it < self._iters and movement > self.threshold:
31 |                 shift = np.zeros(len(p))
32 |                 scale = 0.
33 |                 for orig_pt in self._original_data:
34 |                     weight = self.rbf_kernel(p, orig_pt, sig=self.bandwidth)
35 |                     shift += weight*orig_pt
36 |                     scale += weight
37 |                 movement = p - shift/scale
38 |                 p = shift/scale
39 |                 movement = np.sqrt(np.sum(movement**2))
40 |                 it+=1
41 |             new_X.append(p)
42 |         return new_X
43 |     
44 |     def fit_transform(self, X):
45 |         self.fit(X)
46 |         return self.transform(X)
47 |         
48 |     def check_feature_shape(self, x):
49 |         """
50 |         Helper function to make sure any new data conforms to the fit data shape
51 |         ---
52 |         In: numpy array, (unknown shape)
53 |         Out: numpy array, shape: (rows, self.data_cols)"""
54 |         return x.reshape(-1,self.data_cols)
55 |         
56 |     def rbf_kernel(self, x1, x2, sig=1.):
57 |         """
58 |         Returns the rbf affinity between two points (x1 and x2),
59 |         for a given bandwidth (standard deviation).
60 |         ---
61 |         Inputs: 
62 |             x1; point 1(array)
63 |             x2; point 2(array)
64 |             sig; standard deviation (float)
65 |         """
66 |         diff = np.sum((x1-x2)**2)
67 |         norm = 1/(np.sqrt(2*np.pi*sig**2))
68 |         return norm*np.exp(-diff/(2*sig**2))
69 |     
70 |     def pandas_to_numpy(self, x):
71 |         """
72 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
73 |         calculation purposes.
74 |         ---
75 |         Input: X (array, dataframe, or series)
76 |         
77 |         Output: X (array)
78 |         """
79 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
80 |             return x.as_matrix()
81 |         if type(x) == type(np.array([1,2])):
82 |             return x
83 |         return np.array(x)


--------------------------------------------------------------------------------
/zwml/clustering/spectral_clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from zwml.clustering import kmeans
  4 | 
  5 | class spectral_clustering:
  6 |     
  7 |     def __init__(self, k=3, connectivity=20, svd_dims=3, affinity='neighbors', bandwidth=1.):
  8 |         self.k = k
  9 |         self.connect = connectivity
 10 |         self.dims = svd_dims
 11 |         if affinity in ['neighbors', 'rbf']:
 12 |             self.affinity_type = affinity
 13 |         else:
 14 |             print("Not a valid affinity type, default to 'neighbors'.")
 15 |             self.affinity_type = 'neighbors'
 16 |         self.bandwidth = bandwidth
 17 |     
 18 |     def rbf_kernel(self, x1, x2, sig=1.):
 19 |         """
 20 |         Returns the rbf affinity between two points (x1 and x2),
 21 |         for a given bandwidth (standard deviation).
 22 |         ---
 23 |         Inputs: 
 24 |             x1; point 1(array)
 25 |             x2; point 2(array)
 26 |             sig; standard deviation (float)
 27 |         """
 28 |         diff = np.sqrt(np.sum((x1-x2)**2))
 29 |         norm = 1/(np.sqrt(2*np.pi*sig**2))
 30 |         return norm*np.exp(-diff**2/(2*sig**2))
 31 |     
 32 |     def compute_distance_between_all_points(self, pt1, pts, connectivity=None):
 33 |         """
 34 |         Returns the distance between points. Currently only uses Euclidean distance.
 35 |         ---
 36 |         Input: data point, all data points (np arrays)
 37 |         Output: Distance (float)
 38 |         """
 39 |         if self.affinity_type == 'neighbors':
 40 |             x = np.sqrt(np.sum((pt1 - pts)**2, axis=1))
 41 |             idxs = x.argsort()[:connectivity]
 42 |             filt = np.ones(len(x), dtype=bool)
 43 |             filt[idxs] = False
 44 |             x[filt] = 0.
 45 |             x[~filt] = 1.
 46 |         elif self.affinity_type == 'rbf':
 47 |             x = []
 48 |             for p in pts:
 49 |                 x.append(self.rbf_kernel(pt1, p, sig=self.bandwidth))
 50 |         return x
 51 |     
 52 |     def fit(self, X):
 53 |         X = self.pandas_to_numpy(X)
 54 |         self.original_data = np.copy(X)
 55 |         self.similarity = np.array([self.compute_distance_between_all_points(p,X, connectivity=self.connect) for p in X])
 56 |         self.similarity /= max(self.similarity.ravel())
 57 |         self.U, self.Sigma, self.VT = self.do_svd(self.similarity)
 58 |         self.kmeans = kmeans(k=self.k)
 59 |         self.kmeans.fit(self.U)
 60 |         
 61 |     def fit_predict(self, X):
 62 |         self.fit(X)
 63 |         return self.predict(X)
 64 |         
 65 |     def transform_to_svd_space(self,X):
 66 |         sig_inv = np.linalg.inv(self.Sigma)
 67 |         return np.dot(np.dot(X,self.U),sig_inv)
 68 |     
 69 |     def predict(self, X):
 70 |         X = self.pandas_to_numpy(X)
 71 |         sim_space = [self.compute_distance_between_all_points(p,self.original_data, connectivity=self.connect) for p in X]
 72 |         transformed_X = np.array([self.transform_to_svd_space(x) for x in sim_space])
 73 |         return self.kmeans.predict(transformed_X)
 74 |     
 75 |     def do_svd(self, similarity):
 76 |         dims = self.dims
 77 |         U, Sigma, VT = np.linalg.svd(similarity)
 78 |         VT = VT[:dims,:]
 79 |         U = U[:,:dims]
 80 |         Sigma = np.diag(Sigma[:dims])
 81 |         return U, Sigma, VT
 82 |         
 83 |     def plot_similarity_matrix(self):
 84 |         plt.figure(dpi=200)
 85 |         plt.imshow(self.similarity, cmap=plt.cm.Blues)
 86 |         plt.xlabel("Point ID", fontsize=16)
 87 |         plt.ylabel("Point ID", fontsize=16)
 88 |         plt.title("Similarity Matrix (1 for neighbors, 0 for not)", fontsize=16);
 89 |         plt.colorbar(cmap=plt.cm.Blues);
 90 |         
 91 |     def pandas_to_numpy(self, x):
 92 |         """
 93 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 94 |         calculation purposes.
 95 |         ---
 96 |         Input: X (array, dataframe, or series)
 97 |         
 98 |         Output: X (array)
 99 |         """
100 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
101 |             return x.as_matrix()
102 |         if type(x) == type(np.array([1,2])):
103 |             return x
104 |         return np.array(x)


--------------------------------------------------------------------------------
/zwml/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .make_regression import make_regression
2 | from .make_classification import make_classification
3 | from .datasets import load_iris
4 | 
5 | __all__ = ['make_regression','make_classification','load_iris']


--------------------------------------------------------------------------------
/zwml/datasets/datasets.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | 
 5 | def load_iris(as_dataframe=False):
 6 |     directory, _ = os.path.split(__file__)
 7 |     DATA_PATH = os.path.join(directory, "iris.data")
 8 |     data = pd.read_csv(DATA_PATH, header=None)
 9 |     data.columns = ['sepal_length','sepal_width','petal_length','petal_width','class']
10 |     if as_dataframe:
11 |         return data
12 |     X = data.iloc[:,:-1].as_matrix()
13 |     y = data.iloc[:,-1]
14 |     y = y.str.replace('Iris-setosa','0').replace('Iris-versicolor','1').replace('Iris-virginica','2')
15 |     y = y.astype(int).as_matrix()
16 |     return X,y


--------------------------------------------------------------------------------
/zwml/datasets/iris.data:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/zwml/datasets/make_classification.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class make_classification:
  4 |     
  5 |     def __init__(self):
  6 |         self.model_params = None
  7 |         self.num_feats = None
  8 |         self.random_state = None
  9 |         self.is_clean = None
 10 |         self.noise = None
 11 |         
 12 |     def build_dataset(self, num_feat=10, num_rows_per_class=100, num_classes=2, random_state = None, num_important=10, 
 13 |                       misclassify=0.01, dirty_data=False):
 14 |         assert num_feat > 0 and num_rows_per_class > 0, "Must have rows and features > 0."
 15 |         
 16 |         if random_state:
 17 |             np.random.seed(random_state)
 18 |             self.random_state = random_state
 19 |         
 20 |         if num_important > num_feat:
 21 |             num_important = num_feat
 22 |             
 23 |         self.num_important = num_important
 24 |         self.num_feats = num_feat
 25 | 
 26 |         means = np.random.uniform(-1,1,size=(num_classes, num_important))
 27 |         sigmas = np.random.uniform(1e-6,0.5,size=(num_classes, num_important))
 28 |         
 29 |         X = np.empty(num_important)
 30 |         y = np.zeros(num_rows_per_class)
 31 |         for i in range(0,num_classes):
 32 |             new_X = np.random.normal(means[i][0],sigmas[i][0],num_rows_per_class).reshape(-1,1)
 33 |             for j in range(1,num_important):
 34 |                 col_X = np.random.normal(means[i][j],sigmas[i][j],num_rows_per_class).reshape(-1,1)
 35 |                 new_X = np.hstack((new_X, col_X))
 36 |             if not i:
 37 |                 X = np.vstack((X, new_X))[1:]
 38 |             else:
 39 |                 X = np.vstack((X, new_X))
 40 |                 y = np.hstack((y,[i]*num_rows_per_class))  
 41 |         
 42 |         # fill in the rest of the unimportant columns
 43 |         means = np.random.uniform(-1,1,num_feat-num_important)
 44 |         sigmas = np.random.uniform(1e-6,0.5, num_feat-num_important)
 45 |         for i in range(num_feat-num_important):
 46 |             X = np.hstack((X,np.random.normal(means[i],sigmas[i],X.shape[0]).reshape(-1,1)))
 47 |     
 48 |         #shuffle rows
 49 |         permute = np.random.permutation(len(X))
 50 |         X = X[permute]
 51 |         y = y[permute]
 52 |         
 53 |         #shuffle columns
 54 |         np.random.shuffle(X.T)
 55 |         
 56 |         if dirty_data:
 57 |             X = self.muck_up_data(X)
 58 |             
 59 |         return X, y
 60 |     
 61 |     def muck_up_data(self, X, dup_cols=True, add_nan=True, combine_feats=True):
 62 |         if dup_cols:
 63 |             X = self._add_duplicate_columns(X, dup_cols)
 64 |         if combine_feats:
 65 |             X = self._combine_features(X, combine_feats)
 66 |         if add_nan:
 67 |             X = self._add_nans(X, add_nan)
 68 |         return X
 69 |     
 70 |     def _add_duplicate_columns(self,X, dup_cols):
 71 |         if isinstance(dup_cols, float):
 72 |             num_to_dupe = int(dup_cols*X.size)   
 73 |         elif isinstance(dup_cols, bool):
 74 |             max_dupe = int((0.1*self.num_feats)+1.5)
 75 |             num_to_dupe = np.random.randint(1,max_dupe)
 76 |         elif isinstance(dup_cols, int):
 77 |             num_to_dupe = dup_cols
 78 |         else:
 79 |             raise TypeError('dup_cols must be type float, int, or bool.')
 80 |         
 81 |         cols_to_dup = np.random.choice(np.arange(self.num_feats), num_to_dupe, replace=False)
 82 |         new_X = np.hstack((X, X.T[cols_to_dup].T.reshape(-1,len(cols_to_dup))))
 83 |         return new_X
 84 |             
 85 |     def _combine_features(self, X, combine_feats):
 86 |         if isinstance(combine_feats, float):
 87 |             num_to_dupe = int(combine_feats*X.size) 
 88 |         elif isinstance(combine_feats, bool):
 89 |             max_dupe = int((0.1*self.num_feats)+1.5)
 90 |             num_to_dupe = np.random.randint(1,max_dupe)
 91 |         elif isinstance(combine_feats, int):
 92 |             num_to_dupe = combine_feats
 93 |         else:
 94 |             raise TypeError('combine_feats must be type float, int, or bool.')
 95 |         
 96 |         cols = np.random.choice(np.arange(self.num_feats), size=(num_to_dupe,2), replace=True)
 97 |         for col_set in cols:
 98 |             new_X = np.random.uniform(-1,1)*X.T[col_set[0]]+np.random.uniform(-1,1)*X.T[col_set[1]]
 99 |             X = np.hstack((X, new_X.T.reshape(-1,1)))
100 |         return X
101 |     
102 |     def _add_nans(self, X, add_nan_val):
103 |         if isinstance(add_nan_val, float):
104 |             num_of_nans = int(add_nan_val*X.size)   
105 |         elif isinstance(add_nan_val, int):
106 |             num_of_nans = add_nan_val
107 |         else:
108 |             max_nans = int(0.1*X.size)
109 |             num_of_nans = np.random.randint(1,max_nans)
110 |             
111 |         for _ in range(num_of_nans):
112 |             i = np.random.randint(0,X.shape[0])
113 |             j = np.random.randint(0,X.shape[1])
114 |             X[i,j] = np.nan
115 |         return X


--------------------------------------------------------------------------------
/zwml/datasets/make_regression.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class make_regression:
  4 |     
  5 |     def __init__(self):
  6 |         self.model_params = None
  7 |         self.num_feats = None
  8 |         self.random_state = None
  9 |         self.is_clean = None
 10 |         self.noise = None
 11 |         self.bias = None
 12 |         
 13 |     def build_dataset(self, num_feat=10, num_rows=100, random_state = None, num_important=10, 
 14 |                       noise=0.1, bias=None, dirty_data=False):
 15 |         assert num_feat > 0 and num_rows > 0, "Must have rows and features > 0."
 16 |         if random_state:
 17 |             np.random.seed(random_state)
 18 |             self.random_state = random_state
 19 | 
 20 |         means = np.random.uniform(-1,1, size=num_feat)
 21 |         sigmas = np.random.uniform(1e-6,1, size=num_feat)
 22 |         X = np.zeros((num_rows, num_feat))
 23 |         for i, mu in enumerate(means):
 24 |             X.T[i] = np.random.normal(mu, sigmas[i], num_rows)
 25 | 
 26 |         if bias == True:
 27 |             bias = np.random.uniform(-1,1)
 28 |         elif isinstance(bias, float):
 29 |             pass
 30 |         else:
 31 |             bias = 0.
 32 |         
 33 |         self.bias = bias
 34 | 
 35 |         if num_important > num_feat:
 36 |             num_important = num_feat
 37 |             
 38 |         self.num_important = num_important
 39 |         self.num_feats = num_feat
 40 | 
 41 |         target_builder = np.random.choice(np.arange(num_feat),num_important, replace=False)
 42 |         X_target = X.T[target_builder].T
 43 |         betas = np.random.uniform(-1,1,num_important)
 44 |         params = []
 45 |         for i,j in zip(betas, target_builder):
 46 |             params.append((j,i))
 47 |         self.model_params = params
 48 |         
 49 |         y = np.sum(X_target*betas, axis=1) + bias + np.random.normal(0, noise, num_rows)
 50 |         
 51 |         if dirty_data:
 52 |             X = self.muck_up_data(X)
 53 |             
 54 |         return X, y
 55 |     
 56 |     def muck_up_data(self, X, dup_cols=True, add_nan=True, combine_feats=True):
 57 |         if dup_cols:
 58 |             X = self._add_duplicate_columns(X, dup_cols)
 59 |         if combine_feats:
 60 |             X = self._combine_features(X, combine_feats)
 61 |         if add_nan:
 62 |             X = self._add_nans(X, add_nan)
 63 |         return X
 64 |     
 65 |     def _add_duplicate_columns(self,X, dup_cols):
 66 |         if isinstance(dup_cols, float):
 67 |             num_to_dupe = int(dup_cols*X.size)   
 68 |         elif isinstance(dup_cols, bool):
 69 |             max_dupe = int((0.1*self.num_feats)+1.5)
 70 |             num_to_dupe = np.random.randint(1,max_dupe)
 71 |         elif isinstance(dup_cols, int):
 72 |             num_to_dupe = dup_cols
 73 |         else:
 74 |             raise TypeError('dup_cols must be type float, int, or bool.')
 75 |         
 76 |         cols_to_dup = np.random.choice(np.arange(self.num_feats), num_to_dupe, replace=False)
 77 |         new_X = np.hstack((X, X.T[cols_to_dup].T.reshape(-1,len(cols_to_dup))))
 78 |         return new_X
 79 |             
 80 |     def _combine_features(self, X, combine_feats):
 81 |         if isinstance(combine_feats, float):
 82 |             num_to_dupe = int(combine_feats*X.size) 
 83 |         elif isinstance(combine_feats, bool):
 84 |             max_dupe = int((0.1*self.num_feats)+1.5)
 85 |             num_to_dupe = np.random.randint(1,max_dupe)
 86 |         elif isinstance(combine_feats, int):
 87 |             num_to_dupe = combine_feats
 88 |         else:
 89 |             raise TypeError('combine_feats must be type float, int, or bool.')
 90 |         
 91 |         cols = np.random.choice(np.arange(self.num_feats), size=(num_to_dupe,2), replace=True)
 92 |         for col_set in cols:
 93 |             new_X = np.random.uniform(-1,1)*X.T[col_set[0]]+np.random.uniform(-1,1)*X.T[col_set[1]]
 94 |             X = np.hstack((X, new_X.T.reshape(-1,1)))
 95 |         return X
 96 |     
 97 |     def _add_nans(self, X, add_nan_val):
 98 |         if isinstance(add_nan_val, float):
 99 |             num_of_nans = int(add_nan_val*X.size)   
100 |         elif isinstance(add_nan_val, int):
101 |             num_of_nans = add_nan_val
102 |         else:
103 |             max_nans = int(0.1*X.size)
104 |             num_of_nans = np.random.randint(1,max_nans)
105 |             
106 |         for _ in range(num_of_nans):
107 |             i = np.random.randint(0,X.shape[0])
108 |             j = np.random.randint(0,X.shape[1])
109 |             X[i,j] = np.nan
110 |         return X


--------------------------------------------------------------------------------
/zwml/linear_models/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .sgd_classifier import sgd_classifier
 3 | from .sgd_regressor import sgd_regressor
 4 | from .elastic_net_regressor import elastic_net_regressor
 5 | from .lasso_regressor import lasso_regressor
 6 | from .ridge_regressor import ridge_regressor
 7 | from .linear_regression import linear_regression
 8 | 
 9 | __all__ = ['linear_regression','ridge_regressor','lasso_regressor','elastic_net_regressor','sgd_regressor','sgd_classifier']
10 | 


--------------------------------------------------------------------------------
/zwml/linear_models/elastic_net_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import sys
 4 | from zwml.linear_models import sgd_regressor
 5 | 
 6 | class elastic_net_regressor(sgd_regressor):
 7 |     
 8 |     def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 
 9 |                  dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5):
10 |         """
11 |         Ridge Regressor - This is a wrapper on the SGD class where the regularization is set
12 |         to the L2 Norm. All other functionality is the same as the SGD class.
13 |         ---
14 |         KWargs:
15 |         
16 |         n_iter: number of epochs to run in while fitting to the data. Total number of steps
17 |         will be n_iter*X.shape[0]. 
18 |         
19 |         alpha: The learning rate. Moderates the step size during the gradient descent algorithm.
20 |         
21 |         verbose: Whether to print out coefficient information during the epochs
22 |         
23 |         return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics
24 |         
25 |         fit_intercept: If True, an extra coefficient is added with no associated feature to act as the
26 |                        base prediction if all X are 0.
27 |                        
28 |         dynamic: If true, an annealing scedule is used to scale the learning rate. 
29 |         
30 |         lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large
31 |               regression. If regularization is off, this does not apply to anything.
32 |               
33 |         l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 
34 |                  If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 
35 |                  regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'.
36 |         """
37 |         self.coef_ = None
38 |         self.trained = False
39 |         self.n_iter = n_iter
40 |         self.alpha_ = alpha
41 |         self.verbosity = verbose
42 |         self._return_steps = return_steps
43 |         self._fit_intercept = fit_intercept
44 |         self._next_alpha_shift = 0.1 # Only used if dynamic=True
45 |         self._dynamic = dynamic
46 |         self._regularize = 'EN'
47 |         self._lamb = lamb
48 |         self._l1_perc = l1_perc
49 | 


--------------------------------------------------------------------------------
/zwml/linear_models/lasso_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import sys
 4 | from zwml.linear_models import sgd_regressor
 5 | 
 6 | class lasso_regressor(sgd_regressor):
 7 |     
 8 |     def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 
 9 |                  dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5):
10 |         """
11 |         Lasso Regressor - This is a wrapper on the SGD class where the regularization is set
12 |         to the L1 Norm. All other functionality is the same as the SGD class.
13 |         ---
14 |         KWargs:
15 |         
16 |         n_iter: number of epochs to run in while fitting to the data. Total number of steps
17 |         will be n_iter*X.shape[0]. 
18 |         
19 |         alpha: The learning rate. Moderates the step size during the gradient descent algorithm.
20 |         
21 |         verbose: Whether to print out coefficient information during the epochs
22 |         
23 |         return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics
24 |         
25 |         fit_intercept: If True, an extra coefficient is added with no associated feature to act as the
26 |                        base prediction if all X are 0.
27 |                        
28 |         dynamic: If true, an annealing scedule is used to scale the learning rate. 
29 |         
30 |         lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large
31 |               regression. If regularization is off, this does not apply to anything.
32 |               
33 |         l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 
34 |                  If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 
35 |                  regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'.
36 |         """
37 |         self.coef_ = None
38 |         self.trained = False
39 |         self.n_iter = n_iter
40 |         self.alpha_ = alpha
41 |         self.verbosity = verbose
42 |         self._return_steps = return_steps
43 |         self._fit_intercept = fit_intercept
44 |         self._next_alpha_shift = 0.1 # Only used if dynamic=True
45 |         self._dynamic = dynamic
46 |         self._regularize = 'L1'
47 |         self._lamb = lamb
48 |         self._l1_perc = l1_perc
49 | 


--------------------------------------------------------------------------------
/zwml/linear_models/linear_regression.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class linear_regression:
  4 |     
  5 |     def __init__(self, w_intercept=True):
  6 |         """
  7 |         Performs linear regression using the direct matrix solution
  8 |         from linear algebra. Minimizes the sum of squared errors of 
  9 |         all included data points by drawing a best fit line and 
 10 |         extracting the coefficients of that line.
 11 |         ---
 12 |         KWargs:
 13 |         w_intercept: flag to choose whether to include a y-intercept
 14 |         term in the calculation or not.
 15 |         """
 16 |         self.coef_ = None
 17 |         self.intercept = w_intercept
 18 |         self.is_fit = False
 19 |         
 20 |     def add_intercept(self, X):
 21 |         """
 22 |         Adds an 'all 1's' bias term to function as the y-intercept
 23 |         """
 24 |         rows = X.shape[0]
 25 |         inter = np.ones(rows).reshape(-1,1)
 26 |         return np.hstack((X,inter))
 27 |         
 28 |     def fit(self, X, y):
 29 |         """
 30 |         Read in X (all features) and y (target) and use the Linear Algebra solution
 31 |         to extract the coefficients for Linear Regression.
 32 |         """
 33 |         X = self.convert_to_array(X)
 34 |         y = self.convert_to_array(y)
 35 |         
 36 |         if self.intercept:
 37 |             X = self.add_intercept(X)
 38 |             
 39 |         temp_xtx = np.linalg.inv(np.dot(X.T,X))
 40 |         temp_xty = np.dot(X.T,y)
 41 |         self.coef_ = np.dot(temp_xtx,temp_xty)
 42 |         self.is_fit = True
 43 |     
 44 |     def predict(self, X):
 45 |         """
 46 |         Takes in a new X value (that must be the same shape as the original X for fitting)
 47 |         and returns the predicted y value, using the coefficients from fitting.
 48 |         """
 49 |         if not self.is_fit:
 50 |             raise ValueError("You have to run the 'fit' method before using predict!")
 51 | 
 52 |         X = self.convert_to_array(X)
 53 |         if self.intercept:
 54 |             X = self.add_intercept(X)
 55 |         return np.dot(X,self.coef_)
 56 |    
 57 |     def pandas_to_numpy(self, x):
 58 |         """
 59 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 60 |         calculation purposes.
 61 |         ---
 62 |         Input: X (array, dataframe, or series)
 63 |         Output: X (array)
 64 |         """
 65 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 66 |             return x.as_matrix()
 67 |         if type(x) == type(np.array([1,2])):
 68 |             return x
 69 |         return np.array(x) 
 70 |     
 71 |     def handle_1d_data(self,x):
 72 |         """
 73 |         Converts 1 dimensional data into a series of rows with 1 columns
 74 |         instead of 1 row with many columns.
 75 |         """
 76 |         if x.ndim == 1:
 77 |             x = x.reshape(-1,1)
 78 |         return x
 79 |     
 80 |     def convert_to_array(self, x):
 81 |         """
 82 |         Takes in an input and converts it to a numpy array
 83 |         and then checks if it needs to be reshaped for us
 84 |         to use it properly
 85 |         """
 86 |         x = self.pandas_to_numpy(x)
 87 |         x = self.handle_1d_data(x)
 88 |         return x
 89 | 
 90 |     def score(self, X, y):
 91 |         """
 92 |         Uses the predict method to measure the (negative)
 93 |         mean squared error of the model.
 94 |         ---
 95 |         In: X (list or array), feature matrix; y (list or array) labels
 96 |         Out: negative mean squared error (float)
 97 |         """
 98 |         X = self.convert_to_array(X)
 99 |         y = self.convert_to_array(y)
100 |         pred = self.predict(X)
101 |         return -1.* np.mean((np.array(pred)-np.array(y))**2)


--------------------------------------------------------------------------------
/zwml/linear_models/ridge_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import sys
 4 | from zwml.linear_models import sgd_regressor
 5 | 
 6 | class ridge_regressor(sgd_regressor):
 7 |     
 8 |     def __init__(self, n_iter=100, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 
 9 |                  dynamic=True, loss='ols', epsilon=0.1, lamb=1e-6, l1_perc = 0.5):
10 |         """
11 |         Ridge Regressor - This is a wrapper on the SGD class where the regularization is set
12 |         to the L2 Norm. All other functionality is the same as the SGD class.
13 |         ---
14 |         KWargs:
15 |         
16 |         n_iter: number of epochs to run in while fitting to the data. Total number of steps
17 |         will be n_iter*X.shape[0]. 
18 |         
19 |         alpha: The learning rate. Moderates the step size during the gradient descent algorithm.
20 |         
21 |         verbose: Whether to print out coefficient information during the epochs
22 |         
23 |         return_steps: If True, fit returns a list of the coefficients at each update step for diagnostics
24 |         
25 |         fit_intercept: If True, an extra coefficient is added with no associated feature to act as the
26 |                        base prediction if all X are 0.
27 |                        
28 |         dynamic: If true, an annealing scedule is used to scale the learning rate. 
29 |         
30 |         lamb: Stands for lambda. Sets the strength of the regularization. Large lambda causes large
31 |               regression. If regularization is off, this does not apply to anything.
32 |               
33 |         l1_perc: If using elastic net, this variable sets what portion of the penalty is L1 vs L2. 
34 |                  If regularize='EN' and l1_perc = 1, equivalent to regularize='L1'. If 
35 |                  regularize='EN' and l1_perc = 0, equivalent to regulzarize='L2'.
36 |         """
37 |         self.coef_ = None
38 |         self.trained = False
39 |         self.n_iter = n_iter
40 |         self.alpha_ = alpha
41 |         self.verbosity = verbose
42 |         self._return_steps = return_steps
43 |         self._fit_intercept = fit_intercept
44 |         self._next_alpha_shift = 0.1 # Only used if dynamic=True
45 |         self._dynamic = dynamic
46 |         self._regularize = 'L2'
47 |         self._lamb = lamb
48 |         self._l1_perc = l1_perc
49 | 


--------------------------------------------------------------------------------
/zwml/linear_models/sgd_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | class sgd_classifier:
  5 |     
  6 |     def __init__(self, n_iter=10, alpha=0.01, verbose=False, return_steps=False, fit_intercept=True, 
  7 |                  dynamic=False, loss='ols', epsilon=0.1, random_state=None):
  8 |         """
  9 |         Stochastic Gradient Descent Algorithm, with Logistic Regression 
 10 |         cost function.
 11 |         ---
 12 |         KWargs:
 13 |         
 14 |         n_iter: number of epochs to run in while fitting to the data. 
 15 |         Total number of steps will be n_iter*X.shape[0]. 
 16 |         alpha: The learning rate. Moderates the step size during the 
 17 |         gradient descent algorithm.
 18 |         verbose: Whether to print out coefficient information during 
 19 |         the epochs
 20 |         return_steps: If True, fit returns a list of the coefficients 
 21 |         at each update step for diagnostics
 22 |         fit_intercept: If True, an extra coefficient is added with no 
 23 |         associated feature to act as the base prediction if all X are 0.
 24 |         dynamic: If true, an annealing scedule is used to scale the learning rate. 
 25 |         """
 26 |         self.coef_ = None
 27 |         self.trained = False
 28 |         self.n_iter = n_iter
 29 |         self.alpha_ = alpha
 30 |         self.verbosity = verbose
 31 |         self._return_steps = return_steps
 32 |         self._fit_intercept = fit_intercept
 33 |         self._next_alpha_shift = 0.1 # Only used if dynamic=True
 34 |         self._dynamic = dynamic
 35 |         if random_state:
 36 |             np.random.seed(random_state)
 37 |         self._data_cols = None
 38 |         
 39 |     def update(self, x, error):
 40 |         """
 41 |         Calculating the change of the coeficients for SGD. This is the derivative of the cost 
 42 |         function. B_i = B_i - alpha * dJ/dB_i. If fit_intercept=True, a slightly different 
 43 |         value is used to update the intercept coefficient, since the associated feature is "1."
 44 |         ---
 45 |         Inputs:
 46 |         
 47 |         data_point: A single row of the feature matrix. Since this is Stochastic, batches are not allowed.
 48 |         
 49 |         error: The residual for the current data point, given the current coefficients. Prediction - True
 50 |         for the current datapoint and coefficients.
 51 |         """
 52 |         step = self.alpha_*error*x
 53 |         if self._fit_intercept:
 54 |             self.coef_[1:] = self.coef_[1:] - step
 55 |             self.coef_[0] = self.coef_[0] - self.alpha_ * error
 56 |         else:
 57 |             self.coef_ = self.coef_ - step
 58 |         
 59 |     def shuffle_data(self, X, y):
 60 |         """
 61 |         Given X and y, shuffle them together to get a new_X and new_y that maintain feature-target
 62 |         correlations. 
 63 |         ---
 64 |         Inputs:
 65 |         
 66 |         X: A numpy array of any shape
 67 |         y: A numpy array of any shape
 68 |         
 69 |         Both X and y must have the same first dimension length.
 70 |         
 71 |         Returns:
 72 |         X,y: two numpy arrays
 73 |         """
 74 |         assert len(X) == len(y)
 75 |         permute = np.random.permutation(len(y))
 76 |         return X[permute], y[permute]
 77 |     
 78 |     def dynamic_learning_rate_check(self, epoch):
 79 |         """
 80 |         If dynamic=True, shrink the learning rate by a factor of 2 after every 10% of
 81 |         the total number of epochs. This should cause a more direct path to the global 
 82 |         minimum after the initial large steps.
 83 |         ---
 84 |         Inputs: epoch (int,float), the current iteration number. 
 85 |         """
 86 |         percent_of_epochs = float(epoch)/float(self.n_iter)
 87 |         if percent_of_epochs > self._next_alpha_shift:
 88 |             self._next_alpha_shift += 0.1
 89 |             self.alpha_ = self.alpha_/2
 90 |             
 91 |     def fit(self, X, y):
 92 |         """
 93 |         Actually trains the model. Given feature-target combinations, gradient descent is performed
 94 |         using the optimization stepping given in the 'update' function. At present, all epochs are 
 95 |         completed, as no tolerance is set. The learning rate is currently fixed.
 96 |         ---
 97 |         Inputs: 
 98 |             X (array, dataframe, series), The features to regress on using SGD
 99 |             y (array, series), Must be a 1D set of targets.
100 |         Outputs:
101 |             steps (optional): If return_steps=True, a list of the evolution of the coefficients is returned
102 |         """
103 |         X = self.convert_to_array(X)
104 |         y = self.convert_to_array(y)
105 |         self._stdy = np.std(y)
106 |         self.coef_ = self.init_coef(X)
107 |         if self._return_steps:
108 |             steps = []
109 |             steps.append(np.copy(self.coef_))
110 |         for epoch in range(self.n_iter):
111 |             shuf_X, shuf_y = self.shuffle_data(X,y)
112 |             if self.verbosity:
113 |                 print("Epoch ", epoch, ", Coeff: ", self.coef_)
114 |             for data, true in zip(shuf_X,shuf_y):
115 |                 pred = self.predict_proba(data, is_array=True)
116 |                 error = pred - true
117 |                 self.update(data, error)
118 |                 if self._return_steps:
119 |                     steps.append(np.copy(self.coef_))
120 |             if self._dynamic:
121 |                 self.dynamic_learning_rate_check(epoch)
122 |         if self._return_steps:
123 |             return steps
124 |             
125 |     def init_coef(self, X):
126 |         """
127 |         Returns the initial starting values for the coefficients. At present, these are randomly
128 |         set. If fit_intercept = True, an extra coefficient is generated. 
129 |         ---
130 |         Input: X, Feature matrix. Needed to decide how many coefficients to generate.
131 |         """
132 |         if self._fit_intercept:
133 |             return np.random.rand(X.shape[1]+1)
134 |         return np.random.rand(X.shape[1])
135 | 
136 |     def predict_proba(self, X, is_array=False):  
137 |         """
138 |         Returns a prediction for a new data set, using the model coefficients.
139 |         ---
140 |         Input: 
141 |             X (dataframe, array): The new feature set. Must be the same number of columns
142 |             as the initial training features. 
143 |         Output:
144 |             prediction (array): The dot product of the input data and the coeficients.
145 |         """
146 |         if not is_array:
147 |             X = self.convert_to_array(X)
148 |         if not self.coef_.all():
149 |             raise ValueError("Coefficients not defined, must fit() before predict().")
150 |         if self._fit_intercept:
151 |             return self.logit(np.dot(X,self.coef_[1:]) + self.coef_[0])
152 |         
153 |         return self.logit(np.dot(X,self.coef_))
154 |     
155 |     def predict(self, X, threshold=0.5):
156 |         """
157 |         Takes the output of predict_proba and applies a threshold
158 |         to the probability value. If the value is greater than the
159 |         threshold, labels the row as class 1. Else class 0.
160 |         """
161 |         preds = self.predict_proba(X)
162 |         preds[preds >= threshold] = 1
163 |         preds[preds < threshold] = 0
164 |         return preds.reshape(-1,1)
165 |     
166 |     def logit(self, beta_x):
167 |         """
168 |         Applies the sigmoid or logit function to current
169 |         linear prediction from beta * X.
170 |         """
171 |         denom = 1. - np.exp(-beta_x)
172 |         val = 1./denom
173 |         
174 |         if type(val) != 'numpy.ndarray':
175 |             val = np.array([val])
176 |             
177 |         # Handle rounding errors!
178 |         val[val>1] = 1
179 |         val[val<0] = 0
180 |         return val
181 |       
182 |     def score(self, X, y):
183 |         """
184 |         Uses the predict method to measure the accuracy of the model.
185 |         ---
186 |         In: X (list or array), feature matrix; y (list or array) labels
187 |         Out: accuracy (float)
188 |         """
189 |         pred = self.predict(X)
190 |         correct = 0
191 |         for i,j in zip(y,pred):
192 |             if i == j:
193 |                 correct+=1
194 |         return float(correct)/float(len(y))
195 |     
196 |     def pandas_to_numpy(self, x):
197 |         """
198 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
199 |         calculation purposes.
200 |         ---
201 |         Input: X (array, dataframe, or series)
202 |         Output: X (array)
203 |         """
204 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
205 |             return x.as_matrix()
206 |         if type(x) == type(np.array([1,2])):
207 |             return x
208 |         return np.array(x) 
209 |     
210 |     def handle_1d_data(self,x):
211 |         """
212 |         Converts 1 dimensional data into a series of rows with 1 columns
213 |         instead of 1 row with many columns.
214 |         """
215 |         if x.ndim == 1:
216 |             x = x.reshape(-1,1)
217 |         return x
218 |     
219 |     def convert_to_array(self, x):
220 |         """
221 |         Takes in an input and converts it to a numpy array
222 |         and then checks if it needs to be reshaped for us
223 |         to use it properly
224 |         """
225 |         x = self.pandas_to_numpy(x)
226 |         x = self.handle_1d_data(x)
227 |         return x


--------------------------------------------------------------------------------
/zwml/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .regression_metrics import *
2 | from .classification_metrics import *
3 | from .pairwise_distance import *
4 | 
5 | __all__ = ['get_error','mean_square_error','root_mean_square_error','mean_absolute_error','sum_square_error','r2_score','adj_r2','assess_model','test_regression_results', 'accuracy','precision','recall','f1_score','average_precision','average_recall','average_f1','confusion_matrix','pretty_confusion_matrix','classification_report',
6 | 'pandas_to_numpy','manhattan_distance','euclidean_distance','cosine_similarity_without_numpy','cosine_similarity','gaussian_kernel','uniform_kernel','rbf_kernel']
7 | 


--------------------------------------------------------------------------------
/zwml/metrics/classification_metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | def accuracy(true, pred):
  6 |     true = pandas_to_numpy(true)
  7 |     pred = pandas_to_numpy(pred)
  8 |     mask = (true == pred)
  9 |     numeric_mask = mask.astype(int)
 10 |     correct = np.sum(numeric_mask)
 11 |     return correct/true.shape[0]
 12 | 
 13 | def get_precision_score(cls, true, pred):
 14 |     mask = (pred == cls)
 15 |     pred_of_cls = pred[mask]
 16 |     trues = true[mask]
 17 |     prec = accuracy(trues, pred_of_cls)
 18 |     return prec
 19 |     
 20 | def precision(true, pred):
 21 |     true = pandas_to_numpy(true)
 22 |     pred = pandas_to_numpy(pred)
 23 |     unique_pred = np.unique(pred)
 24 |     precision_result = {}
 25 |     for cls in unique_pred:
 26 |         precision_result[cls] = get_precision_score(cls, true, pred)
 27 |     return precision_result
 28 | 
 29 | def average_precision(true,pred):
 30 |     true = pandas_to_numpy(true)
 31 |     pred = pandas_to_numpy(pred)
 32 |     prec = precision(true,pred)
 33 |     
 34 |     clses = 0
 35 |     pr_score = 0.
 36 |     for pr in prec.values():
 37 |         clses += 1
 38 |         pr_score += pr
 39 |     return pr_score/clses
 40 | 
 41 | def get_recall_score(cls, true, pred):
 42 |     mask = (true == cls)
 43 |     pred_of_cls = pred[mask]
 44 |     trues = true[mask]
 45 |     reca = accuracy(trues, pred_of_cls)
 46 |     return reca
 47 |     
 48 | def recall(true, pred):
 49 |     true = pandas_to_numpy(true)
 50 |     pred = pandas_to_numpy(pred)
 51 |     unique_true = np.unique(true)
 52 |     recall_result = {}
 53 |     for cls in unique_true:
 54 |         recall_result[cls] = get_recall_score(cls, true, pred)
 55 |     return recall_result
 56 | 
 57 | def average_recall(true,pred):
 58 |     true = pandas_to_numpy(true)
 59 |     pred = pandas_to_numpy(pred)
 60 |     reca = recall(true,pred)
 61 |     
 62 |     clses = 0
 63 |     rc_score = 0.
 64 |     for rc in reca.values():
 65 |         clses += 1
 66 |         rc_score += rc
 67 |     return rc_score/clses
 68 | 
 69 | def f1_score(true, pred):
 70 |     rec = recall(true,pred)
 71 |     prec = precision(true,pred)
 72 |     f1 = {}
 73 |     for key in rec.keys():
 74 |         f1[key] = (2*rec[key]*prec[key])/(rec[key]+prec[key])
 75 |     return f1
 76 | 
 77 | def average_f1(true,pred):
 78 |     f1 = f1_score(true,pred)
 79 |     
 80 |     clses = 0
 81 |     f1_scr = 0.
 82 |     for f1 in f1.values():
 83 |         clses += 1
 84 |         f1_scr += f1
 85 |     return f1_scr/clses
 86 | 
 87 | def classification_report(true, pred):
 88 |     prec = precision(true,pred)
 89 |     reca = recall(true,pred)
 90 |     f1 = f1_score(true,pred)
 91 |     acc = accuracy(true,pred)
 92 |     clses = np.unique(true)
 93 |     fill_empty_slots(clses, [prec, reca, f1])
 94 |     for cls in clses:
 95 |         print("--- Label %s ---"%str(cls))
 96 |         print("Precision: %.3f"%prec[cls])
 97 |         print("Recall: %.3f"%reca[cls])
 98 |         print("F1: %.3f\n"%f1[cls])
 99 |     print("--- Average ---")
100 |     print("Precision: %.3f"%average_precision(true,pred))
101 |     print("Recall: %.3f"%average_recall(true,pred))
102 |     print("F1: %.3f"%average_f1(true,pred))
103 |     print("Accuracy: %.3f"%accuracy(true,pred))
104 |     
105 | def fill_empty_slots(clses, metrics):
106 |     for metric in metrics:
107 |         for cls in clses:
108 |             if cls not in metric:
109 |                 metric[cls] = 0.
110 |             
111 | def confusion_matrix(true,pred):
112 |     true = pandas_to_numpy(true)
113 |     pred = pandas_to_numpy(pred)
114 |     unique_true = np.unique(true)
115 |     
116 |     cm = np.zeros((unique_true.shape[0], unique_true.shape[0]))
117 |     
118 |     for cls in unique_true:
119 |         mask = (true == cls)
120 |         pred_of_cls = pred[mask]
121 |         counts = np.unique(pred_of_cls, return_counts=True)
122 |         for pred_cls, count in zip(*counts):
123 |             cm[cls][pred_cls] = count
124 |     return cm    
125 | 
126 | def pretty_confusion_matrix(true,pred, show_text=True):
127 |     cm = confusion_matrix(true,pred)
128 |     plt.figure(dpi=250)
129 |     plt.imshow(cm, cmap=plt.cm.RdBu)
130 |     plt.grid(False)
131 |     plt.colorbar()
132 |     ax = plt.gca()
133 |     if show_text:
134 |         for (j,i),label in np.ndenumerate(cm):
135 |             ax.text(i,j,label,ha='center',va='center', fontsize=20, color='w')
136 |     plt.xticks(list(range(cm.shape[0])))
137 |     plt.yticks(list(range(cm.shape[1])));
138 |     plt.xlabel("True")
139 |     plt.ylabel("Predicted");
140 |     plt.show();
141 |     return cm
142 |     
143 | def pandas_to_numpy(x):
144 |     """
145 |     Checks if the input is a Dataframe or series, converts to numpy matrix for
146 |     calculation purposes.
147 |     ---
148 |     Input: X (array, dataframe, or series)    
149 |     Output: X (array)
150 |     """
151 |     if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
152 |         return x.as_matrix()
153 |     if type(x) == type(np.array([1,2])):
154 |         return x
155 |     return np.array(x)


--------------------------------------------------------------------------------
/zwml/metrics/pairwise_distance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | def pandas_to_numpy(x):
  5 |     """
  6 |     Checks if the input is a Dataframe or series, converts to numpy matrix for
  7 |     calculation purposes.
  8 |     ---
  9 |     Input: X (array, dataframe, or series)
 10 |     Output: X (array)
 11 |     """
 12 |     if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 13 |         return x.as_matrix()
 14 |     if type(x) == type(np.array([1,2])):
 15 |         return x
 16 |     return np.array(x) 
 17 | 
 18 | def manhattan_distance(vec1, vec2):
 19 |     """
 20 |     Manhattan distance measures the distance along
 21 |     each direction and sums them together.
 22 |     """
 23 |     vec1 = pandas_to_numpy(vec1)
 24 |     vec2 = pandas_to_numpy(vec2)
 25 |     return np.sum(np.abs(vec1-vec2))
 26 | 
 27 | def euclidean_distance(vec1, vec2):
 28 |     """
 29 |     Calculating the Euclidean distance which is
 30 |     the more traditional method for distance 
 31 |     calculation. sqrt((x1-x2)^2 + (y1-y2)^2 + ...)
 32 |     """
 33 |     vec1 = pandas_to_numpy(vec1)
 34 |     vec2 = pandas_to_numpy(vec2)
 35 |     return np.sqrt(np.sum((vec1-vec2)**2))
 36 | 
 37 | def cosine_similarity_without_numpy(vec1, vec2):
 38 |     """
 39 |     Calculates the angular similarity of two vectors.
 40 |     Does so by calculating cos(theta) between the vectors
 41 |     using the dot product.
 42 |     
 43 |     cos_sim = A dot B/(magnitude(A)*magnitude(B))
 44 |     """
 45 |     dot_product=0
 46 |     vec1_sum_sq = 0
 47 |     vec2_sum_sq = 0
 48 |     for idx, val in enumerate(vec1):
 49 |         dot_product += val*vec2[idx]
 50 |         vec1_sum_sq += val*val
 51 |         vec2_sum_sq += vec2[idx]*vec2[idx]
 52 |     return dot_product/(vec1_sum_sq**0.5*vec2_sum_sq**0.5)
 53 | 
 54 | def cosine_similarity(vec1,vec2):
 55 |     """
 56 |     Calculates the angular similarity of two vectors.
 57 |     Does so by calculating cos(theta) between the vectors
 58 |     using the dot product.
 59 |     
 60 |     cos_sim = A dot B/(magnitude(A)*magnitude(B))
 61 |     """
 62 |     vec1 = pandas_to_numpy(vec1)
 63 |     vec2 = pandas_to_numpy(vec2)
 64 |     dot_product = np.dot(vec1, vec2)
 65 |     vec1_norm = np.linalg.norm(vec1)
 66 |     vec2_norm = np.linalg.norm(vec2)
 67 |     return dot_product/(vec1_norm* vec2_norm)
 68 | 
 69 | def gaussian_kernel(vec1, vec2, bandwidth=1.):
 70 |     """
 71 |     Returns the Gaussian kernel relationship between two
 72 |     vectors. The Gaussian kernel assumes a bandwidth that
 73 |     defines the "width" of the Gaussian used to determine
 74 |     the relationship between the two points.
 75 |     """
 76 |     dist = euclidean_distance(vec1, vec2)
 77 |     norm = 1/(np.sqrt(2*np.pi*bandwidth**2))
 78 |     return norm*np.exp(-dist**2/(2*bandwidth**2))
 79 | 
 80 | def uniform_kernel(vec1, vec2, threshold_range=1, value=0.5):
 81 |     """
 82 |     Returns a value if the two provided vectors are
 83 |     within threshold range of each other. In normal
 84 |     implementation, the integration of value over the
 85 |     whole range should be 1.
 86 |     """
 87 |     distance = euclidean_distance(vec1, vec2)
 88 |     if distance <= threshold_range:
 89 |         probs = value
 90 |     else:
 91 |         probs = 0.
 92 |     return probs
 93 | 
 94 | def rbf_kernel(vec1, vec2, gamma=None):
 95 |     """
 96 |     The RBF, or radial basis function, kernel
 97 |     is similar to the gaussian kernel. However,
 98 |     it has a different scaling factor, using
 99 |     gamma instead of the bandwidth for normalization
100 |     and width scaling. Gamma defaults to 1/dimensions
101 |     unless otherwise specified.d
102 |     """
103 |     if not gamma:
104 |         gamma = 1/len(vec1)
105 |     distance = euclidean_distance(vec1, vec2)**2
106 |     distance *= -gamma
107 |     return np.exp(distance)


--------------------------------------------------------------------------------
/zwml/metrics/regression_metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | def get_error(true,pred):
  5 |     """
  6 |     Returns predicted - true for each entry
  7 |     """
  8 |     true = pandas_to_numpy(true)
  9 |     pred = pandas_to_numpy(pred)
 10 |     return pred-true
 11 | 
 12 | def get_square_error(true,pred):
 13 |     """
 14 |     Returns the square of predicted - true for each entry
 15 |     """
 16 |     return np.square(get_error(true,pred))   
 17 | 
 18 | def mean_square_error(true, pred):
 19 |     """
 20 |     Returns the average predicted - true
 21 |     """
 22 |     return np.mean(get_square_error(true,pred))
 23 | 
 24 | def root_mean_square_error(true,pred):
 25 |     """
 26 |     Returns the sqrt of mean square error
 27 |     """
 28 |     return np.sqrt(mean_square_error(true,pred))
 29 | 
 30 | def mean_absolute_error(true,pred):
 31 |     """
 32 |     Returns the mean absolute value of error
 33 |     """
 34 |     return np.mean(np.abs(get_error(true,pred)))
 35 | 
 36 | def sum_square_error(true,pred):
 37 |     """
 38 |     Returns the sum of squared errors
 39 |     """
 40 |     true = pandas_to_numpy(true)
 41 |     pred = pandas_to_numpy(pred)
 42 |     return np.sum(get_square_error(true,pred))
 43 | 
 44 | def r2_score(true,pred):
 45 |     """
 46 |     Returns R2 which is computed by
 47 |     SSE = sum of squared errors from the model
 48 |     SST = sume of squared errors to the mean of the data (y)
 49 |     R2 = 1 - SSE/SST
 50 |     """
 51 |     true = pandas_to_numpy(true)
 52 |     pred = pandas_to_numpy(pred)
 53 |     SSE = np.sum(get_square_error(true,pred))
 54 |     shpe = len(np.array(true))
 55 |     SST = np.sum(get_square_error(true,np.mean(true)*shpe))
 56 |     return 1.-(SSE/SST)
 57 | 
 58 | def adj_r2(true, pred, X):
 59 |     """
 60 |     Returns a version of R2 that penalizes for having many
 61 |     features. Fights against false correlations in data
 62 |     and is generally better than R2.
 63 |     """
 64 |     X = pandas_to_numpy(X)
 65 |     rsquare = r2_score(true,pred)
 66 |     num_data = X.shape[0]
 67 |     num_features = X.shape[1]
 68 |     temp = (1-rsquare)*(num_data-1)
 69 |     temp = temp/(num_data-num_features-1)
 70 |     temp = 1 - temp
 71 |     return temp
 72 | 
 73 | def assess_model(true, pred):
 74 |     """
 75 |     Computes a suite of metrics all at once
 76 |     """
 77 |     true = pandas_to_numpy(true)
 78 |     pred = pandas_to_numpy(pred)
 79 |     return sum_square_error(true,pred), mean_square_error(true,pred), root_mean_square_error(true,pred)
 80 | 
 81 | def test_regression_results(X, true, pred):
 82 |     """
 83 |     A print out of many of the metrics that show model performance
 84 |     """
 85 |     true = pandas_to_numpy(true)
 86 |     pred = pandas_to_numpy(pred)
 87 |     print("Mean Square Error: ", mean_square_error(true,pred))
 88 |     print("Root Mean Square Error: ", np.sqrt(mean_square_error(true,pred)))
 89 |     print("Mean Absolute Error: ",mean_absolute_error(true,pred))
 90 |     r2 = r2_score(true,pred)
 91 |     print("R2: ", r2)
 92 |     print("Adj R2: ", adj_r2(true,pred,X))
 93 | 
 94 | def pandas_to_numpy(x):
 95 |     """
 96 |     Checks if the input is a Dataframe or series, converts to numpy matrix for
 97 |     calculation purposes.
 98 |     ---
 99 |     Input: X (array, dataframe, or series)    
100 |     Output: X (array)
101 |     """
102 |     if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
103 |         return x.as_matrix()
104 |     if type(x) == type(np.array([1,2])):
105 |         return x
106 |     return np.array(x)


--------------------------------------------------------------------------------
/zwml/naive_bayes/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .bernoulli_naive_bayes import bernoulli_naive_bayes
3 | from .gaussian_naive_bayes import gaussian_naive_bayes
4 | 
5 | __all__ = ['bernoulli_naive_bayes','gaussian_naive_bayes']
6 | 
7 | 


--------------------------------------------------------------------------------
/zwml/naive_bayes/bernoulli_naive_bayes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | 
  5 | class bernoulli_naive_bayes:
  6 |     
  7 |     def __init__(self, smoothing = 1.):
  8 |         """
  9 |         Bernoulli Naive Bayes builds it's understanding of the data by
 10 |         applying Bayes rule and calculating the conditional probability of
 11 |         being a class based on a probabilistic understanding of how the 
 12 |         class has behaved before. We only care if a feature is zero or non-zero
 13 |         in this style of naive bayes and will calculate our conditional probabilities
 14 |         accordingly. 
 15 |         ---
 16 |         Inputs:
 17 |         smoothing: the Laplace smoothing factor overcome the problem of multiplying
 18 |         a 0 probability, that causes the total probability to be 0.
 19 |         """
 20 |         self._prob_by_class = defaultdict(float)
 21 |         self._cond_probs = defaultdict(lambda: defaultdict(float))
 22 |         self._log_prob_by_class = defaultdict(float)
 23 |         self._log_cond_probs = defaultdict(lambda: defaultdict(float))
 24 |         self._data_cols = None
 25 |         self._smoothing = smoothing
 26 |     
 27 |     def fit(self, X, y):
 28 |         """
 29 |         For each class, we find out what percentage of the data is that class.
 30 |         We then filter the data so only the rows that are that class remain,
 31 |         and then go column by column - calculating what percentage of rows are
 32 |         non-zero, given the class. We store all of these values to be used later 
 33 |         for predictions. We also store the log of these values for later prediction.
 34 |         ---
 35 |         Input: X, data (array/DataFrame)
 36 |         y, targets (array/Series)
 37 |         """
 38 |         X = self.convert_to_array(X)
 39 |         y = self.pandas_to_numpy(y) # keep as 1D
 40 |         self._data_cols = X.shape[1]
 41 |        
 42 |         self._classes = np.unique(y)
 43 |         
 44 |         for cl in self._classes:
 45 |             filtered_targets = y[y == cl]
 46 |             filtered_data = X[y == cl]
 47 |             self._prob_by_class[cl] = len(filtered_targets)/len(y)
 48 |             self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl])
 49 |             denom = len(filtered_targets)
 50 |             for col in range(self._data_cols):
 51 |                 binarized_column = filtered_data.T[col] > 0 
 52 |                 num_ones = np.sum(binarized_column)
 53 |                 #smoothing applied here so we never get a zero probability
 54 |                 self._cond_probs[cl][col] = (num_ones+self._smoothing)/(denom+self._smoothing) 
 55 |                 self._log_cond_probs[cl][col] = np.log(self._cond_probs[cl][col])
 56 |                 
 57 |     def predict(self, X):
 58 |         """
 59 |         Wrapper to return only the class of the prediction
 60 |         ---
 61 |         Input: X, data (array/dataframe)
 62 |         """
 63 |         return self._predict(X, mode="predict")
 64 |     
 65 |     def predict_proba(self, X):
 66 |         """
 67 |         Wrapper to return probability of each class of the prediction
 68 |         ---
 69 |         Input: X, data (array/dataframe)
 70 |         """
 71 |         return self._predict(X, mode="predict_proba")
 72 |     
 73 |     def predict_log_proba(self, X):
 74 |         """
 75 |         Wrapper to return log of the probability of each class of 
 76 |         the prediction.
 77 |         ---
 78 |         Input: X, data (array/dataframe)
 79 |         """
 80 |         return self._predict(X, mode="predict_log_proba")
 81 |     
 82 |     def _predict(self, X, mode="predict"):
 83 |         """
 84 |         For each data point, we go through and calculate the probability
 85 |         of it being each class. We do so by using the probability of
 86 |         seeing each value per feature, then combining them together with 
 87 |         the class probability. We work in the log space to fight against
 88 |         combining too many really small or large values and under/over 
 89 |         flowing Python's memory capabilities for a float. Depending on the mode
 90 |         we return either the prediction, the probabilities for each class,
 91 |         or the log of the probabilities for each class.
 92 |         ---
 93 |         Inputs: X, data (array/DataFrame)
 94 |         mode: type of prediction to return, defaults to single prediction mode
 95 |         """
 96 |         X = self.convert_to_array(X)
 97 |         X = (X > 0).astype(int) # convert to 1 or 0
 98 |         results = []
 99 |         for row in X:
100 |             beliefs = []
101 |             for cl in self._classes:
102 |                 prob_for_class = self._log_prob_by_class[cl]
103 |                 for col in range(self._data_cols):
104 |                     p = self._log_cond_probs[cl][col]
105 |                     # The row or (1-row) chooses either the 0 or 1 probability
106 |                     # based on whether our row is a 0 or 1.
107 |                     prob_for_class += p*row[col] + (1-p)*(1-row[col])
108 |                 beliefs.append([cl, prob_for_class])
109 |             
110 |             if mode == "predict_log_proba":
111 |                 _, log_probs = zip(*beliefs)
112 |                 results.append(log_probs)
113 |             
114 |             elif mode == "predict_proba":
115 |                 _, probs = zip(*beliefs)
116 |                 unlog_probs = np.exp(probs)
117 |                 normed_probs = unlog_probs/np.sum(unlog_probs)
118 |                 results.append(normed_probs)
119 |             
120 |             else:
121 |                 sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True)
122 |                 results.append(sort_beliefs[0][0])
123 |         
124 |         return np.array(results).reshape(-1,1)
125 |     
126 |     def score(self, X, y):
127 |         """
128 |         Uses the predict method to measure the accuracy of the model.
129 |         ---
130 |         In: X (list or array), feature matrix; y (list or array) labels
131 |         Out: accuracy (float)
132 |         """
133 |         pred = self.predict(X)
134 |         correct = 0
135 |         for i,j in zip(y,pred):
136 |             if i == j:
137 |                 correct+=1
138 |         return float(correct)/float(len(y))
139 |       
140 |     def pandas_to_numpy(self, x):
141 |         """
142 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
143 |         calculation purposes.
144 |         ---
145 |         Input: X (array, dataframe, or series)
146 |         Output: X (array)
147 |         """
148 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
149 |             return x.as_matrix()
150 |         if type(x) == type(np.array([1,2])):
151 |             return x
152 |         return np.array(x) 
153 |     
154 |     def handle_1d_data(self,x):
155 |         """
156 |         Converts 1 dimensional data into a series of rows with 1 columns
157 |         instead of 1 row with many columns.
158 |         """
159 |         if x.ndim == 1:
160 |             x = x.reshape(-1,1)
161 |         return x
162 |     
163 |     def convert_to_array(self, x):
164 |         """
165 |         Takes in an input and converts it to a numpy array
166 |         and then checks if it needs to be reshaped for us
167 |         to use it properly
168 |         """
169 |         x = self.pandas_to_numpy(x)
170 |         x = self.handle_1d_data(x)
171 |         return x


--------------------------------------------------------------------------------
/zwml/naive_bayes/gaussian_naive_bayes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | 
  5 | class gaussian_naive_bayes:
  6 |     
  7 |     def __init__(self):
  8 |         """
  9 |         Gaussian Naive Bayes builds it's understanding of the data by
 10 |         applying Bayes rule and calculating the conditional probability of
 11 |         being a class based on a probabilistic understanding of how the 
 12 |         class has behaved before. We will assume each feature is normally
 13 |         distributed in its own space, then use a gaussian PDF to calculate
 14 |         the probability of a class based on behavior. 
 15 |         """
 16 |         self._prob_by_class = defaultdict(float)
 17 |         self._cond_means = defaultdict(lambda: defaultdict(float))
 18 |         self._cond_std = defaultdict(lambda: defaultdict(float))
 19 |         self._log_prob_by_class = defaultdict(float)
 20 |         self._data_cols = None
 21 |         
 22 |     def gaus(self, x, mu=0, sig=1):
 23 |         """
 24 |         Returns the probability of x given the mean and standard
 25 |         deviation provided - assuming a Gaussian probability.
 26 |         ---
 27 |         Inputs: x (the value to find the probability for, float),
 28 |         mu (the mean value of the feature in the training data, float),
 29 |         sig (the standard deviation of the feature in the training data, float)
 30 |         Outputs: probability (float)
 31 |         """
 32 |         norm = 1/(np.sqrt(2*np.pi*sig**2))
 33 |         return norm*np.exp(-(x-mu)**2/(2*sig**2))
 34 |     
 35 |     def fit(self, X, y):
 36 |         """
 37 |         For each class, we find out what percentage of the data is that class.
 38 |         We then filter the data so only the rows that are that class remain,
 39 |         and then go column by column - calculating the mean and standard dev
 40 |         for the values of that column, given the class. We store all of these
 41 |         values to be used later for predictions.
 42 |         ---
 43 |         Input: X, data (array/DataFrame)
 44 |         y, targets (array/Series)
 45 |         """
 46 |         X = self.convert_to_array(X)
 47 |         y = self.pandas_to_numpy(y)
 48 |         self._data_cols = X.shape[1]
 49 |         
 50 |         self._classes = np.unique(y)
 51 |         
 52 |         for cl in self._classes:
 53 |             self._prob_by_class[cl] = len(y[y == cl])/len(y)
 54 |             self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl])
 55 |             filt = (y == cl)
 56 |             filtered_data = X[filt]
 57 |             for col in range(self._data_cols):
 58 |                 self._cond_means[cl][col] = np.mean(filtered_data.T[col])
 59 |                 self._cond_std[cl][col] = np.std(filtered_data.T[col])
 60 |                 
 61 |     def predict(self, X):
 62 |         """
 63 |         Wrapper to return only the class of the prediction
 64 |         ---
 65 |         Input: X, data (array/dataframe)
 66 |         """
 67 |         return self._predict(X, mode="predict")
 68 |     
 69 |     def predict_proba(self, X):
 70 |         """
 71 |         Wrapper to return probability of each class of the prediction
 72 |         ---
 73 |         Input: X, data (array/dataframe)
 74 |         """
 75 |         return self._predict(X, mode="predict_proba")
 76 |     
 77 |     def predict_log_proba(self, X):
 78 |         """
 79 |         Wrapper to return log of the probability of each class of 
 80 |         the prediction.
 81 |         ---
 82 |         Input: X, data (array/dataframe)
 83 |         """
 84 |         return self._predict(X, mode="predict_log_proba")
 85 |     
 86 |     def _predict(self, X, mode="predict"):
 87 |         """
 88 |         For each data point, we go through and calculate the probability
 89 |         of it being each class. We do so by sampling the probability of
 90 |         seeing each value per feature, then combining them together with 
 91 |         the class probability. We work in the log space to fight against
 92 |         combining too many really small or large values and under/over 
 93 |         flowing Python's memory capabilities for a float. Depending on the mode
 94 |         we return either the prediction, the probabilities for each class,
 95 |         or the log of the probabilities for each class.
 96 |         ---
 97 |         Inputs: X, data (array/DataFrame)
 98 |         mode: type of prediction to return, defaults to single prediction mode
 99 |         """
100 |         X = self.convert_to_array(X)
101 |         results = []
102 |         for row in X:
103 |             beliefs = []
104 |             for cl in self._classes:
105 |                 prob_for_class = self._log_prob_by_class[cl]
106 |                 for col in range(self._data_cols):
107 |                     if self._cond_std[cl][col]:
108 |                         p = self.gaus(row[col],mu=self._cond_means[cl][col],sig=self._cond_std[cl][col])
109 |                         logp = np.log(p)
110 |                         prob_for_class += logp
111 |                 beliefs.append([cl, prob_for_class])
112 |             
113 |             if mode == "predict_log_proba":
114 |                 _, log_probs = zip(*beliefs)
115 |                 results.append(log_probs)
116 |             
117 |             elif mode == "predict_proba":
118 |                 _, probs = zip(*beliefs)
119 |                 unlog_probs = np.exp(probs)
120 |                 normed_probs = unlog_probs/np.sum(unlog_probs)
121 |                 results.append(normed_probs)
122 |             
123 |             else:
124 |                 sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True)
125 |                 results.append(sort_beliefs[0][0])
126 |         
127 |         return results
128 |     
129 |     def score(self, X, y):
130 |         """
131 |         Uses the predict method to measure the accuracy of the model.
132 |         ---
133 |         In: X (list or array), feature matrix; y (list or array) labels
134 |         Out: accuracy (float)
135 |         """
136 |         pred = self.predict(X)
137 |         correct = 0
138 |         for i,j in zip(y,pred):
139 |             if i == j:
140 |                 correct+=1
141 |         return float(correct)/float(len(y))
142 |       
143 |     def pandas_to_numpy(self, x):
144 |         """
145 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
146 |         calculation purposes.
147 |         ---
148 |         Input: X (array, dataframe, or series)
149 |         Output: X (array)
150 |         """
151 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
152 |             return x.as_matrix()
153 |         if type(x) == type(np.array([1,2])):
154 |             return x
155 |         return np.array(x) 
156 |     
157 |     def handle_1d_data(self,x):
158 |         """
159 |         Converts 1 dimensional data into a series of rows with 1 columns
160 |         instead of 1 row with many columns.
161 |         """
162 |         if x.ndim == 1:
163 |             x = x.reshape(-1,1)
164 |         return x
165 |     
166 |     def convert_to_array(self, x):
167 |         """
168 |         Takes in an input and converts it to a numpy array
169 |         and then checks if it needs to be reshaped for us
170 |         to use it properly
171 |         """
172 |         x = self.pandas_to_numpy(x)
173 |         x = self.handle_1d_data(x)
174 |         return x


--------------------------------------------------------------------------------
/zwml/naive_bayes/multinomial_naive_bayes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | 
  5 | class multinomial_naive_bayes:
  6 |     
  7 |     def __init__(self, smoothing = 1.):
  8 |         """
  9 |         Multinomial Naive Bayes builds it's understanding of the data by
 10 |         applying Bayes rule and calculating the conditional probability of
 11 |         being a class based on a probabilistic understanding of how the 
 12 |         class has behaved before. We calculate conditional probabilities
 13 |         . 
 14 |         ---
 15 |         Inputs:
 16 |         smoothing: the Laplace smoothing factor overcome the problem of multiplying
 17 |         a 0 probability, that causes the total probability to be 0.
 18 |         """
 19 |         self._prob_by_class = defaultdict(float)
 20 |         self._cond_probs = defaultdict(lambda: defaultdict(float))
 21 |         self._log_prob_by_class = defaultdict(float)
 22 |         self._log_cond_probs = defaultdict(lambda: defaultdict(float))
 23 |         self._data_cols = None
 24 |         self._smoothing = smoothing
 25 |     
 26 |     def fit(self, X, y):
 27 |         """
 28 |         For each class, we find out what percentage of the data is that class.
 29 |         We then filter the data so only the rows that are that class remain,
 30 |         and then go column by column - calculating what of total counts in the
 31 |         class come from that feature. We store all of these values to be used later 
 32 |         for predictions. We also store the log of these values for later prediction.
 33 |         ---
 34 |         Input: X, data (array/DataFrame)
 35 |         y, targets (array/Series)
 36 |         """
 37 |         X = self.convert_to_array(X)
 38 |         y = self.pandas_to_numpy(y)
 39 |         self._data_cols = X.shape[1]
 40 |        
 41 |         self._classes = np.unique(y)
 42 |         
 43 |         for cl in self._classes:
 44 |             filtered_targets = y[y == cl]
 45 |             filtered_data = X[y == cl]
 46 |             self._prob_by_class[cl] = len(filtered_targets)/len(y)
 47 |             self._log_prob_by_class[cl] = np.log(self._prob_by_class[cl])
 48 |             denom = np.sum(filtered_data)
 49 |             for col in range(self._data_cols):
 50 |                 sum_of_column = np.sum(filtered_data.T[col])
 51 |                 #smoothing applied here so we never get a zero probability
 52 |                 self._cond_probs[cl][col] = (sum_of_column+self._smoothing)/(denom+self._smoothing) 
 53 |                 self._log_cond_probs[cl][col] = np.log(self._cond_probs[cl][col])
 54 |                 
 55 |     def predict(self, X):
 56 |         """
 57 |         Wrapper to return only the class of the prediction
 58 |         ---
 59 |         Input: X, data (array/dataframe)
 60 |         """
 61 |         return self._predict(X, mode="predict")
 62 |     
 63 |     def predict_proba(self, X):
 64 |         """
 65 |         Wrapper to return probability of each class of the prediction
 66 |         ---
 67 |         Input: X, data (array/dataframe)
 68 |         """
 69 |         return self._predict(X, mode="predict_proba")
 70 |     
 71 |     def predict_log_proba(self, X):
 72 |         """
 73 |         Wrapper to return log of the probability of each class of 
 74 |         the prediction.
 75 |         ---
 76 |         Input: X, data (array/dataframe)
 77 |         """
 78 |         return self._predict(X, mode="predict_log_proba")
 79 |     
 80 |     def _predict(self, X, mode="predict"):
 81 |         """
 82 |         For each data point, we go through and calculate the probability
 83 |         of it being each class. We do so by using the probability of
 84 |         seeing each feature/class and multiplying that by the number
 85 |         of times we see that feature, then combining them together with 
 86 |         the class probability. We work in the log space to fight against
 87 |         combining too many really small or large values and under/over 
 88 |         flowing Python's memory capabilities for a float. Depending on the mode
 89 |         we return either the prediction, the probabilities for each class,
 90 |         or the log of the probabilities for each class.
 91 |         ---
 92 |         Inputs: X, data (array/DataFrame)
 93 |         mode: type of prediction to return, defaults to single prediction mode
 94 |         """
 95 |         X = self.convert_to_array(X)
 96 |         results = []
 97 |         for row in X:
 98 |             beliefs = []
 99 |             for cl in self._classes:
100 |                 prob_for_class = self._log_prob_by_class[cl]
101 |                 for col in range(self._data_cols):
102 |                     val = row[col]
103 |                     p = self._log_cond_probs[cl][col]
104 |                     prob_for_class += val*p
105 |                 beliefs.append([cl, prob_for_class])
106 |             
107 |             if mode == "predict_log_proba":
108 |                 _, log_probs = zip(*beliefs)
109 |                 results.append(log_probs)
110 |             
111 |             elif mode == "predict_proba":
112 |                 _, probs = zip(*beliefs)
113 |                 unlog_probs = np.exp(probs)
114 |                 normed_probs = unlog_probs/np.sum(unlog_probs)
115 |                 results.append(normed_probs)
116 |             
117 |             else:
118 |                 sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True)
119 |                 results.append(sort_beliefs[0][0])
120 |         
121 |         return np.array(results).reshape(-1,1)
122 |     
123 |     def score(self, X, y):
124 |         """
125 |         Uses the predict method to measure the accuracy of the model.
126 |         ---
127 |         In: X (list or array), feature matrix; y (list or array) labels
128 |         Out: accuracy (float)
129 |         """
130 |         pred = self.predict(X)
131 |         correct = 0
132 |         for i,j in zip(y,pred):
133 |             if i == j:
134 |                 correct+=1
135 |         return float(correct)/float(len(y))
136 |       
137 |     def pandas_to_numpy(self, x):
138 |         """
139 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
140 |         calculation purposes.
141 |         ---
142 |         Input: X (array, dataframe, or series)
143 |         Output: X (array)
144 |         """
145 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
146 |             return x.as_matrix()
147 |         if type(x) == type(np.array([1,2])):
148 |             return x
149 |         return np.array(x) 
150 |     
151 |     def handle_1d_data(self,x):
152 |         """
153 |         Converts 1 dimensional data into a series of rows with 1 columns
154 |         instead of 1 row with many columns.
155 |         """
156 |         if x.ndim == 1:
157 |             x = x.reshape(-1,1)
158 |         return x
159 |     
160 |     def convert_to_array(self, x):
161 |         """
162 |         Takes in an input and converts it to a numpy array
163 |         and then checks if it needs to be reshaped for us
164 |         to use it properly
165 |         """
166 |         x = self.pandas_to_numpy(x)
167 |         x = self.handle_1d_data(x)
168 |         return x


--------------------------------------------------------------------------------
/zwml/neighbors/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .knn_classifier import knn_classifier
3 | from .knn_regressor import knn_regressor
4 | from .kde_approximator import kde_approximator
5 | 
6 | __all__ = ['knn_classifier','knn_regressor','kde_approximator']
7 | 


--------------------------------------------------------------------------------
/zwml/neighbors/k_neighbors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | 
 5 | class k_neighbors:
 6 |     
 7 |     def __init__(self, n_neighbors=5, return_dist=False):
 8 |         """
 9 |         KNearestNeighbors finds the nearest points in the feature space.
10 |         ---
11 |         In: n_neighbors (int) - how many closest neighbors do we consider
12 |         """
13 |         if n_neighbors > 0:
14 |             self.k = int(n_neighbors)
15 |         else:
16 |             print("n_neighbors must be >0. Set to 5!")
17 |             self.k = 5
18 |         self.X = None
19 |         self._return_dist = return_dist
20 |         
21 |     def fit(self, X):
22 |         """
23 |         Makes a copy of the training data that can live within the class.
24 |         Thus, the model can be serialized and used away from the original
25 |         training data. 
26 |         ---
27 |         In: X (features); np.array or pandas dataframe/series
28 |         """
29 |         self.X = copy.copy(self.convert_to_array(X))
30 |     
31 |     def find_neighbors(self, X):
32 |         """
33 |         Iterates through all points to predict, calculating the distance
34 |         to all of the training points. It then finds the closest points.
35 |         ___
36 |         In: new data to predict (np.array, pandas series/dataframe)
37 |         Out: predictions (np.array)
38 |         """
39 |         X = self.convert_to_array(X)
40 |         results = []
41 |         for x in X:
42 |             local_results = []
43 |             for x2 in self.X:
44 |                 local_results.append([self.dist_between_points(x,x2),x2])
45 |             neighbors = sorted(local_results, key=lambda x: x[0])[:self.k]
46 |             if self._return_dist:
47 |                 results.append(neighbors)
48 |             else:
49 |                 for x in neighbors:
50 |                     results.append(x[1])
51 |                 #results.append([x[1] for x in neighbors])
52 |         return np.array(results)
53 | 
54 |     def dist_between_points(self, a, b):
55 |         """
56 |         Calculates the distance between two vectors.
57 |         ---
58 |         Inputs: a,b (np.arrays)
59 |         Outputs: distance (float)"""
60 |         assert np.array(a).shape == np.array(b).shape, 'Vectors must be of same size'
61 |         return np.sqrt(np.sum((a-b)**2))
62 |     
63 |     def pandas_to_numpy(self, x):
64 |         """
65 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
66 |         calculation purposes.
67 |         ---
68 |         Input: X (array, dataframe, or series)
69 |         Output: X (array)
70 |         """
71 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
72 |             return x.as_matrix()
73 |         if type(x) == type(np.array([1,2])):
74 |             return x
75 |         return np.array(x) 
76 |     
77 |     def handle_1d_data(self,x):
78 |         """
79 |         Converts 1 dimensional data into a series of rows with 1 columns
80 |         instead of 1 row with many columns.
81 |         """
82 |         if x.ndim == 1:
83 |             x = x.reshape(-1,1)
84 |         return x
85 |     
86 |     def convert_to_array(self, x):
87 |         """
88 |         Takes in an input and converts it to a numpy array
89 |         and then checks if it needs to be reshaped for us
90 |         to use it properly
91 |         """
92 |         x = self.pandas_to_numpy(x)
93 |         x = self.handle_1d_data(x)
94 |         return x


--------------------------------------------------------------------------------
/zwml/neighbors/kde_approximator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | from mpl_toolkits.mplot3d import Axes3D
  5 | from copy import copy
  6 | plt.style.use('seaborn')
  7 | 
  8 | class kde_approximator:
  9 |     
 10 |     def __init__(self, kernel='gaus', bandwidth=1., grid_fineness=10.):
 11 |         """
 12 |         KDE allows us a method of drawing samples from an
 13 |         already known set of data, with the same distribution
 14 |         of data. This is done by assuming a probability dist
 15 |         for each point and using that as a probabilistic 
 16 |         interpretation of the data.
 17 |         ---
 18 |         KWargs:
 19 |         kernel: type of probability dist to assume. Options
 20 |         "gaus", "uniform". (string)
 21 |         bandwidth: Used with gaussian, sets the width of the 
 22 |         assumed gaussian distribution. (float)
 23 |         grid_fineness: Sets how many points to use in each
 24 |         dimension when building a probability surface for 
 25 |         plotting purposes.
 26 |         
 27 |         """
 28 |         self.kernel = kernel
 29 |         self.bandwidth = bandwidth
 30 |         if kernel == "gaus":
 31 |             self.estim = self.gaus
 32 |         elif kernel == "uniform":
 33 |             self.estim = self.uniform
 34 |         else:
 35 |             raise TypeError("Invalid Kernel Selection")
 36 |         self.grid_fineness = grid_fineness
 37 |         self.data_cols = None
 38 |         
 39 |     def gaus(self, x, mu):
 40 |         """
 41 |         Returns the probability of x given the mean and standard
 42 |         deviation provided - assuming a Gaussian probability.
 43 |         ---
 44 |         Inputs: x (the value to find the probability for, float),
 45 |         mu (the mean value of the feature in the training data, float),
 46 |         sig (the standard deviation of the feature in the training data, float)
 47 |         Outputs: probability (float)
 48 |         """
 49 |         sig = self.bandwidth
 50 |         diff = np.sqrt(np.sum((x-mu)**2))
 51 |         norm = 1/(np.sqrt(2*np.pi*sig**2))
 52 |         return norm*np.exp(-diff**2/(2*sig**2))
 53 |     
 54 |     def uniform(self, x, pt):
 55 |         """
 56 |         Returns probability of x, assuming uniform distribution
 57 |         in each direction around pt2 in range (-1, 1).
 58 |         """
 59 |         diff = np.sqrt(np.sum((x-pt)**2))
 60 |         probs = np.zeros_like(diff)
 61 |         probs[diff <= 1] = 0.5
 62 |         return probs
 63 |     
 64 |     def get_grid(self, X):
 65 |         """
 66 |         Given a dataset, figure out how many dimensions there are
 67 |         then create a series of sampling points based on the
 68 |         user's requested grid_fineness. Create this sampling region
 69 |         over the span of the data +/- 10% in each dimension.
 70 |         ---
 71 |         Input: X, Data matrix
 72 |         """
 73 |         mins, maxes = [], []
 74 |         
 75 |         for col in range(self.data_cols): 
 76 |             data = X.T[col]
 77 |             mins.append(np.min(data)-abs(np.min(data)*0.10))
 78 |             maxes.append(np.max(data)+abs(np.max(data)*0.10))
 79 |         grid = np.stack(np.meshgrid(*[np.linspace(i,j,self.grid_fineness) for i,j in zip(mins, maxes)], indexing='ij'),self.data_cols)
 80 |         return grid
 81 |     
 82 |     def fit(self, X):
 83 |         """
 84 |         Copies the data for later use.
 85 |         ---
 86 |         In: X (features), np.array or pandas dataframe/series
 87 |         """
 88 |         X = self.convert_to_array(X)
 89 |         self.data_cols = X.shape[1]
 90 |         self.X = copy(X)
 91 |     
 92 |     def make_surface(self):
 93 |         """
 94 |         Using a sampling grid, goes point by point along the
 95 |         grid to determine the probability of data existing
 96 |         at that point, given all of the known data.
 97 |         ---
 98 |         In: X (features), np.array or pandas dataframe/series
 99 |         """
100 |         X = self.X
101 |         span = self.get_grid(X)
102 |         
103 |         probs = []
104 |         points = []
105 |         for dim in span:
106 |             for p in dim:
107 |                 prob = 0.
108 |                 for d in X: 
109 |                     prob += self.estim(p,d)
110 |                 if np.isnan(prob):
111 |                     prob = 0.
112 |                 points.append(p)
113 |                 probs.append(prob)
114 |         self.region = points 
115 |         self.probs = probs
116 |         
117 |     def sample(self, num_samples=1, random_state=None):
118 |         """
119 |         Given the data we trained on, sample new points
120 |         based on the density of the data. Use the kernel
121 |         to sample not just the available points, but the 
122 |         whole region of possiblities given the kernel.
123 |         ---
124 |         Inputs: 
125 |         num_samples: how many samples to draw (int)
126 |         random_state: seed to make the random draws 
127 |         reproducible (int)
128 |         """
129 |         if random_state:
130 |             np.random.seed(random_state)
131 |         
132 |         samples = []
133 |         for i in range(num_samples):
134 |             pt = self.X[np.random.randint(self.X.shape[0])]
135 |             sample_pt = []
136 |             for dim in pt:
137 |                 if self.kernel == "gaus":
138 |                     sample_pt.append(np.random.normal(dim, self.bandwidth))
139 |                 elif self.kernel == "uniform":
140 |                     sample_pt.append(np.random.uniform(dim-1,dim+1))
141 |             samples.append(sample_pt)
142 |         return np.array(samples)
143 |     
144 |     def pandas_to_numpy(self, x):
145 |         """
146 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
147 |         calculation purposes.
148 |         ---
149 |         Input: X (array, dataframe, or series)
150 |         Output: X (array)
151 |         """
152 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
153 |             return x.as_matrix()
154 |         if type(x) == type(np.array([1,2])):
155 |             return x
156 |         return np.array(x) 
157 |     
158 |     def handle_1d_data(self,x):
159 |         """
160 |         Converts 1 dimensional data into a series of rows with 1 columns
161 |         instead of 1 row with many columns.
162 |         """
163 |         if x.ndim == 1:
164 |             x = x.reshape(-1,1)
165 |         return x
166 |     
167 |     def convert_to_array(self, x):
168 |         """
169 |         Takes in an input and converts it to a numpy array
170 |         and then checks if it needs to be reshaped for us
171 |         to use it properly
172 |         """
173 |         x = self.pandas_to_numpy(x)
174 |         x = self.handle_1d_data(x)
175 |         return x
176 |     
177 |     def make_plot(self):
178 |         """
179 |         Creates a plot of the surface created by make_surface
180 |         using 2D or 1D, depending on request.
181 |         """
182 |         if self.data_cols == 2:
183 |             Xpl, Ypl  = zip(*self.region)
184 |             Zpl = kde2.probs/max(self.probs)
185 |             fig = plt.figure(dpi=200, figsize=(18,14))
186 |             ax = fig.gca(projection='3d')
187 |             ax.plot_trisurf(Xpl,Ypl,Zpl, cmap=plt.cm.rainbow, linewidth=1)
188 | 
189 |             Xsc, Ysc = zip(*X)
190 |             ax.scatter(Xsc,Ysc,[max(Zpl)]*len(Xsc),c='k',s=20, label="Data", alpha=0.5);
191 |             proxy = plt.Circle((0,0), fc="k")
192 |             ax.legend([proxy],['Data (z = 1)'], fontsize=18, loc='upper right', frameon=True, facecolor='#FFFFFF', edgecolor='#333333');
193 |             ax.set_zlabel("Norm. Prob.",fontsize=16, labelpad=10)
194 |             ax.set_xlabel("X",fontsize=16, labelpad=10)
195 |             ax.set_ylabel("Y",fontsize=16, labelpad=10);
196 |         
197 |         elif self.data_cols == 1:
198 |             plt.figure(figsize=(10,6))
199 |             plt.hist(X, label="Binned data", bins=18, alpha=0.8, zorder=1)
200 |             plt.plot(self.region, self.probs, c='k', lw=3, label="KDE", zorder=2);
201 |             plt.scatter(X, [5]*len(X), marker='o', c='r', s=30, alpha=0.3,label='Actual Data', zorder=3)
202 |             plt.legend(fontsize=20, loc='upper left', frameon=True, facecolor='#FFFFFF', edgecolor='#333333');
203 |             ax = plt.gca()
204 |         else:
205 |             print("Can only draw if KDE is done on 2 or fewer columns.")
206 |             return None
207 |         return ax
208 |         
209 |     


--------------------------------------------------------------------------------
/zwml/neighbors/knn_classifier.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import copy
  4 | import collections
  5 | 
  6 | class knn_classifier:
  7 |     
  8 |     def __init__(self, n_neighbors=5):
  9 |         """
 10 |         KNearestNeighbors is a distance based classifier that returns
 11 |         predictions based on the nearest points in the feature space.
 12 |         ---
 13 |         In: n_neighbors (int) - how many closest neighbors do we consider
 14 |         """
 15 |         if n_neighbors > 0:
 16 |             self.k = int(n_neighbors)
 17 |         else:
 18 |             print("n_neighbors must be >0. Set to 5!")
 19 |             self.k = 5
 20 |         self.X = None
 21 |         self.y = None
 22 |         
 23 |     def fit(self, X, y):
 24 |         """
 25 |         Makes a copy of the training data that can live within the class.
 26 |         Thus, the model can be serialized and used away from the original
 27 |         training data. 
 28 |         ---
 29 |         In: X (features), y (labels); both np.array or pandas dataframe/series
 30 |         """
 31 |         self.X = copy.copy(self.convert_to_array(X))
 32 |         self.y = copy.copy(self.pandas_to_numpy(y))
 33 |     
 34 |     def predict(self, X):
 35 |         """
 36 |         Iterates through all points to predict, calculating the distance
 37 |         to all of the training points. It then passes that to a sorting function
 38 |         which returns the most common vote of the n_neighbors (k) closest training
 39 |         points.
 40 |         ___
 41 |         In: new data to predict (np.array, pandas series/dataframe)
 42 |         Out: predictions (np.array)
 43 |         """
 44 |         X = self.pandas_to_numpy(X)
 45 |         results = []
 46 |         for x in X:
 47 |             local_results = []
 48 |             for (x2,y) in zip(self.X,self.y):
 49 |                 local_results.append([self.dist_between_points(x,x2),y])
 50 |             results.append(self.get_final_predict(local_results))
 51 |         return np.array(results).reshape(-1,1)
 52 |             
 53 |     def get_final_predict(self,results):
 54 |         """
 55 |         Takes a list of [distance, label] pairs and sorts by distance,
 56 |         returning the mode vote for the n_neighbors (k) closest votes. 
 57 |         ---
 58 |         In: [[distance, label]] list of lists
 59 |         Output: class label (int)
 60 |         """
 61 |         results = sorted(results, key=lambda x: x[0])
 62 |         dists, votes = zip(*results)
 63 |         return collections.Counter(votes[:self.k]).most_common(1)[0][0]
 64 | 
 65 |     def dist_between_points(self, a, b):
 66 |         """
 67 |         Calculates the distance between two vectors.
 68 |         ---
 69 |         Inputs: a,b (np.arrays)
 70 |         Outputs: distance (float)"""
 71 |         assert np.array(a).shape == np.array(b).shape
 72 |         return np.sqrt(np.sum((a-b)**2))
 73 |     
 74 |     def score(self, X, y):
 75 |         """
 76 |         Uses the predict method to measure the accuracy of the model.
 77 |         ---
 78 |         In: X (list or array), feature matrix; y (list or array) labels
 79 |         Out: accuracy (float)
 80 |         """
 81 |         pred = self.predict(X)
 82 |         correct = 0
 83 |         for i,j in zip(y,pred):
 84 |             if i == j:
 85 |                 correct+=1
 86 |         return float(correct)/float(len(y))
 87 |     
 88 |     def pandas_to_numpy(self, x):
 89 |         """
 90 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 91 |         calculation purposes.
 92 |         ---
 93 |         Input: X (array, dataframe, or series)
 94 |         Output: X (array)
 95 |         """
 96 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 97 |             return x.as_matrix()
 98 |         if type(x) == type(np.array([1,2])):
 99 |             return x
100 |         return np.array(x) 
101 |     
102 |     def handle_1d_data(self,x):
103 |         """
104 |         Converts 1 dimensional data into a series of rows with 1 columns
105 |         instead of 1 row with many columns.
106 |         """
107 |         if x.ndim == 1:
108 |             x = x.reshape(-1,1)
109 |         return x
110 |     
111 |     def convert_to_array(self, x):
112 |         """
113 |         Takes in an input and converts it to a numpy array
114 |         and then checks if it needs to be reshaped for us
115 |         to use it properly
116 |         """
117 |         x = self.pandas_to_numpy(x)
118 |         x = self.handle_1d_data(x)
119 |         return x


--------------------------------------------------------------------------------
/zwml/neighbors/knn_regressor.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import copy
  4 | import collections
  5 | 
  6 | class knn_regressor:
  7 |     
  8 |     def __init__(self, n_neighbors=5):
  9 |         """
 10 |         KNearestNeighbors is a distance based regressors that returns
 11 |         predictions based on the nearest points in the feature space.
 12 |         ---
 13 |         In: n_neighbors (int) - how many closest neighbors do we consider
 14 |         """
 15 |         if n_neighbors > 0:
 16 |             self.k = int(n_neighbors)
 17 |         else:
 18 |             print("n_neighbors must be >0. Set to 5!")
 19 |             self.k = 5
 20 |         self.X = None
 21 |         self.y = None
 22 |         
 23 |     def fit(self, X, y):
 24 |         """
 25 |         Makes a copy of the training data that can live within the class.
 26 |         Thus, the model can be serialized and used away from the original
 27 |         training data. 
 28 |         ---
 29 |         In: X (features), y (labels); both np.array or pandas dataframe/series
 30 |         """
 31 |         self.X = copy.copy(self.convert_to_array(X))
 32 |         self.y = copy.copy(self.convert_to_array(y))
 33 |         
 34 |     def pandas_to_numpy(self, x):
 35 |         """
 36 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 37 |         calculation purposes.
 38 |         ---
 39 |         Input: X (array, dataframe, or series)
 40 |         Output: X (array)
 41 |         """
 42 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 43 |             return x.as_matrix()
 44 |         if type(x) == type(np.array([1,2])):
 45 |             return x
 46 |         return np.array(x) 
 47 |         
 48 |     def handle_1d_data(self,x):
 49 |         """
 50 |         Converts 1 dimensional data into a series of rows with 1 columns
 51 |         instead of 1 row with many columns.
 52 |         """
 53 |         if x.ndim == 1:
 54 |             x = x.reshape(-1,1)
 55 |         return x
 56 |     
 57 |     def convert_to_array(self, x):
 58 |         """
 59 |         Takes in an input and converts it to a numpy array
 60 |         and then checks if it needs to be reshaped for us
 61 |         to use it properly
 62 |         """
 63 |         x = self.pandas_to_numpy(x)
 64 |         x = self.handle_1d_data(x)
 65 |         return x
 66 |     
 67 |     def predict(self, X):
 68 |         """
 69 |         Iterates through all points to predict, calculating the distance
 70 |         to all of the training points. It then passes that to a sorting function
 71 |         which returns the most common vote of the n_neighbors (k) closest training
 72 |         points.
 73 |         ___
 74 |         In: new data to predict (np.array, pandas series/dataframe)
 75 |         Out: predictions (np.array)
 76 |         """
 77 |         X = self.convert_to_array(X)
 78 |         results = []
 79 |         for x in X:
 80 |             local_results = []
 81 |             for (x2,y) in zip(self.X,self.y):
 82 |                 local_results.append([self.dist_between_points(x,x2),y])
 83 |             results.append(self.get_final_predict(local_results))
 84 |         return np.array(results).reshape(-1,1)
 85 |             
 86 |     def get_final_predict(self,results):
 87 |         """
 88 |         Takes a list of [distance, label] pairs and sorts by distance,
 89 |         returning themean of the n_neighbors (k) closest points. 
 90 |         ---
 91 |         In: [[distance, label]] list of lists
 92 |         Output: class label (int)
 93 |         """
 94 |         results = sorted(results, key=lambda x: x[0])
 95 |         dists, votes = zip(*results)
 96 |         return np.mean(votes[:self.k])
 97 | 
 98 |     def dist_between_points(self, a, b):
 99 |         """
100 |         Calculates the distance between two vectors.
101 |         ---
102 |         Inputs: a,b (np.arrays)
103 |         Outputs: distance (float)"""
104 |         assert np.array(a).shape == np.array(b).shape
105 |         return np.sqrt(np.sum((a-b)**2))
106 |     
107 |     def score(self, X, y):
108 |         """
109 |         Uses the predict method to measure the (negative)
110 |         mean squared error of the model.
111 |         ---
112 |         In: X (list or array), feature matrix; y (list or array) labels
113 |         Out: negative mean squared error (float)
114 |         """
115 |         pred = self.predict(X)
116 |         return -1.* np.mean((np.array(pred)-np.array(y))**2)


--------------------------------------------------------------------------------
/zwml/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .count_vectorizer import count_vectorizer
2 | from .tfidf_vectorizer import tfidf_vectorizer
3 | from .latent_semantic_indexing import latent_semantic_indexing
4 | 
5 | __all__ = ['count_vectorizer','latent_semantic_indexing','tfidf_vectorizer']
6 | 


--------------------------------------------------------------------------------
/zwml/nlp/count_vectorizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Counter
  3 | from string import punctuation
  4 | 
  5 | class count_vectorizer:
  6 |     
  7 |     def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False):
  8 |         """
  9 |         Count vectorizer reads the text provided, tokenizes it
 10 |         with the provided tokenizer (or the default), then generates
 11 |         ngrams keeping track of all ngrams as the vocabulary.
 12 |         Then it takes provided texts and converts them into vectors
 13 |         by counting the appearance of each ngram and tracking that
 14 |         for every document. 
 15 |         ---
 16 |         KWargs:
 17 |         max_features: how many ngrams to allow in the vector, using the
 18 |         most common features first. If None, defaults to using all
 19 |         ngrams (int)
 20 |         ngrams: how many tokens to combine to form features. First element
 21 |         of tuple is starting point, second is ending point.
 22 |         tokenizer: what function to use to create tokens (must return 
 23 |         list of tokens)
 24 |         remove_stopwords: whether to include very common english words that
 25 |         do not add much value due to their commonness.
 26 |         """
 27 |         self.max_features = max_features
 28 |         self.vocabulary = {}
 29 |         self.ngrams = ngrams
 30 |         if tokenizer == None:
 31 |             self.tokenizer = self.tokenize
 32 |         else:
 33 |             self.tokenizer = tokenizer
 34 |         self.remove_stopwords = remove_stopwords
 35 |         self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
 36 |                           'there', 'about', 'once', 'during', 'out', 'very', 'having', 
 37 |                           'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 
 38 |                           'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 
 39 |                           'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
 40 |                           'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 
 41 |                           'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 
 42 |                           'himself', 'this', 'down', 'should', 'our', 'their', 'while', 
 43 |                           'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 
 44 |                           'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 
 45 |                           'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 
 46 |                           'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
 47 |                           'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 
 48 |                           'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 
 49 |                           'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
 50 |                           'was', 'here', 'than'}
 51 |         
 52 |     def token_generator(self, X):
 53 |         """
 54 |         Generator that returns joined tokens as a single
 55 |         string to act as a feature. It generates the tokens
 56 |         by iterating through the allowed ngrams and combining
 57 |         the appropriate number of tokens into a string.
 58 |         """
 59 |         for i in range(self.ngrams[0],self.ngrams[1]+1):
 60 |             for ix, _ in enumerate(X):
 61 |                 if ix+i < len(X)+1:
 62 |                     yield ' '.join(X[ix:ix+i])
 63 |     
 64 |     def tokenize(self, X):
 65 |         """
 66 |         Simple tokenizer that removes punctuation,
 67 |         lowercases the text, and breaks on spaces.
 68 |         Also removes stopwords and numeric values
 69 |         from being treated as words.
 70 |         """
 71 |         for symbol in punctuation:
 72 |             X = X.replace(symbol,'')
 73 |         final_token_list = [] 
 74 |         for token in X.lower().split():
 75 |             if self.remove_stopwords:
 76 |                 if not self.check_stopwords(token):
 77 |                     try:
 78 |                         int(token)
 79 |                         float(token)
 80 |                     except:
 81 |                         final_token_list.append(token)  
 82 |             else:
 83 |                 final_token_list.append(token)
 84 |         return final_token_list
 85 |         
 86 |     def check_stopwords(self, token):
 87 |         """
 88 |         Checks if the token is in our list of common
 89 |         stopwords, and returns a boolean.
 90 |         """
 91 |         return token in self.stopwords
 92 |     
 93 |     def fit(self, X):
 94 |         """
 95 |         Go through all provided training documents and
 96 |         create the list of vocabulary for known documents
 97 |         by looking at all ngrams and tracking how often
 98 |         those ngrams appear. If max_features is defined,
 99 |         only keep the most common tokens. Afterward,
100 |         generate a token_to_id mapper and an id_to_token
101 |         mapper.
102 |         """
103 |         for document in X:
104 |             tokens = self.tokenizer(document)
105 |             for token in self.token_generator(tokens):
106 |                 if token in self.vocabulary.keys():
107 |                     self.vocabulary[token] += 1
108 |                 else:
109 |                     self.vocabulary[token] = 1
110 |         
111 |         if self.max_features != None:
112 |             temp_vocab = {}
113 |             for key, value in Counter(self.vocabulary).most_common(self.max_features):
114 |                 temp_vocab[key] = value
115 |             self.vocabulary = temp_vocab
116 |             del temp_vocab
117 |             
118 |         self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
119 |         self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
120 |         
121 |         
122 |     def transform(self, X):
123 |         """
124 |         Go through all provided documents and use the known
125 |         vocabulary to track how often each ngram appears in
126 |         the document. At the end, stack all of the generated
127 |         document vectors together. Skip the initial vector that
128 |         all 0's, which is just there to act as a template.
129 |         """
130 |         vectorized_docs = np.zeros(len(self.vocabulary.keys()))
131 |         for document in X:
132 |             tokens = self.tokenizer(document)
133 |             vectorized_doc = np.zeros(len(self.vocabulary.keys()))
134 |             for token in self.token_generator(tokens):
135 |                 if token in self.vocabulary:
136 |                     word_id = self.token_to_id[token]
137 |                     vectorized_doc[word_id] += 1
138 |             vectorized_docs = np.vstack((vectorized_docs,vectorized_doc))
139 |         return vectorized_docs[1:]
140 |     
141 |     def fit_transform(self, X):
142 |         """
143 |         Fit on X and then transform X and return it as vectors.
144 |         """
145 |         self.fit(X)
146 |         return self.transform(X)
147 |                         


--------------------------------------------------------------------------------
/zwml/nlp/latent_semantic_indexing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class latent_semantic_indexing:
  4 |     
  5 |     def __init__(self, num_topics=5):
  6 |         """
  7 |         Latent semantic indexing uses matrix decomposition
  8 |         techniques to reduce the large feature space associated
  9 |         with text analysis into a smaller "topic" space which
 10 |         by exploiting SVD's ability to find correlations in
 11 |         features and combine them into super-dimensions made
 12 |         of the correlated columns. In the text analysis, that 
 13 |         means if the original features are word, LSI will 
 14 |         find words that tend to be in the same document together
 15 |         and group them as unique topics. 
 16 |         """
 17 |         self.num_topics = num_topics
 18 |         
 19 |     def fit(self, X):
 20 |         """
 21 |         Using SVD as the base of the algorithm (we use numpy since 
 22 |         it's faster than our method), we do a dimensionality
 23 |         reduction. Remember that V is an expression of the new
 24 |         dimensions in terms of the old columns. If we do count
 25 |         vectorizer, this is an expression of topics in terms of
 26 |         ngrams. We'll use this to extract our topics. We can also
 27 |         cast new documents into topic space using the V matrix.
 28 |         """
 29 |         X = self.convert_to_array(X)
 30 |         self.U, self.sigma, self.V = np.linalg.svd(X)
 31 |         self.V = self.V[:self.num_topics,:]
 32 |         self.sigma = self.sigma[:self.num_topics]
 33 |         self.U = self.U[:,:self.num_topics]
 34 |         
 35 |     def transform(self, X):
 36 |         """
 37 |         Since V is a conversion of columns to the lower
 38 |         dimensional space, we can just use matrix 
 39 |         multiplication to cast any new data into that 
 40 |         space.
 41 |         ---
 42 |         Input: X, data matrix (dataframe, array, list of lists)
 43 |         """
 44 |         X = self.convert_to_array(X)
 45 |         return np.dot(X, self.V.T)
 46 |     
 47 |     def fit_transform(self, X):
 48 |         """
 49 |         Fit on X and then transform X and return it as vectors.
 50 |         """
 51 |         self.fit(X)
 52 |         return self.transform(X)
 53 |     
 54 |     def print_topics(self, X, id_to_word=None, num_words_per_topics=10):
 55 |         """
 56 |         For each topic created in the SVD decomposition,
 57 |         iterate through the strongest contributors (positive
 58 |         or negative), and print out those words. Requires a 
 59 |         column number to word dictionary, otherwise just prints
 60 |         the column number for the strong correlations.
 61 |         """
 62 |         for idx, row in enumerate(self.V):
 63 |             sorted_word_ids = np.argsort(row)[-num_words_per_topics:]
 64 |             print("--- Topic ", idx, " ---")
 65 |             words_to_print = ""
 66 |             for word_id in sorted_word_ids:
 67 |                 if id_to_word != None:
 68 |                     words_to_print += id_to_word[word_id]
 69 |                     words_to_print += ', '
 70 |                 else:
 71 |                     words_to_print += "Column "
 72 |                     words_to_print += str(word_id)
 73 |                     words_to_print += ', '
 74 |             print(words_to_print[:-2])
 75 |     
 76 |     def pandas_to_numpy(self, x):
 77 |         """
 78 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 79 |         calculation purposes.
 80 |         ---
 81 |         Input: X (array, dataframe, or series)
 82 |         Output: X (array)
 83 |         """
 84 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
 85 |             return x.as_matrix()
 86 |         if type(x) == type(np.array([1,2])):
 87 |             return x
 88 |         return np.array(x) 
 89 |     
 90 |     def handle_1d_data(self,x):
 91 |         """
 92 |         Converts 1 dimensional data into a series of rows with 1 columns
 93 |         instead of 1 row with many columns.
 94 |         """
 95 |         if x.ndim == 1:
 96 |             x = x.reshape(-1,1)
 97 |         return x
 98 |     
 99 |     def convert_to_array(self, x):
100 |         """
101 |         Takes in an input and converts it to a numpy array
102 |         and then checks if it needs to be reshaped for us
103 |         to use it properly
104 |         """
105 |         x = self.pandas_to_numpy(x)
106 |         x = self.handle_1d_data(x)
107 |         return x
108 |                         


--------------------------------------------------------------------------------
/zwml/nlp/tfidf_vectorizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Counter
  3 | from string import punctuation
  4 | 
  5 | class tfidf_vectorizer:
  6 |     
  7 |     def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False):
  8 |         """
  9 |         Term frequency, inverse document frequency vectorizer 
 10 |         reads the text provided, tokenizes it with the provided 
 11 |         tokenizer (or the default), then generates ngrams keeping 
 12 |         track of all ngrams as the vocabulary. Then it takes provided 
 13 |         texts and converts them into vectors by counting the 
 14 |         appearance of each ngram and tracking that for every document. 
 15 |         The counts are then scaled by the max term frequency and the
 16 |         inverse document frequency (see converter method). This new
 17 |         result is better than counts at picking out how important
 18 |         words are based on both usage and uniqueness. 
 19 |         ---
 20 |         KWargs:
 21 |         max_features: how many ngrams to allow in the vector, using the
 22 |         most common features first. If None, defaults to using all
 23 |         ngrams (int)
 24 |         ngrams: how many tokens to combine to form features. First element
 25 |         of tuple is starting point, second is ending point.
 26 |         tokenizer: what function to use to create tokens (must return 
 27 |         list of tokens)
 28 |         remove_stopwords: whether to include very common english words that
 29 |         do not add much value due to their commonness.
 30 |         """
 31 |         self.max_features = max_features
 32 |         self.vocabulary = {}
 33 |         self.ngrams = ngrams
 34 |         if tokenizer == None:
 35 |             self.tokenizer = self.tokenize
 36 |         else:
 37 |             self.tokenizer = tokenizer
 38 |         self.remove_stopwords = remove_stopwords
 39 |         self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
 40 |                           'there', 'about', 'once', 'during', 'out', 'very', 'having', 
 41 |                           'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 
 42 |                           'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 
 43 |                           'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
 44 |                           'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 
 45 |                           'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 
 46 |                           'himself', 'this', 'down', 'should', 'our', 'their', 'while', 
 47 |                           'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 
 48 |                           'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 
 49 |                           'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 
 50 |                           'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
 51 |                           'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 
 52 |                           'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 
 53 |                           'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
 54 |                           'was', 'here', 'than'}
 55 |         
 56 |     def token_generator(self, X):
 57 |         """
 58 |         Generator that returns joined tokens as a single
 59 |         string to act as a feature. It generates the tokens
 60 |         by iterating through the allowed ngrams and combining
 61 |         the appropriate number of tokens into a string.
 62 |         """
 63 |         for i in range(self.ngrams[0],self.ngrams[1]+1):
 64 |             for ix, _ in enumerate(X):
 65 |                 if ix+i < len(X)+1:
 66 |                     yield ' '.join(X[ix:ix+i])
 67 |     
 68 |     def tokenize(self, X):
 69 |         """
 70 |         Simple tokenizer that removes punctuation,
 71 |         lowercases the text, and breaks on spaces.
 72 |         Also removes stopwords and numeric values
 73 |         from being treated as words.
 74 |         """
 75 |         for symbol in punctuation:
 76 |             X = X.replace(symbol,'')
 77 |         final_token_list = [] 
 78 |         for token in X.lower().split():
 79 |             if self.remove_stopwords:
 80 |                 if not self.check_stopwords(token):
 81 |                     try:
 82 |                         int(token)
 83 |                         float(token)
 84 |                     except:
 85 |                         final_token_list.append(token)  
 86 |             else:
 87 |                 final_token_list.append(token)
 88 |         return final_token_list
 89 |         
 90 |     def check_stopwords(self, token):
 91 |         """
 92 |         Checks if the token is in our list of common
 93 |         stopwords, and returns a boolean.
 94 |         """
 95 |         return token in self.stopwords
 96 |     
 97 |     def fit(self, X):
 98 |         """
 99 |         Go through all provided training documents and
100 |         create the list of vocabulary for known documents
101 |         by looking at all ngrams and tracking how often
102 |         those ngrams appear. If max_features is defined,
103 |         only keep the most common tokens. Afterward,
104 |         generate a token_to_id mapper and an id_to_token
105 |         mapper.
106 |         """
107 |         for document in X:
108 |             tokens = self.tokenizer(document)
109 |             for token in self.token_generator(tokens):
110 |                 if token in self.vocabulary.keys():
111 |                     self.vocabulary[token] += 1
112 |                 else:
113 |                     self.vocabulary[token] = 1
114 |         
115 |         if self.max_features != None:
116 |             temp_vocab = {}
117 |             for key, value in Counter(self.vocabulary).most_common(self.max_features):
118 |                 temp_vocab[key] = value
119 |             self.vocabulary = temp_vocab
120 |             del temp_vocab
121 |             
122 |         self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
123 |         self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
124 |         
125 |         
126 |     def transform(self, X):
127 |         """
128 |         Go through all provided documents and use the known
129 |         vocabulary to track how often each ngram appears in
130 |         the document. At the end, stack all of the generated
131 |         document vectors together. Convert them to tf-idf
132 |         and skip the initial vector that's all 0's, which 
133 |         is just there to act as a template.
134 |         """
135 |         vectorized_docs = np.zeros(len(self.vocabulary.keys()))
136 |         for document in X:
137 |             tokens = self.tokenizer(document)
138 |             vectorized_doc = np.zeros(len(self.vocabulary.keys()))
139 |             for token in self.token_generator(tokens):
140 |                 if token in self.vocabulary:
141 |                     word_id = self.token_to_id[token]
142 |                     vectorized_doc[word_id] += 1
143 |             vectorized_docs = np.vstack((vectorized_docs,vectorized_doc))
144 |         return self.convert_counts_to_tf_idf(vectorized_docs)[1:]
145 |     
146 |     def convert_counts_to_tf_idf(self, docs):
147 |         """
148 |         To convert from counts to TF-IDF, we first scale
149 |         each value by the maximum in it's own column. This 
150 |         lowers dependence on document length. Then we calculate
151 |         log(number of documents/(1+documents containing this ngram)).
152 |         This is the inverse document frequency (the one is to make
153 |         combat division by 0). Each value is scaled as:
154 |         term_frequency*inverse_document_frequency.
155 |         """
156 |         number_of_columns = docs.shape[1]
157 |         number_of_docs = docs.shape[0]
158 |         frequency_scalers = np.ones(number_of_columns)
159 |         idf_terms = np.ones(number_of_columns)
160 |         for col in range(number_of_columns):
161 |             column_vals = docs.T[col]
162 |             frequency_scalers[col] = np.max(column_vals)
163 |             number_of_docs_containing = np.sum((column_vals > 0).astype(int))
164 |             idf_terms[col] = np.log(number_of_docs/(1+number_of_docs_containing))
165 |         docs = docs/frequency_scalers
166 |         docs = docs*idf_terms
167 |         
168 |         return docs           
169 |     
170 |     def fit_transform(self, X):
171 |         """
172 |         Fit on X and then transform X and return it as vectors.
173 |         """
174 |         self.fit(X)
175 |         return self.transform(X)
176 |                         


--------------------------------------------------------------------------------
/zwml/random/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .middle_square import middle_square
3 | 
4 | __all__ = ['middle_square']
5 | 


--------------------------------------------------------------------------------
/zwml/random/middle_square.py:
--------------------------------------------------------------------------------
 1 | class middle_square:
 2 |     
 3 |     def __init__(self):
 4 |         """
 5 |         Generates random numbers using a middle square method. 
 6 |         Squares the seed, pads the left side of the number with 
 7 |         zeroes, then takes the middle values as the next random
 8 |         number in the sequence. Note: do not use in production,
 9 |         very easy to crack.
10 |         """
11 |         pass
12 |     
13 |     def middle_square_list(self, seed, count, width=4, seeds=[]):
14 |         """
15 |         Creates a list of length "count" of random numbers
16 |         given a seed, by squaring the seed and taking the middle
17 |         digits. If the seed becomes 0000, stops early.
18 |         Works recursively by creating one value at a time and 
19 |         sending that value to the next call as the new seed.
20 |         ---
21 |         KWargs:
22 |         seed: starting value for the RNG
23 |         count: how many numbers to generate
24 |         width: how many digits is the generated number
25 |         seeds: stores the results so far, can be used to force
26 |         a certain number to be in the result.
27 |         """
28 |         if not seeds:
29 |             assert len(str(seed)) == width, "Seed must have a length equal to request width!"
30 |         x = str(seed**2)
31 |         while len(x)<width:
32 |             x = '0'+ x
33 |         
34 |         spread = width//2
35 |         new_seed = x[width-spread:width+spread]
36 |         seeds.append(new_seed)
37 |         if new_seed == ''.join(['0' for _ in range(width)]):
38 |             return 'Done'
39 | 
40 |         count -= 1
41 |         if count == 0:
42 |             return seeds
43 | 
44 |         return self.middle_square_list(int(new_seed), count, width=width, seeds=seeds)
45 |     
46 |     def middle_square_gen(self, seed, width=4):
47 |         """
48 |         Generates random numbers given a seed, by squaring the seed 
49 |         and taking the middle digits. Each number
50 |         will have number of digits equal to width. This is a
51 |         generator, so it must be handled as such.
52 |         ---
53 |         KWargs:
54 |         seed: starting value for the RNG
55 |         width: how many digits is the generated number
56 |         """
57 |         assert len(str(seed)) == width, "Seed must have a length equal to request width!"
58 |         new_seed = seed
59 |         while True:
60 |             x = str(int(new_seed)**2)
61 |             while len(x)<2*width:
62 |                 x = '0'+ x
63 |             spread = width//2
64 |             new_seed = x[width-spread:width+spread]
65 | 
66 |             if int(new_seed) == 0:
67 |                 new_seed = seed
68 | 
69 |             yield int(new_seed)


--------------------------------------------------------------------------------
/zwml/svm/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .svc import svc
3 | 
4 | __all__ = ['svc']
5 | 


--------------------------------------------------------------------------------
/zwml/svm/svc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from copy import copy
  4 | # Trained using Pegasos algorithm: http://www.cs.huji.ac.il/~shais/papers/ShalevSiSrCo10.pdf
  5 | 
  6 | class svc:
  7 |     
  8 |     def __init__(self, n_iter=10, lambd=0.01, verbose=False, add_bias=True, random_state=None):
  9 |         """
 10 |         Support Vector Machines work by trying to maximize the margin between the groups.
 11 |         This specific class will be built based on teh Pegasos methodology, which is
 12 |         a form of gradient descent for SVMs. It converges pretty quickly and is stable.
 13 |         If multiclass, automatically does one-vs-rest method.
 14 |         ---
 15 |         KWargs: 
 16 |         n_iter: number of epochs for the gradient descent
 17 |         lambd: learning rate for the gradient descent
 18 |         verbose: flag to set debug printing
 19 |         add_bias: flag for whether there is a y-intercept term
 20 |         random_state: sets state for reproducibility
 21 |         """
 22 |         self.w = None
 23 |         self.trained = False
 24 |         self.n_iter = n_iter
 25 |         self.lambd = lambd
 26 |         self.verbosity = verbose
 27 |         self.add_bias = add_bias
 28 |         if random_state:
 29 |             np.random.seed(random_state)
 30 |         self._data_cols = None
 31 |         
 32 |     def shuffle_data(self, X, y):
 33 |         """
 34 |         Given X and y, shuffle them together to get a new_X and new_y that maintain feature-target
 35 |         correlations. 
 36 |         ---
 37 |         Inputs:
 38 |         
 39 |         X: A numpy array of any shape
 40 |         y: A numpy array of any shape
 41 |         
 42 |         Both X and y must have the same first dimension length.
 43 |         
 44 |         Returns:
 45 |         X,y: two numpy arrays
 46 |         """
 47 |         assert len(X) == len(y)
 48 |         permute = np.random.permutation(len(y))
 49 |         return X[permute], y[permute]
 50 |             
 51 |     def fit(self, X, y):
 52 |         """
 53 |         Controller function for the fit. If there are only
 54 |         two classes, it trains a single model. If there are more
 55 |         it detects how many models it will need (one for each class)
 56 |         and builds a model for each class in a one-vs-rest method.
 57 |         """
 58 |         
 59 |         X = self.convert_to_array(X)
 60 |         y = self.convert_to_array(y)
 61 |         
 62 |         if self.add_bias:
 63 |             X = self.add_intercept(X)
 64 |         
 65 |         unique_classes = np.unique(y)
 66 |         self.num_classes = len(unique_classes)
 67 |         if self.num_classes == 2:
 68 |             new_y = copy(y)
 69 |             new_y[new_y==0] = -1
 70 |             self.train_model(X, new_y)
 71 |         else:
 72 |             self.models = [copy(self) for _ in range(self.num_classes)]
 73 |             for ix, val in enumerate(unique_classes):
 74 |                 new_y = copy(y)
 75 |                 new_y[new_y!=val] = -1
 76 |                 new_y[new_y==val] = 1
 77 |                 self.models[ix].train_model(X, new_y)
 78 |             
 79 |         
 80 |     def train_model(self, X, y):
 81 |         """
 82 |         Updates the weights, w, with a gradient descent method.
 83 |         The method is based on a heuristic function that recreates
 84 |         the margin maximization method found in "traditional" SVM.
 85 |         The indicator helps control the type of step taken, to speed
 86 |         convergence. Smaller steps are taken every epoch by scaling
 87 |         by epoch number.
 88 |         """
 89 |         if self.w == None:
 90 |             self.w = np.zeros(X.shape[1])
 91 |             
 92 |         t = 1
 93 |         for i in range(1,self.n_iter+1):
 94 |             shuf_X, shuf_y = self.shuffle_data(X,y)
 95 |             for data, true in zip(shuf_X, shuf_y):
 96 |                 indicator = true*np.dot(data,self.w)
 97 |                 if indicator < 1.:
 98 |                     self.w = (1-1/t)*self.w + (1/(self.lambd*t))*true*data
 99 |                 else:
100 |                     self.w = (1-1/t)*self.w
101 |                 if self.verbosity:
102 |                     print(self.w)
103 |             t+=1
104 |         
105 |     
106 |     def predict(self, X):
107 |         """
108 |         If there are only 2 classes, projects the data onto the 
109 |         axis of the decision boundary, then checks which side of 
110 |         the boundary to use by taking the sign of the projection.
111 |         If more than two, does a projection for each model and 
112 |         chooses whichever class has the largest projection
113 |         value (it's MOST in this class compared to others).
114 |         """
115 |         X = self.convert_to_array(X)
116 |         
117 |         if self.add_bias:
118 |             X = self.add_intercept(X)
119 |         
120 |         if self.num_classes == 2:
121 |             pred = np.sign(np.dot(X,self.w))  
122 |             pred[pred < 0] = 0 
123 |             return pred.reshape(-1,1)
124 |         else:
125 |             results = np.empty((X.shape[0],0))
126 |             for model in self.models:
127 |                 pred = np.dot(X,model.w).reshape(-1,1)
128 |                 results = np.hstack((results, pred))
129 |             return np.argmax(results, axis=1).reshape(-1,1)
130 |       
131 |     def score(self, X, y):
132 |         """
133 |         Uses the predict method to measure the accuracy of the model.
134 |         ---
135 |         In: X (list or array), feature matrix; y (list or array) labels
136 |         Out: accuracy (float)
137 |         """
138 |         pred = self.predict(X)
139 |         correct = 0
140 |         for i,j in zip(y,pred):
141 |             if i == j:
142 |                 correct+=1
143 |         return float(correct)/float(len(y))
144 |     
145 |     def pandas_to_numpy(self, x):
146 |         """
147 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
148 |         calculation purposes.
149 |         ---
150 |         Input: X (array, dataframe, or series)
151 |         Output: X (array)
152 |         """
153 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
154 |             return x.as_matrix()
155 |         if type(x) == type(np.array([1,2])):
156 |             return x
157 |         return np.array(x) 
158 |         
159 |     def handle_1d_data(self,x):
160 |         """
161 |         Converts 1 dimensional data into a series of rows with 1 columns
162 |         instead of 1 row with many columns.
163 |         """
164 |         if x.ndim == 1:
165 |             x = x.reshape(-1,1)
166 |         return x
167 |     
168 |     def convert_to_array(self, x):
169 |         """
170 |         Takes in an input and converts it to a numpy array
171 |         and then checks if it needs to be reshaped for us
172 |         to use it properly
173 |         """
174 |         x = self.pandas_to_numpy(x)
175 |         x = self.handle_1d_data(x)
176 |         return x
177 |     
178 |     def add_intercept(self,X):
179 |         """
180 |         Adds an 'all 1's' bias term to function as the y-intercept
181 |         """
182 |         rows = X.shape[0]
183 |         inter = np.ones(rows).reshape(-1,1)
184 |         return np.hstack((X,inter))


--------------------------------------------------------------------------------
/zwml/tree_models/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .decision_tree_classifier import decision_tree_classifier
 3 | from .decision_tree_regressor import decision_tree_regressor
 4 | from .random_forest_classifier import random_forest_classifier
 5 | from .random_forest_regressor import random_forest_regressor
 6 | from .bagging_classifier import bagging_classifier
 7 | from .bagging_regressor import bagging_regressor
 8 | 
 9 | __all__ = ['bagging_classifier','decision_tree_classifier',
10 |            'decision_tree_regressor','random_forest_classifier',
11 |            'random_forest_regressor','bagging_regressor']
12 | 


--------------------------------------------------------------------------------
/zwml/tree_models/bagging_classifier.py:
--------------------------------------------------------------------------------
 1 | from zwml.tree_models import decision_tree_classifier
 2 | import collections
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | class bagging_classifier:
 7 |     
 8 |     def __init__(self, n_trees = 10, max_depth=None):
 9 |         """
10 |         Bagging Classifier uses bootstrapping to generate n_trees different
11 |         datasets and then applies a decision tree to each dataset. The final 
12 |         prediction is an ensemble of all created trees.
13 |         ---
14 |         Params:
15 |         n_trees (int): number of bootstrapped trees to grow for ensembling
16 |         max_depth (int): maximum number of splits to make in each tree)
17 |         """
18 |         self.n_trees = n_trees
19 |         self.max_depth = max_depth
20 |         self.trees = []
21 |     
22 |     def get_bagged_data(self, X, y):
23 |         """
24 |         Chooses random rows to populate a bootstrapped dataset, with replacement.
25 |         Maintains the correlation between X and y
26 |         ---
27 |         Input: X, y (arrays)
28 |         Outputs: randomized X,y (arrays)
29 |         """
30 |         index = np.random.choice(np.arange(len(X)),len(X))
31 |         return X[index], y[index]
32 |     
33 |     def pandas_to_numpy(self, x):
34 |         """
35 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
36 |         calculation purposes.
37 |         ---
38 |         Input: X (array, dataframe, or series)
39 |         Output: X (array)
40 |         """
41 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
42 |             return x.as_matrix()
43 |         if type(x) == type(np.array([1,2])):
44 |             return x
45 |         return np.array(x)
46 |     
47 |     def fit(self, X, y):
48 |         """
49 |         Generates the bootstrapped data then uses the decision tree
50 |         class to build a model on each bootstrapped dataset. Each tree
51 |         is stored as part of the model for later use.
52 |         ---
53 |         Input: X, y (arrays, dataframe, or series)
54 |         """
55 |         X = self.pandas_to_numpy(X)
56 |         y = self.pandas_to_numpy(y)
57 |         for _ in range(self.n_trees):
58 |             bagX, bagy = self.get_bagged_data(X,y)
59 |             new_tree = decision_tree_classifier(max_depth = self.max_depth)
60 |             new_tree.fit(bagX, bagy)
61 |             self.trees.append(new_tree)
62 |             
63 |     def predict(self, X):
64 |         """
65 |         Uses the list of tree models built in the fit, doing a predict with each
66 |         model. The final prediction uses the mode of all the trees predictions.
67 |         ---
68 |         Input: X (array, dataframe, or series)
69 |         Output: Class ID (int)
70 |         """
71 |         X = self.pandas_to_numpy(X)
72 |         self.predicts = []
73 |         for tree in self.trees:
74 |             self.predicts.append(tree.predict(X))
75 |         self.pred_by_row = np.array(self.predicts).T
76 |         
77 |         ensemble_predict = []
78 |         for row in self.pred_by_row:
79 |             ensemble_predict.append(collections.Counter(row).most_common(1)[0][0])
80 |         return ensemble_predict
81 |     
82 |     def score(self, X, y):
83 |         """
84 |         Uses the predict method to measure the accuracy of the model.
85 |         ---
86 |         In: X (list or array), feature matrix; y (list or array) labels
87 |         Out: accuracy (float)
88 |         """
89 |         pred = self.predict(X)
90 |         correct = 0
91 |         for i,j in zip(y,pred):
92 |             if i == j:
93 |                 correct+=1
94 |         return float(correct)/float(len(y))
95 | 


--------------------------------------------------------------------------------
/zwml/tree_models/bagging_regressor.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | sys.path.append('../..')
 3 | from zwml.tree_models import decision_tree_regressor
 4 | import collections
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | class bagging_regressor:
 9 |     
10 |     def __init__(self, n_trees = 10, max_depth=None):
11 |         """
12 |         Bagging regressor uses bootstrapping to generate n_trees different
13 |         datasets and then applies a decision tree to each dataset. The final 
14 |         prediction is an ensemble of all created trees.
15 |         ---
16 |         Params:
17 |         n_trees (int): number of bootstrapped trees to grow for ensembling
18 |         max_depth (int): maximum number of splits to make in each tree)
19 |         """
20 |         self.n_trees = n_trees
21 |         self.max_depth = max_depth
22 |         self.trees = []
23 |     
24 |     def get_bagged_data(self, X, y):
25 |         """
26 |         Chooses random rows to populate a bootstrapped dataset, with replacement.
27 |         Maintains the correlation between X and y
28 |         ---
29 |         Input: X, y (arrays)
30 |         Outputs: randomized X,y (arrays)
31 |         """
32 |         index = np.random.choice(np.arange(len(X)),len(X))
33 |         return X[index], y[index]
34 |     
35 |     def pandas_to_numpy(self, x):
36 |         """
37 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
38 |         calculation purposes.
39 |         ---
40 |         Input: X (array, dataframe, or series)
41 |         Output: X (array)
42 |         """
43 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
44 |             return x.as_matrix()
45 |         if type(x) == type(np.array([1,2])):
46 |             return x
47 |         return np.array(x)
48 |     
49 |     def fit(self, X, y):
50 |         """
51 |         Generates the bootstrapped data then uses the decision tree
52 |         class to build a model on each bootstrapped dataset. Each tree
53 |         is stored as part of the model for later use.
54 |         ---
55 |         Input: X, y (arrays, dataframe, or series)
56 |         """
57 |         X = self.pandas_to_numpy(X)
58 |         y = self.pandas_to_numpy(y)
59 |         for _ in range(self.n_trees):
60 |             bagX, bagy = self.get_bagged_data(X,y)
61 |             new_tree = decision_tree_regressor(max_depth=self.max_depth)
62 |             new_tree.fit(bagX, bagy)
63 |             self.trees.append(new_tree)
64 |             
65 |     def predict(self, X):
66 |         """
67 |         Uses the list of tree models built in the fit, doing a predict with each
68 |         model. The final prediction uses the mode of all the trees predictions.
69 |         ---
70 |         Input: X (array, dataframe, or series)
71 |         Output: Class ID (int)
72 |         """
73 |         X = self.pandas_to_numpy(X)
74 |         self.predicts = []
75 |         for tree in self.trees:
76 |             self.predicts.append(tree.predict(X))
77 |         self.pred_by_row = np.array(self.predicts).T
78 |         
79 |         ensemble_predict = []
80 |         for row in self.pred_by_row:
81 |             ensemble_predict.append(np.mean(row))
82 |         return ensemble_predict
83 |     
84 |     def score(self, X, y):
85 |         """
86 |         Uses the predict method to measure the (negative)
87 |         mean squared error of the model.
88 |         ---
89 |         In: X (list or array), feature matrix; y (list or array) labels
90 |         Out: negative mean squared error (float)
91 |         """
92 |         pred = self.predict(X)
93 |         return -1.* np.mean((np.array(pred)-np.array(y))**2)


--------------------------------------------------------------------------------
/zwml/tree_models/random_forest_classifier.py:
--------------------------------------------------------------------------------
  1 | from zwml.tree_models import decision_tree_classifier
  2 | import collections
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | class random_forest_classifier:
  7 |     
  8 |     def __init__(self, n_trees = 10, max_depth=None, n_features='sqrt', mode='rfnode', seed=None):
  9 |         """
 10 |         Random Forest Classifier uses bootstrapping and column randomization
 11 |         to generate n_trees different datasets and then applies a decision 
 12 |         tree to each dataset. The final prediction is an ensemble of all created trees.
 13 |         ---
 14 |         Params:
 15 |         n_trees (int): number of bootstrapped trees to grow for ensembling
 16 |         max_depth (int): maximum number of splits to make in each tree)
 17 |         n_features: The number of columns to include in the models. 
 18 |                     Options: numeric value (e.g. 4 => 4 columns used)
 19 |                              "sqrt" (square root of the number of cols in input data)
 20 |                              "div3" (number of input cols divided by 3)
 21 |         mode: If mode='rfnode' the column randomization happens at each node. Otherwise
 22 |               Each tree gets one randomized set of columns for all nodes in that tree.
 23 |         seed: Random seed to allow for reproducibility.
 24 |         """
 25 |         self.n_trees = n_trees
 26 |         self.max_depth = max_depth
 27 |         self.n_features = n_features
 28 |         self.tree_filter_pairs = []
 29 |         self.mode = mode
 30 |         if seed:
 31 |             self._seed = seed
 32 |             np.random.seed(seed)
 33 |         
 34 |     def find_number_of_columns(self, X):
 35 |         """
 36 |         Uses the user input for n_features to decide how many columns should
 37 |         be included in each model. Uses the shape of X to decide the final number
 38 |         if 'sqrt' is called. 
 39 |         ---
 40 |         Input: X (array, dataframe, or series)
 41 |         """
 42 |         if isinstance(self.n_features, int):
 43 |             return self.n_features
 44 |         if self.n_features == 'sqrt':
 45 |             return int(np.sqrt(X.shape[1])+0.5)
 46 |         if self.n_features == 'div3':
 47 |             return int(X.shape[1]/3+0.5)
 48 |         else:
 49 |             raise ValueError("Invalid n_features selection")
 50 |     
 51 |     def get_bagged_data(self, X, y):
 52 |         """
 53 |         Chooses random rows to populate a bootstrapped dataset, with replacement.
 54 |         Maintains the correlation between X and y
 55 |         ---
 56 |         Input: X, y (arrays)
 57 |         Outputs: randomized X,y (arrays)
 58 |         """
 59 |         index = np.random.choice(np.arange(len(X)),len(X))
 60 |         return X[index], y[index]
 61 |     
 62 |     def randomize_columns(self,X):
 63 |         """
 64 |         Chooses a set of columns to keep from the input data. These are
 65 |         randomly drawn, according the number requested by the user. The data
 66 |         is filtered and only the allowed columns are returned, along with the
 67 |         filter. 
 68 |         ---
 69 |         Input: X (array)
 70 |         Output: filtered_X (array), filter (array)
 71 |         """
 72 |         num_col = self.find_number_of_columns(X)
 73 |         filt = np.random.choice(np.arange(0,X.shape[1]),num_col,replace=False)
 74 |         filtered_X = self.apply_filter(X, filt)
 75 |         return filtered_X, filt
 76 |     
 77 |     def apply_filter(self, X, filt):
 78 |         """
 79 |         Given X and a filter, only the columns matching the index values
 80 |         in filter are returned.
 81 |         ---
 82 |         Input: X (array), filter (array of column IDs)
 83 |         Output: filtered_X (array)
 84 |         """
 85 |         filtered_X = X.T[filt]
 86 |         return filtered_X.T
 87 |     
 88 |     def fit(self, X, y):
 89 |         """
 90 |         Generates the bootstrapped data, decides which column to keep,
 91 |         and then uses the decision tree class to build a model on each 
 92 |         bootstrapped and column-randomized dataset. Each tree is stored 
 93 |         as part of the model for later use, along with the appropriate
 94 |         filter - which is needed to filter new data for use with the model.
 95 |         ---
 96 |         Input: X, y (arrays, dataframe, or series)
 97 |         """
 98 |         X = self.convert_to_array(X)
 99 |         y = self.pandas_to_numpy(y)
100 |         try:
101 |             self.base_filt = [x for x in range(X.shape[1])]
102 |         except IndexError:
103 |             self.base_filt = [0]
104 |         for _ in range(self.n_trees):
105 |             filt = self.base_filt
106 |             bagX, bagy = self.get_bagged_data(X,y)
107 |             if self.mode == 'rftree':
108 |                 bagX, filt = self.randomize_columns(bagX)
109 |             new_tree = decision_tree_classifier(self.max_depth, mode=self.mode, n_features=self.n_features)
110 |             new_tree.fit(bagX, bagy)
111 |             self.tree_filter_pairs.append((new_tree, filt))
112 |     
113 |     def predict(self, X):
114 |         """
115 |         Uses the list of tree models built in the fit, doing a predict with each
116 |         model. The associated filter is applied to X, so the model sees the columns
117 |         it has learned about. The final prediction uses the mode of all the trees 
118 |         predictions.
119 |         ---
120 |         Input: X (array, dataframe, or series)
121 |         Output: Class ID (int)
122 |         """
123 |         X = self.convert_to_array(X)
124 |         self.predicts = []
125 |         for tree, filt in self.tree_filter_pairs:
126 |             filtered_X = self.apply_filter(X, filt)
127 |             self.predicts.append(tree.predict(filtered_X))
128 |         self.pred_by_row = np.array(self.predicts).T
129 |         
130 |         ensemble_predict = []
131 |         for row in self.pred_by_row:
132 |             ensemble_predict.append(collections.Counter(row).most_common(1)[0][0])
133 |         return ensemble_predict
134 |     
135 |     def score(self, X, y):
136 |         """
137 |         Uses the predict method to measure the accuracy of the model.
138 |         ---
139 |         In: X (list or array), feature matrix; y (list or array) labels
140 |         Out: accuracy (float)
141 |         """
142 |         pred = self.predict(X)
143 |         correct = 0
144 |         for i,j in zip(y,pred):
145 |             if i == j:
146 |                 correct+=1
147 |         return float(correct)/float(len(y))
148 |     
149 |     def pandas_to_numpy(self, x):
150 |         """
151 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
152 |         calculation purposes.
153 |         ---
154 |         Input: X (array, dataframe, or series)
155 |         Output: X (array)
156 |         """
157 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
158 |             return x.as_matrix()
159 |         if type(x) == type(np.array([1,2])):
160 |             return x
161 |         return np.array(x) 
162 |     
163 |     def handle_1d_data(self,x):
164 |         """
165 |         Converts 1 dimensional data into a series of rows with 1 columns
166 |         instead of 1 row with many columns.
167 |         """
168 |         if x.ndim == 1:
169 |             x = x.reshape(-1,1)
170 |         return x
171 |     
172 |     def convert_to_array(self, x):
173 |         """
174 |         Takes in an input and converts it to a numpy array
175 |         and then checks if it needs to be reshaped for us
176 |         to use it properly
177 |         """
178 |         x = self.pandas_to_numpy(x)
179 |         x = self.handle_1d_data(x)
180 |         return x


--------------------------------------------------------------------------------
/zwml/tree_models/random_forest_regressor.py:
--------------------------------------------------------------------------------
  1 | from zwml.tree_models import decision_tree_regressor
  2 | import collections
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | class random_forest_regressor:
  7 |     
  8 |     def __init__(self, n_trees = 10, max_depth=None, n_features='sqrt', mode='rfnode', seed=None, criteria='std'):
  9 |         """
 10 |         Random Forest Regressor uses bootstrapping and column randomization
 11 |         to generate n_trees different datasets and then applies a decision 
 12 |         tree to each dataset. The final prediction is an ensemble of all created trees.
 13 |         ---
 14 |         Params:
 15 |         n_trees (int): number of bootstrapped trees to grow for ensembling
 16 |         max_depth (int): maximum number of splits to make in the tree
 17 |         mode: If mode='rfnode' the column randomization happens at each node. Otherwise
 18 |               the tree will assume all input columns are valid choices and randomize at
 19 |               a "per tree" level.
 20 |         n_features: The number of columns to include in the models. Only applies if
 21 |                     mode='rfnode.' Otherwise n_features = number of columns in data.
 22 |                     Options: numeric value (e.g. 4 => 4 columns used)
 23 |                              "sqrt" (square root of the number of cols in input data)
 24 |                              "div3" (number of input cols divided by 3)
 25 |         criteria: Options are "std" (standard deviation) and "mae" (absolute error from mean). 
 26 |                   This choice decides how the tree will be optimized. Default: "std"
 27 |         seed: Random seed to allow for reproducibility.
 28 |         """
 29 |         self.n_trees = n_trees
 30 |         self.max_depth = max_depth
 31 |         self.n_features = n_features
 32 |         self.tree_filter_pairs = []
 33 |         self.mode = mode
 34 |         self.criteria = criteria
 35 |         if seed:
 36 |             self._seed = seed
 37 |             np.random.seed(seed)
 38 |         
 39 |     def find_number_of_columns(self, X):
 40 |         """
 41 |         Uses the user input for n_features to decide how many columns should
 42 |         be included in each model. Uses the shape of X to decide the final number
 43 |         if 'sqrt' is called. 
 44 |         ---
 45 |         Input: X (array, dataframe, or series)
 46 |         """
 47 |         if isinstance(self.n_features, int):
 48 |             return self.n_features
 49 |         if self.n_features == 'sqrt':
 50 |             return int(np.sqrt(X.shape[1])+0.5)
 51 |         if self.n_features == 'div3':
 52 |             return int(X.shape[1]/3+0.5)
 53 |         else:
 54 |             raise ValueError("Invalid n_features selection")
 55 |     
 56 |     def get_bagged_data(self, X, y):
 57 |         """
 58 |         Chooses random rows to populate a bootstrapped dataset, with replacement.
 59 |         Maintains the correlation between X and y
 60 |         ---
 61 |         Input: X, y (arrays)
 62 |         Outputs: randomized X,y (arrays)
 63 |         """
 64 |         index = np.random.choice(np.arange(len(X)),len(X))
 65 |         return X[index], y[index]
 66 |     
 67 |     def randomize_columns(self,X):
 68 |         """
 69 |         Chooses a set of columns to keep from the input data. These are
 70 |         randomly drawn, according the number requested by the user. The data
 71 |         is filtered and only the allowed columns are returned, along with the
 72 |         filter. 
 73 |         ---
 74 |         Input: X (array)
 75 |         Output: filtered_X (array), filter (array)
 76 |         """
 77 |         num_col = self.find_number_of_columns(X)
 78 |         filt = np.random.choice(np.arange(0,X.shape[1]),num_col,replace=False)
 79 |         filtered_X = self.apply_filter(X, filt)
 80 |         return filtered_X, filt
 81 |     
 82 |     def apply_filter(self, X, filt):
 83 |         """
 84 |         Given X and a filter, only the columns matching the index values
 85 |         in filter are returned.
 86 |         ---
 87 |         Input: X (array), filter (array of column IDs)
 88 |         Output: filtered_X (array)
 89 |         """
 90 |         filtered_X = X.T[filt]
 91 |         return filtered_X.T
 92 |     
 93 |     def pandas_to_numpy(self, x):
 94 |         """
 95 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
 96 |         calculation purposes.
 97 |         ---
 98 |         Input: X (array, dataframe, or series)
 99 |         Output: X (array)
100 |         """
101 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
102 |             return x.as_matrix()
103 |         if type(x) == type(np.array([1,2])):
104 |             return x
105 |         return np.array(x) 
106 |         
107 |     def handle_1d_data(self,x):
108 |         """
109 |         Converts 1 dimensional data into a series of rows with 1 columns
110 |         instead of 1 row with many columns.
111 |         """
112 |         if x.ndim == 1:
113 |             x = x.reshape(-1,1)
114 |         return x
115 |     
116 |     def convert_to_array(self, x):
117 |         """
118 |         Takes in an input and converts it to a numpy array
119 |         and then checks if it needs to be reshaped for us
120 |         to use it properly
121 |         """
122 |         x = self.pandas_to_numpy(x)
123 |         x = self.handle_1d_data(x)
124 |         return x
125 |     
126 |     def fit(self, X, y):
127 |         """
128 |         Generates the bootstrapped data, decides which column to keep,
129 |         and then uses the decision tree class to build a model on each 
130 |         bootstrapped and column-randomized dataset. Each tree is stored 
131 |         as part of the model for later use, along with the appropriate
132 |         filter - which is needed to filter new data for use with the model.
133 |         ---
134 |         Input: X, y (arrays, dataframe, or series)
135 |         """
136 |         X = self.convert_to_array(X)
137 |         y = self.convert_to_array(y)
138 |         try:
139 |             self.base_filt = [x for x in range(X.shape[1])]
140 |         except IndexError:
141 |             self.base_filt = [0]
142 |         for _ in range(self.n_trees):
143 |             filt = self.base_filt
144 |             bagX, bagy = self.get_bagged_data(X,y)
145 |             if self.mode == 'rftree':
146 |                 bagX, filt = self.randomize_columns(bagX)
147 |             new_tree = decision_tree_regressor(self.max_depth, mode=self.mode, 
148 |                                                n_features=self.n_features, criteria=self.criteria)
149 |             new_tree.fit(bagX, bagy)
150 |             self.tree_filter_pairs.append((new_tree, filt))
151 |     
152 |     def predict(self, X):
153 |         """
154 |         Uses the list of tree models built in the fit, doing a predict with each
155 |         model. The associated filter is applied to X, so the model sees the columns
156 |         it has learned about. The final prediction uses the mode of all the trees 
157 |         predictions.
158 |         ---
159 |         Input: X (array, dataframe, or series)
160 |         Output: Class ID (int)
161 |         """
162 |         X = self.convert_to_array(X)
163 |         self.predicts = []
164 |         for tree, filt in self.tree_filter_pairs:
165 |             filtered_X = self.apply_filter(X, filt)
166 |             self.predicts.append(tree.predict(filtered_X))
167 |         self.pred_by_row = np.array(self.predicts).T
168 |         
169 |         ensemble_predict = []
170 |         for row in self.pred_by_row:
171 |             ensemble_predict.append(np.mean(row))
172 |         return ensemble_predict
173 |     
174 |     def score(self, X, y):
175 |         """
176 |         Uses the predict method to measure the (negative)
177 |         mean squared error of the model.
178 |         ---
179 |         In: X (list or array), feature matrix; y (list or array) labels
180 |         Out: negative mean squared error (float)
181 |         """
182 |         pred = self.predict(X)
183 |         return -1.* np.mean((np.array(pred)-np.array(y))**2)


--------------------------------------------------------------------------------
/zwml/utilities/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .data_splitting import *
 3 | from .grid_search import *
 4 | from .randomized_search import *
 5 | from .markov_chain import markov_chain
 6 | from .standard_scaler import standard_scaler
 7 | from .normalizer import normalizer
 8 | 
 9 | __all__ = ['train_test_split','cross_val','grid_search','grid_search_cv','randomized_search','randomized_search_cv','markov_chain', 'standard_scaler','normalizer']
10 | 


--------------------------------------------------------------------------------
/zwml/utilities/grid_search.py:
--------------------------------------------------------------------------------
  1 | from itertools import product
  2 | 
  3 | class grid_search():
  4 |     
  5 |     def __init__(self, model_name, param_grid):
  6 |         """
  7 |         Given a base model and a parameter grid of params
  8 |         for that model, iterates through all the combinations
  9 |         of parameters, builds a model with each combo,
 10 |         and returns the score of the model.
 11 |         ---
 12 |         Inputs:
 13 |         model_name : the name of the model with parenthesis 
 14 |         and as a string. Any parameters you wish to set for all
 15 |         models can be set in the parameter name.
 16 |         param_grid: dictionary with parameter names as keys,
 17 |         and list of param values to test as value for each key
 18 |         """
 19 |         self._base_model = str(model_name).replace(')','')
 20 |         self._param_grid = param_grid
 21 |         self.models = self.get_models()
 22 |         
 23 |     def get_models(self):
 24 |         """
 25 |         Finds every combination of parameters from the param grid.
 26 |         Uses the string basename for to create a list of model 
 27 |         names with the proper parameters. This command_list is
 28 |         still in string form until we're ready to test the models.
 29 |         """
 30 |         params = []
 31 |         order = []
 32 |         for key, value in self._param_grid.items():
 33 |             order.append(key)
 34 |             params.append(value)
 35 |         options = list(product(*params))
 36 | 
 37 |         command_list = []
 38 |         for option in options:
 39 |             cmd = self._base_model
 40 |             if cmd[-1] != '(':
 41 |                 cmd+=', '
 42 |             for i,j in zip(order, option):
 43 |                 if type(j) == type('string'):
 44 |                     cmd += str(i)+"='"+str(j)+"', "
 45 |                 else:
 46 |                     cmd += str(i)+"="+str(j)+", "
 47 |             command_list.append(cmd[:-2]+')')
 48 |         return command_list
 49 |     
 50 |     def fit(self, X, y):
 51 |         """
 52 |         Uses the "eval" function in Python to convert the model
 53 |         name from string to an actual model. Fits each model
 54 |         and scores it. Creates a lists of models and scores.
 55 |         Sets the best possible model and score to be easily
 56 |         retrievable and usable.
 57 |         """
 58 |         results = []
 59 |         for model_name in self.models:
 60 |             model = eval(model_name)
 61 |             model.fit(X,y)
 62 |             s = model.score(X,y)
 63 |             results.append([model, s, model_name])
 64 |         self.all_results = sorted(results, key=lambda x: x[1], reverse=True)
 65 |         self.best_model = self.all_results[0][0]
 66 |         self.best_score = self.all_results[0][1]
 67 |         
 68 |     def print_results(self):
 69 |         """
 70 |         Method to print the results in a nice readable format.
 71 |         """
 72 |         if self.all_results:
 73 |             print("Model    |    Score\n--------------------\n")
 74 |             for result in self.all_results:
 75 |                 print(result[2], "   |   ", result[1],"\n")
 76 | 
 77 | 
 78 | from itertools import product
 79 | from zwml.utilities import cross_val
 80 | 
 81 | class grid_search_cv():
 82 |     
 83 |     def __init__(self, model_name, param_grid={}, k=5):
 84 |         """
 85 |         Given a base model and a parameter grid of params
 86 |         for that model, iterates through all the combinations
 87 |         of parameters, builds a model with each combo,
 88 |         and does kFold cross validation on them model
 89 |         ---
 90 |         Inputs:
 91 |         model_name : the name of the model with parenthesis 
 92 |         and as a string. Any parameters you wish to set for all
 93 |         models can be set in the parameter name.
 94 |         param_grid: dictionary with parameter names as keys,
 95 |         and list of param values to test as value for each key
 96 |         k: number of folds for cross val
 97 |         """
 98 |         self._base_model = str(model_name).replace(')','')
 99 |         self._param_grid = param_grid
100 |         self.models = self.get_models()
101 |         self.k = k
102 |         
103 |     def get_models(self):
104 |         """
105 |         Finds every combination of parameters from the param grid.
106 |         Uses the string basename for to create a list of model 
107 |         names with the proper parameters. This command_list is
108 |         still in string form until we're ready to test the models.
109 |         """
110 |         params = []
111 |         order = []
112 |         for key, value in self._param_grid.items():
113 |             order.append(key)
114 |             params.append(value)
115 |         options = list(product(*params))
116 | 
117 |         command_list = []
118 |         for option in options:
119 |             cmd = self._base_model
120 |             if cmd[-1] != '(':
121 |                 cmd+=', '
122 |             for i,j in zip(order, option):
123 |                 if type(j) == type('string'):
124 |                     cmd += str(i)+"='"+str(j)+"', "
125 |                 else:
126 |                     cmd += str(i)+"="+str(j)+", "
127 |             command_list.append(cmd[:-2]+')')
128 |         return command_list
129 |     
130 |     def fit(self, X, y):
131 |         """
132 |         Uses the "eval" function in Python to convert the model
133 |         name from string to an actual model. Fits each model
134 |         and scores it with kfold cross_val. 
135 |         Creates a lists of models and scores.
136 |         Sets the best possible model and score to be easily
137 |         retrievable and usable.
138 |         """
139 |         results = []
140 |         for model_name in self.models:
141 |             model = eval(model_name)
142 |             cv = cross_val()
143 |             cv.cross_validation_scores(model, X, y, self.k)
144 |             results.append([model, cv.score_folds, model_name])
145 |         self.all_results = sorted(results, key=lambda x: np.mean(x[1]), reverse=True)
146 |         self.best_model = self.all_results[0][0]
147 |         self.best_score = self.all_results[0][1]
148 |         
149 |     def print_results(self, coefs=False, mean=False):
150 |         """
151 |         Method to print the results in a nice readable format.
152 |         If the user asks for mean, only show the average score 
153 |         across all folds. If the user asks for coefficients
154 |         show coefficients if the model has them.
155 |         """
156 |         if self.all_results:
157 |             print("Model    |    Scores\n--------------------")
158 |             for result in self.all_results:
159 |                 if mean:
160 |                     print(result[2], "   |   ", np.mean(result[1]))
161 |                 else:
162 |                     print(result[2], "   |   ", result[1])
163 |                 if coefs:
164 |                     try:
165 |                         print("Coefs: ", result[0].coefs_)
166 |                     except AttributeError:
167 |                         print("No Coefficients in model!")    
168 |                 print()


--------------------------------------------------------------------------------
/zwml/utilities/markov_chain.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class markov_chain:
  4 |     
  5 |     def __init__(self, text, from_file=True, ngram=2, random_state=None):
  6 |         """
  7 |         Markov Chains are great for generating text based on previously seen text. 
  8 |         Here we'll either read from file or from one big string, then generate a 
  9 |         probabilistic understanding of the document by using ngrams as keys and
 10 |         storing all possible following words. We can then generate sentences
 11 |         using random dice and this object.
 12 |         ---
 13 |         Inputs
 14 |             text: either the path to a file containing the text or the text (string)
 15 |             from_file: whether the text is in a file or note (bool)
 16 |             ngram: how many words to use as a key for the text generation
 17 |             random_state: used to set the random state for reproducibility
 18 |         """
 19 |         self.ngram = int(ngram)
 20 |         self.markov_keys = dict()
 21 |         self._from_file = from_file
 22 |         if type(text) != type("string"):
 23 |             raise TypeError("'text' must be a PATH or string object")
 24 |         if from_file:
 25 |             self.path = text
 26 |         else:
 27 |             self.raw = text
 28 |         self.text_as_list = None
 29 |         if random_state:
 30 |             np.random.seed(random_state)
 31 |         self.create_probability_object()
 32 | 
 33 |     def preprocess(self):
 34 |         """
 35 |         Opens and cleans the text to be learned. If self.from_file, it reads
 36 |         from the path provided. The cleaning is very minor, just lowercasing
 37 |         and getting rid of quotes. Creates a list of words from the text.
 38 |         """
 39 |         if self._from_file:
 40 |             with open(self.path,'r') as f:
 41 |                 self.raw = f.read()
 42 |         self.text_as_list = self.raw.lower().replace('"','').replace("'","").split()
 43 | 
 44 |     def markov_group_generator(self,text_as_list):
 45 |         """
 46 |         Generator that creates the ngram groupings to act as keys.
 47 |         Just grabs ngram number of words and puts them into a tuple
 48 |         and yields that upon iteration request.
 49 |         ---
 50 |         Inputs
 51 |             text_as_list: the text after preprocessing (list)
 52 |         Outputs
 53 |             keys: word groupings of length self.ngram (tuple)
 54 |         """
 55 |         if len(text_as_list) < self.ngram+1:
 56 |             raise ValueError("NOT A LONG ENOUGH TEXT!")
 57 |             return
 58 | 
 59 |         for i in range(self.ngram,len(text_as_list)):
 60 |             yield tuple(text_as_list[i-self.ngram:i+1])
 61 | 
 62 |     def create_probability_object(self):
 63 |         """
 64 |         Steps through the text, pulling keys out and keeping track
 65 |         of which words follow the keys. Duplication is allowed for 
 66 |         values for each key - but all keys are unique.
 67 |         """
 68 |         if self.markov_keys:
 69 |             print("Probability Object already built!")
 70 |             return
 71 |         if not self.text_as_list:
 72 |             self.preprocess()
 73 |         for group in self.markov_group_generator(self.text_as_list):
 74 |             word_key = tuple(group[:-1])
 75 |             if word_key in self.markov_keys:
 76 |                 self.markov_keys[word_key].append(group[-1])
 77 |             else:
 78 |                 self.markov_keys[word_key] = [group[-1]]
 79 |     
 80 |     def generate_sentence(self, length=25, starting_word_id=None):
 81 |         """
 82 |         Given a seed word, pulls the key associated with that word and 
 83 |         samples from the values available. Then moves to the newly generated 
 84 |         word and gets the key associated with it, and generates again. 
 85 |         Repeats until the sentence is 'length' words long.
 86 |         ---
 87 |         Inputs
 88 |             length: how many words to generate (int)
 89 |             starting_word_id: what word to use as seed, by location (int)
 90 |         Outputs
 91 |             gen_words: the generated sentence, including seed words (string)
 92 |         """
 93 |         if not self.markov_keys:
 94 |             raise ValueError("No probability object built. Check initialization!")
 95 |         
 96 |         if (not starting_word_id or type(starting_word_id) != type(int(1)) 
 97 |             or starting_word_id < 0 or starting_word_id > len(self.text_as_list)-self.ngram):
 98 |             starting_word_id = np.random.randint(0,len(self.text_as_list)-self.ngram)
 99 |             
100 |         gen_words = self.text_as_list[starting_word_id:starting_word_id+self.ngram]
101 |         
102 |         while len(gen_words) < length:
103 |             seed = tuple(gen_words[-self.ngram:])
104 |             gen_words.append(np.random.choice(self.markov_keys[seed]))
105 |         return ' '.join(gen_words)
106 |     
107 |     def print_key_value_pairs(self, num_keys=20):
108 |         """
109 |         Iterates through the probability object, printing key-value
110 |         pairs. 
111 |         ---
112 |         Input
113 |         num_keys: how many pairs to show (int)
114 |         """
115 |         i = 1
116 |         for key,value in self.markov_keys.items():
117 |             print(key,value)
118 |             print()
119 |             i+=1
120 |             if i>int(num_keys):
121 |                 break


--------------------------------------------------------------------------------
/zwml/utilities/normalizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from copy import copy
 3 | 
 4 | 
 5 | class normalizer:
 6 |     
 7 |     def __init__(self, axis='col'):
 8 |         """
 9 |         Normalizer has two behaviors. If the axis is 'col', it divides
10 |         each column by the maximum magnitude in that column. If the axis 
11 |         is 'row', it forces each row to sum to 1.
12 |         ---
13 |         KWargs:
14 |         axis: mode of behavior. See description for details.
15 |         """
16 |         self.axis = axis
17 |         self.data_stats = {}
18 |         self.number_of_columns = None
19 |         if self.axis not in ['col', 'row']:
20 |             raise ValueError("axis must be either 'row' or 'col'")
21 |         
22 |     def fit(self, X):
23 |         """
24 |         If axis='col', learns about the input data and 
25 |         stores the max value of each column. If set for 
26 |         'row', does nothing.
27 |         ---
28 |         In: X (features); np.array or pandas dataframe/series
29 |         """
30 |         X = self.convert_to_array(X)
31 |         self.number_of_columns = X.shape[1]
32 |         
33 |         if self.axis == 'col':
34 |             for ix in range(self.number_of_columns):
35 |                 self.data_stats[ix] = np.amax(np.abs(X.T[ix]))
36 |     
37 |     def transform(self,X):
38 |         """
39 |         Given the information learned about the training data,
40 |         remove the mean and scale the new data as requested by
41 |         the user.
42 |         ---
43 |         In: X (features); np.array or pandas dataframe/series
44 |         """
45 |         X = self.convert_to_array(X)
46 |         new_X = copy(X)
47 |         
48 |         if self.axis == 'col':
49 |             for ix in range(self.number_of_columns):
50 |                 new_X.T[ix] = new_X.T[ix]/self.data_stats[ix]
51 |             
52 |         if self.axis == 'row':
53 |             new_X = new_X/np.sum(new_X**2, axis=1).reshape(-1,1)
54 |             
55 |         return new_X
56 |     
57 |     def fit_transform(self, X):
58 |         """
59 |         Learn from X and then return the transformed version
60 |         of X for the user to use.
61 |         ---
62 |         In: X (features); np.array or pandas dataframe/series
63 |         """
64 |         self.fit(X)
65 |         return self.transform(X)
66 |     
67 |     def pandas_to_numpy(self, x):
68 |         """
69 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
70 |         calculation purposes.
71 |         ---
72 |         Input: X (array, dataframe, or series)
73 |         Output: X (array)
74 |         """
75 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
76 |             return x.as_matrix()
77 |         if type(x) == type(np.array([1,2])):
78 |             return x
79 |         return np.array(x) 
80 |     
81 |     def handle_1d_data(self,x):
82 |         """
83 |         Converts 1 dimensional data into a series of rows with 1 columns
84 |         instead of 1 row with many columns.
85 |         """
86 |         if x.ndim == 1:
87 |             x = x.reshape(-1,1)
88 |         return x
89 |     
90 |     def convert_to_array(self, x):
91 |         """
92 |         Takes in an input and converts it to a numpy array
93 |         and then checks if it needs to be reshaped for us
94 |         to use it properly
95 |         """
96 |         x = self.pandas_to_numpy(x)
97 |         x = self.handle_1d_data(x)
98 |         return x


--------------------------------------------------------------------------------
/zwml/utilities/standard_scaler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from copy import copy
 3 | 
 4 | 
 5 | class standard_scaler:
 6 |     
 7 |     def __init__(self, demean=True, dev_scale=True):
 8 |         """
 9 |         Standard Scaler demeans each column and converts 
10 |         each column to have a standard deviation of 1.
11 |         ---
12 |         KWargs:
13 |         demean: whether to subtract the mean from each column
14 |         dev_scale: whether to convert to unit variance
15 |         """
16 |         self.demean = demean
17 |         self.dev_scale = dev_scale
18 |         self.data_stats = {}
19 |         self.number_of_columns = None
20 |         
21 |     def fit(self, X):
22 |         """
23 |         Learns about the input data and stores the mean and 
24 |         standard deviation of each column.
25 |         ---
26 |         In: X (features); np.array or pandas dataframe/series
27 |         """
28 |         X = self.convert_to_array(X)
29 |         self.number_of_columns = X.shape[1]
30 |         
31 |         for ix in range(self.number_of_columns):
32 |             col = X.T[ix]
33 |             col_mean = np.mean(col)
34 |             col_std = np.std(col)
35 |             self.data_stats[ix] = (col_mean, col_std)
36 |     
37 |     def transform(self,X):
38 |         """
39 |         Given the information learned about the training data,
40 |         remove the mean and scale the new data as requested by
41 |         the user.
42 |         ---
43 |         In: X (features); np.array or pandas dataframe/series
44 |         """
45 |         X = self.convert_to_array(X)
46 |         new_X = copy(X)
47 |         
48 |         for ix in range(self.number_of_columns):
49 |             if self.demean:
50 |                 new_X.T[ix] = new_X.T[ix] - self.data_stats[ix][0]
51 |             if self.dev_scale:
52 |                 new_X.T[ix] = new_X.T[ix]/self.data_stats[ix][1]
53 |         
54 |         return new_X
55 |     
56 |     def fit_transform(self, X):
57 |         """
58 |         Learn from X and then return the transformed version
59 |         of X for the user to use.
60 |         ---
61 |         In: X (features); np.array or pandas dataframe/series
62 |         """
63 |         self.fit(X)
64 |         return self.transform(X)
65 |     
66 |     def pandas_to_numpy(self, x):
67 |         """
68 |         Checks if the input is a Dataframe or series, converts to numpy matrix for
69 |         calculation purposes.
70 |         ---
71 |         Input: X (array, dataframe, or series)
72 |         Output: X (array)
73 |         """
74 |         if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
75 |             return x.as_matrix()
76 |         if type(x) == type(np.array([1,2])):
77 |             return x
78 |         return np.array(x) 
79 |     
80 |     def handle_1d_data(self,x):
81 |         """
82 |         Converts 1 dimensional data into a series of rows with 1 columns
83 |         instead of 1 row with many columns.
84 |         """
85 |         if x.ndim == 1:
86 |             x = x.reshape(-1,1)
87 |         return x
88 |     
89 |     def convert_to_array(self, x):
90 |         """
91 |         Takes in an input and converts it to a numpy array
92 |         and then checks if it needs to be reshaped for us
93 |         to use it properly
94 |         """
95 |         x = self.pandas_to_numpy(x)
96 |         x = self.handle_1d_data(x)
97 |         return x


--------------------------------------------------------------------------------