├── .gitignore
├── .travis.yml
├── Makefile
├── README.md
├── mozsci
    ├── __init__.py
    ├── _c_utils.pyx
    ├── cross_validate.py
    ├── cspearmanr_by_fast.cc
    ├── ems.py
    ├── evaluation.py
    ├── glm
    │   ├── __init__.py
    │   ├── prob_distributions.py
    │   ├── regularization.py
    │   └── simplified_glm.py
    ├── histogram.py
    ├── inputs.py
    ├── map_train.py
    ├── models
    │   ├── __init__.py
    │   ├── linear_regression.py
    │   └── logistic_regression.py
    ├── numpy_util.py
    ├── pca.py
    ├── spearmanr_by_fast.pyx
    └── variables.py
├── requirements.txt
├── setup.py
└── test
    ├── data
        └── poissonreg.csv
    ├── test_PCA.py
    ├── test_cross_validate.py
    ├── test_evaluation.py
    ├── test_glm.py
    ├── test_histogram.py
    ├── test_inputs.py
    ├── test_linear_regression.py
    ├── test_logistic_regression.py
    ├── test_map_train.py
    └── test_variables.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | mozsci.egg-info/
 4 | 
 5 | mozsci/spearmanr_by_fast.cpp
 6 | mozsci/_c_utils.cpp
 7 | 
 8 | 
 9 | *.pyc
10 | 
11 | .coverage
12 | mozsci/*.so
13 | 
14 | # vim files
15 | *.swp
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 |   - 3.3
 5 |   - 3.4
 6 | script: make test
 7 | cache:
 8 |   - apt
 9 |   - pip
10 | install:
11 |   - sudo apt-get -y install libatlas-base-dev libatlas-dev lib{blas,lapack}-dev
12 |   # some conda magic to install numpy, scipy
13 |   # see http://sburns.org/2014/03/28/faster-travis-builds.html
14 |   - sudo pip install conda
15 |   - conda_deps='pip numpy scipy matplotlib cython scikit-learn'
16 |   - conda create -p $HOME/py --yes $conda_deps "python=$TRAVIS_PYTHON_VERSION"
17 |   - export PATH=$HOME/py/bin:$PATH
18 |   - pip install -r requirements.txt
19 |   - python setup.py build_ext --inplace
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean:
 2 | 	# Remove the build
 3 | 	rm -rf build dist
 4 | 	# And all of our pyc files
 5 | 	rm -f mozsci/*.pyc test/*.pyc
 6 | 	# All compiled files
 7 | 	rm -f mozsci/*.so mozsci/spearmanr_by_fast.cpp mozsci/_c_utils.cpp
 8 | 	# And lastly, .coverage files
 9 | 	rm -f .coverage
10 | 
11 | test: nose
12 | 
13 | nose:
14 | 	rm -rf .coverage
15 | 	nosetests --exe --cover-package=mozsci --with-coverage --cover-branches -v --cover-erase 
16 | 
17 | unittest:
18 | 	python -m unittest discover -s test
19 | 
20 | # build inplace for unit tests to pass (since they are run from this
21 | # top level directory we need the .so files to be in the src tree
22 | # when they run.
23 | build: clean
24 | 	python setup.py build_ext --inplace
25 | 
26 | install: build
27 | 	python setup.py install
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mozsci
 2 | ======
 3 | 
 4 | [![Build Status](https://api.travis-ci.org/seomoz/mozsci.png)](https://api.travis-ci.org/seomoz/mozsci.png)
 5 | 
 6 | A grab bag of assorted Data science tools from Moz.
 7 | 
 8 | Currently includes:
 9 | 
10 | * Utilities for training/evaluating machine learning models:
11 |     * Cross validation
12 |     * Evaluation metrics (AUC, F1, etc)
13 |     * Training models in parallel
14 | * Ensemble model selection
15 | * PCA
16 | * A generic way to specify model inputs
17 | * Some linear models:
18 |     * Linear Regression
19 |     * Logistic Regression
20 |     * GLM
21 | 
22 | ## Installing
23 | 
24 | ```
25 | pip install mozsci
26 | ```
27 | 
28 | 


--------------------------------------------------------------------------------
/mozsci/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | #from .evaluation import pearsonr_weighted, auc_wmw_fast, auc_wmw_error, classification_error, precision_recall_f1
3 | #from .cross_validate import 
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/mozsci/_c_utils.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | cimport cython
 3 | cimport numpy as np
 4 | np.import_array()
 5 | 
 6 | import numpy as np
 7 | 
 8 | @cython.boundscheck(False)
 9 | @cython.cdivision(True)
10 | def histogram1d_update(
11 |     np.ndarray[np.float64_t, ndim=1] data,
12 |     np.ndarray[np.int64_t, ndim=1] bin_count,
13 |     double bin_width,
14 |     int bins1,
15 |     float mn):
16 |     cdef int ndata = len(data)
17 |     cdef int i
18 |     cdef int bin_index
19 | 
20 |     for i in range(ndata):
21 |         bin_index = int((data[i] - mn) / bin_width)
22 |         bin_index = min(max(bin_index, 0), bins1)
23 |         bin_count[bin_index] += 1
24 | 
25 | 
26 | @cython.boundscheck(False)
27 | @cython.cdivision(True)
28 | def histogram1d_update_counts(
29 |     np.ndarray[np.float64_t, ndim=1] data,
30 |     np.ndarray[np.int64_t, ndim=1] bin_count,
31 |     double bin_width,
32 |     int bins1,
33 |     float mn,
34 |     np.ndarray[np.float64_t, ndim=1] counts):
35 |     cdef int ndata = len(data)
36 |     cdef int i
37 |     cdef int bin_index
38 | 
39 |     for i in range(ndata):
40 |         bin_index = int((data[i] - mn) / bin_width)
41 |         bin_index = min(max(bin_index, 0), bins1)
42 |         bin_count[bin_index] += <long long>counts[i]
43 | 
44 | 
45 | @cython.boundscheck(False)
46 | @cython.cdivision(True)
47 | def histogram1d_compute_indices(
48 |     np.ndarray[np.float64_t, ndim=1] data,
49 |     double bin_width,
50 |     int bins1,
51 |     float mn,
52 |     np.ndarray[np.int64_t, ndim=1] bin_index):
53 |     cdef int ndata = len(data)
54 |     cdef int i
55 |     cdef int this_index
56 | 
57 |     for i in range(ndata):
58 |         this_index = int((data[i] - mn) / bin_width)
59 |         bin_index[i] = min(max(this_index, 0), bins1)
60 | 
61 | 
62 | @cython.boundscheck(False)
63 | @cython.cdivision(True)
64 | def c_auc_wmw(
65 |     np.ndarray[np.int64_t, ndim=1] idxp,
66 |     np.ndarray[np.int64_t, ndim=1] idxn,
67 |     np.ndarray[np.float64_t, ndim=1] parr,
68 |     np.ndarray[np.float64_t, ndim=1] warr):
69 | 
70 |     cdef int i, j
71 |     cdef double auc = 0.0
72 |     cdef double sum_weights = 0.0
73 |     cdef int nidxp = len(idxp)
74 |     cdef int nidxn = len(idxn)
75 |     cdef double this_weight
76 |     for i in range(nidxp):
77 |         for j in range(nidxn):
78 |             this_weight = warr[idxp[i]] + warr[idxn[j]]
79 |             sum_weights += this_weight
80 |             if parr[idxp[i]] - parr[idxn[j]] > 0.0:
81 |                 auc += this_weight
82 | 
83 |     return auc / sum_weights
84 | 
85 | 


--------------------------------------------------------------------------------
/mozsci/cross_validate.py:
--------------------------------------------------------------------------------
  1 | """Things to do cross validation"""
  2 | from __future__ import absolute_import
  3 | 
  4 | import numpy as np
  5 | from .map_train import TrainModelCV
  6 | import six
  7 | from six.moves import range
  8 | 
  9 | def cv_kfold(ntrain, nk, seed=None):
 10 |     """k-fold cross validation
 11 | 
 12 |     ntrain = the integer number of training data points to sample
 13 |     nk = the number of splits of the training data
 14 |     optionally sets seed
 15 | 
 16 |     returns a list length nk.  Each element is a tuple:
 17 |         (train_indices, test_indices)
 18 | 
 19 |     NOTE: this is an approximate sampler, so the test set size
 20 |     isn't guaranteed to be 1 / nk, especially for small values of
 21 |     ntrain.
 22 |     """
 23 |     # need k probability splits 0-1
 24 | 
 25 |     # optionally set seed
 26 |     if seed is not None:
 27 |         np.random.seed(seed)
 28 | 
 29 |     # need k probability splits 0-1
 30 |     # the end points to sample
 31 |     fold_edges = np.linspace(0, 1, nk + 1)
 32 | 
 33 |     r = np.random.rand(ntrain)
 34 |     indices = np.arange(ntrain)
 35 |     folds = []
 36 |     for k in range(nk):
 37 |         folds.append(indices[np.logical_and(fold_edges[k] <= r, r < fold_edges[k + 1])])
 38 | 
 39 |     # make training + test arrays
 40 |     training_test = []
 41 |     for k in range(nk):
 42 |         training = []
 43 |         test = []
 44 |         for i in range(nk):
 45 |             if i != k:
 46 |                 training.extend(folds[i])
 47 |             else:
 48 |                 test.extend(folds[i])
 49 |         training_test.append([training, test])
 50 | 
 51 |     return training_test
 52 | 
 53 | 
 54 | def plot_cv_errors(errors, model, regparm, fignum):
 55 |     """Plots test vs training error for cross validation, as return from run_train_models
 56 | 
 57 |     errors = as returned from run_train_models
 58 |     model = a string with model name, e.g. "LogisticRegression"
 59 |     regparm = the name of regularization parameter, e.g. "lam"
 60 |     """
 61 |     import pylab as plt
 62 |     import re
 63 | 
 64 |     # accumulate the erorrs + the regularization parameters
 65 |     # errors_plot = [train, test] list
 66 |     errors_plot = []
 67 |     reg = []
 68 | 
 69 |     for desc, err in six.iteritems(errors):
 70 |         if re.search(model, desc):
 71 |             # it corresponds to this model
 72 |             # get the regularization parameter
 73 |             c = float(re.search("'%s':\s+([\.0-9-e]+)(}|,)" % regparm, desc).group(1))
 74 |             reg.append(c)
 75 |             errors_plot.append([err['train'], err['test']])
 76 | 
 77 |     errors_plot = np.asarray(errors_plot)
 78 |     reg = np.asarray(reg)
 79 |     plot_order = reg.argsort()
 80 | 
 81 |     fig = plt.figure(fignum)
 82 |     fig.clf()
 83 |     plt.plot(np.log(reg[plot_order]), errors_plot[plot_order, 0], label='train')
 84 |     plt.plot(np.log(reg[plot_order]), errors_plot[plot_order, 1], label='test')
 85 |     plt.legend()
 86 |     plt.grid(True)
 87 |     fig.show()
 88 | 
 89 | 
 90 | def learning_curves(model_description, X, y, kfolds=5, fignum=1):
 91 |     """Plot learning curves
 92 | 
 93 |     uses k-fold cross validation for 25, 50, 75, 100% of data
 94 |     
 95 |     model_description as input to TrainModelCV the model.  This defines
 96 |     the model to check.
 97 |     kfolds = use this many folds
 98 |     fignum = plot in this figure number
 99 |     """
100 |     # use TrainModelCV to do so
101 |     import pylab as plt
102 | 
103 |     indices = np.arange(X.shape[0])
104 |     np.random.shuffle(indices)
105 | 
106 |     pct_data = np.array([0.25, 0.5, 0.75, 1.0])
107 |     npct = len(pct_data)
108 |     ndata = (pct_data * X.shape[0]).astype(np.int)
109 |     test_errors = []
110 |     train_errors = []
111 |     for N in ndata:
112 |         folds = cv_kfold(N, kfolds) 
113 |         trainer = TrainModelCV(model_description,
114 |                     X=X[indices][:N, :], y=y[indices][:N],
115 |                     folds=folds)
116 |         errors = trainer.run()
117 |         test_errors.append(errors[list(errors.keys())[0]]['test'])
118 |         train_errors.append(errors[list(errors.keys())[0]]['train'])
119 | 
120 |     fig = plt.figure(fignum)
121 |     fig.clf()
122 |     plt.plot(pct_data, train_errors, label='train')
123 |     plt.plot(pct_data, test_errors, label='test')
124 |     plt.xlabel("Percent of data")
125 |     plt.legend()
126 |     plt.ylabel(model_description[1].__name__)
127 |     fig.show()
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/mozsci/cspearmanr_by_fast.cc:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | #include <numeric>
  5 | #include <math.h>
  6 | #include <algorithm>
  7 | 
  8 | template <class iter_t>
  9 | struct indexed_compare
 10 | {
 11 |     iter_t begin;
 12 |     indexed_compare(iter_t begin) : begin(begin) {}
 13 |     bool operator()(std::size_t a, std::size_t b) const {
 14 |         // sort in ascending order
 15 |         return *(begin+a) < *(begin+b);
 16 |     }
 17 | };
 18 | 
 19 | typedef indexed_compare<std::vector<double>::iterator> index_compare_double_vector;
 20 | 
 21 | std::vector<double> to_ranked (std::vector<double> a)
 22 | {
 23 |     int n = a.size();
 24 |     std::vector<size_t> ret(n);
 25 |     for (std::size_t i = 0; i < n; i++) ret[i] = i;
 26 |     std::sort<std::vector<size_t>::iterator, index_compare_double_vector>(ret.begin(), ret.end(), index_compare_double_vector(a.begin()));
 27 | 
 28 |     // need to take ties into account and assign the average rank
 29 |     std::vector<double> ret2(a.size());
 30 |     int sumranks = 0.0;
 31 |     int dupcount = 0.0;
 32 |     double eps = 1.0e-8;
 33 |     for (std::size_t i = 0; i < n; i++)
 34 |     {
 35 |         sumranks = sumranks + i;
 36 |         dupcount++;
 37 |         if (i == (n - 1) || abs(a[ret[i]] != a[ret[i+1]]) > eps)
 38 |         {
 39 |             double avgrank = double(sumranks) / double(dupcount) + 1;
 40 |             for (int j = i - dupcount + 1; j < i + 1; j++)
 41 |             {
 42 |                 ret2[ret[j]] = avgrank;
 43 |             }
 44 |             sumranks = 0;
 45 |             dupcount = 0;
 46 |         }
 47 |     }
 48 | 
 49 |     return ret2;
 50 | }
 51 | 
 52 | template <class T>
 53 | double pearson_correlation(std::vector<T> a, std::vector<T> b) 
 54 | {
 55 | //    for (int i = 0; i < a.size(); i ++) {
 56 | //      std::cout << a[i] << "  " << b[i] << "  " << std::endl;
 57 | //    }
 58 |     if (a.size() != b.size()) abort();
 59 | 
 60 |     double sum_a_b = inner_product(a.begin(), a.end(), b.begin(), 0.0);
 61 |     double sum_a = accumulate(a.begin(), a.end(), 0.0);
 62 |     double sum_b = accumulate(b.begin(), b.end(), 0.0);
 63 |     double sum_a_a = inner_product(a.begin(), a.end(), a.begin(), 0.0);
 64 |     double sum_b_b = inner_product(b.begin(), b.end(), b.begin(), 0.0);
 65 |     double n = a.size();
 66 | 
 67 |     double r = (sum_a_b - sum_a*sum_b/n) / sqrt((sum_a_a- sum_a*sum_a/n)*(sum_b_b-sum_b*sum_b/n));
 68 |     return r;
 69 | }
 70 | 
 71 | double spearman_correlation(std::vector<double> a, std::vector<double> b)
 72 | {
 73 | //    for (int i = 0; i < a.size(); i ++) {
 74 | //        std::cout << a[i] << "  " << b[i] << "  " << std::endl;
 75 | //    }
 76 |     return pearson_correlation(to_ranked(a), to_ranked(b));
 77 | }
 78 | 
 79 | double spearman_by(std::vector<double> a, std::vector<double> b, std::vector<std::size_t> byvar)
 80 | {
 81 |     // data must be sorted byvar in ascending order
 82 |     double ret = 0.0;
 83 |     int ngroups = 0;
 84 | 
 85 |     // the minimum number of elements in a by group to add into the overall result
 86 |     int min_by = 25;
 87 | 
 88 |     std::size_t last_by = byvar[0];
 89 |     int nby = 0;
 90 |     int start_index = 0;
 91 |     for (std::size_t k = 0; k < a.size(); k++)
 92 |     {
 93 |         if (byvar[k] == last_by)
 94 |         {
 95 |             nby += 1;
 96 |         }
 97 |         else
 98 |         {
 99 |             // we are at a new group
100 |             if (nby >= min_by)
101 |             {
102 |                 // compute stuff
103 |                 std::vector<double>  a_by_group(&a[start_index], &a[start_index + nby]);
104 |                 std::vector<double>  b_by_group(&b[start_index], &b[start_index + nby]);
105 |                 double sc = spearman_correlation(a_by_group, b_by_group);
106 |                 if (!isnan(sc))
107 |                 {
108 |                     ret = ret + sc;
109 |                     ngroups++;
110 |                 }
111 |             }
112 | 
113 |             // reset
114 |             nby = 1;
115 |             start_index = k;
116 |             last_by = byvar[k];
117 |         }
118 |     }
119 | 
120 |     // last group
121 |     if (nby >= min_by)
122 |     {
123 |         // compute stuff
124 |         std::vector<double>  a_by_group(&a[start_index], &a[start_index + nby]);
125 |         std::vector<double>  b_by_group(&b[start_index], &b[start_index + nby]);
126 |         double sc = spearman_correlation(a_by_group, b_by_group);
127 |         if (!isnan(sc)) 
128 |         {
129 |             ret = ret + sc;
130 |             ngroups++;
131 |         }
132 |     }
133 | 
134 |     return ret / ngroups;
135 | }
136 | 
137 | extern "C" double c_spearman_for_python(double* a, double* b, std::size_t* byvar, std::size_t n)
138 | {
139 |     // wrapper function for python
140 |     std::vector<double> avec (a, a + n);
141 |     std::vector<double> bvec (b, b + n);
142 |     std::vector<std::size_t> byvarvec (byvar, byvar + n);
143 |     return spearman_by(avec, bvec, byvarvec);
144 | 
145 | }
146 | 
147 | 
148 | int main(void)
149 | {
150 |     // initialize vectors
151 |     //static const double arr[] = {1, 2, 3, 4, 5};
152 |     //std::vector<double> position (arr, arr + sizeof(arr) / sizeof(arr[0]) );
153 | 
154 |     //static const double arr_pa[] = {0.4, 0.1, 0.22, -0.88, 0.55};
155 |     //std::vector<double> pa (arr_pa, arr_pa + sizeof(arr_pa) / sizeof(arr_pa[0]) );
156 | 
157 |     static const double arr[] = {  0.33117374,   0.80947619,   3.        ,   0.25457016,
158 |          0.52897721,   3.        ,   0.51733111,   0.60862871,
159 |          0.21389315,   0.35368557,  10.        ,  10.        ,
160 |          0.72061731,   0.23078359,   0.38791586,   0.43954613,
161 |          0.91398124,   0.29594647,  10.        ,   0.78991894};
162 |     std::vector<double> position (arr, arr + sizeof(arr) / sizeof(arr[0]) );
163 | 
164 |     static const double arr_pa[] = { 0.10526316,  1.15789474,  1.94736842,  2.21052632, -1.73684211,
165 |        -1.47368421, -0.68421053,  1.68421053,  0.63157895,  0.36842105,
166 |        -0.94736842,  1.42105263,  3.        , -0.42105263,  0.89473684,
167 |         2.47368421, -1.21052632, -0.15789474,  2.73684211, -2.        };
168 |     std::vector<double> pa (arr_pa, arr_pa + sizeof(arr_pa) / sizeof(arr_pa[0]) );
169 | 
170 | 
171 |     std::cout << spearman_correlation(position, pa) << std::endl;
172 | 
173 |     static const double by_arr_pa[] = { 51.73402682,  52.19589972,  44.97281905,  54.73404694,
174 |         47.6719409 ,  45.96619825,  50.36193419,  46.27607543,
175 |         48.18824048,  54.88529706,  42.67667074,  41.80373588,
176 |         37.29934119,  57.98812747,  45.04782628,  38.10858417,
177 |         46.44031713,  40.59823939,  26.29936944,  23.96820474,
178 |         47.98343799,  36.4455311 ,  43.92931621,  55.19172514,
179 |         33.44633285,  37.38381116,  39.03392758,  41.43285553,
180 |         28.63082987,  31.86069758,  41.19551474,  29.04928565,
181 |         39.09690404,  36.75441683,  29.66390582,  70.4035713 ,
182 |         63.53532854,  49.78916058,  64.39911984,  65.41353192,
183 |         48.42353021,  60.38572122,  42.44357922,  42.86378695,
184 |         58.93821467,  61.93862217,  36.23459784,  64.57533596,
185 |         40.09399141,  45.57233379,  44.7748158 ,  50.88705955,
186 |         47.24016865,  51.75866967,  36.17935042,  46.73933887,
187 |         52.7136634 ,  47.0337377 ,  34.19077012,  18.5836512 ,
188 |         41.63257011,   9.8698871 ,  37.63277795,  47.71676464,
189 |         34.89667886,  35.10845963,  44.56638481,  36.70884056,
190 |         57.9185177 ,  50.65260932,  58.53307806,  43.25154747,
191 |         40.59802125,  38.97005406,  35.19682907,  51.94755877,
192 |         44.04430199,  35.84048228,  36.25006727,  46.35317423,
193 |         37.44668618,  16.90596421,  38.87970562,  47.33515849,
194 |         27.41230181,  29.47142008 } ;
195 |     std::vector<double> by_pa (by_arr_pa, by_arr_pa + sizeof(by_arr_pa) / sizeof(by_arr_pa[0]) );
196 | 
197 |     static const double by_arr_position[] = { 1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  12.,
198 |         13.,  15.,  16.,  17.,  19.,  23.,  24.,  25.,  26.,  27.,  28.,
199 |         29.,   1.,   2.,   3.,   6.,   8.,   9.,  11.,  12.,  13.,  17.,
200 |         19.,  21.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,
201 |         10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,
202 |         22.,  23.,  24.,  25.,  26.,  27.,   1.,   2.,   4.,   5.,   6.,
203 |          7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,
204 |         18.,  20.,  21.,  22.,  23.,  24.,  25.,  26.,  27. };
205 |     std::vector<double> by_position (by_arr_position, by_arr_position + sizeof(by_arr_position) / sizeof(by_arr_position[0]) );
206 | 
207 |     static const std::size_t by_arr_queryid[] = {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
208 |         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
209 |         1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,
210 |         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
211 |         2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
212 |         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
213 |         3,  3,  3,  3,  3,  3,  3,  3};
214 |     std::vector<std::size_t> by_queryid (by_arr_queryid, by_arr_queryid + sizeof(by_arr_queryid) / sizeof(by_arr_queryid[0]) );
215 | 
216 |     std::cout << spearman_by(by_pa, by_position, by_queryid) << std::endl;
217 | 
218 | 
219 | 
220 | }
221 | 
222 | 


--------------------------------------------------------------------------------
/mozsci/ems.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | 
  4 | 
  5 | # ensemble model selection
  6 | #
  7 | # based on "Ensemble Selection from Libraries of Models",
  8 | #  Caruana, Niculescu-Mizil, Crew, Ksikes
  9 | #  Proceedings of the 21st International Conference on ML, Banff Canada 2004
 10 | #
 11 | 
 12 | import numpy as np
 13 | import json
 14 | import six
 15 | from six.moves import range
 16 | 
 17 | class EnsembleModelSelector(object):
 18 |     """Implements
 19 |         "Ensemble Selection from Libraries of Models",
 20 |            Caruana, Niculescu-Mizil, Crew, Ksikes
 21 |            Proceedings of the 21st International Conference on ML, Banff Canada 2004
 22 | 
 23 |         Holds data
 24 |             .error = error function
 25 |             .ensemble = numpy array of the model weights
 26 |             .nmodels = the number of models added (=sum(ensemble)
 27 |             .ensemble_indices = the indices of models in ensemble included in final model
 28 | 
 29 |             .niter = number of iterations in an ensemble selction
 30 |             .nsort = the number of models to add at the beginning of the iteration
 31 | 
 32 |             For bagged selection:
 33 |                 .nbags = number of bags to use for bagged 
 34 |                 .pbags = the percent of models to use in each bag
 35 |            """
 36 | 
 37 |     def __init__(self, error=None, niter=10, nsort=5, nbags=20, pbags=0.5):
 38 |         """error = a callable thing (y, ypred) that computes the error
 39 |             it is minimized by the ensemble.
 40 |             needs to accept ypred that are averages of the individual model predictions
 41 | 
 42 |            niter = the number of iterations to use to add models
 43 |            nsort = the number of models added to start the ensemble (section 2.2)
 44 |            nbags = the number of bags to use for bagged selection
 45 |            pbags = the percentage of each models to include in each bag"""
 46 |         self.error = error
 47 |         self.ensemble = None
 48 |         self.ensemble_indices = None
 49 |         self.nmodels = 0
 50 |         self.niter = niter
 51 |         self.nsort = nsort
 52 |         self.nbags = nbags
 53 |         self.pbags = pbags
 54 | 
 55 | 
 56 |     def select_ensemble_bagged(self, y, ymodels, verbose=False):
 57 |         """Ensemble selection using bagged selection
 58 |         (Section 2.3 of the paper)"""
 59 |         ensemble = np.zeros(len(ymodels))
 60 |         indices = np.arange(len(ymodels))
 61 |         max_keep = int(self.pbags * len(ymodels))
 62 |         nmodels = 0
 63 |         for k in range(self.nbags):
 64 |             if verbose:
 65 |                 print("Bagging number %s" % str(k+1))
 66 |             np.random.shuffle(indices)
 67 |             ymodels_bagged = np.array(ymodels)[indices[:max_keep]]
 68 |             self.select_ensemble(y, ymodels_bagged)
 69 |             # NOW self.ensemble is the selection of models in the bag
 70 |             # need to unroll these selected indices to those in the original ymodels
 71 |             ensemble[indices[:max_keep]] += self.ensemble
 72 |             nmodels += self.nmodels
 73 | 
 74 |         # set final ensemble
 75 |         self.ensemble = ensemble
 76 |         self.nmodels = nmodels
 77 |         self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5]
 78 | 
 79 |     def select_ensemble(self, y, ymodels, early_termination = False):
 80 |         """Y = actual y = (N, ) numpy array
 81 |            ymodels = a list of predictions from different models.
 82 |              len(ymodels) = nmodels
 83 |              ymodels[k] = prediction for model k (N, ) numpy array
 84 |            DOESN'T do any bagging (section 2.3).  use select_ensemble_bagged"""
 85 |         # process:
 86 |         # (1) set the initial ensemble
 87 |         # (2) for each iteration, choose the model that decrease the error the most
 88 |         #     and update the current ensemble
 89 | 
 90 |         # (1)
 91 |         self.ensemble = np.zeros(len(ymodels))
 92 | 
 93 |         # do initial sort and insert these models into the ensemble
 94 |         # errors = a vector of errors corresponing to each model
 95 |         #   it will be updated for each iteration corresponding to the
 96 |         #   error for adding each model to the current ensemble
 97 |         errors = np.array([self.error(y, ypred) for ypred in ymodels])
 98 |         initial_models_to_add = errors.argsort()[0:self.nsort]
 99 |         self.ensemble[initial_models_to_add] = 1
100 |         current_prediction = ymodels[initial_models_to_add[0]].astype(np.float)
101 |         for i in initial_models_to_add[1:]:
102 |             current_prediction += ymodels[i]
103 |         current_prediction /= float(self.nsort)
104 |         nmodels = self.nsort
105 | 
106 |         if early_termination: last_error = np.finfo(np.float).max
107 |         # (2)
108 |         for k in range(self.niter):
109 |             # find the model that reduces error the most
110 |             # current_prediction is averaged over nmodels
111 |             # need to add in one more as a weighted average
112 |             errors = np.array([self.error(y, current_prediction * (float(nmodels) / (nmodels + 1)) + ypred.astype(np.float) / float(nmodels + 1)) for ypred in ymodels])
113 | 
114 |             if early_termination:
115 |                 min_error = errors.min()
116 |                 if min_error < last_error: last_error = min_error
117 |                 else:break
118 | 
119 |             model_to_add = errors.argmin()
120 | 
121 |             self.ensemble[model_to_add] += 1
122 |             current_prediction = current_prediction * (float(nmodels) / (nmodels + 1)) + ymodels[model_to_add].astype(np.float) / float(nmodels + 1)
123 |             nmodels += 1
124 | 
125 |             print(("Iteration %s, error=%s" % (k, errors.min())))
126 | 
127 |         # pull out the indices of models included in the final ensemble
128 |         self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5]
129 |         self.nmodels = nmodels
130 | 
131 | 
132 |     def pred(self, ymodels):
133 |         """Given the input from ymodels (same as input to select_ensemble),
134 |         return the predicted probabilities"""
135 |         pred = ymodels[self.ensemble_indices[0]] * self.ensemble[self.ensemble_indices[0]]
136 |         for k in self.ensemble_indices[1:]:
137 |             pred += ymodels[k] * self.ensemble[k]
138 |         return pred.astype(np.float) / np.float(self.nmodels)
139 | 
140 |     def save_ensemble(self, fileout):
141 |         """
142 |         Serialize the ensemble.
143 |         :param fileout: name of the file to write the json string, or a file object.
144 |         :return: None
145 |         """
146 |         if self.ensemble is None or self.ensemble_indices is None:
147 |             raise ValueError('The ensemble has not been properly trained.')
148 | 
149 |         model_json = {
150 |             'nmodels': self.nmodels,
151 |             'ensemble': self.ensemble[:].tolist(),
152 |             'ensemble_indices': self.ensemble_indices[:].tolist(),
153 |             }
154 | 
155 |         # save to the file
156 |         if isinstance(fileout, six.string_types):
157 |             with open(fileout, 'w') as f:
158 |                 json.dump(model_json, f)
159 |         else:
160 |             json.dump(model_json, fileout)
161 | 
162 |     @classmethod
163 |     def load_ensemble(cls, model_json):
164 |         """
165 |         Load the serialized model. Afteer the loading, we can use pred method on new data sets.
166 |         :param cls:
167 |         :param model_json: name of the file to read in the json string, or a file object.
168 |         :return: the new object.
169 |         """
170 |         if isinstance(model_json, six.string_types):
171 |             with open(model_json, 'r') as f:
172 |                 model_json = json.load(f)
173 | 
174 |         ensemble = cls()
175 |         ensemble.nmodels = model_json['nmodels']
176 |         ensemble.ensemble = np.array(model_json['ensemble'], dtype = np.float64)
177 |         ensemble.ensemble_indices = np.array(model_json['ensemble_indices'], dtype = np.int)
178 | 
179 |         return ensemble
180 | 
181 | if __name__ == "__main__":
182 | 
183 |     import pylab as plt
184 |     from .evaluation import classification_error
185 | 
186 |     np.random.seed(2)
187 | 
188 |     # make the data
189 |     N = 1000
190 | 
191 |     # some predictons
192 |     # actual = 5 * x - 4 > 0
193 |     x = np.linspace(0, 1, N)
194 |     y = 5 * x - 4 > 0
195 | 
196 |     nmodels = 500
197 |     ymodels = []
198 |     for k in range(nmodels):
199 |         m = np.random.rand(1) * 5 * (np.random.rand(N) - 0.5) + 5
200 |         b = 3 * (np.random.rand(N) - 0.5) + 4
201 |         thisy = (m * x - b > 0).astype(np.int)
202 |         ymodels.append(thisy)
203 | 
204 |     ems = EnsembleModelSelector(classification_error, niter=25)
205 |     ems.select_ensemble(y, ymodels)
206 |     ypred = ems.pred(ymodels)
207 |     classification_error(y, ypred)
208 | 
209 |     ems.select_ensemble_bagged(y, ymodels)
210 |     ypred = ems.pred(ymodels)
211 |     classification_error(y, ypred)
212 | 
213 | 
214 |     fig = plt.figure(1)
215 |     fig.clf()
216 |     plt.scatter(x, y, marker='o', color='r')
217 |     for k in range(40):
218 |         plt.scatter(x, ymodels[k]+0.01 + k*0.01, marker='s', s=1, color='b')
219 | 
220 |     plt.scatter(x, ypred, marker='x', color='k')
221 |     plt.plot(x, 0.5 * np.ones(x.shape), 'k')
222 |     plt.plot(0.8 * np.ones((100, 1)), np.linspace(0, 1, 100), 'k')
223 | 
224 |     plt.title("Ensemble model selection via greedy sampling\nRed=actual, Blue=40 samples of noisy models, black=ensemble average")
225 |     plt.xlabel("X")
226 |     plt.ylabel("Y")
227 |     fig.show()
228 |     # fig.savefig("ensemble_model_average.png")
229 | 
230 | 
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/mozsci/evaluation.py:
--------------------------------------------------------------------------------
  1 | """Evaluate model performance including efficient C implementations"""
  2 | from __future__ import absolute_import
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .inputs import mean_std_weighted
  7 | from .spearmanr_by_fast import spearmanr_by
  8 | from ._c_utils import c_auc_wmw
  9 | from six.moves import range
 10 | 
 11 | def pearsonr_weighted(x, y, weights=None):
 12 |     """Weighted Pearson correlation coefficient.
 13 | 
 14 |     x, y = (N, ) numpy arrays or 
 15 |     weights = (N, ) or None for no weights"""
 16 |     from scipy.stats import pearsonr
 17 |     if weights is None:
 18 |         return pearsonr(x, y)[0]
 19 |     else:
 20 |         mean_std_x = mean_std_weighted(x.flatten(), weights.flatten())
 21 |         mean_std_y = mean_std_weighted(y.flatten(), weights.flatten())
 22 |         cov_xy = np.sum((x - mean_std_x['mean']) * (y - mean_std_y['mean']) * weights.flatten()) / np.sum(weights)
 23 |         return cov_xy / mean_std_x['std'] / mean_std_y['std']   # r
 24 | 
 25 | 
 26 | def auc_wmw_fast(t, p, weights=None):
 27 |     """Compute the AUC by using the Wilcoxon-Mann-Whitney
 28 |     statistic
 29 |     
 30 |     t = (Nobs, ) target values  (-1/+1) or (0/1)
 31 |     p = (Nobs, ) predicted values
 32 |     weights = a (Nobs, )  array with the weights
 33 |       if omitted, uses uniform weights
 34 | 
 35 |     Returns AUC
 36 |     """
 37 |     tarr = np.asarray(t, dtype=np.int).flatten()
 38 |     parr = np.asarray(p, dtype=np.float).flatten()
 39 | 
 40 |     if len(tarr) != len(parr):
 41 |         raise ValueError("t, p: shape mismatch")
 42 | 
 43 |     idxp = np.where(tarr ==  1)[0]
 44 |     idxn = np.where(tarr <= 0)[0]
 45 |     nidxn = idxn.shape[0]
 46 |     nidxp = idxp.shape[0]
 47 | 
 48 |     if weights is not None:
 49 |         warr = np.asarray(weights, dtype=np.float).flatten()
 50 |     else:
 51 |         warr = np.ones(tarr.shape)
 52 | 
 53 |     auc = c_auc_wmw(idxp, idxn, parr, warr)
 54 |     if np.isnan(auc):
 55 |         auc = 0
 56 | 
 57 |     return auc
 58 | 
 59 | 
 60 | def auc_wmw_error(t, p, weights=None):
 61 |     """Returns 1.0 - AUC to mimic an error function
 62 |         (to pass into minimization routines)"""
 63 |     return 1.0 - auc_wmw_fast(t, p, weights)
 64 | 
 65 | 
 66 | def classification_error(y, ypred, thres=0.5, weights=None):
 67 |     """ y = 0, 1
 68 |     y pred = P(y == 1) is between 0 and 1
 69 |     Uses thres as the threshold
 70 |     y and ypred are numpy arrays
 71 |     weights = if provided is a y.shape() array with the weights
 72 |         take a weighted error in this case"""
 73 |     if weights is None:
 74 |         return ((ypred > thres).astype(np.int).reshape(-1, 1) != y.reshape(-1, 1)).sum() / float(len(y))
 75 |     else:
 76 |         return (((ypred > thres).astype(np.int).reshape(-1, 1) != y.reshape(-1, 1)) * weights.reshape(-1, 1)).sum() / float(weights.sum())
 77 | 
 78 | 
 79 | def precision_recall_f1(y, ypred, thres=0.5, weights=None):
 80 |     """y = 0/1 or -1/+1
 81 |     ypred = P(y == 1) is between 0 and 1
 82 |     y and ypred are numpy arrays
 83 |     weights = if provided is a y.shape() array with the weights
 84 |     take a weighted error in this case"""
 85 |     # see http://en.wikipedia.org/wiki/Precision_and_recall
 86 |     # need to properly handle case where y = (10, ), ypred=(10, 1)
 87 |     ypred_1 = (ypred > thres).reshape(-1, 1)
 88 |     yy = y.reshape(-1, 1)
 89 |     if weights is None:
 90 |         tp = np.sum(np.logical_and(ypred_1, yy == 1))
 91 |         fp = np.sum(np.logical_and(ypred_1, yy == 0))
 92 |         fn = np.sum(np.logical_and(~ypred_1, yy == 1))
 93 |     else:
 94 |         ww = weights.reshape(-1, 1)
 95 |         tp = np.sum(np.logical_and(ypred_1, yy == 1) * ww)
 96 |         fp = np.sum(np.logical_and(ypred_1, yy == 0) * ww)
 97 |         fn = np.sum(np.logical_and(~ypred_1, yy == 1) * ww)
 98 | 
 99 | #    precision = tp / float(tp + fp)
100 | #    recall = tp / float(tp + fn)
101 | #    f1 = 2.0 * precision * recall / (precision + recall)
102 | 
103 |     # we need to check for degenerate cases
104 |     # that might happen if we have only 1 input
105 |     if tp + fp > 0:
106 |         precision = tp / float(tp + fp)
107 |     else:
108 |         precision = 0
109 | 
110 |     if tp + fn > 0:
111 |         recall = tp / float(tp + fn)
112 |     else:
113 |         recall = 0
114 | 
115 |     if precision + recall > 0:
116 |         f1 = 2.0 * precision * recall / (precision + recall)
117 |     else:
118 |         f1 = 0
119 | 
120 |     return precision, recall, f1
121 | 
122 | 
123 | """
124 | All the performance measures that we will be using for classification problems live in this file below here.
125 | """
126 | 
127 | 
128 | def classification_model_performance(observed, predicted, weight=None):
129 |     """
130 |     This is to check the performance of a classification algorithm.
131 |     The observed values should be 0, 1, 2, etc. The weight is a list of the float numbers whose indices are
132 |     the classes. For ex, if weight is [1, 5], then we have two classes in the classification problem. And
133 |     the error caused by assigning class 0 instance to a class 1 instance is 1. The error caused by assigning
134 |     a class 1 instance to a class 0 instance is 5.
135 | 
136 |     I like the returned perf measure to be in the range of [0, 1]. We should do so for at least the 'no-weight'
137 |     case.
138 | 
139 |     Currently the value is, the lower, the better.
140 |     """
141 |     if weight is None:
142 |         sum_incorrect = sum(observed != predicted)
143 |     else:
144 |         sum_incorrect = sum(weight[observed[ii]] for ii in range(len(observed)) if observed[ii] != predicted[ii])
145 | 
146 |     return sum_incorrect / float(len(predicted))
147 | 
148 | 
149 | def classification_model_performance_matrix(observed, predicted):
150 |     """
151 |     This is to check the performance of a classification algorithm.
152 |     The observed values should be 0, 1, 2, etc.
153 | 
154 |     We will use numpy's round number here - np.round(4.6) ( = 5.0). we can use int(np.round(4.6)) gives 5.
155 |     """
156 |     # assume that the classe categories start from 0.
157 |     num_classes = int(max(observed)) + 1
158 | 
159 |     perf_2d_array = np.zeros([num_classes] * 2, dtype=int)
160 | 
161 |     for ii in range(len(observed)):
162 |         # in case some algorithms return float numbers.
163 |         predicted_class = int(np.round(predicted[ii]))
164 |         perf_2d_array[observed[ii], predicted_class] += 1
165 | 
166 |     return perf_2d_array
167 | 
168 | 
169 | def classification_model_performance_loss(observed, predicted, loss=None):
170 |     """
171 |     loss is a function with two inputs (i, j) where i is the real category and j is the predicted category.
172 |     It returns a float number as the loss of assigning a category i instance to category j.
173 |     A simple one is implemented as the default (see below in the function body).
174 | 
175 |     Another way to call this function is to define a loss function or lambda as below.
176 |     classification_model_performance_loss(observed, predicted, loss=lambda i, j: (i-j)**2)
177 |     """
178 |     def default_loss(class_i, class_j):
179 |         if class_i == class_j:
180 |             return 0
181 |         else:
182 |             return 1
183 | 
184 |     if loss is None:
185 |         loss = default_loss
186 | 
187 |     total_loss = sum(loss(observed[ii], int(np.round(predicted[ii]))) for ii in range(len(observed)))
188 | 
189 |     return total_loss
190 | 
191 | 


--------------------------------------------------------------------------------
/mozsci/glm/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/mozsci/glm/prob_distributions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module provides all the probability distributions that the simplified generalized models supports.
  3 | A better name might be likelihood. We will provide the eval, eval_gradient, and eval_hessian for the
  4 | log likelihood here. Note, we do not add the negative here. That should be done by the caller.
  5 | """
  6 | from __future__ import absolute_import
  7 | from __future__ import print_function
  8 | 
  9 | ### Attention, there is no special treatment of the constant column here. So before call any method here,
 10 | ### add one columns of 1's to the feature matrix, ex, np.c_[features, np.ones(features.shape[0])]
 11 | 
 12 | import numpy as np
 13 | 
 14 | class GlmProbDistBase(object):
 15 |     """
 16 |     The base class of the probability distributions.
 17 |     """
 18 | 
 19 |     def __init__(self):
 20 |         pass
 21 | 
 22 |     def eval(self, beta, features, y):
 23 |         """
 24 |         This method returns the log likelihood of the variables. Constants will be omitted because the goal of
 25 |         this evaluation is to maximize the log likelihood. So, this method might return positive numbers.
 26 |         """
 27 |         pass
 28 | 
 29 |     def eval_gradient(self, beta, features, y):
 30 |         pass
 31 | 
 32 |     def eval_hessian(self, beta, features, y):
 33 |         pass
 34 | 
 35 |     def get_inverse_link(self):
 36 |         """
 37 |         Get the inverse of the link function. The caller can use it to calculate the expected value from
 38 |         the linear predictors.
 39 |         """
 40 |         pass
 41 | 
 42 | class Poisson(GlmProbDistBase):
 43 |     """
 44 |     Poisson regression.
 45 |     """
 46 |     def eval(self, beta, features, y):
 47 |         """
 48 |         return the log likelihood, with features
 49 |         """
 50 | 
 51 |         log_miu = np.dot(features, beta)
 52 |         log_miu = np.minimum(log_miu, 5)
 53 |         tmp = np.sum(log_miu * y - np.exp(log_miu))
 54 | 
 55 |         if np.isinf(tmp):
 56 |             print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ')
 57 |             print('max of y * log miu', np.max(y * log_miu))
 58 |             print('max of  miu', np.max(np.exp(log_miu)))
 59 |             print('max of y ', max(y))
 60 | 
 61 |             return np.finfo(np.float).max
 62 |         else:
 63 |             return tmp
 64 | 
 65 |     def eval_gradient(self, beta, features, y):
 66 |         """
 67 |         return the gradient of beta at y with feature features.
 68 |         y is the array of the observed values.
 69 |         This is the gradient against beta_k. beta_k[0] = k* which is the log of k.
 70 |         This is a faster version, compared with the eval_gradient_bk
 71 |         :param beta_k: one single array of k* and beta
 72 |         :param features:
 73 |         :param y: observed variable.
 74 |         :return:
 75 |         """
 76 |         # setup the values we are going to need.
 77 |         log_miu = np.dot(features, beta)
 78 |         # prevent overflows
 79 |         log_miu = np.minimum(log_miu, 5)
 80 |         miu = np.exp(log_miu)
 81 |         grad_tmp = y - miu
 82 | 
 83 |         gradient = np.sum(features * grad_tmp.reshape(-1,1), axis=0)
 84 |         if np.isnan(np.sum(gradient)):
 85 |             print('Warning--The grad_tmp has nan', gradient)
 86 | 
 87 |         return gradient
 88 | 
 89 |     def get_inverse_link(self):
 90 |         return np.exp
 91 | 
 92 | class Exponential(GlmProbDistBase):
 93 |     """
 94 |     The exponential probability distribution. The parameter lambda is the inner product of beta and x.
 95 |     This exponential uses a different link function. log(x). This solves the non-positive problem we have
 96 |     in Expontial class.
 97 |     """
 98 | 
 99 |     def eval(self, beta, features, y):
100 |         """
101 |         return the log likelihood
102 |         theta = beta * feature.
103 |         """
104 | 
105 |         log_miu = np.dot(features, beta)
106 |         tmp = -np.sum(log_miu + y * np.exp(-log_miu))
107 | 
108 |         if np.isinf(tmp):
109 |             print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ')
110 |             print('max of log miu', np.max(log_miu))
111 |             print('max of  y / miu', np.max(y * np.exp(-log_miu)))
112 |             print('max of y ', max(y))
113 | 
114 |             return np.finfo(np.float).max
115 |         else:
116 |             return tmp
117 | 
118 |     def eval_gradient(self, beta, features, y):
119 |         """
120 |         return the gradient of beta at y with feature features.
121 |         y is the array of the observed values.
122 |         """
123 |         # setup the values we are going to need.
124 |         log_miu = np.dot(features, beta)
125 |         grad_tmp = 1.0 - y * np.exp(-log_miu)
126 | 
127 |         gradient = -np.sum(features * grad_tmp.reshape(-1,1), axis=0)
128 |         if np.isnan(np.sum(gradient)):
129 |             print('Warning--The grad_tmp has nan', gradient)
130 | 
131 |         return gradient
132 | 
133 |     def get_inverse_link(self):
134 |         return np.exp
135 | 
136 | class NegativeBinomialWithKstar(GlmProbDistBase):
137 |     """
138 |     Negative Binomial regression.
139 |     Parameter k is fixed.
140 |     """
141 |     def eval(self, beta_k, features, y):
142 |         """
143 |         return the log likelihood, with feature feature
144 |         theta = beta * feature.
145 |         Attention: We omit the ln((y-1)!) in the loglikelihood, because our goal is to optimize the loglikelihood.
146 |         beta_k[0] = k* which is the log of k.
147 |         """
148 |         beta = beta_k[1:]
149 | 
150 |         # underflow in some special cases.
151 |         if beta_k[0] < -720.0:
152 |             beta_k[0] = -720.0
153 | 
154 |         k = np.exp(beta_k[0])           ## exp(k*).
155 |         ln_exp_k_star = beta_k[0]       ## ln(e^(k*)). It's actually k*, ie. beta_k[0]
156 | 
157 |         max_y = int(y.max())
158 |         subsum_y = np.log(np.arange(max_y) + k).cumsum()
159 |         log_miu = np.dot(features, beta)
160 | 
161 |         # log( 1 + exp( k* - log(miu)))
162 |         log_1_plus_sth = np.log(1.0 + np.exp(beta_k[0] - log_miu))
163 |         log_1_plus_sth[beta_k[0] - log_miu > 50] = beta_k[0] - log_miu[beta_k[0] - log_miu > 50]
164 | 
165 |         subsum = subsum_y[y.astype(np.int) - 1]
166 |         subsum[y.astype(np.int) == 0] = 0.0
167 | 
168 |         tmp = np.sum(subsum + k * ln_exp_k_star + y * log_miu) - np.sum((k + y) * (log_miu + log_1_plus_sth))
169 | 
170 |         if np.isinf(tmp):
171 |             print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ')
172 |             print('max of subsum', np.max(subsum))
173 |             print('max of y * log miu', np.max(y * log_miu))
174 |             print('max of (k+y) * log miu and k',  np.max((k + y) * (log_miu + log_1_plus_sth)))
175 |             print('max of log miu and log 1 puls sth', np.max(log_miu), np.max(log_1_plus_sth))
176 |             print('max of y ', max(y))
177 |             print('value of  exp and k', k * ln_exp_k_star)
178 | 
179 |             return np.finfo(np.float).max
180 |         else:
181 |             return tmp
182 | 
183 |     def eval_gradient(self, beta_k, features, y):
184 |         """
185 |         return the gradient of beta at y with feature features.
186 |         y is the array of the observed values.
187 |         This is the gradient against beta_k. beta_k[0] = k* which is the log of k.
188 |         This is a faster version, compared with the eval_gradient_bk
189 |         :param beta_k: one single array of k* and beta
190 |         :param features:
191 |         :param y: observed variable.
192 |         :return: the gradient of the log likelihood
193 |         """
194 |         # setup the values we are going to need.
195 |         beta = beta_k[1:]
196 | 
197 |         if beta_k[0] < -720.0:  # handling underflow.
198 |             beta_k[0] = -720.0
199 | 
200 |         k = np.exp(beta_k[0])           ## exp(k*).
201 | 
202 |         log_miu = np.dot(features, beta)
203 |         log_1_plus_sth = np.log(1.0 + np.exp(beta_k[0] - log_miu))
204 |         log_1_plus_sth[beta_k[0] - log_miu > 50] = beta_k[0] - log_miu[beta_k[0] - log_miu > 50]
205 | 
206 |         miu = np.exp(log_miu)
207 |         miu[np.isinf(miu + k)] = np.finfo(np.float).max - 1.5 * k
208 | 
209 |         # gradient of beta
210 |         grad_tmp = (y - miu) / (miu + k)
211 |         # test of nan in the gradient calculation.
212 |         if np.isnan(np.sum(grad_tmp)):
213 |             if np.isnan(np.sum(miu)):
214 |                 print('The miu has nan', miu)
215 |             print('min of miu + k is ', np.min(miu + k))
216 |             print('max of miu + k is ', np.max(miu + k))
217 |             print('min of y - miu is ', np.min(y - miu))
218 |             print('max of y - miu is ', np.max(y - miu))
219 |             print('The grad_tmp has nan', grad_tmp)
220 | 
221 |         gradient_beta = k * np.sum(features * grad_tmp.reshape(-1,1), axis=0)
222 |         if np.isnan(np.sum(gradient_beta)):
223 |             print('The grad_tmp has nan', gradient_beta)
224 | 
225 |         # derivative of k*
226 |         max_y = int(y.max())
227 |         subsum_y = (1.0 / (np.arange(max_y) + k)).cumsum()
228 |         subsum = subsum_y[y.astype(np.int) - 1]
229 |         subsum[y.astype(np.int) == 0] = 0.0
230 | 
231 |         derivative_k = np.sum(subsum + 1.0 + beta_k[0] - (k + y)/(k + miu) - (log_miu + log_1_plus_sth))
232 | 
233 |         if np.isinf(derivative_k):
234 |             print('WARNING -- Derivative of kstar got inf value. It has been replaced by float.max. ')
235 |             derivative_k = np.finfo(np.float).max
236 | 
237 |         # Assemble them together!
238 |         gradient = np.zeros(beta_k.shape[0])
239 |         gradient[0] = k * derivative_k
240 |         gradient[1:] = gradient_beta
241 | 
242 |         return gradient
243 | 
244 |     def get_inverse_link(self):
245 |         return np.exp
246 | 
247 | 
248 | 


--------------------------------------------------------------------------------
/mozsci/glm/regularization.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import numpy as np
  3 | from six.moves import range
  4 | 
  5 | class RegularizationBase(object):
  6 |     """
  7 |     Base class of all the regularization methods.
  8 |     Super classes can provide gradient and Hessian methods.
  9 |     """
 10 | 
 11 |     def __init__(self):
 12 |         pass
 13 | 
 14 |     def eval(self, x):
 15 |         pass
 16 | 
 17 | class NullRegularization(RegularizationBase):
 18 |     """
 19 |     This is a null regularization, ie. 0 regularization.
 20 |     """
 21 |     def eval(self, x):
 22 |         return 0
 23 | 
 24 |     def eval_gradient(self, x):
 25 |         return 0
 26 | 
 27 |     def eval_hessian(self, x):
 28 |         return 0
 29 | 
 30 | class RidgeRegularization(RegularizationBase):
 31 |     """
 32 |     Ridge regularization.
 33 |     It's lam/2.0 * ||x|| ** 2
 34 |     This regularization does not penalize the constant term. The constant term is supposed to be the last term.
 35 |     """
 36 | 
 37 |     def __init__(self, lam):
 38 |         self.lam = lam
 39 | 
 40 |     def eval(self, x):
 41 |         return 0.5 * self.lam * np.inner(x[:-1], x[:-1])
 42 | 
 43 |     def eval_gradient(self, x):
 44 |         tmp = self.lam * x
 45 |         tmp[-1] = 0.0
 46 | 
 47 |         # return self.lam * x
 48 |         return tmp
 49 | 
 50 |     def eval_hessian(self, x):
 51 |         hessian = self.lam * np.identity(x.shape[0])
 52 |         hessian[-1, -1] = 0
 53 |         return hessian
 54 | 
 55 | class RidgeRegularizationAll(RegularizationBase):
 56 |     """
 57 |     Ridge regularization.
 58 |     It's lam/2.0 * ||x|| ** 2
 59 |     """
 60 | 
 61 |     def __init__(self, lam):
 62 |         self.lam = lam
 63 | 
 64 |     def eval(self, x):
 65 |         return 0.5 * self.lam * np.inner(x, x)
 66 | 
 67 |     def eval_gradient(self, x):
 68 |         return self.lam * x
 69 | 
 70 |     def eval_hessian(self, x):
 71 |         return self.lam * np.identity(x.shape[0])
 72 | 
 73 | 
 74 | class RidgeRegularizationChosen(RegularizationBase):
 75 |     """
 76 |     Ridge regularization on chosen terms.
 77 |     It's lam/2.0 * ||x|| ** 2
 78 |     """
 79 | 
 80 |     def __init__(self, lam, dim, free_list=[]):
 81 |         self.lam = lam
 82 | 
 83 |         # this is the indices that will be penalized/regulated.
 84 |         self.index = list(set(range(dim)) - set(free_list))
 85 |         self.free = free_list
 86 | 
 87 |     def eval(self, x):
 88 |         xx = x[self.index]
 89 |         return 0.5 * self.lam * np.inner(xx, xx)
 90 | 
 91 |     def eval_gradient(self, x):
 92 |         grad = self.lam * x
 93 |         grad[self.free] = 0.0
 94 | 
 95 |         return grad
 96 | 
 97 |     def eval_hessian(self, x):
 98 |         hessian = self.lam * np.identity(x.shape[0])
 99 |         hessian[self.free, self.free] = 0.0
100 |         return hessian
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/mozsci/glm/simplified_glm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import numpy as np
  3 | import simplejson as json
  4 | 
  5 | from . import regularization
  6 | from . import prob_distributions
  7 | from scipy import optimize
  8 | import six
  9 | 
 10 | """
 11 | This module defines a series of simplified glm functions. They are simplified because they use a much simpler way to
 12 | link the probability distribution parameters and the observed feature variables.
 13 | """
 14 | 
 15 | class SimplifiedGlmBase(object):
 16 |     """
 17 |     This is the base class of the simplified glm classes.
 18 |     Each super class must provide both fit and predict methods.
 19 |     """
 20 | 
 21 |     def __init__(self, lam=0.1, regular=None, seed=None, likelihood=None, initialize_params=None, param_len=None,
 22 |                  maxiter=None):
 23 |         """
 24 |         :param lam: the parameter for regularization.
 25 |         :param regular:
 26 |         :param seed:
 27 |         :param likelihood:
 28 |         :param initialize_params: the method to initialize the parameter array. It takes one parameter as the length
 29 |                of the params array. See the definition of random_initialize_params(len).
 30 |         :param maxiter: the number of iterations that can run when do the 'fitting' of the model. It controls the
 31 |                         time spent on optimization routine.
 32 |         """
 33 | 
 34 |         if lam is not None:
 35 |             self.regularization = regularization.RidgeRegularization(lam)
 36 |         else:
 37 |             if regular is None:
 38 |                 self.regularization = regularization.RidgeRegularization(0.1)
 39 |             else:
 40 |                 self.regularization = regular
 41 | 
 42 |         self.params = None  ## The last number in this 1-d array is for the constant (1.0) term.
 43 |         self.likelihood = likelihood
 44 | 
 45 |         if seed is not None:
 46 |             np.random.seed(seed)
 47 |         else:           ## Likely I will delete these two lines in the future to let the numpy use its own.
 48 |             np.random.seed(4559)
 49 | 
 50 |         if initialize_params is None:
 51 |             #self.initialize_params = np.zeros
 52 |             self.initialize_params = np.random.rand
 53 |         else:
 54 |             self.initialize_params = initialize_params
 55 | 
 56 |         self.param_len = param_len
 57 |         self.maxiter = maxiter  ## To control the optimization routine.
 58 | 
 59 |     def get_eval(self, features, y):
 60 |         """
 61 |         A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines.
 62 |         :param features:
 63 |         :param y:
 64 |         :return: the value of the objective function which is -loglikelihood + regularization. We want to
 65 |                  minimize it.
 66 |         """
 67 | 
 68 |         def func(beta):
 69 |             return -self.likelihood.eval(beta, features, y) + self.regularization.eval(beta)
 70 |         return func
 71 | 
 72 |     def get_gradient(self, features, y):
 73 |         """
 74 |         A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines.
 75 |         :param features:
 76 |         :param y:
 77 |         :return:
 78 |         """
 79 | 
 80 |         def func(beta):
 81 |             return -self.likelihood.eval_gradient(beta, features, y) + self.regularization.eval_gradient(beta)
 82 |         return func
 83 | 
 84 |     def get_hessian(self, features, y):
 85 |         """
 86 |         A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines.
 87 |         :param features:
 88 |         :param y:
 89 |         :return:
 90 |         """
 91 | 
 92 |         def func(beta):
 93 |             return -self.likelihood.eval_hessian(beta, features, y) + self.regularization.eval_hessian(beta)
 94 |         return func
 95 | 
 96 | 
 97 |     def fit(self, x, y):
 98 |         """
 99 |         training the model.
100 |         :param x: the design matrix. It doesn't need to have the constant column, because we are adding one.
101 |         :param y: the observed independent variables.
102 |         :return:
103 |         """
104 |         # add the constant column as the last column
105 |         features = np.c_[x, np.ones(x.shape[0])]
106 | 
107 |         # setup the param length. This is usually the number of features plus 1 for the constant term. but there are exceptions, such as negative binomial.
108 |         if self.param_len is None:
109 |             self.param_len = features.shape[1]
110 | 
111 |         initial_params = self.initialize_params(self.param_len)
112 | 
113 |         eval_func = self.get_eval(features, y)
114 |         eval_gradient_func = self.get_gradient(features, y)
115 | 
116 |         # I have tried Newton, Secant, Conjugate Gradient etc to see which one is more robust. Speed is less important.
117 |         # http://scipy-lectures.github.com/advanced/mathematical_optimization/index.html
118 |         # eval_hessian_func = self.get_hessian(features, y)
119 | 
120 |         self.params = optimize.fmin_bfgs(eval_func, initial_params, fprime=eval_gradient_func, maxiter=self.maxiter)
121 | 
122 |     def predict(self, x):
123 |         """
124 |         This predict actually returns the the inverse of the expectation.
125 |         :param x: design matrix
126 |         :return:
127 |         """
128 |         features = np.c_[x, np.ones(x.shape[0])]
129 |         return np.inner(features, self.params)
130 | 
131 |     def save_model(self, model_file=None):
132 |         """Serialize model to model_file or return the json str if no file is provided."""
133 |         m = {'params':self.params.tolist()}
134 | 
135 |         if model_file is None:
136 |             return json.dumps(m)
137 |         else:
138 |             with open(model_file, 'w') as f:
139 |                 json.dump(m, f)
140 |             return None
141 | 
142 |     @classmethod
143 |     def load_model(cls, model_file):
144 |         """
145 |         load the model from a file or a json block.
146 |         """
147 | 
148 |         if isinstance(model_file, six.string_types):
149 |             params = json.load(open(model_file, 'r'))
150 |         else:
151 |             params = model_file
152 |         ret = cls()
153 |         ret.params = np.array(params['params'])
154 |         return ret
155 | 
156 | class PoissonRegression(SimplifiedGlmBase):
157 |     """
158 |     prob dist: Poisson.
159 |     lambda is a linear function of feature variables.
160 |     Expected value is exp(w * x) where x * x is the inner product.
161 |     """
162 |     def __init__(self, *args, **kw):
163 | 
164 |         super(PoissonRegression, self).__init__(likelihood=prob_distributions.Poisson(), *args, **kw)
165 | 
166 | class NegativeBinomialWithKstarRegression(SimplifiedGlmBase):
167 |     """
168 |     prob dist: Poisson.
169 |     lambda is a linear function of feature variables.
170 |     Expected value is exp(w * x) where x * x is the inner product.
171 |     """
172 |     def __init__(self, beta_k_len, initial_k_star=9, regular=None, lam=1.0, *args, **kw):
173 |         """
174 |         :param initial_k_star:  the initial value for k*, ie. log(k).
175 |         """
176 | 
177 |         if regular is None:
178 |             ## The first entry is the k_star, ie. the number of failures in negative binomial.
179 |             ## The last entry is the constant term in the linear regression.
180 |             regular = regularization.RidgeRegularizationChosen(lam, dim=beta_k_len, free_list=[0, beta_k_len - 1])
181 | 
182 |         self.initial_k_star = initial_k_star
183 | 
184 |         super(NegativeBinomialWithKstarRegression, self).__init__(lam=None, regular=regular,
185 |             likelihood=prob_distributions.NegativeBinomialWithKstar(),
186 |             initialize_params=self.initialize_params_withk,
187 |             param_len=beta_k_len, *args, **kw)
188 | 
189 |     def initialize_params_withk(self, cnt):
190 |         """
191 |         To return a function to initialize the beta and initial k*.
192 |         cnt should be one more of all the features.
193 |         """
194 |         params = np.zeros(cnt)
195 |         params[0] = self.initial_k_star
196 | 
197 |         return params
198 | 
199 |     def predict(self, x):
200 |         """
201 |         This overrides the base class's predict.
202 |         :param x: design matrix
203 |         :return: predicted y.
204 |         """
205 |         # The first entry of the params is the k*, ie. log(k)
206 |         params = self.params[1:]
207 | 
208 |         features = np.c_[x, np.ones(x.shape[0])]
209 |         return np.inner(features, params)
210 | 
211 | class ExponentialGlm(SimplifiedGlmBase):
212 |     """
213 |     prob dist: Exponential.
214 |     lambda is a linear function of feature variables.
215 |     Expected value is exp(w * x) where w * x is actually the inner product of (w, x)
216 |     """
217 | 
218 |     def __init__(self, *args, **kw):
219 | 
220 |         super(ExponentialGlm, self).__init__(likelihood=prob_distributions.Exponential(), *args, **kw)
221 | 
222 | def random_initialize_params(array_len):
223 |     """
224 |     Create an 1-d array that is uniformly randomly chosen in [-0.5, 0.5]
225 |     :param array_len: how long is the array.
226 |     :return: the numpy array.
227 |     """
228 |     return np.random.rand(array_len) - 0.5
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/mozsci/histogram.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Fast 1D empirical histogram sampler.
  3 | 
  4 |  Efficently compute binned histograms from large streaming
  5 |  data sets, using cython to speed up the slow steps.
  6 |  The speed is typically 10-100X faster then the corresponding numpy
  7 |  routine.
  8 | 
  9 |  Provides capabilities to estimate a probability density
 10 |  function from data, sample from a given distribution,
 11 |  plot, serialize to/from a file.
 12 | """
 13 | from __future__ import absolute_import
 14 | 
 15 | import numpy as np
 16 | 
 17 | from ._c_utils import histogram1d_update, histogram1d_update_counts
 18 | from ._c_utils import histogram1d_compute_indices
 19 | 
 20 | class Histogram1DFast(object):
 21 |     """A fast 1D histogram sampler
 22 |     for evenly spaced bins"""
 23 |     def __init__(self, bins, mn, mx):
 24 |         """bins evenly spaced bins from mn to mx"""
 25 |         self.bins = int(bins)
 26 |         self.bin_width = (mx - mn) / float(bins)
 27 |         self.bin_count = np.zeros((bins, ), np.int)
 28 |         self.bin_edges = mn + self.bin_width * np.arange(self.bins + 1)
 29 |         self.bin_centers = 0.5 * (self.bin_edges[0:-1] + self.bin_edges[1:])
 30 |         self.mx = float(mx)
 31 |         self.mn = float(mn)
 32 |         self._pdf_updated = False
 33 |         self.pdf = np.zeros((bins, ), np.float)
 34 |         self.cdf = np.zeros((bins, ), np.float)
 35 | 
 36 |     def update(self, data):
 37 |         """data is a 1D array to update histogram with
 38 |         Note: pdf, cdf are not updated after updating the counts
 39 |             if updated values are needed, client should call self.compute_pdf_cdf()
 40 |             before accessing.  TODO: .pdf and .cdf attributes that lazily
 41 |             compute/return based on the value of self._pdf_updated"""
 42 |         bin_count = self.bin_count
 43 |         bin_width = self.bin_width
 44 |         mn = self.mn
 45 |         bins1 = self.bins - 1
 46 |         histogram1d_update(data.astype(np.float), bin_count, bin_width,
 47 |             bins1, mn)
 48 |         self._pdf_updated = False
 49 | 
 50 |     def plot(self, ti, fignum):
 51 |         """Plots the current histogram count
 52 |         ti = the title
 53 |         fignum = make this figure number
 54 |         plots both counts and log(counts)
 55 |         returns fig"""
 56 |         import pylab as plt
 57 | 
 58 |         fig = plt.figure(fignum)
 59 |         fig.clf()
 60 |     
 61 |         plt.subplot(211)
 62 |         plt.plot(self.bin_centers, self.bin_count)
 63 |         plt.ylabel("# " + ti)
 64 |     
 65 |         plt.subplot(212)
 66 |         plt.plot(self.bin_centers, np.log(self.bin_count + 1))
 67 |         plt.ylabel("log(# " + ti + ')')
 68 |     
 69 |         return fig
 70 | 
 71 | 
 72 |     def update_counts(self, data, counts):
 73 |         """data is a 1D array of x values, counts is a 1D array
 74 |            of counts to add"""
 75 |         ndata = len(data)
 76 |         assert len(counts) == ndata
 77 |         bin_count = self.bin_count
 78 |         bin_width = float(self.bin_width)
 79 |         mn = float(self.mn)
 80 |         bins1 = self.bins - 1
 81 |         histogram1d_update_counts(data.astype(np.float), bin_count, bin_width,
 82 |             bins1, mn, counts.astype(np.float))
 83 |         self._pdf_updated = False
 84 | 
 85 |     def compute_indices(self, data):
 86 |         """Compute the indices in the histogram corresponding to data,
 87 |         but do not update"""
 88 |         ndata = len(data)
 89 |         mn = self.mn
 90 |         bins1 = self.bins - 1
 91 |         bin_index = np.zeros(data.shape, np.int)
 92 |         bin_width = self.bin_width
 93 |         histogram1d_compute_indices(data.astype(np.float), bin_width,
 94 |             bins1, mn, bin_index)
 95 |         return bin_index
 96 | 
 97 | 
 98 |     def compute_pdf_cdf(self):
 99 |         """Compute and store the pdf and cdf of bin_count"""
100 |         if not self._pdf_updated:
101 |             ndata = self.bin_count.sum()
102 |             if ndata > 0:
103 |                 self.pdf = self.bin_count / float(self.bin_count.sum())
104 |                 self.cdf = self.pdf.cumsum()
105 |             else:
106 |                 self.pdf = None
107 |                 self.cdf = None
108 |             self._pdf_updated = True
109 | 
110 |     def sample(self, N, return_edge=False, return_index=False):
111 |         """Returns N samples of x
112 |         if return_edge = True then returns the left bin_edge
113 |             instead of a random sample from the interval
114 |         if return_index = True then return the index of
115 |             the selected bin
116 |         Can't have both return_index and return_edge"""
117 |         assert not (return_edge and return_index)
118 |         if not self._pdf_updated:
119 |             self.compute_pdf_cdf()
120 | 
121 |         # sample the cdf
122 |         # numpy's searchsorted uses binary search
123 |         # and returns the left bin edge index
124 |         rand1 = np.random.rand(N)
125 |         samples = self.cdf.searchsorted(rand1)
126 |         if return_index:
127 |             ret = samples
128 |         elif return_edge:
129 |             ret = self.bin_edges[samples]
130 |         else:
131 |             rand2 = np.random.rand(N)
132 |             ret = self.bin_edges[samples] + rand2 * self.bin_width
133 |         return ret
134 | 
135 |     def stratified_sample(self, x, sample_size=None, indices=False, empty_bin_rate=0.01):
136 |         """Input:
137 |                 X = (N, ) numpy vector of samples from this distribution,
138 |                 sample_size = (self.bins, ) vector.  This gives the
139 |                     total number of samples to take from this distribution
140 |                     for each of the histogram bins.
141 |                     If None, then uses the last cached value
142 |                 empty_bin_rate = if the bin_count == 0 for any bins, then the sampling
143 |                     rate in them is set to empty_bin_rate.
144 |                     Note: this is only used if sample_size is also provided
145 |            Output:
146 |             if indices == False, return a sample from X stratified according
147 |                 to sample_size
148 |             if indices == True, return the indices into X to make that sample"""
149 | 
150 |         if sample_size is not None:
151 |             # update sampling rate
152 |             gt0_count = self.bin_count > 0
153 |             sz = np.asarray(sample_size)
154 |             self._stratified_sampling_rate = np.zeros(sz.shape)
155 |             self._stratified_sampling_rate[gt0_count] = sz[gt0_count] / self.bin_count[gt0_count].astype(np.float)
156 |             self._stratified_sampling_rate[~gt0_count] = empty_bin_rate
157 | 
158 |         # strategy: find the sampling rate for each point in the input
159 |         # vector x.  choose it with that sampling rate
160 |         xindices = self.compute_indices(x)
161 |         nsamples = len(xindices)
162 |         r = np.random.rand(nsamples)
163 |         indices_accept = np.arange(nsamples)[r < self._stratified_sampling_rate[xindices]]
164 |         if indices:
165 |             return indices_accept
166 |         else:
167 |             return x[indices_accept]
168 | 
169 | def plot_joint_marginal(x, y,
170 |     N=50, range_x=None, range_y=None, log_joint=False,
171 |     xtitle=None, ytitle=None, title=None, 
172 |     fignum=1, show=True, outfile=None):
173 |     """
174 |     Makes a pretty joint/marginal probability plot
175 | 
176 |     In the main square we plot the joint PDF
177 |     On each axis we also add the marginal PDFs
178 |     Correlations optionally added to the title
179 | 
180 |     Input:
181 |         N = number of bins
182 |         range_x/range_y = the ranges for x and y.  If None, uses
183 |             min/max values
184 |         log_joint = if True, then plot log(joint counts),
185 |              otherwise just use joint(counts)
186 |         xtitle/ytitle/title = strings to add for description
187 |     
188 |         fignum = plot in this figure
189 |         show = if True, does a fig.show()
190 |     Returns the fig object
191 |     """
192 |     import pylab as plt
193 |     from mpl_toolkits.axes_grid1 import make_axes_locatable
194 | 
195 |     if range_x is None:
196 |         range_x = [x.min(), x.max()]
197 |     if range_y is None:
198 |         range_y = [y.min(), y.max()]
199 | 
200 |     # make a 2D histogram of the input for contour plotting
201 |     # any bins with density 0 we will set to NaN so they aren't plotted
202 |     data_hist_2D = np.histogram2d(x, y, bins=[N, N+1], range=[range_x, range_y])
203 |     x_bins = 0.5 * (data_hist_2D[1][0:-1] + data_hist_2D[1][1:])
204 |     y_bins = 0.5 * (data_hist_2D[2][0:-1] + data_hist_2D[2][1:])
205 |     data_hist_2D = data_hist_2D[0]
206 |     data_hist_2D[data_hist_2D == 0] = np.nan
207 |     if log_joint:
208 |         data_hist_2D = np.log(data_hist_2D + 1)
209 | 
210 |     fig = plt.figure(fignum)
211 |     fig.clf()
212 | 
213 |     # the contour plot in the middle with joint PDF
214 |     axScatter = plt.subplot(111)
215 |     axScatter.contourf(x_bins, y_bins, data_hist_2D.T, ncontours=10)
216 |     plt.xlabel(xtitle)
217 |     plt.ylabel(ytitle)
218 | 
219 |     divider = make_axes_locatable(axScatter)
220 |     axHistx = divider.append_axes("top", 1.2, pad=0.1, sharex=axScatter)
221 |     axHisty = divider.append_axes("right", 1.2, pad=0.1, sharey=axScatter)
222 |     dummy = plt.setp(axHistx.get_xticklabels() + axHistx.get_yticklabels() + axHisty.get_xticklabels() + axHisty.get_yticklabels(), visible=False)
223 | 
224 |     axHisty.hist(y, N+1, range=range_y, orientation='horizontal')
225 |     axHistx.hist(x, N, range=range_x)
226 | 
227 |     if title:
228 |         plt.figtext(0.5, 0.94, title,
229 |             ha='center', color='black', weight='bold', size='large')
230 | 
231 |     if show:
232 |         fig.show()
233 |     if outfile is not None:
234 |         plt.savefig(outfile)
235 |     return fig
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 


--------------------------------------------------------------------------------
/mozsci/inputs.py:
--------------------------------------------------------------------------------
  1 | """Input feature manipulation, including normalizations"""
  2 | from __future__ import absolute_import
  3 | 
  4 | import numpy as np
  5 | 
  6 | from sklearn.preprocessing import StandardScaler
  7 | from six.moves import range
  8 | 
  9 | def mean_std_weighted(x, weights=None):
 10 |     """Computes weighted mean and standard deviation.
 11 | 
 12 |     x = a (N, ) or an (N, nx) numpy array
 13 |     weights = a (N, ) numpy array of weights or None (no weights)
 14 | 
 15 |     Returns {'mean':[means], 'std':[standard deviations]}
 16 |     where each value is a len(nx) array for each feature
 17 |     """
 18 |     if weights is None:
 19 |         ret = {'mean': np.mean(x, axis=0), 'std': np.std(x, axis=0) }
 20 |     else:
 21 |         # weighted mean/std
 22 |         # reshape x to 1 dim
 23 |         m = np.average(x, axis=0, weights=weights)
 24 |         v = np.sqrt(np.dot(weights, (x - m)**2) / weights.sum())
 25 |         ret = {'mean': m, 'std': v}
 26 | 
 27 |     # replace zero values
 28 |     if len(x.shape) == 1:
 29 |         if ret['std'] == 0:
 30 |             ret['std'] = 1
 31 |     else:
 32 |         zero_std = [k for k in range(x.shape[1]) if ret['std'][k] < 1e-16]
 33 |         for i in zero_std:
 34 |             ret['std'][i] = 1.0
 35 | 
 36 |     return ret
 37 | 
 38 | 
 39 | class IdentityTransformer(object):
 40 |     '''
 41 |     Identity transformer that implements sklearn Transformer API
 42 |     '''
 43 |     def transform(self, X, *args, **kwargs):
 44 |         return X
 45 | 
 46 |     def fit(self, X, *args, **kwargs):
 47 |         pass
 48 | 
 49 | 
 50 | class LogScaledTransformer(StandardScaler):
 51 |     def __init__(self, offset=0.0, **kwargs):
 52 |         '''
 53 |         Take log(X+offset) then apply mean-std scaling.
 54 |         **kwargs: passed into StandardScaler.__init__
 55 | 
 56 |         we ignore the copy options for convenience
 57 |         '''
 58 |         super(LogScaledTransformer, self).__init__(**kwargs)
 59 |         self._offset = offset
 60 | 
 61 |     def _log(self, X):
 62 |         return np.log(X + self._offset)
 63 | 
 64 |     def fit(self, X, *args, **kwargs):
 65 |         XX = self._log(X)
 66 |         return super(LogScaledTransformer, self).fit(XX, *args, **kwargs)
 67 | 
 68 |     def transform(self, X, *args, **kwargs):
 69 |         XX = self._log(X)
 70 |         return super(LogScaledTransformer, self).transform(
 71 |             XX, *args, **kwargs)
 72 | 
 73 |     def inverse_transform(self, X, *args, **kwargs):
 74 |         XX = super(LogScaledTransformer, self).inverse_transform(
 75 |             X, *args, **kwargs)
 76 |         return np.exp(XX) - self._offset
 77 | 
 78 | 
 79 | class BucketTransformer(object):
 80 |     '''
 81 |     Transform a float to a categorical variable and represent as
 82 |         1-in-k encoding.
 83 |     '''
 84 |     def __init__(self, bin_edges):
 85 |         '''
 86 |         bin_edges: edges for the len(bin_edges) + 1 bins.  They are:
 87 | 
 88 |         bin_edges = [x0, x1, ..., xn]
 89 |             x <= x0
 90 |             x0 < x <= x1
 91 |             ...
 92 |             xn < x
 93 |         '''
 94 |         from sklearn.preprocessing import Binarizer
 95 |         self._binarizers = [Binarizer(threshold=-np.inf)]
 96 |         self._binarizers.extend(
 97 |             [Binarizer(threshold=edge) for edge in bin_edges])
 98 |         self._nbins = len(self._binarizers)
 99 | 
100 |     def fit(self, *args, **kwargs):
101 |         pass
102 | 
103 |     def transform(self, X):
104 |         '''
105 |         X = len N vector
106 |         return (N, nbins) matrix with 1-in-k encoding
107 |         '''
108 |         assert len(X.shape) == 1 or min(X.shape) == 1
109 | 
110 |         ret = np.zeros((len(X), self._nbins))
111 |         for k, binarizer in enumerate(self._binarizers):
112 |             ret[:, k] = binarizer.transform(X.flatten())
113 | 
114 |         # since binarizer is 0-1 for whether X is less then the threshold
115 |         # we need the last 1 in each column, e.g.
116 |         #
117 |         # [1, 1, 0, 0] we change to [0, 1, 0, 0]
118 |         # can get the value by subtracting the previous column
119 |         for k in range(self._nbins-1):
120 |             ret[:, k] = ret[:, k] - ret[:, k+1]
121 |         return ret
122 | 
123 | 


--------------------------------------------------------------------------------
/mozsci/map_train.py:
--------------------------------------------------------------------------------
  1 | """Train models in parallel"""
  2 | from __future__ import absolute_import
  3 | 
  4 | import numpy as np
  5 | from six.moves import range
  6 | 
  7 | class TrainModelCV(object):
  8 |     def __init__(self,
  9 |                  model_description=[None, None, '', (), {}],
 10 |                  X=None, y=None, Xtest=None, ytest=None,
 11 |                  folds=None, weights=None, weightstest=None, fit_kwargs={}):
 12 |         """
 13 |         model_description = [model_init, error, model_save_file, args, kwargs]
 14 |                 WHERE
 15 |             model_init = a callable thing model_init(args, kwargs) that returns a model
 16 |                 object.  This has an interface as follows:
 17 |                    model.fit(X, y) = trains
 18 |                    model.predict(X) = predicts
 19 |                    model.save_model(filename) = serializes model to a file
 20 |             error = a callable thing that computes error as error(Yactual, Ypred)
 21 |             model_save_file = if provided, then saves the model to this file
 22 |             args, kwards = passed to model_init(*args, **kwargs)
 23 | 
 24 |             fit_kwargs = anything to pass down the model.fit routine (error tolerance, etc)
 25 | 
 26 |         X, y = training dataset (required)
 27 |         Xtest, Ytest = testing dataset (if provided, then computes error on this dataset
 28 | 
 29 |         folds = if provided, then gives a set of splits to use for k-fold cross validation.
 30 |         folds is a length-k list.  Each element of the list is a tuple, where the first
 31 |         element of the tuple gives the training indices, the second the test indices.
 32 |         folds can easily be generated with a call to cv_kfold.
 33 | 
 34 |         If doing a k-fold CV, then Xtest and ytest are ignored, and X and y are split
 35 |         (and an error is raised if Xtest and ytest are provided).
 36 |         The model_save_file is also ignored in this case
 37 |         The errors data structure reports the average error for each fold.
 38 |         """
 39 |         self.model_description = model_description
 40 |         self.model_init = model_description[0]
 41 |         self.error = model_description[1]
 42 |         self.model_save_file = model_description[2]
 43 |         self.X = X
 44 |         self.y = y
 45 |         self.Xtest = Xtest
 46 |         self.ytest = ytest
 47 |         self.folds = folds
 48 |         if folds is not None:
 49 |             assert Xtest is None and ytest is None
 50 |         self.weights = weights
 51 |         self.weightstest = weightstest
 52 |         self._fit_kwargs = fit_kwargs
 53 | 
 54 | 
 55 |     def run(self):
 56 |         if self.folds is not None:
 57 |             errors = self._run_kfold()
 58 |         else:
 59 |             errors, model = self._run_one_train_test(self.X, self.y, self.Xtest, self.ytest, self.weights, self.weightstest, fit_kwargs=self._fit_kwargs)
 60 | 
 61 |             # save to file if needed
 62 |             if self.model_save_file is not None:
 63 |                 model.save_model(self.model_save_file)
 64 | 
 65 |         # prepare errors for output
 66 |         errors_ret = {}
 67 |         errors_ret[str(self.model_description)] = errors
 68 | 
 69 |         return errors_ret
 70 | 
 71 | 
 72 |     def _run_kfold(self):
 73 |         # do k-fold cross validation
 74 |         errors = []
 75 |         for k in range(len(self.folds)):
 76 |             train_indices = self.folds[k][0]
 77 |             test_indices = self.folds[k][1]
 78 | 
 79 |             if self.weights is None:
 80 |                 this_error, model = self._run_one_train_test(self.X[train_indices, :], self.y[train_indices], self.X[test_indices, :], self.y[test_indices], fit_kwargs=self._fit_kwargs)
 81 |             else:
 82 |                 this_error, model = self._run_one_train_test(self.X[train_indices, :], self.y[train_indices], self.X[test_indices, :], self.y[test_indices], self.weights[train_indices], self.weights[test_indices], fit_kwargs=self._fit_kwargs)
 83 | 
 84 |             errors.append(this_error)
 85 | 
 86 |         # return average error
 87 |         # for aggregate error functions, can return
 88 |         #   errors['train'] = {'error1': 0.5, 'error2': 0.2}, ...
 89 |         # also support this case
 90 |         ret = {}
 91 |         if type(errors[0]['train']) == dict:
 92 |             for k in ['train', 'test']:
 93 |                 ret[k] = {}
 94 |                 for error_type in errors[0]['train'].keys():
 95 |                     ret[k][error_type] = np.mean([ele[k][error_type] for ele in errors])
 96 |         else:
 97 |             for k in ['train', 'test']:
 98 |                 ret[k] = np.mean([ele[k] for ele in errors])
 99 | 
100 |         return ret
101 | 
102 | 
103 |     def _run_one_train_test(self, X, y, Xtest, ytest, weights=None, weightstest=None, fit_kwargs={}):
104 |         # initialize model
105 |         # train
106 |         # compute error
107 | 
108 |         # initialize
109 |         model = self.model_init(*self.model_description[3], **self.model_description[4])
110 | 
111 |         # train
112 |         try:
113 |             model.fit(X, y, weights=weights, **fit_kwargs)
114 |         except TypeError:   # model doesn't do weighted learning
115 |             model.fit(X, y, **fit_kwargs)
116 | 
117 |         # compute error
118 |         errors = {}
119 |         ypred = model.predict(X)
120 |         if weights is None:
121 |             errors['train'] = self.error(y, ypred)
122 |         else:
123 |             errors['train'] = self.error(y, ypred, weights=weights)
124 | 
125 | 
126 |         if Xtest is not None:
127 |             ypred = model.predict(Xtest)
128 |             if weightstest is None:
129 |                 errors['test'] = self.error(ytest, ypred)
130 |             else:
131 |                 errors['test'] = self.error(ytest, ypred, weights=weightstest)
132 |         else:
133 |             errors['test'] = None
134 | 
135 |         return errors, model
136 | 
137 | 
138 | def _pool_helper(model_description, X=None, y=None, Xtest=None, ytest=None,
139 |                  folds=None, weights=None, weightstest=None):
140 |     # a helper for Pool class.
141 |     # this creates an instance of TrainModelCV and runs it
142 |     trainer = TrainModelCV(model_description,
143 |                 X=X, y=y, Xtest=Xtest, ytest=ytest,
144 |                 folds=folds, weights=weights, weightstest=weightstest)
145 |     return trainer.run()
146 | 
147 | 
148 | 
149 | def run_train_models(processes, model_library, **kwargs):
150 |     """Train many supervised learning problems in parallel
151 | 
152 |     model_library = a list specifying the model library for the dataset in
153 |             format needed for TrainModelCV
154 |             **kwargs: all the rest of the input to TrainModelCV"""
155 |     # sample input for model_library:
156 |     #          [[LogisticRegression, classification_error, 'parameters.json', (), {'lam':0.5}],
157 |     #          [LogisticRegression, auc_wmw_fast, None, (), {'C':50}]]
158 | 
159 | 
160 |     if processes > 1:
161 | 
162 |         # use a process pool top execute all the training jobs
163 |         # collect the results and combine to return
164 |         from multiprocessing import Pool
165 | 
166 |         p = Pool(processes)
167 | 
168 |         results = []
169 |         for model in model_library:
170 |             results.append(p.apply_async(_pool_helper, (model, ), kwargs))
171 | 
172 |         # wait on the pool to finish
173 |         p.close()
174 |         p.join()
175 | 
176 |         # collect the results
177 |         ret = {}
178 |         for result in results:
179 |             ret.update(result.get())
180 | 
181 |     else:
182 |         # don't need a pool
183 |         ret = {}
184 |         for model in model_library:
185 |             ret.update(_pool_helper(model, **kwargs))
186 | 
187 |     return ret
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/mozsci/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .logistic_regression import LogisticRegression
3 | from .linear_regression import LinearRegression
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/mozsci/models/linear_regression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import numpy as np
  4 | from scipy.optimize import fmin_bfgs
  5 | import json
  6 | import six
  7 | from six.moves import range
  8 | 
  9 | class LinearRegression(object):
 10 |     def __init__(self, lam=1.0):
 11 |         """lam = regularization parameter"""
 12 |         self.lam = lam
 13 | 
 14 |         # these are set in fit
 15 |         self.b = None  # float
 16 |         self.w = None  # (nvars, ) array
 17 | 
 18 |     def predict(self, x):
 19 |         """Make a prediction.
 20 |         Return P(y == 1 | x)
 21 | 
 22 |         x = (Nobs, nvars)
 23 |         """
 24 |         return np.sum(self.w * x, axis=1) + self.b
 25 | 
 26 |     def fit(self, x, yy, weights=None):
 27 |         """Train the model.
 28 | 
 29 |         x = (Nobs, nvars)
 30 |         y = (Nobs, )
 31 | 
 32 |         Bias term automatically added
 33 | 
 34 |         Returns the loss"""
 35 |         # transform y to vector
 36 |         if len(yy.shape) > 1:
 37 |             assert len(yy.shape) == 2 and yy.shape[1] == 1
 38 |             y = yy.reshape(-1, )
 39 |         else:
 40 |             y = yy
 41 | 
 42 |         def _loss_for_optimize(params):
 43 |             return LinearRegression._loss(x, y, params[0], params[1:], self.lam, weights)
 44 |         def _gradient_for_optimize(params):
 45 |             return LinearRegression._gradient_loss(x, y, params[0], params[1:], self.lam, weights)
 46 | 
 47 |         params_opt = fmin_bfgs(_loss_for_optimize, np.zeros(1 + x.shape[1]), fprime=_gradient_for_optimize, maxiter=200)
 48 | 
 49 |         self.b = params_opt[0]
 50 |         self.w = params_opt[1:]
 51 | 
 52 |         return _loss_for_optimize(params_opt)
 53 | 
 54 |     def save_model(self, model_file):
 55 |         """Serialize model to model_file"""
 56 |         m = {'b':self.b,
 57 |             'w':self.w.tolist()}
 58 | 
 59 |         with open(model_file, 'w') as f:
 60 |             json.dump(m, f)
 61 | 
 62 |     @classmethod
 63 |     def load_model(cls, model_file):
 64 |         '''If a string is provided, it's assumed to be a path to a file
 65 |         containing a JSON blob describing the model. Otherwise, it should
 66 |         be a dictionary representing the model'''
 67 |         if isinstance(model_file, six.string_types):
 68 |             params = json.load(open(model_file, 'r'))
 69 |         else:
 70 |             params = model_file
 71 |         ret = cls()
 72 |         ret.b = float(params['b'])
 73 |         ret.w = np.array(params['w'])
 74 |         return ret
 75 | 
 76 |     @staticmethod
 77 |     def _loss(x, y, b, w, lam, weights=None):
 78 |         """Return loss function at x.
 79 |         loss = sum_squared loss + 0.5 * lambda * sum(w**2)
 80 |         weights = if provided an (N, ) list of weights
 81 |         """
 82 |         loss = 0.5 * lam * np.sum(w ** 2)
 83 |         if weights is None:
 84 |             loss += np.sum((np.sum(w * x, axis=1) + b - y) ** 2)
 85 |         else:
 86 |             loss += np.sum(weights * (np.sum(w * x, axis=1) + b - y) ** 2)
 87 |         return loss
 88 | 
 89 |     @staticmethod
 90 |     def _gradient_loss(x, y, b, w, lam, weights=None):
 91 |         """Return the gradient of the loss.
 92 | 
 93 |            x0 = (N, nvars) numpy array of x
 94 |            y = prediction
 95 | 
 96 |            gradient = loss + self.lam * w
 97 | 
 98 |             weights = if provided an (N, ) array to add in to each
 99 |         """
100 |         nvars = len(w)
101 |         gradient = np.zeros(nvars + 1)               # first position is b
102 |         gradient[1:] = lam * w
103 | 
104 |         # need sum(f(x) - y) * x for all variables
105 |         error = np.sum(w * x, axis=1) + b - y
106 |         if weights is None:
107 |             gradient[0] = np.sum(error)   # * 1 for bias term
108 |             for k in range(nvars):
109 |                 gradient[k + 1] += np.sum(error * x[:, k])
110 |         else:
111 |             gradient[0] = np.sum(error * weights)   # * 1 for bias term
112 |             for k in range(nvars):
113 |                 gradient[k + 1] += np.sum(weights * error * x[:, k])
114 | 
115 |         gradient *= 2
116 | 
117 |         return gradient
118 | 
119 | 


--------------------------------------------------------------------------------
/mozsci/models/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | 
  4 | import numpy as np
  5 | import json
  6 | import six
  7 | from six.moves import range
  8 | 
  9 | class LogisticRegression(object):
 10 |     def __init__(self, lam=1.0):
 11 |         """lam = regularization parameter"""
 12 |         self.lam = lam
 13 | 
 14 |         # these are set in fit
 15 |         self.b = None  # float
 16 |         self.w = None  # (nvars, ) array
 17 | 
 18 |     def predict(self, x):
 19 |         """Make a prediction.
 20 |         Return P(y == 1 | x)
 21 | 
 22 |         x = (Nobs, nvars)
 23 |         """
 24 |         return LogisticRegression._sigmoid(x, self.b, self.w)
 25 | 
 26 |     def fit(self, x, y, weights=None, **kwargs):
 27 |         """Train the model.
 28 | 
 29 |         x = (Nobs, nvars)
 30 |         y = (Nobs, )  = {0, 1}
 31 | 
 32 |         Bias term automatically added
 33 | 
 34 |         Returns the loss
 35 | 
 36 |         **kwags passed into fmin_l_bfgs_b"""
 37 |         from scipy.optimize import fmin_l_bfgs_b
 38 | 
 39 |         assert len(y) == x.shape[0]
 40 |         assert weights is None or len(weights) == x.shape[0]
 41 | 
 42 |         y0 = y == 0
 43 |         x0 = x[y0, :]
 44 |         x1 = x[~y0, :]
 45 | 
 46 |         if weights is None:
 47 |             loss_weights = None
 48 |         else:
 49 |             loss_weights = [weights[y0], weights[~y0]]
 50 | 
 51 |         def _loss_for_optimize(params):
 52 |             return LogisticRegression._loss_gradient(x0, x1, params[0], params[1:], self.lam, loss_weights)
 53 | 
 54 |         params0 = np.zeros(1 + x.shape[1])
 55 |         params_opt, loss_opt, info_opt = fmin_l_bfgs_b(_loss_for_optimize, params0, disp=0, **kwargs)
 56 |         print(("%s funcalls: %s" % (info_opt['task'], info_opt['funcalls'])))
 57 | 
 58 |         self.b = params_opt[0]
 59 |         self.w = params_opt[1:]
 60 | 
 61 |     def save_model(self, model_file):
 62 |         """Serialize model to model_file"""
 63 |         m = {'b':self.b,
 64 |             'w':self.w.tolist()}
 65 | 
 66 |         with open(model_file, 'w') as f:
 67 |             json.dump(m, f)
 68 | 
 69 |     @classmethod
 70 |     def load_model(cls, model_file):
 71 |         '''If a string is provided, it's assumed to be a path to a file
 72 |         containing a JSON blob describing the model. Otherwise, it should
 73 |         be a dictionary representing the model'''
 74 |         if isinstance(model_file, six.string_types):
 75 |             params = json.load(open(model_file, 'r'))
 76 |         else:
 77 |             params = model_file
 78 |         ret = cls()
 79 |         ret.b = float(params['b'])
 80 |         ret.w = np.array(params['w'])
 81 |         return ret
 82 | 
 83 |     @staticmethod
 84 |     def _sigmoid(x, b, w):
 85 |         """Return sigma(x) = 1.0 / (1.0 + exp(-x * w - b))
 86 |         X = N x (nvars)
 87 |         
 88 |         Returns a (N, ) array"""
 89 |         return np.minimum(np.maximum(1.0 / (1.0 + np.exp(-b - np.sum(w * x, axis=1))), 1.0e-12), 1 - 1.0e-12)
 90 | 
 91 |     @staticmethod
 92 |     def _loss_gradient(x0, x1, b, w, lam, weights=None):
 93 |         """Return loss/gradient function at x.
 94 |         x0 = (N0, nvars) numpy array of x where y == 0
 95 |         x1 = (N1, nvars) numpy array of x where y == 1
 96 | 
 97 |         loss = Logistic loss + 0.5 * lambda * sum(w**2)
 98 |         logistic loss =  -sum ( log(sigmoid(x))   y == 1
 99 |                                 log(1 - sigmoid(x)) if y == 0 )
100 |         weights = if provided an [(N0, ), (N1, )] list of arrays to add in to each
101 |             observation's contribution to error.
102 |             first entry corresponds to x0, second to x1
103 |         """
104 |         nvars = len(w)
105 | 
106 |         # initialize + regularization term
107 |         loss = 0.5 * lam * np.sum(w ** 2)
108 |         gradient = np.zeros(nvars + 1)               # first position is b
109 |         gradient[1:] = lam * w
110 | 
111 |         # we need prediction for x
112 |         pred_x_0_1 = [LogisticRegression._sigmoid(x0, b, w), LogisticRegression._sigmoid(x1, b, w)]
113 | 
114 |         # the log likelihood
115 |         log_like_x_0_1 = [np.log(1.0 - pred_x_0_1[0]),
116 |                           np.log(pred_x_0_1[1])]
117 | 
118 |         # also need the error for gradient.
119 |         error = [pred_x_0_1[0],
120 |                  pred_x_0_1[1] - 1]
121 | 
122 |         if weights is None:
123 |             loss += -np.sum(log_like_x_0_1[1]) - np.sum(log_like_x_0_1[0])
124 |             gradient[0] += np.sum(error[0]) + np.sum(error[1])   # * 1 for bias term 
125 |             for k in range(nvars):
126 |                 gradient[k + 1] += np.sum(error[0] * x0[:, k]) + np.sum(error[1] * x1[:, k])
127 |         else:
128 |             loss += -np.sum(weights[1] * log_like_x_0_1[1]) - np.sum(weights[0] * log_like_x_0_1[0])
129 |             gradient[0] += np.sum(error[0] * weights[0]) + np.sum(error[1] * weights[1])
130 |             for k in range(nvars):
131 |                 gradient[k + 1] += ( np.sum(weights[0] * error[0] * x0[:, k]) +
132 |                                      np.sum(weights[1] * error[1] * x1[:, k]) )
133 |         return loss, gradient
134 | 
135 | 


--------------------------------------------------------------------------------
/mozsci/numpy_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import numpy as np
 4 | import json
 5 | 
 6 | class NumpyEncoder(json.JSONEncoder):
 7 |     """A JSON encoder for numpy arrays
 8 |     Use like json.dumps(data, cls=NumpyEncoder)"""
 9 |     def default(self, obj):
10 |         if isinstance(obj, np.ndarray):
11 |             return obj.tolist()
12 |         return json.JSONEncoder.default(self, obj)
13 | 
14 | 
15 | def numpy_decoder(dct):
16 |     """Decodes numpy arrays stored as values in a json dictionary
17 |     Use like json.loads(j, object_hook=numpy_decoder)"""
18 |     for k in dct.keys():
19 |         if isinstance(dct[k], list):
20 |             try:
21 |                 dct[k] = np.asarray(dct[k])
22 |             except ValueError:
23 |                 pass   # can't convert to numpy array so leave as is
24 |     return dct
25 | 
26 | 
27 | def load_json_to_numpy(jsonfile):
28 |     """Loads the data in the jsonfile using numpy_decoder to convert
29 |     to numpy arrays.  Returns the decoded data"""
30 |     return json.load(open(jsonfile, 'r'),
31 |                             object_hook=numpy_decoder)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/mozsci/pca.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | # linear PCA
  4 | import json
  5 | 
  6 | import numpy as np
  7 | 
  8 | from .numpy_util import numpy_decoder, NumpyEncoder
  9 | from six.moves import range
 10 | 
 11 | class LinearPCA(object):
 12 |     """Linear PCA by SVD"""
 13 |     
 14 |     def __init__(self, json_map=None):
 15 |         """Constructor
 16 |         If json_map is provided, then initializes from it"""
 17 |         if json_map is None:
 18 |             self.mean = None
 19 |             self.nvars = None
 20 |             self.eigval = None
 21 |             self.eigvec = None
 22 |         else:
 23 |             j = json.loads(json_map, object_hook=numpy_decoder)
 24 |             self.mean = j['mean']
 25 |             self.nvars = j['nvars']
 26 |             self.eigval = j['eigval']
 27 |             self.eigvec = j['eigvec']
 28 | 
 29 | 
 30 | 
 31 |     def train(self, data, fignum=None):
 32 |         """Train the PCA.  data is an (nobs, nvars) numpy array
 33 |         If fignum is not None, then plot the eigen values in the figure
 34 |         
 35 |         Returns nothing."""
 36 |         assert isinstance(data, np.ndarray) and data.ndim == 2 
 37 |         self.nvars = data.shape[1]
 38 |         self.mean = np.mean(data, 0)
 39 | 
 40 |         # do SVD of the data
 41 |         corr = np.cov((data - self.mean).T)
 42 |         (eigval, eigvec) = np.linalg.eig(corr)
 43 | 
 44 |         # sort eigenvalues, eigen vectors into ascending order
 45 |         sortindex = (-1.0 * eigval).argsort()
 46 |         eigval = eigval[sortindex]
 47 |         eigvec = eigvec[:, sortindex]
 48 | 
 49 |         self.eigval = eigval
 50 |         self.eigvec = eigvec
 51 | 
 52 |         # plot eigenvalues
 53 |         if fignum is not None:
 54 |             eigval_sum = self._compute_percent_explained()
 55 | 
 56 |             import pylab as plt
 57 |             fig = plt.figure(fignum)
 58 |             fig.clf()
 59 |             plt.plot(eigval_sum, 'bx')
 60 |             plt.title("Eigenvalues for PCA")
 61 |             fig.show()
 62 | 
 63 |     def _compute_percent_explained(self):
 64 |         """Computes percent explained from self.eigval"""
 65 |         eigval_sum = self.eigval.cumsum()
 66 |         eigval_cum_sum = eigval_sum / np.float(eigval_sum[-1])
 67 |         percent_explain = np.hstack((eigval_cum_sum[0], eigval_cum_sum[1:] - eigval_cum_sum[0:-1]))
 68 |         return percent_explain
 69 | 
 70 |     def plot_eigvec(self, neig, fignum):
 71 |         """Plots the first neig eigenvectors for figure fignum"""
 72 |         import pylab as plt
 73 |         fig = plt.figure(fignum)
 74 |         fig.clf()
 75 |         pct_explain = self._compute_percent_explained()
 76 |         for k in range(neig):
 77 |             plt.plot(self.eigvec[:, k], label=str(k) + " " + str(round(pct_explain[k] * 100)))
 78 |         plt.legend()
 79 |         fig.show()
 80 | 
 81 | 
 82 |     def project(self, data, n):
 83 |         """Given the data, project onto the first n principle components.
 84 |         data must have the same number of variables as the data used
 85 |         in training.
 86 | 
 87 |         data is a (nobs, nvars) numpy array
 88 |         return is a (nobs, n) numpy array, the projection onto the PCA
 89 |         
 90 |         Note: the mean (self.mean) is removed from the data before
 91 |         projection so that the full projection is
 92 |             self.mean + SUM_k (projection_k * PC_k)"""
 93 |         assert data.ndim == 2 and data.shape[1] == self.nvars and n > 0 and n <= self.nvars
 94 |         return np.dot((data - self.mean), self.eigvec[:, 0:n])
 95 | 
 96 |     def truncate(self, data, n):
 97 |         """Truncate the data to the n PCs.
 98 |         This projects on the first n PCs, then reconstructs data.
 99 | 
100 |         data is a (nobs, nvars) numpy array
101 |         return is a (nobs, nvars) numpy array"""
102 |         return self.mean + np.dot(self.project(data, n), self.eigvec[:, 0:n].T)
103 | 
104 |     def to_json(self):
105 |         """Returns a json string with the PCA"""
106 |         j = {}
107 |         j['eigval'] = self.eigval
108 |         j['eigvec'] = self.eigvec
109 |         j['mean'] = self.mean
110 |         j['nvars'] = self.nvars
111 |         return json.dumps(j, cls=NumpyEncoder)
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/mozsci/spearmanr_by_fast.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | import cython
 3 | 
 4 | # import both numpy and the Cython declarations for numpy
 5 | import numpy as np
 6 | cimport numpy as np
 7 | 
 8 | # declare the interface to the C code
 9 | cdef extern double c_spearman_for_python(double* a, double* b, np.int_t* byvar, int n)
10 | 
11 | @cython.boundscheck(False)
12 | @cython.wraparound(False)
13 | def spearmanr_by(np.ndarray[double, ndim=1, mode="c"] a not None,
14 |              np.ndarray[double, ndim=1, mode="c"] b not None,
15 |              np.ndarray[np.int_t, ndim=1, mode="c"] byvar not None):
16 |     """
17 |     Spearman correlation of a vs b by byvar
18 | 
19 |     Given a data set of x and y, grouped by the byvar, computes
20 |     the spearman correlation for each group, then returns the average correlation
21 |     across groups.
22 | 
23 |     byvar must be in sorted order.
24 | 
25 |     param: a -- a 1-d numpy array of np.float64
26 |     param: b -- a 1-d numpy array of np.float64
27 |     param: byvar -- the by groups, np.int64
28 |     """
29 |     cdef int n
30 |     n = a.shape[0]
31 |     return c_spearman_for_python(&a[0], &b[0], &byvar[0], n)
32 | 
33 | 


--------------------------------------------------------------------------------
/mozsci/variables.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A few useful abstractions for input/output variables in machine learning
  3 | """
  4 | from __future__ import absolute_import
  5 | import numpy as np
  6 | from six.moves import zip
  7 | 
  8 | 
  9 | class Variable(object):
 10 |     """
 11 |     A Variable is one group of input or output to a model.
 12 |     """
 13 |     def __init__(self, name, transformer, ndim=1, ndimout=1):
 14 |         """
 15 |         name: the variable name
 16 |         transformer: implements the sklearn.Transformer API
 17 |             (fit, transform)
 18 |         ndim: the dimension of the variable (input)
 19 |         ndimout: the dimension of the output transform
 20 |         """
 21 |         self.name = name
 22 |         self.ndim = ndim
 23 |         self.ndimout = ndimout
 24 |         self._transformer = transformer
 25 | 
 26 |     # forwarding methods
 27 |     def fit(self, *args, **kwargs):
 28 |         return self._transformer.fit(*args, **kwargs)
 29 | 
 30 |     def transform(self, *args, **kwargs):
 31 |         return self._transformer.transform(*args, **kwargs)
 32 | 
 33 | 
 34 | class ModelVariables(object):
 35 |     """
 36 |     Hold sets of input and output variables for the model
 37 |     """
 38 |     def __init__(self, independent, dependent):
 39 |         """
 40 |         independent: list of Variable instances for the model input
 41 |         dependent: list of Variable instances for the model output
 42 |         """
 43 |         self.independent = independent
 44 |         self.dependent = dependent
 45 |         self.nin = sum([variable.ndim for variable in independent])
 46 |         self.nout = sum([variable.ndim for variable in dependent])
 47 | 
 48 | 
 49 | class ModelDriver(object):
 50 |     """
 51 |     This class is used to drive any model/algorithm for training and
 52 |     prediction purposes. It's specifically designed so that we don't need
 53 |     to worry about the normalization for cross validation procedures. It also
 54 |     supports the variable definitions that we use for data collection.
 55 |     """
 56 |     def __init__(self, variables, model):
 57 |         """
 58 |         variables: an instance of ModelVariables
 59 |         model: must implement the sklearn interface (fit, predict), as
 60 |             well as be picklable)
 61 |         """
 62 |         self.variables = variables
 63 |         self.model = model
 64 | 
 65 |     def _get_array(self, data, variables, dim):
 66 |         '''
 67 |         Get a numpy array from the data.
 68 |         '''
 69 |         if isinstance(data, np.ndarray):
 70 |             shape = data.shape
 71 |             if len(shape) == 2 and shape[1] == dim:
 72 |                 # data is already an array
 73 |                 return data
 74 | 
 75 |         # otherwise data should implement __getitem__
 76 |         first_var = variables[0]
 77 |         first_data = data[first_var.name]
 78 |         if isinstance(first_data, int) or isinstance(first_data, float):
 79 |             ret = np.zeros((1, dim))
 80 |         else:
 81 |             ret = np.zeros((len(first_data), dim))
 82 |         ind = first_var.ndim
 83 |         if ind == 1:
 84 |             ret[:, 0] = first_data
 85 |         else:
 86 |             ret[:, :ind] = first_data
 87 |         for variable in variables[1:]:
 88 |             if variable.ndim == 1:
 89 |                 ret[:, ind] = data[variable.name]
 90 |             else:
 91 |                 ret[:, ind:(ind + variable.ndim)] = data[variable.name]
 92 |             ind += variable.ndim
 93 |         return ret
 94 | 
 95 |     def _transform(self, X, variables, fit=False):
 96 |         '''
 97 |         Transform the data
 98 |         '''
 99 |         # get the output dimensions
100 |         try:
101 |             ndimout = [v.ndimout for v in variables]
102 |         except AttributeError:
103 |             ndimout = []
104 |             for v in variables:
105 |                 if hasattr(v, 'ndimout'):
106 |                     ndimout.append(v.ndimout)
107 |                 else:
108 |                     ndimout.append(1)
109 |         nout = sum(ndimout)
110 |         ret = np.zeros((len(X), nout))
111 |         ind = 0
112 |         indout = 0
113 |         for variable, dimout in zip(variables, ndimout):
114 |             if fit:
115 |                 variable.fit(X[:, ind:(ind + variable.ndim)])
116 |             ret[:, indout:(indout + dimout)] = variable.transform(
117 |                 X[:, ind:(ind + variable.ndim)])
118 |             ind += variable.ndim
119 |             indout += dimout
120 |         return ret
121 | 
122 |     def fit(self, predictors, y):
123 |         """
124 |         train the model using observations.
125 |         :param X: independent variables. 2-d numpy array or something
126 |             implementing __getitem__
127 |         :param y: dependent variable. numpy array or something implementing
128 |             __getitem__
129 |         :return: Nothing.
130 |         """
131 |         # (1) get predictor, predicted array
132 |         X = self._get_array(predictors, self.variables.independent,
133 |             self.variables.nin)
134 |         yy = self._get_array(y, self.variables.dependent,
135 |             self.variables.nout)
136 | 
137 |         # (2) fit transforms
138 |         XX = self._transform(X, self.variables.independent, True)
139 |         YY = self._transform(yy, self.variables.dependent, True)
140 | 
141 |         # (3) fit the model
142 |         self.model.fit(XX, YY)
143 | 
144 |     def predict(self, predictors, predict_prob=False):
145 |         """
146 |         This method does the prediction using the model and saved
147 |         normalization parameters.
148 |         """
149 |         X = self._get_array(predictors, self.variables.independent,
150 |             self.variables.nin)
151 |         XX = self._transform(X, self.variables.independent, False)
152 | 
153 |         if predict_prob:
154 |             return self.model.predict_proba(XX)
155 |         else:
156 |             return self.model.predict(XX)
157 | 
158 |     def dumps(self):
159 |         '''
160 |         Return a string representation of this instance
161 |         '''
162 |         import six.moves.cPickle
163 |         return six.moves.cPickle.dumps(self)
164 | 
165 |     @classmethod
166 |     def loads(cls, string):
167 |         '''
168 |         Return an instance from the serialized string
169 |         '''
170 |         import six.moves.cPickle
171 |         return six.moves.cPickle.loads(string)
172 | 
173 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | nose
4 | coverage
5 | Cython>=0.17
6 | simplejson
7 | scikit-learn
8 | six
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2012 SEOmoz
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | from setuptools import setup
25 | from distutils.extension import Extension
26 | from Cython.Distutils import build_ext
27 | import numpy as np
28 | 
29 | ext_modules = [Extension('mozsci.spearmanr_by_fast',
30 |     sources=["mozsci/spearmanr_by_fast.pyx", "mozsci/cspearmanr_by_fast.cc"],
31 |     include_dirs = [np.get_include()],
32 |     language="c++"),
33 |     Extension('mozsci._c_utils',
34 |         sources=["mozsci/_c_utils.pyx"],
35 |         include_dirs = [np.get_include()],
36 |         language="c++"),
37 |     ]
38 | 
39 | with open('requirements.txt', 'r') as fin:
40 |     requires = fin.read().strip().split('\n')
41 | 
42 | setup(
43 |     name             = 'mozsci',
44 |     version          = '0.9.2',
45 |     description      = 'Data science tools from Moz',
46 |     author           = 'Moz Data Science',
47 |     author_email     = 'science@moz.com',
48 |     url              = 'http://github.com/seomoz/mozsci',
49 |     packages         = ['mozsci', 'mozsci.models', 'mozsci.glm'],
50 |     license          = 'MIT',
51 |     platforms        = 'Posix; MacOS X',
52 |     cmdclass         = {'build_ext': build_ext},
53 |     ext_modules      = ext_modules,
54 |     install_requires = requires,
55 |     classifiers      = [
56 |         'License :: OSI Approved :: MIT License',
57 |         'Development Status :: 4 - Beta',
58 |         'Intended Audience :: Developers',
59 |         'Topic :: Scientific/Engineering',
60 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
61 |         'Intended Audience :: Science/Research',
62 |         'Programming Language :: Python :: 2',
63 |         'Programming Language :: Python :: 2.7',
64 |         'Programming Language :: Python :: 3',
65 |         'Programming Language :: Python :: 3.3',
66 |         'Programming Language :: Python :: 3.4',
67 |         ],
68 | )
69 | 


--------------------------------------------------------------------------------
/test/data/poissonreg.csv:
--------------------------------------------------------------------------------
  1 | id,school,male,math,langarts,daysatt,daysabs
  2 | 1001,1,1,56.98883,42.45086,73,4
  3 | 1002,1,1,37.09416,46.82059,73,4
  4 | 1003,1,0,32.27546,43.56657,76,2
  5 | 1004,1,0,29.05672,43.56657,74,3
  6 | 1005,1,0,6.748048,27.24847,73,3
  7 | 1006,1,0,61.65428,48.41482,62,13
  8 | 1007,1,0,56.98883,40.73543,66,11
  9 | 1008,1,1,10.39049,15.35938,72,7
 10 | 1009,1,1,50.52795,52.11514,63,10
 11 | 1010,1,1,49.47205,42.45086,68,9
 12 | 1011,1,0,39.55739,36.45115,72,4
 13 | 1012,1,1,33.73761,13.13055,74,5
 14 | 1013,1,0,62.90584,62.27464,72,5
 15 | 1014,1,0,65.56011,44.66451,74,3
 16 | 1015,1,1,23.01052,25.25478,76,1
 17 | 1016,1,1,75.83068,61.04388,76,0
 18 | 1017,1,0,41.31353,49.47205,75,1
 19 | 1018,1,0,41.88515,65.56011,74,0
 20 | 1019,1,1,65.56011,46.82059,75,2
 21 | 1020,1,1,13.13055,6.748048,55,24
 22 | 1021,1,0,33.01677,42.45086,75,2
 23 | 1022,1,1,55.88246,64.87473,76,0
 24 | 1023,1,1,45.2079,55.33549,76,1
 25 | 1024,1,1,56.98883,44.66451,76,0
 26 | 1025,1,0,31.5115,38.34572,71,8
 27 | 1026,1,1,52.64643,50,67,3
 28 | 1027,1,1,17.25647,6.748048,70,7
 29 | 1028,1,0,33.01677,40.15026,76,0
 30 | 1029,1,1,61.04388,57.54914,74,2
 31 | 1030,1,1,66.98323,71.82729,77,0
 32 | 1031,1,1,1.007114,45.2079,77,0
 33 | 1032,1,0,38.34572,35.12527,76,1
 34 | 1033,1,1,44.66451,46.82059,32,3
 35 | 1034,1,1,44.11754,46.82059,77,0
 36 | 1035,1,1,59.84974,46.28556,77,0
 37 | 1036,1,0,32.27546,47.35357,49,28
 38 | 1037,1,1,23.01052,49.47205,71,8
 39 | 1038,1,1,70.94328,61.04388,72,5
 40 | 1039,1,1,1.007114,1.007114,75,2
 41 | 1040,1,0,41.88515,52.11514,46,27
 42 | 1041,1,0,40.15026,35.12527,72,5
 43 | 1042,1,0,41.31353,38.34572,59,18
 44 | 1043,1,0,44.66451,58.68647,57,19
 45 | 1044,1,1,38.34572,42.45086,67,9
 46 | 1045,1,1,32.27546,1.007114,68,9
 47 | 1046,1,0,37.09416,32.27546,75,4
 48 | 1047,1,1,63.54885,57.54914,75,2
 49 | 1048,1,1,43.56657,41.31353,68,3
 50 | 1049,1,1,33.01677,24.16932,68,9
 51 | 1050,1,0,68.48849,59.26457,56,20
 52 | 1051,1,1,29.05672,21.7637,65,6
 53 | 1052,1,1,54.7921,54.7921,76,0
 54 | 1053,1,1,48.94376,51.58518,50,27
 55 | 1054,1,0,52.64643,50.52795,65,12
 56 | 1055,1,0,13.13055,32.27546,43,34
 57 | 1056,1,1,53.71444,41.88515,76,1
 58 | 1057,1,1,15.35938,32.27546,52,25
 59 | 1058,1,1,10.39049,17.25647,71,5
 60 | 1059,1,0,34.43988,38.34572,74,3
 61 | 1060,1,0,41.88515,42.45086,77,2
 62 | 1061,1,1,40.73543,55.33549,76,1
 63 | 1062,1,0,37.09416,52.64643,70,7
 64 | 1063,1,1,26.2782,26.2782,73,4
 65 | 1064,1,1,47.88486,36.45115,71,8
 66 | 1065,1,0,38.95612,55.88246,71,6
 67 | 1066,1,1,40.15026,47.35357,61,16
 68 | 1067,1,0,36.45115,40.15026,74,3
 69 | 1068,1,0,27.24847,39.55739,71,4
 70 | 1069,1,0,39.55739,43.56657,73,3
 71 | 1070,1,0,48.94376,36.45115,72,5
 72 | 1071,1,1,62.27464,46.28556,73,0
 73 | 1072,1,1,66.98323,59.84974,67,9
 74 | 1073,1,1,28.17271,59.84974,77,0
 75 | 1074,1,1,35.12527,37.09416,69,8
 76 | 1075,1,0,52.64643,70.09472,77,0
 77 | 1076,1,0,26.2782,35.12527,65,11
 78 | 1077,1,1,41.31353,42.45086,75,4
 79 | 1078,1,1,59.26457,71.82729,75,2
 80 | 1079,1,1,1.007114,17.25647,35,35
 81 | 1080,1,0,33.01677,41.88515,54,23
 82 | 1081,1,0,40.73543,42.45086,64,13
 83 | 1082,1,1,66.26239,64.87473,71,6
 84 | 1083,1,0,70.94328,45.74812,79,0
 85 | 1084,1,1,54.7921,60.44261,71,6
 86 | 1085,1,1,64.20476,50.52795,77,0
 87 | 1086,1,1,33.73761,40.15026,66,8
 88 | 1087,1,0,49.47205,44.66451,66,11
 89 | 1088,1,0,26.2782,73.72179,66,11
 90 | 1089,1,1,70.94328,54.25188,73,4
 91 | 1090,1,1,37.09416,41.31353,71,6
 92 | 1091,1,0,37.09416,49.47205,54,23
 93 | 1092,1,0,61.04388,69.27759,72,5
 94 | 1093,1,1,86.86945,86.86945,72,5
 95 | 1094,1,0,61.04388,64.20476,51,26
 96 | 1095,1,0,23.01052,6.748048,69,7
 97 | 1096,1,1,27.24847,35.12527,76,1
 98 | 1097,1,1,32.27546,46.82059,68,9
 99 | 1098,1,1,24.16932,17.25647,66,11
100 | 1099,1,1,45.74812,50,60,18
101 | 1100,1,1,6.748048,13.13055,56,12
102 | 1101,1,1,35.12527,27.24847,73,3
103 | 1102,1,1,43.01117,28.17271,69,0
104 | 1103,1,1,26.2782,13.13055,73,4
105 | 1104,1,1,10.39049,29.05672,68,10
106 | 1105,1,1,61.04388,66.26239,61,16
107 | 1106,1,0,37.09416,51.58518,74,1
108 | 1107,1,0,1.007114,1.007114,70,9
109 | 1108,1,0,55.88246,45.2079,74,3
110 | 1109,1,1,46.82059,38.34572,77,0
111 | 1110,1,1,41.88515,56.98883,68,9
112 | 1111,1,1,66.26239,35.79525,65,14
113 | 1112,1,0,10.39049,38.95612,70,7
114 | 1113,1,1,33.01677,40.15026,74,3
115 | 1114,1,1,36.45115,20.40919,67,10
116 | 1115,1,1,46.82059,65.56011,64,12
117 | 1116,1,0,48.41482,34.43988,71,6
118 | 1117,1,0,41.88515,41.88515,42,35
119 | 1118,1,0,20.40919,25.25478,64,13
120 | 1119,1,0,73.72179,74.74522,74,3
121 | 1120,1,1,10.39049,40.15026,65,10
122 | 1121,1,1,17.25647,20.40919,71,6
123 | 1122,1,1,75.83068,64.87473,76,0
124 | 1123,1,1,26.2782,23.01052,75,2
125 | 1124,1,1,70.94328,82.74353,71,6
126 | 1125,1,1,76.98948,68.48849,72,5
127 | 1126,1,0,61.65428,58.11485,62,13
128 | 1127,1,0,28.17271,6.748048,73,4
129 | 1128,1,0,45.2079,48.94376,72,5
130 | 1129,1,0,56.98883,65.56011,74,3
131 | 1130,1,0,41.31353,48.94376,44,30
132 | 1131,1,0,47.35357,46.82059,61,16
133 | 1132,1,0,38.34572,43.01117,62,15
134 | 1133,1,0,10.39049,6.748048,65,12
135 | 1134,1,1,41.88515,36.45115,78,1
136 | 1135,1,1,40.15026,38.95612,72,1
137 | 1136,1,1,33.73761,35.79525,70,7
138 | 1137,1,0,68.48849,52.64643,74,1
139 | 1138,1,0,98.99289,64.20476,13,45
140 | 1139,1,0,62.90584,43.01117,69,10
141 | 1140,1,1,60.44261,36.45115,74,3
142 | 1141,1,0,37.09416,10.39049,52,27
143 | 1142,1,0,26.2782,33.01677,74,2
144 | 1143,1,0,64.87473,53.71444,58,13
145 | 1144,1,0,18.91984,29.05672,70,2
146 | 1145,1,1,54.7921,47.88486,74,5
147 | 1146,1,0,38.34572,38.95612,64,5
148 | 1147,1,0,53.71444,52.11514,69,4
149 | 1148,1,0,46.82059,51.58518,76,3
150 | 1149,1,0,51.58518,71.82729,44,20
151 | 1150,1,0,34.43988,40.15026,63,12
152 | 1151,1,0,38.34572,44.66451,48,31
153 | 1152,1,1,44.66451,40.73543,70,6
154 | 1153,1,1,49.47205,31.5115,63,14
155 | 1154,1,0,43.56657,46.82059,56,13
156 | 1155,1,1,39.55739,49.47205,71,6
157 | 1156,1,0,33.01677,47.35357,65,12
158 | 1157,1,1,20.40919,29.05672,65,12
159 | 1158,1,1,43.01117,34.43988,77,0
160 | 1159,1,1,51.58518,43.56657,78,1
161 | 2001,2,0,39.55739,50.52795,82,4
162 | 2002,2,1,53.71444,1.007114,86,0
163 | 2003,2,1,53.71444,38.95612,86,0
164 | 2004,2,1,54.7921,64.20476,84,2
165 | 2005,2,0,38.34572,54.7921,85,1
166 | 2006,2,0,45.2079,71.82729,84,2
167 | 2007,2,1,61.65428,58.68647,86,0
168 | 2008,2,0,54.7921,50,86,0
169 | 2009,2,1,61.65428,54.7921,86,0
170 | 2010,2,0,61.04388,58.68647,79,7
171 | 2011,2,1,65.56011,66.26239,84,2
172 | 2012,2,0,61.04388,71.82729,76,9
173 | 2013,2,0,73.72179,55.33549,80,6
174 | 2014,2,1,54.7921,51.58518,82,4
175 | 2015,2,1,58.11485,58.11485,85,1
176 | 2016,2,0,37.72536,46.82059,79,7
177 | 2017,2,1,65.56011,70.09472,86,0
178 | 2018,2,1,58.11485,68.48849,86,0
179 | 2019,2,0,62.90584,71.82729,82,4
180 | 2020,2,1,98.99289,55.33549,82,2
181 | 2021,2,0,53.71444,66.98323,86,0
182 | 2022,2,0,65.56011,50,82,4
183 | 2023,2,0,53.71444,65.56011,84,2
184 | 2024,2,0,48.41482,29.05672,67,18
185 | 2025,2,0,35.12527,52.11514,85,1
186 | 2026,2,1,70.94328,70.94328,86,0
187 | 2027,2,0,62.90584,65.56011,85,1
188 | 2028,2,1,68.48849,58.68647,70,16
189 | 2029,2,0,51.05624,41.31353,79,6
190 | 2030,2,0,64.20476,67.72454,70,16
191 | 2031,2,1,58.11485,78.2363,86,0
192 | 2032,2,0,46.82059,42.45086,77,8
193 | 2033,2,1,46.82059,52.64643,85,1
194 | 2034,2,0,66.98323,86.86945,84,2
195 | 2035,2,0,32.27546,51.05624,83,3
196 | 2036,2,0,29.05672,29.05672,82,4
197 | 2037,2,0,98.99289,42.45086,84,2
198 | 2038,2,0,61.65428,84.64062,82,4
199 | 2039,2,1,47.88486,37.09416,79,7
200 | 2040,2,0,53.71444,56.98883,86,0
201 | 2041,2,0,59.26457,51.58518,85,1
202 | 2042,2,0,73.72179,93.25195,85,1
203 | 2043,2,0,61.04388,51.58518,86,0
204 | 2044,2,0,33.01677,45.2079,73,13
205 | 2045,2,0,39.55739,41.31353,85,1
206 | 2046,2,1,56.98883,62.27464,86,0
207 | 2047,2,0,46.82059,47.88486,85,1
208 | 2048,2,1,41.31353,44.66451,86,0
209 | 2049,2,0,65.56011,71.82729,85,1
210 | 2050,2,0,82.74353,98.99289,86,0
211 | 2051,2,0,45.2079,44.11754,85,1
212 | 2052,2,0,59.26457,61.04388,86,0
213 | 2053,2,0,55.88246,56.98883,84,2
214 | 2054,2,1,69.27759,79.59081,86,0
215 | 2055,2,0,56.98883,59.26457,85,1
216 | 2056,2,1,56.98883,47.35357,86,0
217 | 2057,2,0,65.56011,62.27464,82,4
218 | 2058,2,1,47.88486,51.05624,83,3
219 | 2059,2,1,20.40919,40.15026,85,1
220 | 2060,2,0,55.88246,81.08016,86,0
221 | 2061,2,0,61.65428,60.44261,85,1
222 | 2062,2,0,58.11485,68.48849,82,4
223 | 2063,2,0,45.2079,47.35357,85,1
224 | 2064,2,1,29.05672,32.27546,77,9
225 | 2065,2,0,33.73761,59.84974,86,0
226 | 2066,2,1,53.71444,40.15026,82,4
227 | 2067,2,0,50.52795,53.71444,54,8
228 | 2068,2,1,63.54885,54.25188,73,13
229 | 2069,2,1,45.2079,48.94376,86,0
230 | 2070,2,1,73.72179,56.43343,86,0
231 | 2071,2,1,59.26457,49.47205,86,0
232 | 2072,2,0,98.99289,69.27759,86,0
233 | 2073,2,0,56.98883,74.74522,84,2
234 | 2074,2,0,58.11485,58.11485,81,5
235 | 2075,2,0,61.65428,86.86945,85,1
236 | 2076,2,1,65.56011,66.98323,86,0
237 | 2077,2,0,59.26457,56.98883,86,0
238 | 2078,2,0,66.98323,63.54885,85,1
239 | 2079,2,0,32.27546,50,84,2
240 | 2080,2,1,70.94328,73.72179,84,1
241 | 2081,2,1,37.72536,37.72536,81,5
242 | 2082,2,0,32.27546,29.90528,83,3
243 | 2083,2,1,36.45115,32.27546,85,1
244 | 2084,2,0,64.20476,71.82729,83,3
245 | 2085,2,0,56.98883,71.82729,80,6
246 | 2086,2,1,44.11754,24.16932,78,8
247 | 2087,2,1,40.15026,40.15026,65,21
248 | 2088,2,1,28.17271,30.72241,19,1
249 | 2089,2,0,52.64643,79.59081,79,7
250 | 2090,2,0,40.15026,44.66451,81,5
251 | 2091,2,1,41.31353,40.15026,85,1
252 | 2092,2,0,59.26457,89.60951,85,1
253 | 2093,2,0,45.2079,46.82059,86,0
254 | 2094,2,1,45.2079,53.17941,82,4
255 | 2095,2,0,38.34572,55.33549,86,0
256 | 2096,2,1,46.28556,55.33549,72,14
257 | 2097,2,0,65.56011,51.05624,44,2
258 | 2098,2,0,51.58518,65.56011,84,2
259 | 2099,2,0,46.82059,68.48849,84,2
260 | 2100,2,1,50.52795,43.01117,86,0
261 | 2101,2,1,59.26457,55.33549,85,1
262 | 2102,2,1,49.47205,43.01117,67,19
263 | 2103,2,0,43.01117,46.82059,84,2
264 | 2104,2,1,58.11485,71.82729,74,11
265 | 2105,2,1,46.28556,70.94328,86,0
266 | 2106,2,0,48.94376,53.71444,81,5
267 | 2107,2,0,98.99289,56.98883,72,13
268 | 2108,2,0,41.31353,50.52795,86,0
269 | 2109,2,1,73.72179,82.74353,81,5
270 | 2110,2,1,64.20476,65.56011,83,3
271 | 2111,2,0,53.71444,51.58518,84,2
272 | 2112,2,1,55.88246,98.99289,84,2
273 | 2113,2,1,62.90584,56.98883,81,5
274 | 2114,2,0,61.04388,71.82729,86,0
275 | 2115,2,0,98.99289,98.99289,83,3
276 | 2116,2,1,62.90584,54.7921,86,0
277 | 2117,2,1,58.68647,60.44261,85,1
278 | 2118,2,1,53.71444,35.12527,77,9
279 | 2119,2,1,65.56011,44.11754,85,1
280 | 2120,2,1,51.58518,40.15026,86,0
281 | 2121,2,1,62.90584,62.27464,86,0
282 | 2122,2,1,70.94328,58.11485,85,1
283 | 2123,2,1,70.09472,48.41482,73,12
284 | 2124,2,0,65.56011,62.27464,83,3
285 | 2125,2,1,46.82059,65.56011,86,0
286 | 2126,2,0,66.98323,51.58518,85,1
287 | 2127,2,1,51.58518,59.26457,81,5
288 | 2128,2,0,46.28556,64.20476,85,1
289 | 2129,2,1,47.88486,58.68647,85,1
290 | 2130,2,0,41.88515,58.11485,78,7
291 | 2131,2,0,56.98883,59.84974,80,6
292 | 2132,2,0,64.20476,81.08016,86,0
293 | 2133,2,0,70.94328,58.68647,78,8
294 | 2134,2,1,45.2079,57.54914,86,0
295 | 2135,2,0,40.15026,48.41482,84,1
296 | 2136,2,1,25.25478,13.13055,86,0
297 | 2137,2,1,55.88246,86.86945,82,4
298 | 2138,2,0,98.99289,93.25195,63,17
299 | 2139,2,1,59.26457,44.11754,80,6
300 | 2140,2,0,41.88515,41.31353,86,0
301 | 2141,2,0,61.04388,56.98883,86,0
302 | 2142,2,0,73.72179,70.94328,85,1
303 | 2143,2,1,61.04388,47.88486,83,3
304 | 2144,2,1,53.71444,61.04388,84,1
305 | 2145,2,1,46.82059,59.26457,84,2
306 | 2146,2,0,36.45115,46.82059,84,2
307 | 2147,2,1,56.98883,61.04388,84,2
308 | 2148,2,0,20.40919,15.35938,81,5
309 | 2149,2,0,47.88486,54.7921,45,41
310 | 2150,2,1,56.98883,43.01117,83,3
311 | 2151,2,0,54.7921,71.82729,79,7
312 | 2152,2,0,47.88486,69.27759,85,1
313 | 2153,2,1,36.45115,47.88486,85,1
314 | 2154,2,0,66.98323,68.48849,83,3
315 | 2155,2,0,54.7921,53.17941,86,0
316 | 2156,2,0,76.98948,69.27759,86,0
317 | 2157,2,0,65.56011,70.94328,84,2
318 | 


--------------------------------------------------------------------------------
/test/test_PCA.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | from mozsci import pca
 7 | from six.moves import range
 8 | 
 9 | class TestLinearPCA(unittest.TestCase):
10 | 
11 |     def test_linearPCA(self):
12 |         """Test linear PCA"""
13 | 
14 |         # make the data
15 |         N = 1000
16 |         data = np.zeros((N, 3))
17 |         for k in range(N):
18 |             data[k, 0] = (np.random.random() - 0.5) * 5.0 + 2.0
19 |             #data[k, 1] = 3.5 * data[k, 0] + (np.random.random() - 0.5)
20 |             data[k, 1] = (np.random.random() - 0.5) * 5.0
21 |             data[k, 2] = 3.5 + data[k, 0] - 0.55 * data[k, 1] + (np.random.random() - 0.5)
22 | 
23 |         # make the PCA, do the training, then project on all three eigenvectors,
24 |         # and reconstruct the original data
25 |         p = pca.LinearPCA()
26 |         p.train(data)
27 |         data_proj = p.project(data, 3)
28 | 
29 |         # reconstruct the data from the projection
30 |         data_reconstruct = np.zeros((N, 3))
31 |         for k in range(N):
32 |             data_reconstruct[k, :] = p.mean + data_proj[k, 0] * p.eigvec[:, 0] + data_proj[k, 1] * p.eigvec[:, 1] + data_proj[k, 2] * p.eigvec[:, 2]
33 | 
34 |         self.assertTrue((np.abs(data_reconstruct - data) < 1.0e-12).all())
35 | 
36 |         # test out truncate
37 |         self.assertTrue((np.abs(p.truncate(data, 3) - data) < 1.0e-12).all())
38 | 
39 |         # test json
40 |         json_map = p.to_json()
41 |         p_from_json = pca.LinearPCA(json_map=json_map)
42 |         self.assertEqual(p.nvars, p_from_json.nvars)
43 |         self.assertTrue((np.abs(p.mean - p_from_json.mean) < 1.0e-12).all())
44 |         self.assertTrue((np.abs(p.eigval - p_from_json.eigval) < 1.0e-12).all())
45 |         self.assertTrue((np.abs(p.eigvec - p_from_json.eigvec) < 1.0e-12).all())
46 | 
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     suite1 = unittest.TestLoader().loadTestsFromTestCase(TestLinearPCA)
51 |     suitelist = [suite1]
52 |     suite = unittest.TestSuite(suitelist)
53 |     unittest.TextTestRunner(verbosity=2).run(suite)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/test/test_cross_validate.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | import numpy as np
 5 | import time
 6 | 
 7 | from mozsci import cross_validate
 8 | 
 9 | 
10 | class Test_cv_kfold(unittest.TestCase):
11 |     def test_cv_kfold(self):
12 |         folds = cross_validate.cv_kfold(20, 4, seed=2)
13 | 
14 |         sum_training = np.sum([len(ele[0]) for ele in folds])
15 |         self.assertTrue(sum_training == 3 * 20)
16 | 
17 |         sum_training = np.sum([len(ele[1]) for ele in folds])
18 |         self.assertTrue(sum_training == 20)
19 | 
20 |         actual_folds = [
21 |  [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18], [1, 6, 12, 14, 19]],
22 |  [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18], [0, 3, 4, 5, 8, 9, 17]],
23 |  [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18], [2, 7, 10, 11, 13]],
24 |  [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13], [15, 16, 18]]]
25 | 
26 |         self.assertEqual(actual_folds, folds)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/test/test_evaluation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import unittest
  4 | import numpy as np
  5 | 
  6 | from mozsci import evaluation
  7 | from mozsci.inputs import mean_std_weighted
  8 | from six.moves import range
  9 | 
 10 | 
 11 | class TestAUCFast(unittest.TestCase):
 12 |     def test_auc_wmw_fast(self):
 13 | 
 14 |         t = [-1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1]
 15 |         p = [0.01, 0.05, 0.2, 0.25, 0.1, 0.9, 0.6, 0.01, 0.90, 1.0, 0.33, 0.55, 0.555]
 16 | 
 17 |         auc_act = 0.54761904761904767
 18 |         auc = evaluation.auc_wmw_fast(t, p)
 19 | 
 20 |         self.assertTrue(abs(auc_act - auc) < 1.0e-8)
 21 | 
 22 |     def test_auc_degenerate(self):
 23 |         y = np.array([0])
 24 |         ypred = np.array([[ 1.0]])
 25 |         weights = np.array([1])
 26 |         auc = evaluation.auc_wmw_fast(y, ypred, weights=weights)
 27 |         self.assertTrue(auc == 0)
 28 | 
 29 | 
 30 | class Testclassification_error(unittest.TestCase):
 31 |     def test_classification_error(self):
 32 |         y = np.array([0, 1, 1, 0])
 33 |         ypred = np.array([0.1, 0.9, 0.4, 0.2])
 34 | 
 35 |         self.assertTrue(abs(evaluation.classification_error(y, ypred) - 0.25) <
 36 |  1e-12)
 37 |         self.assertTrue(abs(evaluation.classification_error(y, ypred, thres=0.3
 38 | ) - 0.0) < 1e-12)
 39 | 
 40 |         weights = np.array([1.0, 0.8, 0.7, 0.6])
 41 |         self.assertTrue(abs(evaluation.classification_error(y, ypred, weights=weights) - (1.0 - (1.0 + 0.8 + 0.6) / (weights.sum()))) < 1.0e-12)
 42 | 
 43 | 
 44 | 
 45 | 
 46 | class Test_precision_recall_f1(unittest.TestCase):
 47 | 
 48 |     def setUp(self):
 49 |         self.yactual = np.array([0, 0, 0, 0, 1, 1, 1])
 50 |         self.ypred =   np.array([0, 1, 1, 1, 1, 0, 0])
 51 |         self.weights = np.array([1, 2, 3, 4, 5, 6, 7])
 52 | 
 53 |         self.yactual1 = self.yactual.reshape(7, 1)
 54 |         self.ypred1 = self.ypred.reshape(1, 7)
 55 |         self.weights1 = self.weights.reshape(1, 7)
 56 | 
 57 |     def test_precision_recall_f1(self):
 58 |         tp = 1.0
 59 |         fp = 3.0
 60 |         fn = 2.0
 61 | 
 62 |         actual_prec_rec_f1 = Test_precision_recall_f1.prec_rec_f1_from_tp_fp_fn(tp, fp, fn)
 63 |         for y in [self.yactual, self.yactual1]:
 64 |             for ypred in [self.ypred, self.ypred1]:
 65 |                 prec_rec_f1 = evaluation.precision_recall_f1(y, ypred)
 66 |                 for k in range(3):
 67 |                     self.assertTrue(abs(actual_prec_rec_f1[k] - prec_rec_f1[k]) < 1e-12)
 68 | 
 69 |     def test_precision_recall_f1_weighted(self):
 70 |         tp = 5.0
 71 |         fp = 2.0 + 3 + 4
 72 |         fn = 6.0 + 7
 73 | 
 74 |         actual_prec_rec_f1 = Test_precision_recall_f1.prec_rec_f1_from_tp_fp_fn(tp, fp, fn)
 75 | 
 76 |         for y in [self.yactual, self.yactual1]:
 77 |             for ypred in [self.ypred, self.ypred1]:
 78 |                 for weights in [self.weights, self.weights1]:
 79 |                     prec_rec_f1 = evaluation.precision_recall_f1(y, ypred, weights=weights)
 80 |                     for k in range(3):
 81 |                         self.assertTrue(abs(actual_prec_rec_f1[k] - prec_rec_f1[k]) < 1e-12)
 82 | 
 83 | 
 84 |     def test_degenerate(self):
 85 |         # test case with degenerate input
 86 |         y = np.array([0])
 87 |         ypred = np.array([[ 1.0]])
 88 |         weights = np.array([1])
 89 |         prf = evaluation.precision_recall_f1(y, ypred, weights=weights)
 90 | 
 91 |         # check that none are NaN
 92 |         self.assertFalse(np.array([np.isnan(ele) for ele in prf]).any())
 93 | 
 94 |         # and they should all be 0
 95 |         self.assertTrue(np.allclose(prf, [0, 0, 0]))
 96 | 
 97 | 
 98 |     @staticmethod
 99 |     def prec_rec_f1_from_tp_fp_fn(tp, fp, fn):
100 |         actual_prec_rec_f1 = np.zeros(3)
101 |         actual_prec_rec_f1[0] = tp / (tp + fp) # precision
102 |         actual_prec_rec_f1[1] = tp / (tp + fn) # recall
103 |         actual_prec_rec_f1[2] = 2.0 * actual_prec_rec_f1[0] * actual_prec_rec_f1[1] / (actual_prec_rec_f1[0] + actual_prec_rec_f1[1])  # f1
104 |         return actual_prec_rec_f1
105 | 
106 | 
107 | 
108 | class Test_pearson_weighted(unittest.TestCase):
109 |     def test_pearson_weighted(self):
110 |         from scipy.stats import pearsonr
111 | 
112 |         x = np.array([1, 2, 3, 4, 5])
113 |         y = np.array([1.0, 1.5, -0.5, 3.4, 2.9])
114 |         weights = np.array([1, 0, 0.5, 2, 1.5])
115 | 
116 |         r_no_wgt = pearsonr(x, y)[0]
117 |         r_no_wgt_test = evaluation.pearsonr_weighted(x, y)
118 |         r_ones_wgt = evaluation.pearsonr_weighted(x, y, np.ones(x.shape))
119 | 
120 |         self.assertTrue(abs(r_no_wgt - r_no_wgt_test) < 1e-12)
121 |         self.assertTrue(abs(r_no_wgt - r_ones_wgt) < 1e-12)
122 | 
123 |         xm = mean_std_weighted(x, weights)
124 |         ym = mean_std_weighted(y, weights)
125 |         r_wgt = np.sum((x - xm['mean']) * (y - ym['mean']) * weights) / np.sum(weights)
126 |         self.assertTrue((evaluation.pearsonr_weighted(x, y, weights) - r_wgt) < 1e-12)
127 | 
128 | 
129 | 
130 | 
131 | class Test_spearmanr_by(unittest.TestCase):
132 | 
133 |     def test_spearmanr_by(self):
134 | 
135 |         f = np.array([50,  52.19589972,  44.97281905,  50,
136 |             47.6719409 ,  45.96619825,  50,  50,
137 |             48.18824048,  54.88529706,  42.67667074,  41.80373588,
138 |             37.29934119,  57.98812747,  45.04782628,  38.10858417,
139 |             46.44031713,  40.59823939,  26.29936944,  23.96820474,
140 |             47.98343799,  36.4455311 ,  43.92931621,  55.19172514,
141 |             33.44633285,  37.38381116,  39.03392758,  41.43285553,
142 |             28.63082987,  31.86069758,  41.19551474,  29.04928565,
143 |             39.09690404,  36.75441683,  29.66390582,  70.4035713 ,
144 |             63.53532854,  49.78916058,  64.39911984,  65.41353192,
145 |             48.42353021,  60.38572122,  42.44357922,  42.86378695,
146 |             58.93821467,  61.93862217,  36.23459784,  64.57533596,
147 |             40.09399141,  45.57233379,  44.7748158 ,  50.88705955,
148 |             47.24016865,  51.75866967,  36.17935042,  46.73933887,
149 |             52.7136634 ,  47.0337377 ,  34.19077012,  18.5836512 ,
150 |             41.63257011,   9.8698871 ,  37.63277795,  47.71676464,
151 |             34.89667886,  35.10845963,  44.56638481,  36.70884056,
152 |             57.9185177 ,  50.65260932,  58.53307806,  43.25154747,
153 |             40.59802125,  38.97005406,  35.19682907,  51.94755877,
154 |             44.04430199,  35.84048228,  36.25006727,  46.35317423,
155 |             37.44668618,  16.90596421,  38.87970562,  47.33515849,
156 |             27.41230181,  29.47142008])
157 |     
158 |         position = np.array([1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  12.,
159 |             13.,  15.,  16.,  17.,  19.,  23.,  24.,  25.,  26.,  27.,  28.,
160 |             29.,   1.,   2.,   3.,   6.,   8.,   9.,  11.,  12.,  13.,  17.,
161 |             19.,  21.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,
162 |             10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,
163 |             22.,  23.,  24.,  25.,  26.,  27.,   1.,   2.,   4.,   5.,   6.,
164 |              7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,
165 |             18.,  20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.])
166 |     
167 |         queryid = np.array([0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
168 |             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
169 |             1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,
170 |             2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
171 |             2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
172 |             3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
173 |             3,  3,  3,  3,  3,  3,  3,  3], np.int)
174 | 
175 |         fast_spearman = evaluation.spearmanr_by(f, position, queryid)
176 |         self.assertTrue(abs(fast_spearman - -0.42666971560358913) < 1e-1)
177 | 
178 | 
179 | class TestClassificationPerfMeasure(unittest.TestCase):
180 | 
181 |     def test_basic_measure_1(self):
182 |         """
183 |         Test classification_model_performance. All correct case.
184 |         """
185 |         observed = np.array([0, 1, 1, 0, 0, 0, 1])
186 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
187 | 
188 |         measure = evaluation.classification_model_performance(observed, calculated)
189 | 
190 |         self.assertEqual(measure, 0)
191 | 
192 |     def test_basic_measure_2(self):
193 |         """
194 |         Test classification_model_performance. All correct case.
195 |         """
196 |         observed = np.array([0, 1, 0, 1, 0, 0, 1])
197 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
198 | 
199 |         measure = evaluation.classification_model_performance(observed, calculated)
200 | 
201 |         self.assertAlmostEqual(measure, 0.2857142857140)
202 | 
203 |     def test_basic_measure_3(self):
204 |         """
205 |         Test classification_model_performance. weighted case.
206 |         """
207 |         observed = np.array([0, 1, 0, 1, 0, 0, 1])
208 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
209 | 
210 |         measure = evaluation.classification_model_performance(observed, calculated, [1.0, 3.0])
211 | 
212 |     def test_matrix_measure_1(self):
213 |         """
214 |         Test classification_model_performance_matrix. All correct case.
215 |         """
216 |         observed = np.array([0, 1, 1, 0, 0, 0, 1])
217 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
218 | 
219 |         measure = evaluation.classification_model_performance_matrix(observed, calculated)
220 |         expected_measure = np.array([[4, 0], [0, 3]])
221 | 
222 |         np.testing.assert_array_almost_equal(measure, expected_measure)
223 | 
224 |     def test_matrix_measure_2(self):
225 |         """
226 |         Test classification_model_performance_matrix. All correct case.
227 |         """
228 |         observed = np.array([0, 1, 0, 1, 0, 0, 1])
229 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
230 | 
231 |         measure = evaluation.classification_model_performance_matrix(observed, calculated)
232 |         expected_measure = np.array([[3, 1], [1, 2]])
233 | 
234 |         np.testing.assert_array_almost_equal(measure, expected_measure)
235 | 
236 |     def test_matrix_measure_3(self):
237 |         """
238 |         Test classification_model_performance_matrix. multiple classes case.
239 |         """
240 |         observed = np.array([1, 0, 1, 0, 1, 0, 2, 3])
241 |         calculated = np.array([1, 0, 1, 1, 0, 2, 3, 0])
242 | 
243 |         measure = evaluation.classification_model_performance_matrix(observed, calculated)
244 |         expected_measure = np.array([[1, 1, 1, 0], [1, 2, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0]])
245 | 
246 |         np.testing.assert_array_almost_equal(measure, expected_measure)
247 | 
248 |     def test_loss_measure_1(self):
249 |         """
250 |         Test classification_model_performance_loss. default loss (0-1 loss).
251 |         """
252 |         observed = np.array([0, 1, 1, 0, 1, 0, 1])
253 |         calculated = np.array([0, 1, 1, 0, 0, 0, 1])
254 | 
255 |         measure = evaluation.classification_model_performance_loss(observed, calculated)
256 | 
257 |         self.assertEqual(measure, 1)
258 | 
259 |     def test_loss_measure_2(self):
260 |         """
261 |         Test classification_model_performance_loss. user defined loss measure - squared loss.
262 |         """
263 |         observed = np.array([0, 1, 0, 1, 0, 2, 1])
264 |         calculated = np.array([0, 1, 1, 0, 2, 0, 1])
265 | 
266 |         loss = lambda i, j: (i-j)*(i-j)
267 | 
268 |         measure = evaluation.classification_model_performance_loss(observed, calculated, loss)
269 | 
270 |         self.assertEqual(measure, 10)
271 | 
272 | 
273 | if __name__ == "__main__":
274 |     unittest.main()
275 | 
276 | 
277 | 
278 | 


--------------------------------------------------------------------------------
/test/test_glm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import unittest
  3 | import numpy as np
  4 | 
  5 | from mozsci.glm import prob_distributions
  6 | from mozsci.glm import regularization
  7 | from mozsci.glm import simplified_glm
  8 | 
  9 | class TestGlm(unittest.TestCase):
 10 | 
 11 |     def test_negative_binomial_dist_likelihood(self):
 12 |         """
 13 |         Test the calculation of the log likelihood of the negative binomial distribution.
 14 |         :return:
 15 |         """
 16 |         features = np.array([
 17 |             [1,56.98883,42.45086, 1.0],
 18 |             [1,37.09416,46.82059, 1.0],
 19 |             [0,32.27546,43.56657, 1.0],
 20 |             [0,29.05672,43.56657, 1.0],
 21 |             [0,6.748048,27.24847, 1.0],
 22 |             [0,61.65428,48.41482, 1.0]
 23 |         ])
 24 | 
 25 |         Y = np.array([4,  4,  2,  3,  3,  13 ])
 26 |         beta_k = np.array([10.0,  0,  0,  0,  0])
 27 | 
 28 |         dist = prob_distributions.NegativeBinomialWithKstar()
 29 | 
 30 |         calculated = dist.eval(beta_k, features, Y)
 31 | 
 32 |         self.assertAlmostEqual(calculated, -5.9967772892)
 33 | 
 34 | 
 35 |     def test_negative_binomial_dist_gradient(self):
 36 |         """
 37 |         Test the gradient of the log likelihood of negative binomial distribution.
 38 |         """
 39 |         # input data.
 40 |         features = np.array([
 41 |             [1,56.98883,42.45086, 1.0],
 42 |             [1,37.09416,46.82059, 1.0],
 43 |             [0,32.27546,43.56657, 1.0],
 44 |             [0,29.05672,43.56657, 1.0],
 45 |             [0,6.748048,27.24847, 1.0],
 46 |             [0,61.65428,48.41482, 1.0]
 47 |         ])
 48 |         Y = np.array([4,  4,  2,  3,  3,  13 ])
 49 |         beta_k = np.array([10.0,  0,  0,  0,  0])
 50 | 
 51 |         # expected output
 52 |         expected = np.array([-3.22202699e-03, 5.99972761e+00, 1.12593421e+03, 1.03394190e+03, 2.29989558e+01])
 53 | 
 54 |         # calculation.
 55 |         dist = prob_distributions.NegativeBinomialWithKstar()
 56 | 
 57 |         calculated = dist.eval_gradient(beta_k, features, Y)
 58 | 
 59 |         np.testing.assert_almost_equal(calculated, expected, decimal=5)
 60 | 
 61 |     def test_poisson_regression(self):
 62 |         """
 63 |         This method is used to test the poisson regression works as it should.
 64 |         The data is from: http://www.oxfordjournals.org/our_journals/tropej/online/ma_chap13.pdf
 65 |         :return:
 66 |         """
 67 |         features = np.array( [
 68 |             [236,0], [739,1], [970,1], [2371,1], [309,1], [679,1], [26,0], [1272,1], [3246,1], [1904,1],
 69 |             [357,1], [1080,1], [1027,1], [28,0], [2507,1], [138,0], [502,1], [1501,1], [2750,1], [192,1], ] )
 70 | 
 71 |         Y = np.array([ 8, 16, 15, 23, 5, 13, 4, 19, 33, 19, 10, 16, 22, 2, 22, 2, 18, 21, 24, 9])
 72 | 
 73 |         regular = regularization.NullRegularization()
 74 | 
 75 |         # or we can use regular = regularization.NullRegularization()
 76 |         reg = simplified_glm.PoissonRegression(lam=0, maxiter=500)
 77 |         reg.fit(features, Y)
 78 | 
 79 |         # The correct result should be [0.00033, 1.045, 1.351], The last one is the constant.
 80 |         # bfgs gives [  3.26073590e-04   1.04513753e+00   1.35099878e+00]
 81 |         expected = np.array([0.00033, 1.045, 1.351])
 82 |         np.testing.assert_almost_equal(reg.params, expected, decimal=2)
 83 | 
 84 |     def test_negative_binomial(self):
 85 |         """
 86 |         This method is used to test the negative binomial 'regression' works as it should.
 87 |         Data is from : http://www.ats.ucla.edu/stat/sas/dae/negbinreg.htm
 88 |         What they got: loglikelihood: Log Likelihood                         2149.3649
 89 |             Parameter     DF    Estimate       Error           Limits           Chi-Square    Pr > ChiSq
 90 |             Intercept      1      2.7161      0.2326      2.2602      3.1719        136.38        <.0001
 91 |             male           1     -0.4312      0.1397     -0.7049     -0.1574          9.53        0.0020
 92 |             math           1     -0.0016      0.0048     -0.0111      0.0079          0.11        0.7413
 93 |             langarts       1     -0.0143      0.0056     -0.0253     -0.0034          6.61        0.0102
 94 |             Dispersion     1      1.2884      0.1231      1.0471      1.5296
 95 | 
 96 |             NOTE: The negative binomial dispersion parameter was estimated by maximum likelihood.
 97 | 
 98 |         What we got: (Under the same condition - no regularization. No max iteration limit.)
 99 |             the likelihood term value and the regularization term value are  -2149.36485714 0.0
100 |             Optimization terminated successfully.
101 |             Current function value: -2149.364857
102 |             Iterations: 27
103 |             Function evaluations: 184
104 |             Gradient evaluations: 161
105 |             The linear coefficients are:
106 |             [ (This is k*) -2.53387660e-01  -4.31184391e-01  -1.60095828e-03  -1.43475268e-02
107 |               (This is the intercept) 2.71606920e+00]
108 |         """
109 |         mydata = np.genfromtxt('test/data/poissonreg.csv', delimiter=',', skip_header=1)
110 |         features = mydata[:, 2:5]
111 | 
112 |         Y = mydata[:, 6]
113 | 
114 |         reg = simplified_glm.NegativeBinomialWithKstarRegression(3 + 2, lam=0)
115 |         reg.fit(features, Y)
116 | 
117 |         ## data from ucla.
118 |         expected = np.array([-0.4312, -0.0016, -0.0143, 2.7161])
119 | 
120 |         np.testing.assert_almost_equal(reg.params[1:], expected, decimal=2)
121 | 
122 | if __name__ == "__main__":
123 |     unittest.main()
124 | 


--------------------------------------------------------------------------------
/test/test_histogram.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | 
 4 | import unittest
 5 | import numpy as np
 6 | import time
 7 | 
 8 | from mozsci import histogram
 9 | 
10 | 
11 | class TestHistogram1D(unittest.TestCase):
12 |     def test_histogram1d(self):
13 | 
14 |         h = histogram.Histogram1DFast(10, 0, 10)
15 |         self.assertTrue((np.abs(h.bin_edges - np.arange(11)) < 1.0e-12).all())
16 | 
17 |         x = np.array([-1.0, 0.5, 3.2, 0.77, 9.99, 10.1, 8.2])
18 |         h.update(x)
19 | 
20 |         xc = np.array([1.5, 2.5, 8.3])
21 |         cc = np.array([10, 5, 22])
22 |         h.update_counts(xc, cc)
23 |         self.assertTrue((h.bin_count == np.array([3, 10, 5, 1, 0, 0, 0, 0, 23, 2])).all())
24 | 
25 |         # check compute_indices
26 |         self.assertTrue((h.compute_indices(np.arange(12) - 0.5) == np.array([0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9])).all())
27 | 
28 | 
29 |         # benchmark
30 |         x = np.random.randn(1e7)
31 |         time1 = time.time()
32 |         h = histogram.Histogram1DFast(100, -5, 5)
33 |         h.update(x)
34 |         time2 = time.time()
35 |         out = np.histogram(x, bins=100, range=[-5, 5])
36 |         time3 = time.time()
37 | 
38 |         print("Time for fast = " + str(time2 - time1) + " s")
39 |         print("Time for numpy = " + str(time3- time2) + " s")
40 | 
41 | 
42 |         # check sampler
43 |         t1 = time.time()
44 |         samples = h.sample(5e6)
45 |         t2 = time.time()
46 | 
47 |         (counts, edges) = np.histogram(samples, 50, normed=True)
48 |         centers = 0.5 * (edges[1:] + edges[0:-1])
49 |         actual_pdf = 1.0 / np.sqrt(2.0 * 3.14159) * np.exp(-centers ** 2 / 2.0)
50 |         self.assertTrue(np.allclose(counts, actual_pdf, atol=5e-3))
51 | 
52 |     def test_stratified_sample(self):
53 |         hist = histogram.Histogram1DFast(5, 0, 5)
54 |         hist.update_counts(np.array([0.5, 1.5, 2.5, 3.5, 4.5]),
55 |                            np.array([5e6, 1e6, 1e4, 1e3, 2]))
56 | 
57 |         hist.compute_pdf_cdf()
58 | 
59 |         # generate a sample
60 |         x = hist.sample(int(hist.bin_count.sum()))
61 | 
62 |         # now do a stratified sample of the large sample
63 |         sample_size = [5000, 3000, 1000, 250, 2]
64 |         x_stratified_sample = hist.stratified_sample(x, sample_size)
65 |         hist_check = histogram.Histogram1DFast(5, 0, 5)
66 |         hist_check.update(x_stratified_sample)
67 | 
68 |         # check that the actual sample distribution matches the expected
69 |         # one.  We expect a small relative difference in all entries
70 |         # except the last (where we expect a small absolute difference)
71 |         self.assertTrue(np.allclose(1.0,
72 |             hist_check.bin_count[:-1].astype(np.float) / sample_size[:-1],
73 |             atol=0.10, rtol=0.0))
74 |         self.assertTrue(abs(hist_check.bin_count[-1] - sample_size[-1]) < 3)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     unittest.main()
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/test/test_inputs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | import numpy as np
 5 | from mozsci import inputs
 6 | from six.moves import range
 7 | 
 8 | class Test_mean_std_weightd(unittest.TestCase):
 9 |     def test_mean_std(self):
10 | 
11 |         # test 1D case
12 |         x = np.array([1, 2, 3, 4, 5])
13 |         weights = np.array([0.2, 0.1, 2,0.5, 1])
14 | 
15 |         ret = inputs.mean_std_weighted(x)
16 |         self.assertTrue(abs(ret['mean'] - 3.0) < 1e-8)
17 |         self.assertTrue(abs(ret['std'] - np.sqrt(2 * (4 + 1) / 5)) < 1e-8)
18 | 
19 |         ret = inputs.mean_std_weighted(x, np.ones(x.shape))
20 |         self.assertTrue(abs(ret['mean'] - 3.0) < 1e-8) 
21 |         self.assertTrue(abs(ret['std'] - np.sqrt(2 * (4 + 1) / 5)) < 1e-8)
22 | 
23 |         ret = inputs.mean_std_weighted(x, weights)
24 |         m = np.sum(weights * x) / np.sum(weights)
25 |         s = np.sqrt(np.sum((x - m)**2 * weights) / np.sum(weights))
26 |         self.assertTrue(abs(ret['mean'] - m) < 1e-8)
27 |         self.assertTrue(abs(ret['std'] - s) < 1e-8)
28 | 
29 |         # 2D case
30 |         x = np.array([[1, 2],
31 |                      [-0.5, 0.0],
32 |                      [3, -0.55]])
33 |         weights = np.array([0.5, 2, 1.55])
34 | 
35 |         ret = inputs.mean_std_weighted(x, weights)
36 | 
37 |         sum_weights = np.sum(weights)
38 |         m1 = (1.0 * 0.5 + -0.5 * 2 + 3 * 1.55) / sum_weights
39 |         m2 = (2.0 * 0.5 + 0.0 * 2 + -0.55 * 1.55) / sum_weights
40 |         self.assertTrue(np.allclose(ret['mean'], [m1, m2]))
41 | 
42 |         s1 = np.sqrt(((1.0 - m1) ** 2 * 0.5 + (-0.5 - m1)**2 * 2.0 + (3 - m1)**2 * 1.55) / sum_weights)
43 |         s2 = np.sqrt(((2 - m2) ** 2 * 0.5 + (0.0 - m2)**2 * 2.0 + (-0.55 - m2)**2 * 1.55) / sum_weights)
44 |         self.assertTrue(np.allclose(ret['std'], [s1, s2]))
45 | 
46 | 
47 | class TestLogScaledTransformer(unittest.TestCase):
48 |     def test_log_transformer(self):
49 |         mean = np.array([0.5, 1.0])
50 |         std = np.array([0.3, 0.8])
51 |         offset = 2.0
52 |         nsamples = int(1e6)
53 |         samples = np.zeros((nsamples, 2))
54 |         for k in range(2):
55 |             samples[:, k] = np.random.normal(mean[k], std[k], nsamples)
56 |         exp_samples = np.exp(samples) - offset
57 | 
58 |         transformer = inputs.LogScaledTransformer(offset=offset)
59 | 
60 |         # check fit
61 |         transformer.fit(exp_samples)
62 |         self.assertTrue(np.allclose(transformer.mean_, mean, atol=1e-2))
63 |         self.assertTrue(np.allclose(transformer.std_, std, atol=1e-2))
64 | 
65 |         # check transform
66 |         X = exp_samples[:10]
67 |         transformed = transformer.transform(X)
68 |         expected = 1.0 / transformer.std_ * (
69 |             np.log(X + offset) - transformer.mean_)
70 |         self.assertTrue(np.allclose(transformed, expected))
71 | 
72 |         # inverse transform
73 |         self.assertTrue(np.allclose(X,
74 |             transformer.inverse_transform(transformer.transform(X))))
75 | 
76 | class TestBucketTransformer(unittest.TestCase):
77 |     def test_bucket_transformer(self):
78 |         transformer = inputs.BucketTransformer([0, 1, 2.4])
79 |         X = np.array([0.5, 1.2, -1, 3.9, 1.9, 2.1])
80 |         Y = transformer.transform(X)
81 |         expectedY = np.array(
82 |                 [[ 0.,  1.,  0.,  0.],
83 |                 [ 0.,  0.,  1.,  0.],
84 |                 [ 1.,  0.,  0.,  0.],
85 |                 [ 0.,  0.,  0.,  1.],
86 |                 [ 0.,  0.,  1.,  0.],
87 |                 [ 0.,  0.,  1.,  0.]]
88 |         )
89 |         self.assertTrue(np.allclose(Y, expectedY))
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     unittest.main()
94 | 
95 | 


--------------------------------------------------------------------------------
/test/test_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | from mozsci.models import LinearRegression
 5 | from mozsci.evaluation import pearsonr_weighted
 6 | import numpy as np
 7 | 
 8 | class TestLogisticRegression(unittest.TestCase):
 9 |     def test_linear_regression(self):
10 |         np.random.seed(55)
11 |         X = np.random.rand(1000, 3)
12 |         w = [0.5, 1.3, -2.5]
13 |         b = 12.5
14 |         y = X[:, 0] * w[0] + X[:, 1] * w[1] + X[:, 2] * w[2] + b
15 | 
16 |         # should convert to the exact solution with only a little regularization
17 |         lr = LinearRegression(lam=0.001)
18 |         lr.fit(X, y)
19 |         ypred = lr.predict(X)
20 |         self.assertTrue(pearsonr_weighted(y, ypred) > 0.99)
21 | 
22 |         # try weighted
23 |         weights = np.random.rand(1000)
24 |         lr = LinearRegression(lam=0.001)
25 |         lr.fit(X, y, weights=weights)
26 |         ypred = lr.predict(X)
27 |         self.assertTrue(pearsonr_weighted(y, ypred, weights))
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/test/test_logistic_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | from mozsci.models import LogisticRegression
 5 | import numpy as np
 6 | 
 7 | class TestLogisticRegression(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.x = np.array([[1, -2], [-0.5, -2]])
11 |         self.t = np.array([0, 1])
12 |         self.w = np.array([3, -1])
13 |         self.b = 1
14 |         self.lam = 7
15 | 
16 |     def test_sigmoid(self):
17 |         y = LogisticRegression._sigmoid(self.x, self.b, self.w)
18 |         yact = np.array([1.0 / (1.0 + np.exp(-6)), 1.0 / (1.0 + np.exp(-1.5))])
19 | 
20 |         self.assertTrue(np.all(np.abs(y - yact) < 1.0e-12))
21 | 
22 |     def test_error_gradient(self):
23 |         x0 = np.array([self.x[0]])
24 |         x1 =  np.array([self.x[1]])
25 |         error, gradient = LogisticRegression._loss_gradient(x0, x1, self.b, self.w, self.lam)
26 | 
27 |         # this assumes test_sigmoid pases
28 |         err_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) + 0.5 * 7 * 10
29 |             
30 |         pred_error = LogisticRegression._sigmoid(self.x, self.b, self.w) - self.t
31 |         gradient_act = np.array([0.0, 7 * 3, 7 * -1])
32 |         gradient_act[0] = np.sum(pred_error)
33 |         gradient_act[1] += np.sum(pred_error * self.x[:, 0])
34 |         gradient_act[2] += np.sum(pred_error * self.x[:, 1])
35 | 
36 |         self.assertTrue( abs(float(err_act) - error) < 1.0e-12 ) 
37 |         self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12))
38 | 
39 |         # weighted case
40 |         x00 = np.array([self.x[0], [55, -2]])
41 |         error_weighted, gradient_weighted = LogisticRegression._loss_gradient(x00, x1, self.b, self.w, self.lam, [np.array([0.4, 0.75]), np.array(0.35)])
42 |         err_weighted_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid([x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10
43 |         self.assertTrue( abs(float(err_weighted_act) - error_weighted) < 1.0e-12 )
44 | 
45 |     def test_fit(self):
46 |         from mozsci.evaluation import classification_error
47 |         np.random.seed(5)
48 |         N = int(1e5)
49 |         x = np.random.rand(N, 2)
50 |         y = (3 * x[:, 0] - 2 * x[:, 1] - 1.5 > 0.0).astype(np.int)
51 |         lr = LogisticRegression()
52 |         lr.fit(x, y, factr=1e4)
53 |         ypred = lr.predict(x)
54 |         self.assertTrue(classification_error(y, ypred) < 0.002)
55 | 
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     unittest.main()
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/test/test_map_train.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | from mozsci.map_train import TrainModelCV, run_train_models
 7 | from mozsci.evaluation import classification_error, auc_wmw_fast
 8 | from mozsci.cross_validate import cv_kfold
 9 | from mozsci.models import LogisticRegression
10 | 
11 | 
12 | class DataTest(unittest.TestCase):
13 |     def setUp(self):
14 |         np.random.seed(5)
15 |         self.X = np.linspace(0, 1, 100).reshape(100, 1)
16 |         self.y = (5 * self.X.reshape(100, ) - 2 + np.random.rand(100) > 0).astype(np.int)
17 | 
18 |         self.folds = cv_kfold(100, 4, seed=2)
19 | 
20 | class TestTrainModelCV(DataTest):
21 |     @staticmethod
22 |     def agg_err(yactual, ypred):
23 |         ret = {}
24 |         ret['accuracy'] = classification_error(yactual, ypred)
25 |         ret['auc'] = auc_wmw_fast(yactual, ypred)
26 |         return ret
27 | 
28 | 
29 |     def test_map_train_model(self):
30 |         trainer = TrainModelCV([LogisticRegression, classification_error, '/tmp/logistic.json', (), {'lam':0.5}], X=self.X, y=self.y)
31 |         errors = trainer.run()
32 | 
33 |         # load model
34 |         trained_model = LogisticRegression.load_model('/tmp/logistic.json')
35 |         loaded_model_error = classification_error(self.y, trained_model.predict(self.X))
36 | 
37 |         # check the errors
38 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - 0.06) < 1e-12)
39 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - loaded_model_error) < 1e-12)
40 | 
41 |     def test_aggregate_error(self):
42 |         # test an aggregate error function (that returns more than one value)
43 |         trainer = TrainModelCV([LogisticRegression, TestTrainModelCV.agg_err, None, (), {'lam':0.5}],
44 |                        X=self.X, y=self.y, Xtest=self.X[:50, :], ytest=self.y[:50])
45 |         errors = trainer.run()
46 | 
47 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train']['accuracy'] - 0.06) < 1e-8)
48 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train']['auc'] - 0.99310661764705888) < 1e-8)
49 | 
50 | 
51 |     def test_kfold_cv(self):
52 |         trainer = TrainModelCV([LogisticRegression, classification_error, None, (), {'lam':0.5}],
53 |                        X=self.X, y=self.y, folds=self.folds)
54 |         errors = trainer.run()
55 | 
56 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - 0.063340259665816398) < 1e-12)
57 |         self.assertTrue(np.abs(errors[list(errors.keys())[0]]['test'] - 0.049633305762338022)< 1e-12)
58 | 
59 | 
60 | class Test_run_train_models(DataTest):
61 |     def test_run_train_models(self):
62 |         import re
63 | 
64 |         model_library = [[LogisticRegression, classification_error, None, (), {'lam':0.5}],
65 |           [LogisticRegression, classification_error, None, (), {'lam':50}]]
66 | 
67 |         errors = run_train_models(2, model_library, X=self.X, y=self.y)
68 |         for k in errors.keys():
69 |             if re.search("{'lam': 0.5}", k):
70 |                 err_check = errors[k]
71 | 
72 |         self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/test/test_variables.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | from mozsci.evaluation import classification_error
 7 | from mozsci.inputs import IdentityTransformer, LogScaledTransformer
 8 | from mozsci import variables
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | 
12 | class TestModelDriver(unittest.TestCase):
13 |     def test_model_driver(self):
14 |         independents = [
15 |             variables.Variable('x0', IdentityTransformer()),
16 |             variables.Variable('x1', LogScaledTransformer())
17 |         ]
18 |         dependents = [variables.Variable('y', IdentityTransformer())]
19 |         model_variables = variables.ModelVariables(independents, dependents)
20 | 
21 |         # make some test data
22 |         N = int(1e5)
23 |         data = np.zeros(
24 |             N, dtype=[('x0', np.float), ('x1', np.float), ('y', np.int)])
25 |         np.random.seed(5)
26 |         data['x0'] = np.random.rand(N)
27 |         data['x1'] = np.random.normal(0.5, 2.0, N)
28 |         data['y'] = 3 * data['x0'] - 2 * data['x1'] - 1.5 > 0.0
29 | 
30 |         # rescale x1 
31 |         data['x1'] = np.exp(data['x1'])
32 | 
33 |         # create driver and fit
34 |         model = variables.ModelDriver(model_variables, LogisticRegression(C=1e5))
35 | 
36 |         # first try to fit with regular numpy arrays
37 |         X = data.view(dtype=np.float).reshape(-1, 3)[:, :2]
38 |         y = data.view(dtype=np.int).reshape(-1, 3)[:, 2].reshape(-1, 1)
39 |         model.fit(X, y)
40 |         ypred = model.predict(X)
41 |         self.assertTrue(classification_error(y, ypred) < 0.002)
42 | 
43 |         # now try using __getitem__
44 |         model.fit(data, data)
45 |         ypred = model.predict(data)
46 |         self.assertTrue(classification_error(data['y'], ypred) < 0.002)
47 | 
48 |         # serialization
49 |         model_string = model.dumps()
50 |         model_loaded = variables.ModelDriver.loads(model_string)
51 |         self.assertTrue(np.allclose(
52 |             model.predict(data, predict_prob=True),
53 |             model_loaded.predict(data, predict_prob=True)))
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------