├── .gitignore ├── .travis.yml ├── Makefile ├── README.md ├── mozsci ├── __init__.py ├── _c_utils.pyx ├── cross_validate.py ├── cspearmanr_by_fast.cc ├── ems.py ├── evaluation.py ├── glm │ ├── __init__.py │ ├── prob_distributions.py │ ├── regularization.py │ └── simplified_glm.py ├── histogram.py ├── inputs.py ├── map_train.py ├── models │ ├── __init__.py │ ├── linear_regression.py │ └── logistic_regression.py ├── numpy_util.py ├── pca.py ├── spearmanr_by_fast.pyx └── variables.py ├── requirements.txt ├── setup.py └── test ├── data └── poissonreg.csv ├── test_PCA.py ├── test_cross_validate.py ├── test_evaluation.py ├── test_glm.py ├── test_histogram.py ├── test_inputs.py ├── test_linear_regression.py ├── test_logistic_regression.py ├── test_map_train.py └── test_variables.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | mozsci.egg-info/ 4 | 5 | mozsci/spearmanr_by_fast.cpp 6 | mozsci/_c_utils.cpp 7 | 8 | 9 | *.pyc 10 | 11 | .coverage 12 | mozsci/*.so 13 | 14 | # vim files 15 | *.swp 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | - 3.3 5 | - 3.4 6 | script: make test 7 | cache: 8 | - apt 9 | - pip 10 | install: 11 | - sudo apt-get -y install libatlas-base-dev libatlas-dev lib{blas,lapack}-dev 12 | # some conda magic to install numpy, scipy 13 | # see http://sburns.org/2014/03/28/faster-travis-builds.html 14 | - sudo pip install conda 15 | - conda_deps='pip numpy scipy matplotlib cython scikit-learn' 16 | - conda create -p $HOME/py --yes $conda_deps "python=$TRAVIS_PYTHON_VERSION" 17 | - export PATH=$HOME/py/bin:$PATH 18 | - pip install -r requirements.txt 19 | - python setup.py build_ext --inplace 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: 2 | # Remove the build 3 | rm -rf build dist 4 | # And all of our pyc files 5 | rm -f mozsci/*.pyc test/*.pyc 6 | # All compiled files 7 | rm -f mozsci/*.so mozsci/spearmanr_by_fast.cpp mozsci/_c_utils.cpp 8 | # And lastly, .coverage files 9 | rm -f .coverage 10 | 11 | test: nose 12 | 13 | nose: 14 | rm -rf .coverage 15 | nosetests --exe --cover-package=mozsci --with-coverage --cover-branches -v --cover-erase 16 | 17 | unittest: 18 | python -m unittest discover -s test 19 | 20 | # build inplace for unit tests to pass (since they are run from this 21 | # top level directory we need the .so files to be in the src tree 22 | # when they run. 23 | build: clean 24 | python setup.py build_ext --inplace 25 | 26 | install: build 27 | python setup.py install 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mozsci 2 | ====== 3 | 4 | [![Build Status](https://api.travis-ci.org/seomoz/mozsci.png)](https://api.travis-ci.org/seomoz/mozsci.png) 5 | 6 | A grab bag of assorted Data science tools from Moz. 7 | 8 | Currently includes: 9 | 10 | * Utilities for training/evaluating machine learning models: 11 | * Cross validation 12 | * Evaluation metrics (AUC, F1, etc) 13 | * Training models in parallel 14 | * Ensemble model selection 15 | * PCA 16 | * A generic way to specify model inputs 17 | * Some linear models: 18 | * Linear Regression 19 | * Logistic Regression 20 | * GLM 21 | 22 | ## Installing 23 | 24 | ``` 25 | pip install mozsci 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /mozsci/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | #from .evaluation import pearsonr_weighted, auc_wmw_fast, auc_wmw_error, classification_error, precision_recall_f1 3 | #from .cross_validate import 4 | 5 | 6 | -------------------------------------------------------------------------------- /mozsci/_c_utils.pyx: -------------------------------------------------------------------------------- 1 | 2 | cimport cython 3 | cimport numpy as np 4 | np.import_array() 5 | 6 | import numpy as np 7 | 8 | @cython.boundscheck(False) 9 | @cython.cdivision(True) 10 | def histogram1d_update( 11 | np.ndarray[np.float64_t, ndim=1] data, 12 | np.ndarray[np.int64_t, ndim=1] bin_count, 13 | double bin_width, 14 | int bins1, 15 | float mn): 16 | cdef int ndata = len(data) 17 | cdef int i 18 | cdef int bin_index 19 | 20 | for i in range(ndata): 21 | bin_index = int((data[i] - mn) / bin_width) 22 | bin_index = min(max(bin_index, 0), bins1) 23 | bin_count[bin_index] += 1 24 | 25 | 26 | @cython.boundscheck(False) 27 | @cython.cdivision(True) 28 | def histogram1d_update_counts( 29 | np.ndarray[np.float64_t, ndim=1] data, 30 | np.ndarray[np.int64_t, ndim=1] bin_count, 31 | double bin_width, 32 | int bins1, 33 | float mn, 34 | np.ndarray[np.float64_t, ndim=1] counts): 35 | cdef int ndata = len(data) 36 | cdef int i 37 | cdef int bin_index 38 | 39 | for i in range(ndata): 40 | bin_index = int((data[i] - mn) / bin_width) 41 | bin_index = min(max(bin_index, 0), bins1) 42 | bin_count[bin_index] += counts[i] 43 | 44 | 45 | @cython.boundscheck(False) 46 | @cython.cdivision(True) 47 | def histogram1d_compute_indices( 48 | np.ndarray[np.float64_t, ndim=1] data, 49 | double bin_width, 50 | int bins1, 51 | float mn, 52 | np.ndarray[np.int64_t, ndim=1] bin_index): 53 | cdef int ndata = len(data) 54 | cdef int i 55 | cdef int this_index 56 | 57 | for i in range(ndata): 58 | this_index = int((data[i] - mn) / bin_width) 59 | bin_index[i] = min(max(this_index, 0), bins1) 60 | 61 | 62 | @cython.boundscheck(False) 63 | @cython.cdivision(True) 64 | def c_auc_wmw( 65 | np.ndarray[np.int64_t, ndim=1] idxp, 66 | np.ndarray[np.int64_t, ndim=1] idxn, 67 | np.ndarray[np.float64_t, ndim=1] parr, 68 | np.ndarray[np.float64_t, ndim=1] warr): 69 | 70 | cdef int i, j 71 | cdef double auc = 0.0 72 | cdef double sum_weights = 0.0 73 | cdef int nidxp = len(idxp) 74 | cdef int nidxn = len(idxn) 75 | cdef double this_weight 76 | for i in range(nidxp): 77 | for j in range(nidxn): 78 | this_weight = warr[idxp[i]] + warr[idxn[j]] 79 | sum_weights += this_weight 80 | if parr[idxp[i]] - parr[idxn[j]] > 0.0: 81 | auc += this_weight 82 | 83 | return auc / sum_weights 84 | 85 | -------------------------------------------------------------------------------- /mozsci/cross_validate.py: -------------------------------------------------------------------------------- 1 | """Things to do cross validation""" 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | from .map_train import TrainModelCV 6 | import six 7 | from six.moves import range 8 | 9 | def cv_kfold(ntrain, nk, seed=None): 10 | """k-fold cross validation 11 | 12 | ntrain = the integer number of training data points to sample 13 | nk = the number of splits of the training data 14 | optionally sets seed 15 | 16 | returns a list length nk. Each element is a tuple: 17 | (train_indices, test_indices) 18 | 19 | NOTE: this is an approximate sampler, so the test set size 20 | isn't guaranteed to be 1 / nk, especially for small values of 21 | ntrain. 22 | """ 23 | # need k probability splits 0-1 24 | 25 | # optionally set seed 26 | if seed is not None: 27 | np.random.seed(seed) 28 | 29 | # need k probability splits 0-1 30 | # the end points to sample 31 | fold_edges = np.linspace(0, 1, nk + 1) 32 | 33 | r = np.random.rand(ntrain) 34 | indices = np.arange(ntrain) 35 | folds = [] 36 | for k in range(nk): 37 | folds.append(indices[np.logical_and(fold_edges[k] <= r, r < fold_edges[k + 1])]) 38 | 39 | # make training + test arrays 40 | training_test = [] 41 | for k in range(nk): 42 | training = [] 43 | test = [] 44 | for i in range(nk): 45 | if i != k: 46 | training.extend(folds[i]) 47 | else: 48 | test.extend(folds[i]) 49 | training_test.append([training, test]) 50 | 51 | return training_test 52 | 53 | 54 | def plot_cv_errors(errors, model, regparm, fignum): 55 | """Plots test vs training error for cross validation, as return from run_train_models 56 | 57 | errors = as returned from run_train_models 58 | model = a string with model name, e.g. "LogisticRegression" 59 | regparm = the name of regularization parameter, e.g. "lam" 60 | """ 61 | import pylab as plt 62 | import re 63 | 64 | # accumulate the erorrs + the regularization parameters 65 | # errors_plot = [train, test] list 66 | errors_plot = [] 67 | reg = [] 68 | 69 | for desc, err in six.iteritems(errors): 70 | if re.search(model, desc): 71 | # it corresponds to this model 72 | # get the regularization parameter 73 | c = float(re.search("'%s':\s+([\.0-9-e]+)(}|,)" % regparm, desc).group(1)) 74 | reg.append(c) 75 | errors_plot.append([err['train'], err['test']]) 76 | 77 | errors_plot = np.asarray(errors_plot) 78 | reg = np.asarray(reg) 79 | plot_order = reg.argsort() 80 | 81 | fig = plt.figure(fignum) 82 | fig.clf() 83 | plt.plot(np.log(reg[plot_order]), errors_plot[plot_order, 0], label='train') 84 | plt.plot(np.log(reg[plot_order]), errors_plot[plot_order, 1], label='test') 85 | plt.legend() 86 | plt.grid(True) 87 | fig.show() 88 | 89 | 90 | def learning_curves(model_description, X, y, kfolds=5, fignum=1): 91 | """Plot learning curves 92 | 93 | uses k-fold cross validation for 25, 50, 75, 100% of data 94 | 95 | model_description as input to TrainModelCV the model. This defines 96 | the model to check. 97 | kfolds = use this many folds 98 | fignum = plot in this figure number 99 | """ 100 | # use TrainModelCV to do so 101 | import pylab as plt 102 | 103 | indices = np.arange(X.shape[0]) 104 | np.random.shuffle(indices) 105 | 106 | pct_data = np.array([0.25, 0.5, 0.75, 1.0]) 107 | npct = len(pct_data) 108 | ndata = (pct_data * X.shape[0]).astype(np.int) 109 | test_errors = [] 110 | train_errors = [] 111 | for N in ndata: 112 | folds = cv_kfold(N, kfolds) 113 | trainer = TrainModelCV(model_description, 114 | X=X[indices][:N, :], y=y[indices][:N], 115 | folds=folds) 116 | errors = trainer.run() 117 | test_errors.append(errors[list(errors.keys())[0]]['test']) 118 | train_errors.append(errors[list(errors.keys())[0]]['train']) 119 | 120 | fig = plt.figure(fignum) 121 | fig.clf() 122 | plt.plot(pct_data, train_errors, label='train') 123 | plt.plot(pct_data, test_errors, label='test') 124 | plt.xlabel("Percent of data") 125 | plt.legend() 126 | plt.ylabel(model_description[1].__name__) 127 | fig.show() 128 | 129 | 130 | -------------------------------------------------------------------------------- /mozsci/cspearmanr_by_fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | template 9 | struct indexed_compare 10 | { 11 | iter_t begin; 12 | indexed_compare(iter_t begin) : begin(begin) {} 13 | bool operator()(std::size_t a, std::size_t b) const { 14 | // sort in ascending order 15 | return *(begin+a) < *(begin+b); 16 | } 17 | }; 18 | 19 | typedef indexed_compare::iterator> index_compare_double_vector; 20 | 21 | std::vector to_ranked (std::vector a) 22 | { 23 | int n = a.size(); 24 | std::vector ret(n); 25 | for (std::size_t i = 0; i < n; i++) ret[i] = i; 26 | std::sort::iterator, index_compare_double_vector>(ret.begin(), ret.end(), index_compare_double_vector(a.begin())); 27 | 28 | // need to take ties into account and assign the average rank 29 | std::vector ret2(a.size()); 30 | int sumranks = 0.0; 31 | int dupcount = 0.0; 32 | double eps = 1.0e-8; 33 | for (std::size_t i = 0; i < n; i++) 34 | { 35 | sumranks = sumranks + i; 36 | dupcount++; 37 | if (i == (n - 1) || abs(a[ret[i]] != a[ret[i+1]]) > eps) 38 | { 39 | double avgrank = double(sumranks) / double(dupcount) + 1; 40 | for (int j = i - dupcount + 1; j < i + 1; j++) 41 | { 42 | ret2[ret[j]] = avgrank; 43 | } 44 | sumranks = 0; 45 | dupcount = 0; 46 | } 47 | } 48 | 49 | return ret2; 50 | } 51 | 52 | template 53 | double pearson_correlation(std::vector a, std::vector b) 54 | { 55 | // for (int i = 0; i < a.size(); i ++) { 56 | // std::cout << a[i] << " " << b[i] << " " << std::endl; 57 | // } 58 | if (a.size() != b.size()) abort(); 59 | 60 | double sum_a_b = inner_product(a.begin(), a.end(), b.begin(), 0.0); 61 | double sum_a = accumulate(a.begin(), a.end(), 0.0); 62 | double sum_b = accumulate(b.begin(), b.end(), 0.0); 63 | double sum_a_a = inner_product(a.begin(), a.end(), a.begin(), 0.0); 64 | double sum_b_b = inner_product(b.begin(), b.end(), b.begin(), 0.0); 65 | double n = a.size(); 66 | 67 | double r = (sum_a_b - sum_a*sum_b/n) / sqrt((sum_a_a- sum_a*sum_a/n)*(sum_b_b-sum_b*sum_b/n)); 68 | return r; 69 | } 70 | 71 | double spearman_correlation(std::vector a, std::vector b) 72 | { 73 | // for (int i = 0; i < a.size(); i ++) { 74 | // std::cout << a[i] << " " << b[i] << " " << std::endl; 75 | // } 76 | return pearson_correlation(to_ranked(a), to_ranked(b)); 77 | } 78 | 79 | double spearman_by(std::vector a, std::vector b, std::vector byvar) 80 | { 81 | // data must be sorted byvar in ascending order 82 | double ret = 0.0; 83 | int ngroups = 0; 84 | 85 | // the minimum number of elements in a by group to add into the overall result 86 | int min_by = 25; 87 | 88 | std::size_t last_by = byvar[0]; 89 | int nby = 0; 90 | int start_index = 0; 91 | for (std::size_t k = 0; k < a.size(); k++) 92 | { 93 | if (byvar[k] == last_by) 94 | { 95 | nby += 1; 96 | } 97 | else 98 | { 99 | // we are at a new group 100 | if (nby >= min_by) 101 | { 102 | // compute stuff 103 | std::vector a_by_group(&a[start_index], &a[start_index + nby]); 104 | std::vector b_by_group(&b[start_index], &b[start_index + nby]); 105 | double sc = spearman_correlation(a_by_group, b_by_group); 106 | if (!isnan(sc)) 107 | { 108 | ret = ret + sc; 109 | ngroups++; 110 | } 111 | } 112 | 113 | // reset 114 | nby = 1; 115 | start_index = k; 116 | last_by = byvar[k]; 117 | } 118 | } 119 | 120 | // last group 121 | if (nby >= min_by) 122 | { 123 | // compute stuff 124 | std::vector a_by_group(&a[start_index], &a[start_index + nby]); 125 | std::vector b_by_group(&b[start_index], &b[start_index + nby]); 126 | double sc = spearman_correlation(a_by_group, b_by_group); 127 | if (!isnan(sc)) 128 | { 129 | ret = ret + sc; 130 | ngroups++; 131 | } 132 | } 133 | 134 | return ret / ngroups; 135 | } 136 | 137 | extern "C" double c_spearman_for_python(double* a, double* b, std::size_t* byvar, std::size_t n) 138 | { 139 | // wrapper function for python 140 | std::vector avec (a, a + n); 141 | std::vector bvec (b, b + n); 142 | std::vector byvarvec (byvar, byvar + n); 143 | return spearman_by(avec, bvec, byvarvec); 144 | 145 | } 146 | 147 | 148 | int main(void) 149 | { 150 | // initialize vectors 151 | //static const double arr[] = {1, 2, 3, 4, 5}; 152 | //std::vector position (arr, arr + sizeof(arr) / sizeof(arr[0]) ); 153 | 154 | //static const double arr_pa[] = {0.4, 0.1, 0.22, -0.88, 0.55}; 155 | //std::vector pa (arr_pa, arr_pa + sizeof(arr_pa) / sizeof(arr_pa[0]) ); 156 | 157 | static const double arr[] = { 0.33117374, 0.80947619, 3. , 0.25457016, 158 | 0.52897721, 3. , 0.51733111, 0.60862871, 159 | 0.21389315, 0.35368557, 10. , 10. , 160 | 0.72061731, 0.23078359, 0.38791586, 0.43954613, 161 | 0.91398124, 0.29594647, 10. , 0.78991894}; 162 | std::vector position (arr, arr + sizeof(arr) / sizeof(arr[0]) ); 163 | 164 | static const double arr_pa[] = { 0.10526316, 1.15789474, 1.94736842, 2.21052632, -1.73684211, 165 | -1.47368421, -0.68421053, 1.68421053, 0.63157895, 0.36842105, 166 | -0.94736842, 1.42105263, 3. , -0.42105263, 0.89473684, 167 | 2.47368421, -1.21052632, -0.15789474, 2.73684211, -2. }; 168 | std::vector pa (arr_pa, arr_pa + sizeof(arr_pa) / sizeof(arr_pa[0]) ); 169 | 170 | 171 | std::cout << spearman_correlation(position, pa) << std::endl; 172 | 173 | static const double by_arr_pa[] = { 51.73402682, 52.19589972, 44.97281905, 54.73404694, 174 | 47.6719409 , 45.96619825, 50.36193419, 46.27607543, 175 | 48.18824048, 54.88529706, 42.67667074, 41.80373588, 176 | 37.29934119, 57.98812747, 45.04782628, 38.10858417, 177 | 46.44031713, 40.59823939, 26.29936944, 23.96820474, 178 | 47.98343799, 36.4455311 , 43.92931621, 55.19172514, 179 | 33.44633285, 37.38381116, 39.03392758, 41.43285553, 180 | 28.63082987, 31.86069758, 41.19551474, 29.04928565, 181 | 39.09690404, 36.75441683, 29.66390582, 70.4035713 , 182 | 63.53532854, 49.78916058, 64.39911984, 65.41353192, 183 | 48.42353021, 60.38572122, 42.44357922, 42.86378695, 184 | 58.93821467, 61.93862217, 36.23459784, 64.57533596, 185 | 40.09399141, 45.57233379, 44.7748158 , 50.88705955, 186 | 47.24016865, 51.75866967, 36.17935042, 46.73933887, 187 | 52.7136634 , 47.0337377 , 34.19077012, 18.5836512 , 188 | 41.63257011, 9.8698871 , 37.63277795, 47.71676464, 189 | 34.89667886, 35.10845963, 44.56638481, 36.70884056, 190 | 57.9185177 , 50.65260932, 58.53307806, 43.25154747, 191 | 40.59802125, 38.97005406, 35.19682907, 51.94755877, 192 | 44.04430199, 35.84048228, 36.25006727, 46.35317423, 193 | 37.44668618, 16.90596421, 38.87970562, 47.33515849, 194 | 27.41230181, 29.47142008 } ; 195 | std::vector by_pa (by_arr_pa, by_arr_pa + sizeof(by_arr_pa) / sizeof(by_arr_pa[0]) ); 196 | 197 | static const double by_arr_position[] = { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 12., 198 | 13., 15., 16., 17., 19., 23., 24., 25., 26., 27., 28., 199 | 29., 1., 2., 3., 6., 8., 9., 11., 12., 13., 17., 200 | 19., 21., 1., 2., 3., 4., 5., 6., 7., 8., 9., 201 | 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 202 | 22., 23., 24., 25., 26., 27., 1., 2., 4., 5., 6., 203 | 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 204 | 18., 20., 21., 22., 23., 24., 25., 26., 27. }; 205 | std::vector by_position (by_arr_position, by_arr_position + sizeof(by_arr_position) / sizeof(by_arr_position[0]) ); 206 | 207 | static const std::size_t by_arr_queryid[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 208 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 209 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 210 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 211 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 212 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 213 | 3, 3, 3, 3, 3, 3, 3, 3}; 214 | std::vector by_queryid (by_arr_queryid, by_arr_queryid + sizeof(by_arr_queryid) / sizeof(by_arr_queryid[0]) ); 215 | 216 | std::cout << spearman_by(by_pa, by_position, by_queryid) << std::endl; 217 | 218 | 219 | 220 | } 221 | 222 | -------------------------------------------------------------------------------- /mozsci/ems.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | 5 | # ensemble model selection 6 | # 7 | # based on "Ensemble Selection from Libraries of Models", 8 | # Caruana, Niculescu-Mizil, Crew, Ksikes 9 | # Proceedings of the 21st International Conference on ML, Banff Canada 2004 10 | # 11 | 12 | import numpy as np 13 | import json 14 | import six 15 | from six.moves import range 16 | 17 | class EnsembleModelSelector(object): 18 | """Implements 19 | "Ensemble Selection from Libraries of Models", 20 | Caruana, Niculescu-Mizil, Crew, Ksikes 21 | Proceedings of the 21st International Conference on ML, Banff Canada 2004 22 | 23 | Holds data 24 | .error = error function 25 | .ensemble = numpy array of the model weights 26 | .nmodels = the number of models added (=sum(ensemble) 27 | .ensemble_indices = the indices of models in ensemble included in final model 28 | 29 | .niter = number of iterations in an ensemble selction 30 | .nsort = the number of models to add at the beginning of the iteration 31 | 32 | For bagged selection: 33 | .nbags = number of bags to use for bagged 34 | .pbags = the percent of models to use in each bag 35 | """ 36 | 37 | def __init__(self, error=None, niter=10, nsort=5, nbags=20, pbags=0.5): 38 | """error = a callable thing (y, ypred) that computes the error 39 | it is minimized by the ensemble. 40 | needs to accept ypred that are averages of the individual model predictions 41 | 42 | niter = the number of iterations to use to add models 43 | nsort = the number of models added to start the ensemble (section 2.2) 44 | nbags = the number of bags to use for bagged selection 45 | pbags = the percentage of each models to include in each bag""" 46 | self.error = error 47 | self.ensemble = None 48 | self.ensemble_indices = None 49 | self.nmodels = 0 50 | self.niter = niter 51 | self.nsort = nsort 52 | self.nbags = nbags 53 | self.pbags = pbags 54 | 55 | 56 | def select_ensemble_bagged(self, y, ymodels, verbose=False): 57 | """Ensemble selection using bagged selection 58 | (Section 2.3 of the paper)""" 59 | ensemble = np.zeros(len(ymodels)) 60 | indices = np.arange(len(ymodels)) 61 | max_keep = int(self.pbags * len(ymodels)) 62 | nmodels = 0 63 | for k in range(self.nbags): 64 | if verbose: 65 | print("Bagging number %s" % str(k+1)) 66 | np.random.shuffle(indices) 67 | ymodels_bagged = np.array(ymodels)[indices[:max_keep]] 68 | self.select_ensemble(y, ymodels_bagged) 69 | # NOW self.ensemble is the selection of models in the bag 70 | # need to unroll these selected indices to those in the original ymodels 71 | ensemble[indices[:max_keep]] += self.ensemble 72 | nmodels += self.nmodels 73 | 74 | # set final ensemble 75 | self.ensemble = ensemble 76 | self.nmodels = nmodels 77 | self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5] 78 | 79 | def select_ensemble(self, y, ymodels, early_termination = False): 80 | """Y = actual y = (N, ) numpy array 81 | ymodels = a list of predictions from different models. 82 | len(ymodels) = nmodels 83 | ymodels[k] = prediction for model k (N, ) numpy array 84 | DOESN'T do any bagging (section 2.3). use select_ensemble_bagged""" 85 | # process: 86 | # (1) set the initial ensemble 87 | # (2) for each iteration, choose the model that decrease the error the most 88 | # and update the current ensemble 89 | 90 | # (1) 91 | self.ensemble = np.zeros(len(ymodels)) 92 | 93 | # do initial sort and insert these models into the ensemble 94 | # errors = a vector of errors corresponing to each model 95 | # it will be updated for each iteration corresponding to the 96 | # error for adding each model to the current ensemble 97 | errors = np.array([self.error(y, ypred) for ypred in ymodels]) 98 | initial_models_to_add = errors.argsort()[0:self.nsort] 99 | self.ensemble[initial_models_to_add] = 1 100 | current_prediction = ymodels[initial_models_to_add[0]].astype(np.float) 101 | for i in initial_models_to_add[1:]: 102 | current_prediction += ymodels[i] 103 | current_prediction /= float(self.nsort) 104 | nmodels = self.nsort 105 | 106 | if early_termination: last_error = np.finfo(np.float).max 107 | # (2) 108 | for k in range(self.niter): 109 | # find the model that reduces error the most 110 | # current_prediction is averaged over nmodels 111 | # need to add in one more as a weighted average 112 | errors = np.array([self.error(y, current_prediction * (float(nmodels) / (nmodels + 1)) + ypred.astype(np.float) / float(nmodels + 1)) for ypred in ymodels]) 113 | 114 | if early_termination: 115 | min_error = errors.min() 116 | if min_error < last_error: last_error = min_error 117 | else:break 118 | 119 | model_to_add = errors.argmin() 120 | 121 | self.ensemble[model_to_add] += 1 122 | current_prediction = current_prediction * (float(nmodels) / (nmodels + 1)) + ymodels[model_to_add].astype(np.float) / float(nmodels + 1) 123 | nmodels += 1 124 | 125 | print(("Iteration %s, error=%s" % (k, errors.min()))) 126 | 127 | # pull out the indices of models included in the final ensemble 128 | self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5] 129 | self.nmodels = nmodels 130 | 131 | 132 | def pred(self, ymodels): 133 | """Given the input from ymodels (same as input to select_ensemble), 134 | return the predicted probabilities""" 135 | pred = ymodels[self.ensemble_indices[0]] * self.ensemble[self.ensemble_indices[0]] 136 | for k in self.ensemble_indices[1:]: 137 | pred += ymodels[k] * self.ensemble[k] 138 | return pred.astype(np.float) / np.float(self.nmodels) 139 | 140 | def save_ensemble(self, fileout): 141 | """ 142 | Serialize the ensemble. 143 | :param fileout: name of the file to write the json string, or a file object. 144 | :return: None 145 | """ 146 | if self.ensemble is None or self.ensemble_indices is None: 147 | raise ValueError('The ensemble has not been properly trained.') 148 | 149 | model_json = { 150 | 'nmodels': self.nmodels, 151 | 'ensemble': self.ensemble[:].tolist(), 152 | 'ensemble_indices': self.ensemble_indices[:].tolist(), 153 | } 154 | 155 | # save to the file 156 | if isinstance(fileout, six.string_types): 157 | with open(fileout, 'w') as f: 158 | json.dump(model_json, f) 159 | else: 160 | json.dump(model_json, fileout) 161 | 162 | @classmethod 163 | def load_ensemble(cls, model_json): 164 | """ 165 | Load the serialized model. Afteer the loading, we can use pred method on new data sets. 166 | :param cls: 167 | :param model_json: name of the file to read in the json string, or a file object. 168 | :return: the new object. 169 | """ 170 | if isinstance(model_json, six.string_types): 171 | with open(model_json, 'r') as f: 172 | model_json = json.load(f) 173 | 174 | ensemble = cls() 175 | ensemble.nmodels = model_json['nmodels'] 176 | ensemble.ensemble = np.array(model_json['ensemble'], dtype = np.float64) 177 | ensemble.ensemble_indices = np.array(model_json['ensemble_indices'], dtype = np.int) 178 | 179 | return ensemble 180 | 181 | if __name__ == "__main__": 182 | 183 | import pylab as plt 184 | from .evaluation import classification_error 185 | 186 | np.random.seed(2) 187 | 188 | # make the data 189 | N = 1000 190 | 191 | # some predictons 192 | # actual = 5 * x - 4 > 0 193 | x = np.linspace(0, 1, N) 194 | y = 5 * x - 4 > 0 195 | 196 | nmodels = 500 197 | ymodels = [] 198 | for k in range(nmodels): 199 | m = np.random.rand(1) * 5 * (np.random.rand(N) - 0.5) + 5 200 | b = 3 * (np.random.rand(N) - 0.5) + 4 201 | thisy = (m * x - b > 0).astype(np.int) 202 | ymodels.append(thisy) 203 | 204 | ems = EnsembleModelSelector(classification_error, niter=25) 205 | ems.select_ensemble(y, ymodels) 206 | ypred = ems.pred(ymodels) 207 | classification_error(y, ypred) 208 | 209 | ems.select_ensemble_bagged(y, ymodels) 210 | ypred = ems.pred(ymodels) 211 | classification_error(y, ypred) 212 | 213 | 214 | fig = plt.figure(1) 215 | fig.clf() 216 | plt.scatter(x, y, marker='o', color='r') 217 | for k in range(40): 218 | plt.scatter(x, ymodels[k]+0.01 + k*0.01, marker='s', s=1, color='b') 219 | 220 | plt.scatter(x, ypred, marker='x', color='k') 221 | plt.plot(x, 0.5 * np.ones(x.shape), 'k') 222 | plt.plot(0.8 * np.ones((100, 1)), np.linspace(0, 1, 100), 'k') 223 | 224 | plt.title("Ensemble model selection via greedy sampling\nRed=actual, Blue=40 samples of noisy models, black=ensemble average") 225 | plt.xlabel("X") 226 | plt.ylabel("Y") 227 | fig.show() 228 | # fig.savefig("ensemble_model_average.png") 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /mozsci/evaluation.py: -------------------------------------------------------------------------------- 1 | """Evaluate model performance including efficient C implementations""" 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | 6 | from .inputs import mean_std_weighted 7 | from .spearmanr_by_fast import spearmanr_by 8 | from ._c_utils import c_auc_wmw 9 | from six.moves import range 10 | 11 | def pearsonr_weighted(x, y, weights=None): 12 | """Weighted Pearson correlation coefficient. 13 | 14 | x, y = (N, ) numpy arrays or 15 | weights = (N, ) or None for no weights""" 16 | from scipy.stats import pearsonr 17 | if weights is None: 18 | return pearsonr(x, y)[0] 19 | else: 20 | mean_std_x = mean_std_weighted(x.flatten(), weights.flatten()) 21 | mean_std_y = mean_std_weighted(y.flatten(), weights.flatten()) 22 | cov_xy = np.sum((x - mean_std_x['mean']) * (y - mean_std_y['mean']) * weights.flatten()) / np.sum(weights) 23 | return cov_xy / mean_std_x['std'] / mean_std_y['std'] # r 24 | 25 | 26 | def auc_wmw_fast(t, p, weights=None): 27 | """Compute the AUC by using the Wilcoxon-Mann-Whitney 28 | statistic 29 | 30 | t = (Nobs, ) target values (-1/+1) or (0/1) 31 | p = (Nobs, ) predicted values 32 | weights = a (Nobs, ) array with the weights 33 | if omitted, uses uniform weights 34 | 35 | Returns AUC 36 | """ 37 | tarr = np.asarray(t, dtype=np.int).flatten() 38 | parr = np.asarray(p, dtype=np.float).flatten() 39 | 40 | if len(tarr) != len(parr): 41 | raise ValueError("t, p: shape mismatch") 42 | 43 | idxp = np.where(tarr == 1)[0] 44 | idxn = np.where(tarr <= 0)[0] 45 | nidxn = idxn.shape[0] 46 | nidxp = idxp.shape[0] 47 | 48 | if weights is not None: 49 | warr = np.asarray(weights, dtype=np.float).flatten() 50 | else: 51 | warr = np.ones(tarr.shape) 52 | 53 | auc = c_auc_wmw(idxp, idxn, parr, warr) 54 | if np.isnan(auc): 55 | auc = 0 56 | 57 | return auc 58 | 59 | 60 | def auc_wmw_error(t, p, weights=None): 61 | """Returns 1.0 - AUC to mimic an error function 62 | (to pass into minimization routines)""" 63 | return 1.0 - auc_wmw_fast(t, p, weights) 64 | 65 | 66 | def classification_error(y, ypred, thres=0.5, weights=None): 67 | """ y = 0, 1 68 | y pred = P(y == 1) is between 0 and 1 69 | Uses thres as the threshold 70 | y and ypred are numpy arrays 71 | weights = if provided is a y.shape() array with the weights 72 | take a weighted error in this case""" 73 | if weights is None: 74 | return ((ypred > thres).astype(np.int).reshape(-1, 1) != y.reshape(-1, 1)).sum() / float(len(y)) 75 | else: 76 | return (((ypred > thres).astype(np.int).reshape(-1, 1) != y.reshape(-1, 1)) * weights.reshape(-1, 1)).sum() / float(weights.sum()) 77 | 78 | 79 | def precision_recall_f1(y, ypred, thres=0.5, weights=None): 80 | """y = 0/1 or -1/+1 81 | ypred = P(y == 1) is between 0 and 1 82 | y and ypred are numpy arrays 83 | weights = if provided is a y.shape() array with the weights 84 | take a weighted error in this case""" 85 | # see http://en.wikipedia.org/wiki/Precision_and_recall 86 | # need to properly handle case where y = (10, ), ypred=(10, 1) 87 | ypred_1 = (ypred > thres).reshape(-1, 1) 88 | yy = y.reshape(-1, 1) 89 | if weights is None: 90 | tp = np.sum(np.logical_and(ypred_1, yy == 1)) 91 | fp = np.sum(np.logical_and(ypred_1, yy == 0)) 92 | fn = np.sum(np.logical_and(~ypred_1, yy == 1)) 93 | else: 94 | ww = weights.reshape(-1, 1) 95 | tp = np.sum(np.logical_and(ypred_1, yy == 1) * ww) 96 | fp = np.sum(np.logical_and(ypred_1, yy == 0) * ww) 97 | fn = np.sum(np.logical_and(~ypred_1, yy == 1) * ww) 98 | 99 | # precision = tp / float(tp + fp) 100 | # recall = tp / float(tp + fn) 101 | # f1 = 2.0 * precision * recall / (precision + recall) 102 | 103 | # we need to check for degenerate cases 104 | # that might happen if we have only 1 input 105 | if tp + fp > 0: 106 | precision = tp / float(tp + fp) 107 | else: 108 | precision = 0 109 | 110 | if tp + fn > 0: 111 | recall = tp / float(tp + fn) 112 | else: 113 | recall = 0 114 | 115 | if precision + recall > 0: 116 | f1 = 2.0 * precision * recall / (precision + recall) 117 | else: 118 | f1 = 0 119 | 120 | return precision, recall, f1 121 | 122 | 123 | """ 124 | All the performance measures that we will be using for classification problems live in this file below here. 125 | """ 126 | 127 | 128 | def classification_model_performance(observed, predicted, weight=None): 129 | """ 130 | This is to check the performance of a classification algorithm. 131 | The observed values should be 0, 1, 2, etc. The weight is a list of the float numbers whose indices are 132 | the classes. For ex, if weight is [1, 5], then we have two classes in the classification problem. And 133 | the error caused by assigning class 0 instance to a class 1 instance is 1. The error caused by assigning 134 | a class 1 instance to a class 0 instance is 5. 135 | 136 | I like the returned perf measure to be in the range of [0, 1]. We should do so for at least the 'no-weight' 137 | case. 138 | 139 | Currently the value is, the lower, the better. 140 | """ 141 | if weight is None: 142 | sum_incorrect = sum(observed != predicted) 143 | else: 144 | sum_incorrect = sum(weight[observed[ii]] for ii in range(len(observed)) if observed[ii] != predicted[ii]) 145 | 146 | return sum_incorrect / float(len(predicted)) 147 | 148 | 149 | def classification_model_performance_matrix(observed, predicted): 150 | """ 151 | This is to check the performance of a classification algorithm. 152 | The observed values should be 0, 1, 2, etc. 153 | 154 | We will use numpy's round number here - np.round(4.6) ( = 5.0). we can use int(np.round(4.6)) gives 5. 155 | """ 156 | # assume that the classe categories start from 0. 157 | num_classes = int(max(observed)) + 1 158 | 159 | perf_2d_array = np.zeros([num_classes] * 2, dtype=int) 160 | 161 | for ii in range(len(observed)): 162 | # in case some algorithms return float numbers. 163 | predicted_class = int(np.round(predicted[ii])) 164 | perf_2d_array[observed[ii], predicted_class] += 1 165 | 166 | return perf_2d_array 167 | 168 | 169 | def classification_model_performance_loss(observed, predicted, loss=None): 170 | """ 171 | loss is a function with two inputs (i, j) where i is the real category and j is the predicted category. 172 | It returns a float number as the loss of assigning a category i instance to category j. 173 | A simple one is implemented as the default (see below in the function body). 174 | 175 | Another way to call this function is to define a loss function or lambda as below. 176 | classification_model_performance_loss(observed, predicted, loss=lambda i, j: (i-j)**2) 177 | """ 178 | def default_loss(class_i, class_j): 179 | if class_i == class_j: 180 | return 0 181 | else: 182 | return 1 183 | 184 | if loss is None: 185 | loss = default_loss 186 | 187 | total_loss = sum(loss(observed[ii], int(np.round(predicted[ii]))) for ii in range(len(observed))) 188 | 189 | return total_loss 190 | 191 | -------------------------------------------------------------------------------- /mozsci/glm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /mozsci/glm/prob_distributions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides all the probability distributions that the simplified generalized models supports. 3 | A better name might be likelihood. We will provide the eval, eval_gradient, and eval_hessian for the 4 | log likelihood here. Note, we do not add the negative here. That should be done by the caller. 5 | """ 6 | from __future__ import absolute_import 7 | from __future__ import print_function 8 | 9 | ### Attention, there is no special treatment of the constant column here. So before call any method here, 10 | ### add one columns of 1's to the feature matrix, ex, np.c_[features, np.ones(features.shape[0])] 11 | 12 | import numpy as np 13 | 14 | class GlmProbDistBase(object): 15 | """ 16 | The base class of the probability distributions. 17 | """ 18 | 19 | def __init__(self): 20 | pass 21 | 22 | def eval(self, beta, features, y): 23 | """ 24 | This method returns the log likelihood of the variables. Constants will be omitted because the goal of 25 | this evaluation is to maximize the log likelihood. So, this method might return positive numbers. 26 | """ 27 | pass 28 | 29 | def eval_gradient(self, beta, features, y): 30 | pass 31 | 32 | def eval_hessian(self, beta, features, y): 33 | pass 34 | 35 | def get_inverse_link(self): 36 | """ 37 | Get the inverse of the link function. The caller can use it to calculate the expected value from 38 | the linear predictors. 39 | """ 40 | pass 41 | 42 | class Poisson(GlmProbDistBase): 43 | """ 44 | Poisson regression. 45 | """ 46 | def eval(self, beta, features, y): 47 | """ 48 | return the log likelihood, with features 49 | """ 50 | 51 | log_miu = np.dot(features, beta) 52 | log_miu = np.minimum(log_miu, 5) 53 | tmp = np.sum(log_miu * y - np.exp(log_miu)) 54 | 55 | if np.isinf(tmp): 56 | print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ') 57 | print('max of y * log miu', np.max(y * log_miu)) 58 | print('max of miu', np.max(np.exp(log_miu))) 59 | print('max of y ', max(y)) 60 | 61 | return np.finfo(np.float).max 62 | else: 63 | return tmp 64 | 65 | def eval_gradient(self, beta, features, y): 66 | """ 67 | return the gradient of beta at y with feature features. 68 | y is the array of the observed values. 69 | This is the gradient against beta_k. beta_k[0] = k* which is the log of k. 70 | This is a faster version, compared with the eval_gradient_bk 71 | :param beta_k: one single array of k* and beta 72 | :param features: 73 | :param y: observed variable. 74 | :return: 75 | """ 76 | # setup the values we are going to need. 77 | log_miu = np.dot(features, beta) 78 | # prevent overflows 79 | log_miu = np.minimum(log_miu, 5) 80 | miu = np.exp(log_miu) 81 | grad_tmp = y - miu 82 | 83 | gradient = np.sum(features * grad_tmp.reshape(-1,1), axis=0) 84 | if np.isnan(np.sum(gradient)): 85 | print('Warning--The grad_tmp has nan', gradient) 86 | 87 | return gradient 88 | 89 | def get_inverse_link(self): 90 | return np.exp 91 | 92 | class Exponential(GlmProbDistBase): 93 | """ 94 | The exponential probability distribution. The parameter lambda is the inner product of beta and x. 95 | This exponential uses a different link function. log(x). This solves the non-positive problem we have 96 | in Expontial class. 97 | """ 98 | 99 | def eval(self, beta, features, y): 100 | """ 101 | return the log likelihood 102 | theta = beta * feature. 103 | """ 104 | 105 | log_miu = np.dot(features, beta) 106 | tmp = -np.sum(log_miu + y * np.exp(-log_miu)) 107 | 108 | if np.isinf(tmp): 109 | print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ') 110 | print('max of log miu', np.max(log_miu)) 111 | print('max of y / miu', np.max(y * np.exp(-log_miu))) 112 | print('max of y ', max(y)) 113 | 114 | return np.finfo(np.float).max 115 | else: 116 | return tmp 117 | 118 | def eval_gradient(self, beta, features, y): 119 | """ 120 | return the gradient of beta at y with feature features. 121 | y is the array of the observed values. 122 | """ 123 | # setup the values we are going to need. 124 | log_miu = np.dot(features, beta) 125 | grad_tmp = 1.0 - y * np.exp(-log_miu) 126 | 127 | gradient = -np.sum(features * grad_tmp.reshape(-1,1), axis=0) 128 | if np.isnan(np.sum(gradient)): 129 | print('Warning--The grad_tmp has nan', gradient) 130 | 131 | return gradient 132 | 133 | def get_inverse_link(self): 134 | return np.exp 135 | 136 | class NegativeBinomialWithKstar(GlmProbDistBase): 137 | """ 138 | Negative Binomial regression. 139 | Parameter k is fixed. 140 | """ 141 | def eval(self, beta_k, features, y): 142 | """ 143 | return the log likelihood, with feature feature 144 | theta = beta * feature. 145 | Attention: We omit the ln((y-1)!) in the loglikelihood, because our goal is to optimize the loglikelihood. 146 | beta_k[0] = k* which is the log of k. 147 | """ 148 | beta = beta_k[1:] 149 | 150 | # underflow in some special cases. 151 | if beta_k[0] < -720.0: 152 | beta_k[0] = -720.0 153 | 154 | k = np.exp(beta_k[0]) ## exp(k*). 155 | ln_exp_k_star = beta_k[0] ## ln(e^(k*)). It's actually k*, ie. beta_k[0] 156 | 157 | max_y = int(y.max()) 158 | subsum_y = np.log(np.arange(max_y) + k).cumsum() 159 | log_miu = np.dot(features, beta) 160 | 161 | # log( 1 + exp( k* - log(miu))) 162 | log_1_plus_sth = np.log(1.0 + np.exp(beta_k[0] - log_miu)) 163 | log_1_plus_sth[beta_k[0] - log_miu > 50] = beta_k[0] - log_miu[beta_k[0] - log_miu > 50] 164 | 165 | subsum = subsum_y[y.astype(np.int) - 1] 166 | subsum[y.astype(np.int) == 0] = 0.0 167 | 168 | tmp = np.sum(subsum + k * ln_exp_k_star + y * log_miu) - np.sum((k + y) * (log_miu + log_1_plus_sth)) 169 | 170 | if np.isinf(tmp): 171 | print('WARNING -- Log likelihood got inf value. It has been replaced by float.max. ') 172 | print('max of subsum', np.max(subsum)) 173 | print('max of y * log miu', np.max(y * log_miu)) 174 | print('max of (k+y) * log miu and k', np.max((k + y) * (log_miu + log_1_plus_sth))) 175 | print('max of log miu and log 1 puls sth', np.max(log_miu), np.max(log_1_plus_sth)) 176 | print('max of y ', max(y)) 177 | print('value of exp and k', k * ln_exp_k_star) 178 | 179 | return np.finfo(np.float).max 180 | else: 181 | return tmp 182 | 183 | def eval_gradient(self, beta_k, features, y): 184 | """ 185 | return the gradient of beta at y with feature features. 186 | y is the array of the observed values. 187 | This is the gradient against beta_k. beta_k[0] = k* which is the log of k. 188 | This is a faster version, compared with the eval_gradient_bk 189 | :param beta_k: one single array of k* and beta 190 | :param features: 191 | :param y: observed variable. 192 | :return: the gradient of the log likelihood 193 | """ 194 | # setup the values we are going to need. 195 | beta = beta_k[1:] 196 | 197 | if beta_k[0] < -720.0: # handling underflow. 198 | beta_k[0] = -720.0 199 | 200 | k = np.exp(beta_k[0]) ## exp(k*). 201 | 202 | log_miu = np.dot(features, beta) 203 | log_1_plus_sth = np.log(1.0 + np.exp(beta_k[0] - log_miu)) 204 | log_1_plus_sth[beta_k[0] - log_miu > 50] = beta_k[0] - log_miu[beta_k[0] - log_miu > 50] 205 | 206 | miu = np.exp(log_miu) 207 | miu[np.isinf(miu + k)] = np.finfo(np.float).max - 1.5 * k 208 | 209 | # gradient of beta 210 | grad_tmp = (y - miu) / (miu + k) 211 | # test of nan in the gradient calculation. 212 | if np.isnan(np.sum(grad_tmp)): 213 | if np.isnan(np.sum(miu)): 214 | print('The miu has nan', miu) 215 | print('min of miu + k is ', np.min(miu + k)) 216 | print('max of miu + k is ', np.max(miu + k)) 217 | print('min of y - miu is ', np.min(y - miu)) 218 | print('max of y - miu is ', np.max(y - miu)) 219 | print('The grad_tmp has nan', grad_tmp) 220 | 221 | gradient_beta = k * np.sum(features * grad_tmp.reshape(-1,1), axis=0) 222 | if np.isnan(np.sum(gradient_beta)): 223 | print('The grad_tmp has nan', gradient_beta) 224 | 225 | # derivative of k* 226 | max_y = int(y.max()) 227 | subsum_y = (1.0 / (np.arange(max_y) + k)).cumsum() 228 | subsum = subsum_y[y.astype(np.int) - 1] 229 | subsum[y.astype(np.int) == 0] = 0.0 230 | 231 | derivative_k = np.sum(subsum + 1.0 + beta_k[0] - (k + y)/(k + miu) - (log_miu + log_1_plus_sth)) 232 | 233 | if np.isinf(derivative_k): 234 | print('WARNING -- Derivative of kstar got inf value. It has been replaced by float.max. ') 235 | derivative_k = np.finfo(np.float).max 236 | 237 | # Assemble them together! 238 | gradient = np.zeros(beta_k.shape[0]) 239 | gradient[0] = k * derivative_k 240 | gradient[1:] = gradient_beta 241 | 242 | return gradient 243 | 244 | def get_inverse_link(self): 245 | return np.exp 246 | 247 | 248 | -------------------------------------------------------------------------------- /mozsci/glm/regularization.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from six.moves import range 4 | 5 | class RegularizationBase(object): 6 | """ 7 | Base class of all the regularization methods. 8 | Super classes can provide gradient and Hessian methods. 9 | """ 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def eval(self, x): 15 | pass 16 | 17 | class NullRegularization(RegularizationBase): 18 | """ 19 | This is a null regularization, ie. 0 regularization. 20 | """ 21 | def eval(self, x): 22 | return 0 23 | 24 | def eval_gradient(self, x): 25 | return 0 26 | 27 | def eval_hessian(self, x): 28 | return 0 29 | 30 | class RidgeRegularization(RegularizationBase): 31 | """ 32 | Ridge regularization. 33 | It's lam/2.0 * ||x|| ** 2 34 | This regularization does not penalize the constant term. The constant term is supposed to be the last term. 35 | """ 36 | 37 | def __init__(self, lam): 38 | self.lam = lam 39 | 40 | def eval(self, x): 41 | return 0.5 * self.lam * np.inner(x[:-1], x[:-1]) 42 | 43 | def eval_gradient(self, x): 44 | tmp = self.lam * x 45 | tmp[-1] = 0.0 46 | 47 | # return self.lam * x 48 | return tmp 49 | 50 | def eval_hessian(self, x): 51 | hessian = self.lam * np.identity(x.shape[0]) 52 | hessian[-1, -1] = 0 53 | return hessian 54 | 55 | class RidgeRegularizationAll(RegularizationBase): 56 | """ 57 | Ridge regularization. 58 | It's lam/2.0 * ||x|| ** 2 59 | """ 60 | 61 | def __init__(self, lam): 62 | self.lam = lam 63 | 64 | def eval(self, x): 65 | return 0.5 * self.lam * np.inner(x, x) 66 | 67 | def eval_gradient(self, x): 68 | return self.lam * x 69 | 70 | def eval_hessian(self, x): 71 | return self.lam * np.identity(x.shape[0]) 72 | 73 | 74 | class RidgeRegularizationChosen(RegularizationBase): 75 | """ 76 | Ridge regularization on chosen terms. 77 | It's lam/2.0 * ||x|| ** 2 78 | """ 79 | 80 | def __init__(self, lam, dim, free_list=[]): 81 | self.lam = lam 82 | 83 | # this is the indices that will be penalized/regulated. 84 | self.index = list(set(range(dim)) - set(free_list)) 85 | self.free = free_list 86 | 87 | def eval(self, x): 88 | xx = x[self.index] 89 | return 0.5 * self.lam * np.inner(xx, xx) 90 | 91 | def eval_gradient(self, x): 92 | grad = self.lam * x 93 | grad[self.free] = 0.0 94 | 95 | return grad 96 | 97 | def eval_hessian(self, x): 98 | hessian = self.lam * np.identity(x.shape[0]) 99 | hessian[self.free, self.free] = 0.0 100 | return hessian 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /mozsci/glm/simplified_glm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import simplejson as json 4 | 5 | from . import regularization 6 | from . import prob_distributions 7 | from scipy import optimize 8 | import six 9 | 10 | """ 11 | This module defines a series of simplified glm functions. They are simplified because they use a much simpler way to 12 | link the probability distribution parameters and the observed feature variables. 13 | """ 14 | 15 | class SimplifiedGlmBase(object): 16 | """ 17 | This is the base class of the simplified glm classes. 18 | Each super class must provide both fit and predict methods. 19 | """ 20 | 21 | def __init__(self, lam=0.1, regular=None, seed=None, likelihood=None, initialize_params=None, param_len=None, 22 | maxiter=None): 23 | """ 24 | :param lam: the parameter for regularization. 25 | :param regular: 26 | :param seed: 27 | :param likelihood: 28 | :param initialize_params: the method to initialize the parameter array. It takes one parameter as the length 29 | of the params array. See the definition of random_initialize_params(len). 30 | :param maxiter: the number of iterations that can run when do the 'fitting' of the model. It controls the 31 | time spent on optimization routine. 32 | """ 33 | 34 | if lam is not None: 35 | self.regularization = regularization.RidgeRegularization(lam) 36 | else: 37 | if regular is None: 38 | self.regularization = regularization.RidgeRegularization(0.1) 39 | else: 40 | self.regularization = regular 41 | 42 | self.params = None ## The last number in this 1-d array is for the constant (1.0) term. 43 | self.likelihood = likelihood 44 | 45 | if seed is not None: 46 | np.random.seed(seed) 47 | else: ## Likely I will delete these two lines in the future to let the numpy use its own. 48 | np.random.seed(4559) 49 | 50 | if initialize_params is None: 51 | #self.initialize_params = np.zeros 52 | self.initialize_params = np.random.rand 53 | else: 54 | self.initialize_params = initialize_params 55 | 56 | self.param_len = param_len 57 | self.maxiter = maxiter ## To control the optimization routine. 58 | 59 | def get_eval(self, features, y): 60 | """ 61 | A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines. 62 | :param features: 63 | :param y: 64 | :return: the value of the objective function which is -loglikelihood + regularization. We want to 65 | minimize it. 66 | """ 67 | 68 | def func(beta): 69 | return -self.likelihood.eval(beta, features, y) + self.regularization.eval(beta) 70 | return func 71 | 72 | def get_gradient(self, features, y): 73 | """ 74 | A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines. 75 | :param features: 76 | :param y: 77 | :return: 78 | """ 79 | 80 | def func(beta): 81 | return -self.likelihood.eval_gradient(beta, features, y) + self.regularization.eval_gradient(beta) 82 | return func 83 | 84 | def get_hessian(self, features, y): 85 | """ 86 | A wrapper of the likelihood and regularization terms for easy use with scipy's optimization routines. 87 | :param features: 88 | :param y: 89 | :return: 90 | """ 91 | 92 | def func(beta): 93 | return -self.likelihood.eval_hessian(beta, features, y) + self.regularization.eval_hessian(beta) 94 | return func 95 | 96 | 97 | def fit(self, x, y): 98 | """ 99 | training the model. 100 | :param x: the design matrix. It doesn't need to have the constant column, because we are adding one. 101 | :param y: the observed independent variables. 102 | :return: 103 | """ 104 | # add the constant column as the last column 105 | features = np.c_[x, np.ones(x.shape[0])] 106 | 107 | # setup the param length. This is usually the number of features plus 1 for the constant term. but there are exceptions, such as negative binomial. 108 | if self.param_len is None: 109 | self.param_len = features.shape[1] 110 | 111 | initial_params = self.initialize_params(self.param_len) 112 | 113 | eval_func = self.get_eval(features, y) 114 | eval_gradient_func = self.get_gradient(features, y) 115 | 116 | # I have tried Newton, Secant, Conjugate Gradient etc to see which one is more robust. Speed is less important. 117 | # http://scipy-lectures.github.com/advanced/mathematical_optimization/index.html 118 | # eval_hessian_func = self.get_hessian(features, y) 119 | 120 | self.params = optimize.fmin_bfgs(eval_func, initial_params, fprime=eval_gradient_func, maxiter=self.maxiter) 121 | 122 | def predict(self, x): 123 | """ 124 | This predict actually returns the the inverse of the expectation. 125 | :param x: design matrix 126 | :return: 127 | """ 128 | features = np.c_[x, np.ones(x.shape[0])] 129 | return np.inner(features, self.params) 130 | 131 | def save_model(self, model_file=None): 132 | """Serialize model to model_file or return the json str if no file is provided.""" 133 | m = {'params':self.params.tolist()} 134 | 135 | if model_file is None: 136 | return json.dumps(m) 137 | else: 138 | with open(model_file, 'w') as f: 139 | json.dump(m, f) 140 | return None 141 | 142 | @classmethod 143 | def load_model(cls, model_file): 144 | """ 145 | load the model from a file or a json block. 146 | """ 147 | 148 | if isinstance(model_file, six.string_types): 149 | params = json.load(open(model_file, 'r')) 150 | else: 151 | params = model_file 152 | ret = cls() 153 | ret.params = np.array(params['params']) 154 | return ret 155 | 156 | class PoissonRegression(SimplifiedGlmBase): 157 | """ 158 | prob dist: Poisson. 159 | lambda is a linear function of feature variables. 160 | Expected value is exp(w * x) where x * x is the inner product. 161 | """ 162 | def __init__(self, *args, **kw): 163 | 164 | super(PoissonRegression, self).__init__(likelihood=prob_distributions.Poisson(), *args, **kw) 165 | 166 | class NegativeBinomialWithKstarRegression(SimplifiedGlmBase): 167 | """ 168 | prob dist: Poisson. 169 | lambda is a linear function of feature variables. 170 | Expected value is exp(w * x) where x * x is the inner product. 171 | """ 172 | def __init__(self, beta_k_len, initial_k_star=9, regular=None, lam=1.0, *args, **kw): 173 | """ 174 | :param initial_k_star: the initial value for k*, ie. log(k). 175 | """ 176 | 177 | if regular is None: 178 | ## The first entry is the k_star, ie. the number of failures in negative binomial. 179 | ## The last entry is the constant term in the linear regression. 180 | regular = regularization.RidgeRegularizationChosen(lam, dim=beta_k_len, free_list=[0, beta_k_len - 1]) 181 | 182 | self.initial_k_star = initial_k_star 183 | 184 | super(NegativeBinomialWithKstarRegression, self).__init__(lam=None, regular=regular, 185 | likelihood=prob_distributions.NegativeBinomialWithKstar(), 186 | initialize_params=self.initialize_params_withk, 187 | param_len=beta_k_len, *args, **kw) 188 | 189 | def initialize_params_withk(self, cnt): 190 | """ 191 | To return a function to initialize the beta and initial k*. 192 | cnt should be one more of all the features. 193 | """ 194 | params = np.zeros(cnt) 195 | params[0] = self.initial_k_star 196 | 197 | return params 198 | 199 | def predict(self, x): 200 | """ 201 | This overrides the base class's predict. 202 | :param x: design matrix 203 | :return: predicted y. 204 | """ 205 | # The first entry of the params is the k*, ie. log(k) 206 | params = self.params[1:] 207 | 208 | features = np.c_[x, np.ones(x.shape[0])] 209 | return np.inner(features, params) 210 | 211 | class ExponentialGlm(SimplifiedGlmBase): 212 | """ 213 | prob dist: Exponential. 214 | lambda is a linear function of feature variables. 215 | Expected value is exp(w * x) where w * x is actually the inner product of (w, x) 216 | """ 217 | 218 | def __init__(self, *args, **kw): 219 | 220 | super(ExponentialGlm, self).__init__(likelihood=prob_distributions.Exponential(), *args, **kw) 221 | 222 | def random_initialize_params(array_len): 223 | """ 224 | Create an 1-d array that is uniformly randomly chosen in [-0.5, 0.5] 225 | :param array_len: how long is the array. 226 | :return: the numpy array. 227 | """ 228 | return np.random.rand(array_len) - 0.5 229 | 230 | 231 | -------------------------------------------------------------------------------- /mozsci/histogram.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fast 1D empirical histogram sampler. 3 | 4 | Efficently compute binned histograms from large streaming 5 | data sets, using cython to speed up the slow steps. 6 | The speed is typically 10-100X faster then the corresponding numpy 7 | routine. 8 | 9 | Provides capabilities to estimate a probability density 10 | function from data, sample from a given distribution, 11 | plot, serialize to/from a file. 12 | """ 13 | from __future__ import absolute_import 14 | 15 | import numpy as np 16 | 17 | from ._c_utils import histogram1d_update, histogram1d_update_counts 18 | from ._c_utils import histogram1d_compute_indices 19 | 20 | class Histogram1DFast(object): 21 | """A fast 1D histogram sampler 22 | for evenly spaced bins""" 23 | def __init__(self, bins, mn, mx): 24 | """bins evenly spaced bins from mn to mx""" 25 | self.bins = int(bins) 26 | self.bin_width = (mx - mn) / float(bins) 27 | self.bin_count = np.zeros((bins, ), np.int) 28 | self.bin_edges = mn + self.bin_width * np.arange(self.bins + 1) 29 | self.bin_centers = 0.5 * (self.bin_edges[0:-1] + self.bin_edges[1:]) 30 | self.mx = float(mx) 31 | self.mn = float(mn) 32 | self._pdf_updated = False 33 | self.pdf = np.zeros((bins, ), np.float) 34 | self.cdf = np.zeros((bins, ), np.float) 35 | 36 | def update(self, data): 37 | """data is a 1D array to update histogram with 38 | Note: pdf, cdf are not updated after updating the counts 39 | if updated values are needed, client should call self.compute_pdf_cdf() 40 | before accessing. TODO: .pdf and .cdf attributes that lazily 41 | compute/return based on the value of self._pdf_updated""" 42 | bin_count = self.bin_count 43 | bin_width = self.bin_width 44 | mn = self.mn 45 | bins1 = self.bins - 1 46 | histogram1d_update(data.astype(np.float), bin_count, bin_width, 47 | bins1, mn) 48 | self._pdf_updated = False 49 | 50 | def plot(self, ti, fignum): 51 | """Plots the current histogram count 52 | ti = the title 53 | fignum = make this figure number 54 | plots both counts and log(counts) 55 | returns fig""" 56 | import pylab as plt 57 | 58 | fig = plt.figure(fignum) 59 | fig.clf() 60 | 61 | plt.subplot(211) 62 | plt.plot(self.bin_centers, self.bin_count) 63 | plt.ylabel("# " + ti) 64 | 65 | plt.subplot(212) 66 | plt.plot(self.bin_centers, np.log(self.bin_count + 1)) 67 | plt.ylabel("log(# " + ti + ')') 68 | 69 | return fig 70 | 71 | 72 | def update_counts(self, data, counts): 73 | """data is a 1D array of x values, counts is a 1D array 74 | of counts to add""" 75 | ndata = len(data) 76 | assert len(counts) == ndata 77 | bin_count = self.bin_count 78 | bin_width = float(self.bin_width) 79 | mn = float(self.mn) 80 | bins1 = self.bins - 1 81 | histogram1d_update_counts(data.astype(np.float), bin_count, bin_width, 82 | bins1, mn, counts.astype(np.float)) 83 | self._pdf_updated = False 84 | 85 | def compute_indices(self, data): 86 | """Compute the indices in the histogram corresponding to data, 87 | but do not update""" 88 | ndata = len(data) 89 | mn = self.mn 90 | bins1 = self.bins - 1 91 | bin_index = np.zeros(data.shape, np.int) 92 | bin_width = self.bin_width 93 | histogram1d_compute_indices(data.astype(np.float), bin_width, 94 | bins1, mn, bin_index) 95 | return bin_index 96 | 97 | 98 | def compute_pdf_cdf(self): 99 | """Compute and store the pdf and cdf of bin_count""" 100 | if not self._pdf_updated: 101 | ndata = self.bin_count.sum() 102 | if ndata > 0: 103 | self.pdf = self.bin_count / float(self.bin_count.sum()) 104 | self.cdf = self.pdf.cumsum() 105 | else: 106 | self.pdf = None 107 | self.cdf = None 108 | self._pdf_updated = True 109 | 110 | def sample(self, N, return_edge=False, return_index=False): 111 | """Returns N samples of x 112 | if return_edge = True then returns the left bin_edge 113 | instead of a random sample from the interval 114 | if return_index = True then return the index of 115 | the selected bin 116 | Can't have both return_index and return_edge""" 117 | assert not (return_edge and return_index) 118 | if not self._pdf_updated: 119 | self.compute_pdf_cdf() 120 | 121 | # sample the cdf 122 | # numpy's searchsorted uses binary search 123 | # and returns the left bin edge index 124 | rand1 = np.random.rand(N) 125 | samples = self.cdf.searchsorted(rand1) 126 | if return_index: 127 | ret = samples 128 | elif return_edge: 129 | ret = self.bin_edges[samples] 130 | else: 131 | rand2 = np.random.rand(N) 132 | ret = self.bin_edges[samples] + rand2 * self.bin_width 133 | return ret 134 | 135 | def stratified_sample(self, x, sample_size=None, indices=False, empty_bin_rate=0.01): 136 | """Input: 137 | X = (N, ) numpy vector of samples from this distribution, 138 | sample_size = (self.bins, ) vector. This gives the 139 | total number of samples to take from this distribution 140 | for each of the histogram bins. 141 | If None, then uses the last cached value 142 | empty_bin_rate = if the bin_count == 0 for any bins, then the sampling 143 | rate in them is set to empty_bin_rate. 144 | Note: this is only used if sample_size is also provided 145 | Output: 146 | if indices == False, return a sample from X stratified according 147 | to sample_size 148 | if indices == True, return the indices into X to make that sample""" 149 | 150 | if sample_size is not None: 151 | # update sampling rate 152 | gt0_count = self.bin_count > 0 153 | sz = np.asarray(sample_size) 154 | self._stratified_sampling_rate = np.zeros(sz.shape) 155 | self._stratified_sampling_rate[gt0_count] = sz[gt0_count] / self.bin_count[gt0_count].astype(np.float) 156 | self._stratified_sampling_rate[~gt0_count] = empty_bin_rate 157 | 158 | # strategy: find the sampling rate for each point in the input 159 | # vector x. choose it with that sampling rate 160 | xindices = self.compute_indices(x) 161 | nsamples = len(xindices) 162 | r = np.random.rand(nsamples) 163 | indices_accept = np.arange(nsamples)[r < self._stratified_sampling_rate[xindices]] 164 | if indices: 165 | return indices_accept 166 | else: 167 | return x[indices_accept] 168 | 169 | def plot_joint_marginal(x, y, 170 | N=50, range_x=None, range_y=None, log_joint=False, 171 | xtitle=None, ytitle=None, title=None, 172 | fignum=1, show=True, outfile=None): 173 | """ 174 | Makes a pretty joint/marginal probability plot 175 | 176 | In the main square we plot the joint PDF 177 | On each axis we also add the marginal PDFs 178 | Correlations optionally added to the title 179 | 180 | Input: 181 | N = number of bins 182 | range_x/range_y = the ranges for x and y. If None, uses 183 | min/max values 184 | log_joint = if True, then plot log(joint counts), 185 | otherwise just use joint(counts) 186 | xtitle/ytitle/title = strings to add for description 187 | 188 | fignum = plot in this figure 189 | show = if True, does a fig.show() 190 | Returns the fig object 191 | """ 192 | import pylab as plt 193 | from mpl_toolkits.axes_grid1 import make_axes_locatable 194 | 195 | if range_x is None: 196 | range_x = [x.min(), x.max()] 197 | if range_y is None: 198 | range_y = [y.min(), y.max()] 199 | 200 | # make a 2D histogram of the input for contour plotting 201 | # any bins with density 0 we will set to NaN so they aren't plotted 202 | data_hist_2D = np.histogram2d(x, y, bins=[N, N+1], range=[range_x, range_y]) 203 | x_bins = 0.5 * (data_hist_2D[1][0:-1] + data_hist_2D[1][1:]) 204 | y_bins = 0.5 * (data_hist_2D[2][0:-1] + data_hist_2D[2][1:]) 205 | data_hist_2D = data_hist_2D[0] 206 | data_hist_2D[data_hist_2D == 0] = np.nan 207 | if log_joint: 208 | data_hist_2D = np.log(data_hist_2D + 1) 209 | 210 | fig = plt.figure(fignum) 211 | fig.clf() 212 | 213 | # the contour plot in the middle with joint PDF 214 | axScatter = plt.subplot(111) 215 | axScatter.contourf(x_bins, y_bins, data_hist_2D.T, ncontours=10) 216 | plt.xlabel(xtitle) 217 | plt.ylabel(ytitle) 218 | 219 | divider = make_axes_locatable(axScatter) 220 | axHistx = divider.append_axes("top", 1.2, pad=0.1, sharex=axScatter) 221 | axHisty = divider.append_axes("right", 1.2, pad=0.1, sharey=axScatter) 222 | dummy = plt.setp(axHistx.get_xticklabels() + axHistx.get_yticklabels() + axHisty.get_xticklabels() + axHisty.get_yticklabels(), visible=False) 223 | 224 | axHisty.hist(y, N+1, range=range_y, orientation='horizontal') 225 | axHistx.hist(x, N, range=range_x) 226 | 227 | if title: 228 | plt.figtext(0.5, 0.94, title, 229 | ha='center', color='black', weight='bold', size='large') 230 | 231 | if show: 232 | fig.show() 233 | if outfile is not None: 234 | plt.savefig(outfile) 235 | return fig 236 | 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /mozsci/inputs.py: -------------------------------------------------------------------------------- 1 | """Input feature manipulation, including normalizations""" 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | 6 | from sklearn.preprocessing import StandardScaler 7 | from six.moves import range 8 | 9 | def mean_std_weighted(x, weights=None): 10 | """Computes weighted mean and standard deviation. 11 | 12 | x = a (N, ) or an (N, nx) numpy array 13 | weights = a (N, ) numpy array of weights or None (no weights) 14 | 15 | Returns {'mean':[means], 'std':[standard deviations]} 16 | where each value is a len(nx) array for each feature 17 | """ 18 | if weights is None: 19 | ret = {'mean': np.mean(x, axis=0), 'std': np.std(x, axis=0) } 20 | else: 21 | # weighted mean/std 22 | # reshape x to 1 dim 23 | m = np.average(x, axis=0, weights=weights) 24 | v = np.sqrt(np.dot(weights, (x - m)**2) / weights.sum()) 25 | ret = {'mean': m, 'std': v} 26 | 27 | # replace zero values 28 | if len(x.shape) == 1: 29 | if ret['std'] == 0: 30 | ret['std'] = 1 31 | else: 32 | zero_std = [k for k in range(x.shape[1]) if ret['std'][k] < 1e-16] 33 | for i in zero_std: 34 | ret['std'][i] = 1.0 35 | 36 | return ret 37 | 38 | 39 | class IdentityTransformer(object): 40 | ''' 41 | Identity transformer that implements sklearn Transformer API 42 | ''' 43 | def transform(self, X, *args, **kwargs): 44 | return X 45 | 46 | def fit(self, X, *args, **kwargs): 47 | pass 48 | 49 | 50 | class LogScaledTransformer(StandardScaler): 51 | def __init__(self, offset=0.0, **kwargs): 52 | ''' 53 | Take log(X+offset) then apply mean-std scaling. 54 | **kwargs: passed into StandardScaler.__init__ 55 | 56 | we ignore the copy options for convenience 57 | ''' 58 | super(LogScaledTransformer, self).__init__(**kwargs) 59 | self._offset = offset 60 | 61 | def _log(self, X): 62 | return np.log(X + self._offset) 63 | 64 | def fit(self, X, *args, **kwargs): 65 | XX = self._log(X) 66 | return super(LogScaledTransformer, self).fit(XX, *args, **kwargs) 67 | 68 | def transform(self, X, *args, **kwargs): 69 | XX = self._log(X) 70 | return super(LogScaledTransformer, self).transform( 71 | XX, *args, **kwargs) 72 | 73 | def inverse_transform(self, X, *args, **kwargs): 74 | XX = super(LogScaledTransformer, self).inverse_transform( 75 | X, *args, **kwargs) 76 | return np.exp(XX) - self._offset 77 | 78 | 79 | class BucketTransformer(object): 80 | ''' 81 | Transform a float to a categorical variable and represent as 82 | 1-in-k encoding. 83 | ''' 84 | def __init__(self, bin_edges): 85 | ''' 86 | bin_edges: edges for the len(bin_edges) + 1 bins. They are: 87 | 88 | bin_edges = [x0, x1, ..., xn] 89 | x <= x0 90 | x0 < x <= x1 91 | ... 92 | xn < x 93 | ''' 94 | from sklearn.preprocessing import Binarizer 95 | self._binarizers = [Binarizer(threshold=-np.inf)] 96 | self._binarizers.extend( 97 | [Binarizer(threshold=edge) for edge in bin_edges]) 98 | self._nbins = len(self._binarizers) 99 | 100 | def fit(self, *args, **kwargs): 101 | pass 102 | 103 | def transform(self, X): 104 | ''' 105 | X = len N vector 106 | return (N, nbins) matrix with 1-in-k encoding 107 | ''' 108 | assert len(X.shape) == 1 or min(X.shape) == 1 109 | 110 | ret = np.zeros((len(X), self._nbins)) 111 | for k, binarizer in enumerate(self._binarizers): 112 | ret[:, k] = binarizer.transform(X.flatten()) 113 | 114 | # since binarizer is 0-1 for whether X is less then the threshold 115 | # we need the last 1 in each column, e.g. 116 | # 117 | # [1, 1, 0, 0] we change to [0, 1, 0, 0] 118 | # can get the value by subtracting the previous column 119 | for k in range(self._nbins-1): 120 | ret[:, k] = ret[:, k] - ret[:, k+1] 121 | return ret 122 | 123 | -------------------------------------------------------------------------------- /mozsci/map_train.py: -------------------------------------------------------------------------------- 1 | """Train models in parallel""" 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | from six.moves import range 6 | 7 | class TrainModelCV(object): 8 | def __init__(self, 9 | model_description=[None, None, '', (), {}], 10 | X=None, y=None, Xtest=None, ytest=None, 11 | folds=None, weights=None, weightstest=None, fit_kwargs={}): 12 | """ 13 | model_description = [model_init, error, model_save_file, args, kwargs] 14 | WHERE 15 | model_init = a callable thing model_init(args, kwargs) that returns a model 16 | object. This has an interface as follows: 17 | model.fit(X, y) = trains 18 | model.predict(X) = predicts 19 | model.save_model(filename) = serializes model to a file 20 | error = a callable thing that computes error as error(Yactual, Ypred) 21 | model_save_file = if provided, then saves the model to this file 22 | args, kwards = passed to model_init(*args, **kwargs) 23 | 24 | fit_kwargs = anything to pass down the model.fit routine (error tolerance, etc) 25 | 26 | X, y = training dataset (required) 27 | Xtest, Ytest = testing dataset (if provided, then computes error on this dataset 28 | 29 | folds = if provided, then gives a set of splits to use for k-fold cross validation. 30 | folds is a length-k list. Each element of the list is a tuple, where the first 31 | element of the tuple gives the training indices, the second the test indices. 32 | folds can easily be generated with a call to cv_kfold. 33 | 34 | If doing a k-fold CV, then Xtest and ytest are ignored, and X and y are split 35 | (and an error is raised if Xtest and ytest are provided). 36 | The model_save_file is also ignored in this case 37 | The errors data structure reports the average error for each fold. 38 | """ 39 | self.model_description = model_description 40 | self.model_init = model_description[0] 41 | self.error = model_description[1] 42 | self.model_save_file = model_description[2] 43 | self.X = X 44 | self.y = y 45 | self.Xtest = Xtest 46 | self.ytest = ytest 47 | self.folds = folds 48 | if folds is not None: 49 | assert Xtest is None and ytest is None 50 | self.weights = weights 51 | self.weightstest = weightstest 52 | self._fit_kwargs = fit_kwargs 53 | 54 | 55 | def run(self): 56 | if self.folds is not None: 57 | errors = self._run_kfold() 58 | else: 59 | errors, model = self._run_one_train_test(self.X, self.y, self.Xtest, self.ytest, self.weights, self.weightstest, fit_kwargs=self._fit_kwargs) 60 | 61 | # save to file if needed 62 | if self.model_save_file is not None: 63 | model.save_model(self.model_save_file) 64 | 65 | # prepare errors for output 66 | errors_ret = {} 67 | errors_ret[str(self.model_description)] = errors 68 | 69 | return errors_ret 70 | 71 | 72 | def _run_kfold(self): 73 | # do k-fold cross validation 74 | errors = [] 75 | for k in range(len(self.folds)): 76 | train_indices = self.folds[k][0] 77 | test_indices = self.folds[k][1] 78 | 79 | if self.weights is None: 80 | this_error, model = self._run_one_train_test(self.X[train_indices, :], self.y[train_indices], self.X[test_indices, :], self.y[test_indices], fit_kwargs=self._fit_kwargs) 81 | else: 82 | this_error, model = self._run_one_train_test(self.X[train_indices, :], self.y[train_indices], self.X[test_indices, :], self.y[test_indices], self.weights[train_indices], self.weights[test_indices], fit_kwargs=self._fit_kwargs) 83 | 84 | errors.append(this_error) 85 | 86 | # return average error 87 | # for aggregate error functions, can return 88 | # errors['train'] = {'error1': 0.5, 'error2': 0.2}, ... 89 | # also support this case 90 | ret = {} 91 | if type(errors[0]['train']) == dict: 92 | for k in ['train', 'test']: 93 | ret[k] = {} 94 | for error_type in errors[0]['train'].keys(): 95 | ret[k][error_type] = np.mean([ele[k][error_type] for ele in errors]) 96 | else: 97 | for k in ['train', 'test']: 98 | ret[k] = np.mean([ele[k] for ele in errors]) 99 | 100 | return ret 101 | 102 | 103 | def _run_one_train_test(self, X, y, Xtest, ytest, weights=None, weightstest=None, fit_kwargs={}): 104 | # initialize model 105 | # train 106 | # compute error 107 | 108 | # initialize 109 | model = self.model_init(*self.model_description[3], **self.model_description[4]) 110 | 111 | # train 112 | try: 113 | model.fit(X, y, weights=weights, **fit_kwargs) 114 | except TypeError: # model doesn't do weighted learning 115 | model.fit(X, y, **fit_kwargs) 116 | 117 | # compute error 118 | errors = {} 119 | ypred = model.predict(X) 120 | if weights is None: 121 | errors['train'] = self.error(y, ypred) 122 | else: 123 | errors['train'] = self.error(y, ypred, weights=weights) 124 | 125 | 126 | if Xtest is not None: 127 | ypred = model.predict(Xtest) 128 | if weightstest is None: 129 | errors['test'] = self.error(ytest, ypred) 130 | else: 131 | errors['test'] = self.error(ytest, ypred, weights=weightstest) 132 | else: 133 | errors['test'] = None 134 | 135 | return errors, model 136 | 137 | 138 | def _pool_helper(model_description, X=None, y=None, Xtest=None, ytest=None, 139 | folds=None, weights=None, weightstest=None): 140 | # a helper for Pool class. 141 | # this creates an instance of TrainModelCV and runs it 142 | trainer = TrainModelCV(model_description, 143 | X=X, y=y, Xtest=Xtest, ytest=ytest, 144 | folds=folds, weights=weights, weightstest=weightstest) 145 | return trainer.run() 146 | 147 | 148 | 149 | def run_train_models(processes, model_library, **kwargs): 150 | """Train many supervised learning problems in parallel 151 | 152 | model_library = a list specifying the model library for the dataset in 153 | format needed for TrainModelCV 154 | **kwargs: all the rest of the input to TrainModelCV""" 155 | # sample input for model_library: 156 | # [[LogisticRegression, classification_error, 'parameters.json', (), {'lam':0.5}], 157 | # [LogisticRegression, auc_wmw_fast, None, (), {'C':50}]] 158 | 159 | 160 | if processes > 1: 161 | 162 | # use a process pool top execute all the training jobs 163 | # collect the results and combine to return 164 | from multiprocessing import Pool 165 | 166 | p = Pool(processes) 167 | 168 | results = [] 169 | for model in model_library: 170 | results.append(p.apply_async(_pool_helper, (model, ), kwargs)) 171 | 172 | # wait on the pool to finish 173 | p.close() 174 | p.join() 175 | 176 | # collect the results 177 | ret = {} 178 | for result in results: 179 | ret.update(result.get()) 180 | 181 | else: 182 | # don't need a pool 183 | ret = {} 184 | for model in model_library: 185 | ret.update(_pool_helper(model, **kwargs)) 186 | 187 | return ret 188 | 189 | 190 | -------------------------------------------------------------------------------- /mozsci/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .logistic_regression import LogisticRegression 3 | from .linear_regression import LinearRegression 4 | 5 | 6 | -------------------------------------------------------------------------------- /mozsci/models/linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import numpy as np 4 | from scipy.optimize import fmin_bfgs 5 | import json 6 | import six 7 | from six.moves import range 8 | 9 | class LinearRegression(object): 10 | def __init__(self, lam=1.0): 11 | """lam = regularization parameter""" 12 | self.lam = lam 13 | 14 | # these are set in fit 15 | self.b = None # float 16 | self.w = None # (nvars, ) array 17 | 18 | def predict(self, x): 19 | """Make a prediction. 20 | Return P(y == 1 | x) 21 | 22 | x = (Nobs, nvars) 23 | """ 24 | return np.sum(self.w * x, axis=1) + self.b 25 | 26 | def fit(self, x, yy, weights=None): 27 | """Train the model. 28 | 29 | x = (Nobs, nvars) 30 | y = (Nobs, ) 31 | 32 | Bias term automatically added 33 | 34 | Returns the loss""" 35 | # transform y to vector 36 | if len(yy.shape) > 1: 37 | assert len(yy.shape) == 2 and yy.shape[1] == 1 38 | y = yy.reshape(-1, ) 39 | else: 40 | y = yy 41 | 42 | def _loss_for_optimize(params): 43 | return LinearRegression._loss(x, y, params[0], params[1:], self.lam, weights) 44 | def _gradient_for_optimize(params): 45 | return LinearRegression._gradient_loss(x, y, params[0], params[1:], self.lam, weights) 46 | 47 | params_opt = fmin_bfgs(_loss_for_optimize, np.zeros(1 + x.shape[1]), fprime=_gradient_for_optimize, maxiter=200) 48 | 49 | self.b = params_opt[0] 50 | self.w = params_opt[1:] 51 | 52 | return _loss_for_optimize(params_opt) 53 | 54 | def save_model(self, model_file): 55 | """Serialize model to model_file""" 56 | m = {'b':self.b, 57 | 'w':self.w.tolist()} 58 | 59 | with open(model_file, 'w') as f: 60 | json.dump(m, f) 61 | 62 | @classmethod 63 | def load_model(cls, model_file): 64 | '''If a string is provided, it's assumed to be a path to a file 65 | containing a JSON blob describing the model. Otherwise, it should 66 | be a dictionary representing the model''' 67 | if isinstance(model_file, six.string_types): 68 | params = json.load(open(model_file, 'r')) 69 | else: 70 | params = model_file 71 | ret = cls() 72 | ret.b = float(params['b']) 73 | ret.w = np.array(params['w']) 74 | return ret 75 | 76 | @staticmethod 77 | def _loss(x, y, b, w, lam, weights=None): 78 | """Return loss function at x. 79 | loss = sum_squared loss + 0.5 * lambda * sum(w**2) 80 | weights = if provided an (N, ) list of weights 81 | """ 82 | loss = 0.5 * lam * np.sum(w ** 2) 83 | if weights is None: 84 | loss += np.sum((np.sum(w * x, axis=1) + b - y) ** 2) 85 | else: 86 | loss += np.sum(weights * (np.sum(w * x, axis=1) + b - y) ** 2) 87 | return loss 88 | 89 | @staticmethod 90 | def _gradient_loss(x, y, b, w, lam, weights=None): 91 | """Return the gradient of the loss. 92 | 93 | x0 = (N, nvars) numpy array of x 94 | y = prediction 95 | 96 | gradient = loss + self.lam * w 97 | 98 | weights = if provided an (N, ) array to add in to each 99 | """ 100 | nvars = len(w) 101 | gradient = np.zeros(nvars + 1) # first position is b 102 | gradient[1:] = lam * w 103 | 104 | # need sum(f(x) - y) * x for all variables 105 | error = np.sum(w * x, axis=1) + b - y 106 | if weights is None: 107 | gradient[0] = np.sum(error) # * 1 for bias term 108 | for k in range(nvars): 109 | gradient[k + 1] += np.sum(error * x[:, k]) 110 | else: 111 | gradient[0] = np.sum(error * weights) # * 1 for bias term 112 | for k in range(nvars): 113 | gradient[k + 1] += np.sum(weights * error * x[:, k]) 114 | 115 | gradient *= 2 116 | 117 | return gradient 118 | 119 | -------------------------------------------------------------------------------- /mozsci/models/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import json 6 | import six 7 | from six.moves import range 8 | 9 | class LogisticRegression(object): 10 | def __init__(self, lam=1.0): 11 | """lam = regularization parameter""" 12 | self.lam = lam 13 | 14 | # these are set in fit 15 | self.b = None # float 16 | self.w = None # (nvars, ) array 17 | 18 | def predict(self, x): 19 | """Make a prediction. 20 | Return P(y == 1 | x) 21 | 22 | x = (Nobs, nvars) 23 | """ 24 | return LogisticRegression._sigmoid(x, self.b, self.w) 25 | 26 | def fit(self, x, y, weights=None, **kwargs): 27 | """Train the model. 28 | 29 | x = (Nobs, nvars) 30 | y = (Nobs, ) = {0, 1} 31 | 32 | Bias term automatically added 33 | 34 | Returns the loss 35 | 36 | **kwags passed into fmin_l_bfgs_b""" 37 | from scipy.optimize import fmin_l_bfgs_b 38 | 39 | assert len(y) == x.shape[0] 40 | assert weights is None or len(weights) == x.shape[0] 41 | 42 | y0 = y == 0 43 | x0 = x[y0, :] 44 | x1 = x[~y0, :] 45 | 46 | if weights is None: 47 | loss_weights = None 48 | else: 49 | loss_weights = [weights[y0], weights[~y0]] 50 | 51 | def _loss_for_optimize(params): 52 | return LogisticRegression._loss_gradient(x0, x1, params[0], params[1:], self.lam, loss_weights) 53 | 54 | params0 = np.zeros(1 + x.shape[1]) 55 | params_opt, loss_opt, info_opt = fmin_l_bfgs_b(_loss_for_optimize, params0, disp=0, **kwargs) 56 | print(("%s funcalls: %s" % (info_opt['task'], info_opt['funcalls']))) 57 | 58 | self.b = params_opt[0] 59 | self.w = params_opt[1:] 60 | 61 | def save_model(self, model_file): 62 | """Serialize model to model_file""" 63 | m = {'b':self.b, 64 | 'w':self.w.tolist()} 65 | 66 | with open(model_file, 'w') as f: 67 | json.dump(m, f) 68 | 69 | @classmethod 70 | def load_model(cls, model_file): 71 | '''If a string is provided, it's assumed to be a path to a file 72 | containing a JSON blob describing the model. Otherwise, it should 73 | be a dictionary representing the model''' 74 | if isinstance(model_file, six.string_types): 75 | params = json.load(open(model_file, 'r')) 76 | else: 77 | params = model_file 78 | ret = cls() 79 | ret.b = float(params['b']) 80 | ret.w = np.array(params['w']) 81 | return ret 82 | 83 | @staticmethod 84 | def _sigmoid(x, b, w): 85 | """Return sigma(x) = 1.0 / (1.0 + exp(-x * w - b)) 86 | X = N x (nvars) 87 | 88 | Returns a (N, ) array""" 89 | return np.minimum(np.maximum(1.0 / (1.0 + np.exp(-b - np.sum(w * x, axis=1))), 1.0e-12), 1 - 1.0e-12) 90 | 91 | @staticmethod 92 | def _loss_gradient(x0, x1, b, w, lam, weights=None): 93 | """Return loss/gradient function at x. 94 | x0 = (N0, nvars) numpy array of x where y == 0 95 | x1 = (N1, nvars) numpy array of x where y == 1 96 | 97 | loss = Logistic loss + 0.5 * lambda * sum(w**2) 98 | logistic loss = -sum ( log(sigmoid(x)) y == 1 99 | log(1 - sigmoid(x)) if y == 0 ) 100 | weights = if provided an [(N0, ), (N1, )] list of arrays to add in to each 101 | observation's contribution to error. 102 | first entry corresponds to x0, second to x1 103 | """ 104 | nvars = len(w) 105 | 106 | # initialize + regularization term 107 | loss = 0.5 * lam * np.sum(w ** 2) 108 | gradient = np.zeros(nvars + 1) # first position is b 109 | gradient[1:] = lam * w 110 | 111 | # we need prediction for x 112 | pred_x_0_1 = [LogisticRegression._sigmoid(x0, b, w), LogisticRegression._sigmoid(x1, b, w)] 113 | 114 | # the log likelihood 115 | log_like_x_0_1 = [np.log(1.0 - pred_x_0_1[0]), 116 | np.log(pred_x_0_1[1])] 117 | 118 | # also need the error for gradient. 119 | error = [pred_x_0_1[0], 120 | pred_x_0_1[1] - 1] 121 | 122 | if weights is None: 123 | loss += -np.sum(log_like_x_0_1[1]) - np.sum(log_like_x_0_1[0]) 124 | gradient[0] += np.sum(error[0]) + np.sum(error[1]) # * 1 for bias term 125 | for k in range(nvars): 126 | gradient[k + 1] += np.sum(error[0] * x0[:, k]) + np.sum(error[1] * x1[:, k]) 127 | else: 128 | loss += -np.sum(weights[1] * log_like_x_0_1[1]) - np.sum(weights[0] * log_like_x_0_1[0]) 129 | gradient[0] += np.sum(error[0] * weights[0]) + np.sum(error[1] * weights[1]) 130 | for k in range(nvars): 131 | gradient[k + 1] += ( np.sum(weights[0] * error[0] * x0[:, k]) + 132 | np.sum(weights[1] * error[1] * x1[:, k]) ) 133 | return loss, gradient 134 | 135 | -------------------------------------------------------------------------------- /mozsci/numpy_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import numpy as np 4 | import json 5 | 6 | class NumpyEncoder(json.JSONEncoder): 7 | """A JSON encoder for numpy arrays 8 | Use like json.dumps(data, cls=NumpyEncoder)""" 9 | def default(self, obj): 10 | if isinstance(obj, np.ndarray): 11 | return obj.tolist() 12 | return json.JSONEncoder.default(self, obj) 13 | 14 | 15 | def numpy_decoder(dct): 16 | """Decodes numpy arrays stored as values in a json dictionary 17 | Use like json.loads(j, object_hook=numpy_decoder)""" 18 | for k in dct.keys(): 19 | if isinstance(dct[k], list): 20 | try: 21 | dct[k] = np.asarray(dct[k]) 22 | except ValueError: 23 | pass # can't convert to numpy array so leave as is 24 | return dct 25 | 26 | 27 | def load_json_to_numpy(jsonfile): 28 | """Loads the data in the jsonfile using numpy_decoder to convert 29 | to numpy arrays. Returns the decoded data""" 30 | return json.load(open(jsonfile, 'r'), 31 | object_hook=numpy_decoder) 32 | 33 | 34 | -------------------------------------------------------------------------------- /mozsci/pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | # linear PCA 4 | import json 5 | 6 | import numpy as np 7 | 8 | from .numpy_util import numpy_decoder, NumpyEncoder 9 | from six.moves import range 10 | 11 | class LinearPCA(object): 12 | """Linear PCA by SVD""" 13 | 14 | def __init__(self, json_map=None): 15 | """Constructor 16 | If json_map is provided, then initializes from it""" 17 | if json_map is None: 18 | self.mean = None 19 | self.nvars = None 20 | self.eigval = None 21 | self.eigvec = None 22 | else: 23 | j = json.loads(json_map, object_hook=numpy_decoder) 24 | self.mean = j['mean'] 25 | self.nvars = j['nvars'] 26 | self.eigval = j['eigval'] 27 | self.eigvec = j['eigvec'] 28 | 29 | 30 | 31 | def train(self, data, fignum=None): 32 | """Train the PCA. data is an (nobs, nvars) numpy array 33 | If fignum is not None, then plot the eigen values in the figure 34 | 35 | Returns nothing.""" 36 | assert isinstance(data, np.ndarray) and data.ndim == 2 37 | self.nvars = data.shape[1] 38 | self.mean = np.mean(data, 0) 39 | 40 | # do SVD of the data 41 | corr = np.cov((data - self.mean).T) 42 | (eigval, eigvec) = np.linalg.eig(corr) 43 | 44 | # sort eigenvalues, eigen vectors into ascending order 45 | sortindex = (-1.0 * eigval).argsort() 46 | eigval = eigval[sortindex] 47 | eigvec = eigvec[:, sortindex] 48 | 49 | self.eigval = eigval 50 | self.eigvec = eigvec 51 | 52 | # plot eigenvalues 53 | if fignum is not None: 54 | eigval_sum = self._compute_percent_explained() 55 | 56 | import pylab as plt 57 | fig = plt.figure(fignum) 58 | fig.clf() 59 | plt.plot(eigval_sum, 'bx') 60 | plt.title("Eigenvalues for PCA") 61 | fig.show() 62 | 63 | def _compute_percent_explained(self): 64 | """Computes percent explained from self.eigval""" 65 | eigval_sum = self.eigval.cumsum() 66 | eigval_cum_sum = eigval_sum / np.float(eigval_sum[-1]) 67 | percent_explain = np.hstack((eigval_cum_sum[0], eigval_cum_sum[1:] - eigval_cum_sum[0:-1])) 68 | return percent_explain 69 | 70 | def plot_eigvec(self, neig, fignum): 71 | """Plots the first neig eigenvectors for figure fignum""" 72 | import pylab as plt 73 | fig = plt.figure(fignum) 74 | fig.clf() 75 | pct_explain = self._compute_percent_explained() 76 | for k in range(neig): 77 | plt.plot(self.eigvec[:, k], label=str(k) + " " + str(round(pct_explain[k] * 100))) 78 | plt.legend() 79 | fig.show() 80 | 81 | 82 | def project(self, data, n): 83 | """Given the data, project onto the first n principle components. 84 | data must have the same number of variables as the data used 85 | in training. 86 | 87 | data is a (nobs, nvars) numpy array 88 | return is a (nobs, n) numpy array, the projection onto the PCA 89 | 90 | Note: the mean (self.mean) is removed from the data before 91 | projection so that the full projection is 92 | self.mean + SUM_k (projection_k * PC_k)""" 93 | assert data.ndim == 2 and data.shape[1] == self.nvars and n > 0 and n <= self.nvars 94 | return np.dot((data - self.mean), self.eigvec[:, 0:n]) 95 | 96 | def truncate(self, data, n): 97 | """Truncate the data to the n PCs. 98 | This projects on the first n PCs, then reconstructs data. 99 | 100 | data is a (nobs, nvars) numpy array 101 | return is a (nobs, nvars) numpy array""" 102 | return self.mean + np.dot(self.project(data, n), self.eigvec[:, 0:n].T) 103 | 104 | def to_json(self): 105 | """Returns a json string with the PCA""" 106 | j = {} 107 | j['eigval'] = self.eigval 108 | j['eigvec'] = self.eigvec 109 | j['mean'] = self.mean 110 | j['nvars'] = self.nvars 111 | return json.dumps(j, cls=NumpyEncoder) 112 | 113 | 114 | -------------------------------------------------------------------------------- /mozsci/spearmanr_by_fast.pyx: -------------------------------------------------------------------------------- 1 | 2 | import cython 3 | 4 | # import both numpy and the Cython declarations for numpy 5 | import numpy as np 6 | cimport numpy as np 7 | 8 | # declare the interface to the C code 9 | cdef extern double c_spearman_for_python(double* a, double* b, np.int_t* byvar, int n) 10 | 11 | @cython.boundscheck(False) 12 | @cython.wraparound(False) 13 | def spearmanr_by(np.ndarray[double, ndim=1, mode="c"] a not None, 14 | np.ndarray[double, ndim=1, mode="c"] b not None, 15 | np.ndarray[np.int_t, ndim=1, mode="c"] byvar not None): 16 | """ 17 | Spearman correlation of a vs b by byvar 18 | 19 | Given a data set of x and y, grouped by the byvar, computes 20 | the spearman correlation for each group, then returns the average correlation 21 | across groups. 22 | 23 | byvar must be in sorted order. 24 | 25 | param: a -- a 1-d numpy array of np.float64 26 | param: b -- a 1-d numpy array of np.float64 27 | param: byvar -- the by groups, np.int64 28 | """ 29 | cdef int n 30 | n = a.shape[0] 31 | return c_spearman_for_python(&a[0], &b[0], &byvar[0], n) 32 | 33 | -------------------------------------------------------------------------------- /mozsci/variables.py: -------------------------------------------------------------------------------- 1 | """ 2 | A few useful abstractions for input/output variables in machine learning 3 | """ 4 | from __future__ import absolute_import 5 | import numpy as np 6 | from six.moves import zip 7 | 8 | 9 | class Variable(object): 10 | """ 11 | A Variable is one group of input or output to a model. 12 | """ 13 | def __init__(self, name, transformer, ndim=1, ndimout=1): 14 | """ 15 | name: the variable name 16 | transformer: implements the sklearn.Transformer API 17 | (fit, transform) 18 | ndim: the dimension of the variable (input) 19 | ndimout: the dimension of the output transform 20 | """ 21 | self.name = name 22 | self.ndim = ndim 23 | self.ndimout = ndimout 24 | self._transformer = transformer 25 | 26 | # forwarding methods 27 | def fit(self, *args, **kwargs): 28 | return self._transformer.fit(*args, **kwargs) 29 | 30 | def transform(self, *args, **kwargs): 31 | return self._transformer.transform(*args, **kwargs) 32 | 33 | 34 | class ModelVariables(object): 35 | """ 36 | Hold sets of input and output variables for the model 37 | """ 38 | def __init__(self, independent, dependent): 39 | """ 40 | independent: list of Variable instances for the model input 41 | dependent: list of Variable instances for the model output 42 | """ 43 | self.independent = independent 44 | self.dependent = dependent 45 | self.nin = sum([variable.ndim for variable in independent]) 46 | self.nout = sum([variable.ndim for variable in dependent]) 47 | 48 | 49 | class ModelDriver(object): 50 | """ 51 | This class is used to drive any model/algorithm for training and 52 | prediction purposes. It's specifically designed so that we don't need 53 | to worry about the normalization for cross validation procedures. It also 54 | supports the variable definitions that we use for data collection. 55 | """ 56 | def __init__(self, variables, model): 57 | """ 58 | variables: an instance of ModelVariables 59 | model: must implement the sklearn interface (fit, predict), as 60 | well as be picklable) 61 | """ 62 | self.variables = variables 63 | self.model = model 64 | 65 | def _get_array(self, data, variables, dim): 66 | ''' 67 | Get a numpy array from the data. 68 | ''' 69 | if isinstance(data, np.ndarray): 70 | shape = data.shape 71 | if len(shape) == 2 and shape[1] == dim: 72 | # data is already an array 73 | return data 74 | 75 | # otherwise data should implement __getitem__ 76 | first_var = variables[0] 77 | first_data = data[first_var.name] 78 | if isinstance(first_data, int) or isinstance(first_data, float): 79 | ret = np.zeros((1, dim)) 80 | else: 81 | ret = np.zeros((len(first_data), dim)) 82 | ind = first_var.ndim 83 | if ind == 1: 84 | ret[:, 0] = first_data 85 | else: 86 | ret[:, :ind] = first_data 87 | for variable in variables[1:]: 88 | if variable.ndim == 1: 89 | ret[:, ind] = data[variable.name] 90 | else: 91 | ret[:, ind:(ind + variable.ndim)] = data[variable.name] 92 | ind += variable.ndim 93 | return ret 94 | 95 | def _transform(self, X, variables, fit=False): 96 | ''' 97 | Transform the data 98 | ''' 99 | # get the output dimensions 100 | try: 101 | ndimout = [v.ndimout for v in variables] 102 | except AttributeError: 103 | ndimout = [] 104 | for v in variables: 105 | if hasattr(v, 'ndimout'): 106 | ndimout.append(v.ndimout) 107 | else: 108 | ndimout.append(1) 109 | nout = sum(ndimout) 110 | ret = np.zeros((len(X), nout)) 111 | ind = 0 112 | indout = 0 113 | for variable, dimout in zip(variables, ndimout): 114 | if fit: 115 | variable.fit(X[:, ind:(ind + variable.ndim)]) 116 | ret[:, indout:(indout + dimout)] = variable.transform( 117 | X[:, ind:(ind + variable.ndim)]) 118 | ind += variable.ndim 119 | indout += dimout 120 | return ret 121 | 122 | def fit(self, predictors, y): 123 | """ 124 | train the model using observations. 125 | :param X: independent variables. 2-d numpy array or something 126 | implementing __getitem__ 127 | :param y: dependent variable. numpy array or something implementing 128 | __getitem__ 129 | :return: Nothing. 130 | """ 131 | # (1) get predictor, predicted array 132 | X = self._get_array(predictors, self.variables.independent, 133 | self.variables.nin) 134 | yy = self._get_array(y, self.variables.dependent, 135 | self.variables.nout) 136 | 137 | # (2) fit transforms 138 | XX = self._transform(X, self.variables.independent, True) 139 | YY = self._transform(yy, self.variables.dependent, True) 140 | 141 | # (3) fit the model 142 | self.model.fit(XX, YY) 143 | 144 | def predict(self, predictors, predict_prob=False): 145 | """ 146 | This method does the prediction using the model and saved 147 | normalization parameters. 148 | """ 149 | X = self._get_array(predictors, self.variables.independent, 150 | self.variables.nin) 151 | XX = self._transform(X, self.variables.independent, False) 152 | 153 | if predict_prob: 154 | return self.model.predict_proba(XX) 155 | else: 156 | return self.model.predict(XX) 157 | 158 | def dumps(self): 159 | ''' 160 | Return a string representation of this instance 161 | ''' 162 | import six.moves.cPickle 163 | return six.moves.cPickle.dumps(self) 164 | 165 | @classmethod 166 | def loads(cls, string): 167 | ''' 168 | Return an instance from the serialized string 169 | ''' 170 | import six.moves.cPickle 171 | return six.moves.cPickle.loads(string) 172 | 173 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | nose 4 | coverage 5 | Cython>=0.17 6 | simplejson 7 | scikit-learn 8 | six 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2012 SEOmoz 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | from setuptools import setup 25 | from distutils.extension import Extension 26 | from Cython.Distutils import build_ext 27 | import numpy as np 28 | 29 | ext_modules = [Extension('mozsci.spearmanr_by_fast', 30 | sources=["mozsci/spearmanr_by_fast.pyx", "mozsci/cspearmanr_by_fast.cc"], 31 | include_dirs = [np.get_include()], 32 | language="c++"), 33 | Extension('mozsci._c_utils', 34 | sources=["mozsci/_c_utils.pyx"], 35 | include_dirs = [np.get_include()], 36 | language="c++"), 37 | ] 38 | 39 | with open('requirements.txt', 'r') as fin: 40 | requires = fin.read().strip().split('\n') 41 | 42 | setup( 43 | name = 'mozsci', 44 | version = '0.9.2', 45 | description = 'Data science tools from Moz', 46 | author = 'Moz Data Science', 47 | author_email = 'science@moz.com', 48 | url = 'http://github.com/seomoz/mozsci', 49 | packages = ['mozsci', 'mozsci.models', 'mozsci.glm'], 50 | license = 'MIT', 51 | platforms = 'Posix; MacOS X', 52 | cmdclass = {'build_ext': build_ext}, 53 | ext_modules = ext_modules, 54 | install_requires = requires, 55 | classifiers = [ 56 | 'License :: OSI Approved :: MIT License', 57 | 'Development Status :: 4 - Beta', 58 | 'Intended Audience :: Developers', 59 | 'Topic :: Scientific/Engineering', 60 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 61 | 'Intended Audience :: Science/Research', 62 | 'Programming Language :: Python :: 2', 63 | 'Programming Language :: Python :: 2.7', 64 | 'Programming Language :: Python :: 3', 65 | 'Programming Language :: Python :: 3.3', 66 | 'Programming Language :: Python :: 3.4', 67 | ], 68 | ) 69 | -------------------------------------------------------------------------------- /test/data/poissonreg.csv: -------------------------------------------------------------------------------- 1 | id,school,male,math,langarts,daysatt,daysabs 2 | 1001,1,1,56.98883,42.45086,73,4 3 | 1002,1,1,37.09416,46.82059,73,4 4 | 1003,1,0,32.27546,43.56657,76,2 5 | 1004,1,0,29.05672,43.56657,74,3 6 | 1005,1,0,6.748048,27.24847,73,3 7 | 1006,1,0,61.65428,48.41482,62,13 8 | 1007,1,0,56.98883,40.73543,66,11 9 | 1008,1,1,10.39049,15.35938,72,7 10 | 1009,1,1,50.52795,52.11514,63,10 11 | 1010,1,1,49.47205,42.45086,68,9 12 | 1011,1,0,39.55739,36.45115,72,4 13 | 1012,1,1,33.73761,13.13055,74,5 14 | 1013,1,0,62.90584,62.27464,72,5 15 | 1014,1,0,65.56011,44.66451,74,3 16 | 1015,1,1,23.01052,25.25478,76,1 17 | 1016,1,1,75.83068,61.04388,76,0 18 | 1017,1,0,41.31353,49.47205,75,1 19 | 1018,1,0,41.88515,65.56011,74,0 20 | 1019,1,1,65.56011,46.82059,75,2 21 | 1020,1,1,13.13055,6.748048,55,24 22 | 1021,1,0,33.01677,42.45086,75,2 23 | 1022,1,1,55.88246,64.87473,76,0 24 | 1023,1,1,45.2079,55.33549,76,1 25 | 1024,1,1,56.98883,44.66451,76,0 26 | 1025,1,0,31.5115,38.34572,71,8 27 | 1026,1,1,52.64643,50,67,3 28 | 1027,1,1,17.25647,6.748048,70,7 29 | 1028,1,0,33.01677,40.15026,76,0 30 | 1029,1,1,61.04388,57.54914,74,2 31 | 1030,1,1,66.98323,71.82729,77,0 32 | 1031,1,1,1.007114,45.2079,77,0 33 | 1032,1,0,38.34572,35.12527,76,1 34 | 1033,1,1,44.66451,46.82059,32,3 35 | 1034,1,1,44.11754,46.82059,77,0 36 | 1035,1,1,59.84974,46.28556,77,0 37 | 1036,1,0,32.27546,47.35357,49,28 38 | 1037,1,1,23.01052,49.47205,71,8 39 | 1038,1,1,70.94328,61.04388,72,5 40 | 1039,1,1,1.007114,1.007114,75,2 41 | 1040,1,0,41.88515,52.11514,46,27 42 | 1041,1,0,40.15026,35.12527,72,5 43 | 1042,1,0,41.31353,38.34572,59,18 44 | 1043,1,0,44.66451,58.68647,57,19 45 | 1044,1,1,38.34572,42.45086,67,9 46 | 1045,1,1,32.27546,1.007114,68,9 47 | 1046,1,0,37.09416,32.27546,75,4 48 | 1047,1,1,63.54885,57.54914,75,2 49 | 1048,1,1,43.56657,41.31353,68,3 50 | 1049,1,1,33.01677,24.16932,68,9 51 | 1050,1,0,68.48849,59.26457,56,20 52 | 1051,1,1,29.05672,21.7637,65,6 53 | 1052,1,1,54.7921,54.7921,76,0 54 | 1053,1,1,48.94376,51.58518,50,27 55 | 1054,1,0,52.64643,50.52795,65,12 56 | 1055,1,0,13.13055,32.27546,43,34 57 | 1056,1,1,53.71444,41.88515,76,1 58 | 1057,1,1,15.35938,32.27546,52,25 59 | 1058,1,1,10.39049,17.25647,71,5 60 | 1059,1,0,34.43988,38.34572,74,3 61 | 1060,1,0,41.88515,42.45086,77,2 62 | 1061,1,1,40.73543,55.33549,76,1 63 | 1062,1,0,37.09416,52.64643,70,7 64 | 1063,1,1,26.2782,26.2782,73,4 65 | 1064,1,1,47.88486,36.45115,71,8 66 | 1065,1,0,38.95612,55.88246,71,6 67 | 1066,1,1,40.15026,47.35357,61,16 68 | 1067,1,0,36.45115,40.15026,74,3 69 | 1068,1,0,27.24847,39.55739,71,4 70 | 1069,1,0,39.55739,43.56657,73,3 71 | 1070,1,0,48.94376,36.45115,72,5 72 | 1071,1,1,62.27464,46.28556,73,0 73 | 1072,1,1,66.98323,59.84974,67,9 74 | 1073,1,1,28.17271,59.84974,77,0 75 | 1074,1,1,35.12527,37.09416,69,8 76 | 1075,1,0,52.64643,70.09472,77,0 77 | 1076,1,0,26.2782,35.12527,65,11 78 | 1077,1,1,41.31353,42.45086,75,4 79 | 1078,1,1,59.26457,71.82729,75,2 80 | 1079,1,1,1.007114,17.25647,35,35 81 | 1080,1,0,33.01677,41.88515,54,23 82 | 1081,1,0,40.73543,42.45086,64,13 83 | 1082,1,1,66.26239,64.87473,71,6 84 | 1083,1,0,70.94328,45.74812,79,0 85 | 1084,1,1,54.7921,60.44261,71,6 86 | 1085,1,1,64.20476,50.52795,77,0 87 | 1086,1,1,33.73761,40.15026,66,8 88 | 1087,1,0,49.47205,44.66451,66,11 89 | 1088,1,0,26.2782,73.72179,66,11 90 | 1089,1,1,70.94328,54.25188,73,4 91 | 1090,1,1,37.09416,41.31353,71,6 92 | 1091,1,0,37.09416,49.47205,54,23 93 | 1092,1,0,61.04388,69.27759,72,5 94 | 1093,1,1,86.86945,86.86945,72,5 95 | 1094,1,0,61.04388,64.20476,51,26 96 | 1095,1,0,23.01052,6.748048,69,7 97 | 1096,1,1,27.24847,35.12527,76,1 98 | 1097,1,1,32.27546,46.82059,68,9 99 | 1098,1,1,24.16932,17.25647,66,11 100 | 1099,1,1,45.74812,50,60,18 101 | 1100,1,1,6.748048,13.13055,56,12 102 | 1101,1,1,35.12527,27.24847,73,3 103 | 1102,1,1,43.01117,28.17271,69,0 104 | 1103,1,1,26.2782,13.13055,73,4 105 | 1104,1,1,10.39049,29.05672,68,10 106 | 1105,1,1,61.04388,66.26239,61,16 107 | 1106,1,0,37.09416,51.58518,74,1 108 | 1107,1,0,1.007114,1.007114,70,9 109 | 1108,1,0,55.88246,45.2079,74,3 110 | 1109,1,1,46.82059,38.34572,77,0 111 | 1110,1,1,41.88515,56.98883,68,9 112 | 1111,1,1,66.26239,35.79525,65,14 113 | 1112,1,0,10.39049,38.95612,70,7 114 | 1113,1,1,33.01677,40.15026,74,3 115 | 1114,1,1,36.45115,20.40919,67,10 116 | 1115,1,1,46.82059,65.56011,64,12 117 | 1116,1,0,48.41482,34.43988,71,6 118 | 1117,1,0,41.88515,41.88515,42,35 119 | 1118,1,0,20.40919,25.25478,64,13 120 | 1119,1,0,73.72179,74.74522,74,3 121 | 1120,1,1,10.39049,40.15026,65,10 122 | 1121,1,1,17.25647,20.40919,71,6 123 | 1122,1,1,75.83068,64.87473,76,0 124 | 1123,1,1,26.2782,23.01052,75,2 125 | 1124,1,1,70.94328,82.74353,71,6 126 | 1125,1,1,76.98948,68.48849,72,5 127 | 1126,1,0,61.65428,58.11485,62,13 128 | 1127,1,0,28.17271,6.748048,73,4 129 | 1128,1,0,45.2079,48.94376,72,5 130 | 1129,1,0,56.98883,65.56011,74,3 131 | 1130,1,0,41.31353,48.94376,44,30 132 | 1131,1,0,47.35357,46.82059,61,16 133 | 1132,1,0,38.34572,43.01117,62,15 134 | 1133,1,0,10.39049,6.748048,65,12 135 | 1134,1,1,41.88515,36.45115,78,1 136 | 1135,1,1,40.15026,38.95612,72,1 137 | 1136,1,1,33.73761,35.79525,70,7 138 | 1137,1,0,68.48849,52.64643,74,1 139 | 1138,1,0,98.99289,64.20476,13,45 140 | 1139,1,0,62.90584,43.01117,69,10 141 | 1140,1,1,60.44261,36.45115,74,3 142 | 1141,1,0,37.09416,10.39049,52,27 143 | 1142,1,0,26.2782,33.01677,74,2 144 | 1143,1,0,64.87473,53.71444,58,13 145 | 1144,1,0,18.91984,29.05672,70,2 146 | 1145,1,1,54.7921,47.88486,74,5 147 | 1146,1,0,38.34572,38.95612,64,5 148 | 1147,1,0,53.71444,52.11514,69,4 149 | 1148,1,0,46.82059,51.58518,76,3 150 | 1149,1,0,51.58518,71.82729,44,20 151 | 1150,1,0,34.43988,40.15026,63,12 152 | 1151,1,0,38.34572,44.66451,48,31 153 | 1152,1,1,44.66451,40.73543,70,6 154 | 1153,1,1,49.47205,31.5115,63,14 155 | 1154,1,0,43.56657,46.82059,56,13 156 | 1155,1,1,39.55739,49.47205,71,6 157 | 1156,1,0,33.01677,47.35357,65,12 158 | 1157,1,1,20.40919,29.05672,65,12 159 | 1158,1,1,43.01117,34.43988,77,0 160 | 1159,1,1,51.58518,43.56657,78,1 161 | 2001,2,0,39.55739,50.52795,82,4 162 | 2002,2,1,53.71444,1.007114,86,0 163 | 2003,2,1,53.71444,38.95612,86,0 164 | 2004,2,1,54.7921,64.20476,84,2 165 | 2005,2,0,38.34572,54.7921,85,1 166 | 2006,2,0,45.2079,71.82729,84,2 167 | 2007,2,1,61.65428,58.68647,86,0 168 | 2008,2,0,54.7921,50,86,0 169 | 2009,2,1,61.65428,54.7921,86,0 170 | 2010,2,0,61.04388,58.68647,79,7 171 | 2011,2,1,65.56011,66.26239,84,2 172 | 2012,2,0,61.04388,71.82729,76,9 173 | 2013,2,0,73.72179,55.33549,80,6 174 | 2014,2,1,54.7921,51.58518,82,4 175 | 2015,2,1,58.11485,58.11485,85,1 176 | 2016,2,0,37.72536,46.82059,79,7 177 | 2017,2,1,65.56011,70.09472,86,0 178 | 2018,2,1,58.11485,68.48849,86,0 179 | 2019,2,0,62.90584,71.82729,82,4 180 | 2020,2,1,98.99289,55.33549,82,2 181 | 2021,2,0,53.71444,66.98323,86,0 182 | 2022,2,0,65.56011,50,82,4 183 | 2023,2,0,53.71444,65.56011,84,2 184 | 2024,2,0,48.41482,29.05672,67,18 185 | 2025,2,0,35.12527,52.11514,85,1 186 | 2026,2,1,70.94328,70.94328,86,0 187 | 2027,2,0,62.90584,65.56011,85,1 188 | 2028,2,1,68.48849,58.68647,70,16 189 | 2029,2,0,51.05624,41.31353,79,6 190 | 2030,2,0,64.20476,67.72454,70,16 191 | 2031,2,1,58.11485,78.2363,86,0 192 | 2032,2,0,46.82059,42.45086,77,8 193 | 2033,2,1,46.82059,52.64643,85,1 194 | 2034,2,0,66.98323,86.86945,84,2 195 | 2035,2,0,32.27546,51.05624,83,3 196 | 2036,2,0,29.05672,29.05672,82,4 197 | 2037,2,0,98.99289,42.45086,84,2 198 | 2038,2,0,61.65428,84.64062,82,4 199 | 2039,2,1,47.88486,37.09416,79,7 200 | 2040,2,0,53.71444,56.98883,86,0 201 | 2041,2,0,59.26457,51.58518,85,1 202 | 2042,2,0,73.72179,93.25195,85,1 203 | 2043,2,0,61.04388,51.58518,86,0 204 | 2044,2,0,33.01677,45.2079,73,13 205 | 2045,2,0,39.55739,41.31353,85,1 206 | 2046,2,1,56.98883,62.27464,86,0 207 | 2047,2,0,46.82059,47.88486,85,1 208 | 2048,2,1,41.31353,44.66451,86,0 209 | 2049,2,0,65.56011,71.82729,85,1 210 | 2050,2,0,82.74353,98.99289,86,0 211 | 2051,2,0,45.2079,44.11754,85,1 212 | 2052,2,0,59.26457,61.04388,86,0 213 | 2053,2,0,55.88246,56.98883,84,2 214 | 2054,2,1,69.27759,79.59081,86,0 215 | 2055,2,0,56.98883,59.26457,85,1 216 | 2056,2,1,56.98883,47.35357,86,0 217 | 2057,2,0,65.56011,62.27464,82,4 218 | 2058,2,1,47.88486,51.05624,83,3 219 | 2059,2,1,20.40919,40.15026,85,1 220 | 2060,2,0,55.88246,81.08016,86,0 221 | 2061,2,0,61.65428,60.44261,85,1 222 | 2062,2,0,58.11485,68.48849,82,4 223 | 2063,2,0,45.2079,47.35357,85,1 224 | 2064,2,1,29.05672,32.27546,77,9 225 | 2065,2,0,33.73761,59.84974,86,0 226 | 2066,2,1,53.71444,40.15026,82,4 227 | 2067,2,0,50.52795,53.71444,54,8 228 | 2068,2,1,63.54885,54.25188,73,13 229 | 2069,2,1,45.2079,48.94376,86,0 230 | 2070,2,1,73.72179,56.43343,86,0 231 | 2071,2,1,59.26457,49.47205,86,0 232 | 2072,2,0,98.99289,69.27759,86,0 233 | 2073,2,0,56.98883,74.74522,84,2 234 | 2074,2,0,58.11485,58.11485,81,5 235 | 2075,2,0,61.65428,86.86945,85,1 236 | 2076,2,1,65.56011,66.98323,86,0 237 | 2077,2,0,59.26457,56.98883,86,0 238 | 2078,2,0,66.98323,63.54885,85,1 239 | 2079,2,0,32.27546,50,84,2 240 | 2080,2,1,70.94328,73.72179,84,1 241 | 2081,2,1,37.72536,37.72536,81,5 242 | 2082,2,0,32.27546,29.90528,83,3 243 | 2083,2,1,36.45115,32.27546,85,1 244 | 2084,2,0,64.20476,71.82729,83,3 245 | 2085,2,0,56.98883,71.82729,80,6 246 | 2086,2,1,44.11754,24.16932,78,8 247 | 2087,2,1,40.15026,40.15026,65,21 248 | 2088,2,1,28.17271,30.72241,19,1 249 | 2089,2,0,52.64643,79.59081,79,7 250 | 2090,2,0,40.15026,44.66451,81,5 251 | 2091,2,1,41.31353,40.15026,85,1 252 | 2092,2,0,59.26457,89.60951,85,1 253 | 2093,2,0,45.2079,46.82059,86,0 254 | 2094,2,1,45.2079,53.17941,82,4 255 | 2095,2,0,38.34572,55.33549,86,0 256 | 2096,2,1,46.28556,55.33549,72,14 257 | 2097,2,0,65.56011,51.05624,44,2 258 | 2098,2,0,51.58518,65.56011,84,2 259 | 2099,2,0,46.82059,68.48849,84,2 260 | 2100,2,1,50.52795,43.01117,86,0 261 | 2101,2,1,59.26457,55.33549,85,1 262 | 2102,2,1,49.47205,43.01117,67,19 263 | 2103,2,0,43.01117,46.82059,84,2 264 | 2104,2,1,58.11485,71.82729,74,11 265 | 2105,2,1,46.28556,70.94328,86,0 266 | 2106,2,0,48.94376,53.71444,81,5 267 | 2107,2,0,98.99289,56.98883,72,13 268 | 2108,2,0,41.31353,50.52795,86,0 269 | 2109,2,1,73.72179,82.74353,81,5 270 | 2110,2,1,64.20476,65.56011,83,3 271 | 2111,2,0,53.71444,51.58518,84,2 272 | 2112,2,1,55.88246,98.99289,84,2 273 | 2113,2,1,62.90584,56.98883,81,5 274 | 2114,2,0,61.04388,71.82729,86,0 275 | 2115,2,0,98.99289,98.99289,83,3 276 | 2116,2,1,62.90584,54.7921,86,0 277 | 2117,2,1,58.68647,60.44261,85,1 278 | 2118,2,1,53.71444,35.12527,77,9 279 | 2119,2,1,65.56011,44.11754,85,1 280 | 2120,2,1,51.58518,40.15026,86,0 281 | 2121,2,1,62.90584,62.27464,86,0 282 | 2122,2,1,70.94328,58.11485,85,1 283 | 2123,2,1,70.09472,48.41482,73,12 284 | 2124,2,0,65.56011,62.27464,83,3 285 | 2125,2,1,46.82059,65.56011,86,0 286 | 2126,2,0,66.98323,51.58518,85,1 287 | 2127,2,1,51.58518,59.26457,81,5 288 | 2128,2,0,46.28556,64.20476,85,1 289 | 2129,2,1,47.88486,58.68647,85,1 290 | 2130,2,0,41.88515,58.11485,78,7 291 | 2131,2,0,56.98883,59.84974,80,6 292 | 2132,2,0,64.20476,81.08016,86,0 293 | 2133,2,0,70.94328,58.68647,78,8 294 | 2134,2,1,45.2079,57.54914,86,0 295 | 2135,2,0,40.15026,48.41482,84,1 296 | 2136,2,1,25.25478,13.13055,86,0 297 | 2137,2,1,55.88246,86.86945,82,4 298 | 2138,2,0,98.99289,93.25195,63,17 299 | 2139,2,1,59.26457,44.11754,80,6 300 | 2140,2,0,41.88515,41.31353,86,0 301 | 2141,2,0,61.04388,56.98883,86,0 302 | 2142,2,0,73.72179,70.94328,85,1 303 | 2143,2,1,61.04388,47.88486,83,3 304 | 2144,2,1,53.71444,61.04388,84,1 305 | 2145,2,1,46.82059,59.26457,84,2 306 | 2146,2,0,36.45115,46.82059,84,2 307 | 2147,2,1,56.98883,61.04388,84,2 308 | 2148,2,0,20.40919,15.35938,81,5 309 | 2149,2,0,47.88486,54.7921,45,41 310 | 2150,2,1,56.98883,43.01117,83,3 311 | 2151,2,0,54.7921,71.82729,79,7 312 | 2152,2,0,47.88486,69.27759,85,1 313 | 2153,2,1,36.45115,47.88486,85,1 314 | 2154,2,0,66.98323,68.48849,83,3 315 | 2155,2,0,54.7921,53.17941,86,0 316 | 2156,2,0,76.98948,69.27759,86,0 317 | 2157,2,0,65.56011,70.94328,84,2 318 | -------------------------------------------------------------------------------- /test/test_PCA.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from mozsci import pca 7 | from six.moves import range 8 | 9 | class TestLinearPCA(unittest.TestCase): 10 | 11 | def test_linearPCA(self): 12 | """Test linear PCA""" 13 | 14 | # make the data 15 | N = 1000 16 | data = np.zeros((N, 3)) 17 | for k in range(N): 18 | data[k, 0] = (np.random.random() - 0.5) * 5.0 + 2.0 19 | #data[k, 1] = 3.5 * data[k, 0] + (np.random.random() - 0.5) 20 | data[k, 1] = (np.random.random() - 0.5) * 5.0 21 | data[k, 2] = 3.5 + data[k, 0] - 0.55 * data[k, 1] + (np.random.random() - 0.5) 22 | 23 | # make the PCA, do the training, then project on all three eigenvectors, 24 | # and reconstruct the original data 25 | p = pca.LinearPCA() 26 | p.train(data) 27 | data_proj = p.project(data, 3) 28 | 29 | # reconstruct the data from the projection 30 | data_reconstruct = np.zeros((N, 3)) 31 | for k in range(N): 32 | data_reconstruct[k, :] = p.mean + data_proj[k, 0] * p.eigvec[:, 0] + data_proj[k, 1] * p.eigvec[:, 1] + data_proj[k, 2] * p.eigvec[:, 2] 33 | 34 | self.assertTrue((np.abs(data_reconstruct - data) < 1.0e-12).all()) 35 | 36 | # test out truncate 37 | self.assertTrue((np.abs(p.truncate(data, 3) - data) < 1.0e-12).all()) 38 | 39 | # test json 40 | json_map = p.to_json() 41 | p_from_json = pca.LinearPCA(json_map=json_map) 42 | self.assertEqual(p.nvars, p_from_json.nvars) 43 | self.assertTrue((np.abs(p.mean - p_from_json.mean) < 1.0e-12).all()) 44 | self.assertTrue((np.abs(p.eigval - p_from_json.eigval) < 1.0e-12).all()) 45 | self.assertTrue((np.abs(p.eigvec - p_from_json.eigvec) < 1.0e-12).all()) 46 | 47 | 48 | 49 | if __name__ == "__main__": 50 | suite1 = unittest.TestLoader().loadTestsFromTestCase(TestLinearPCA) 51 | suitelist = [suite1] 52 | suite = unittest.TestSuite(suitelist) 53 | unittest.TextTestRunner(verbosity=2).run(suite) 54 | 55 | 56 | -------------------------------------------------------------------------------- /test/test_cross_validate.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | import time 6 | 7 | from mozsci import cross_validate 8 | 9 | 10 | class Test_cv_kfold(unittest.TestCase): 11 | def test_cv_kfold(self): 12 | folds = cross_validate.cv_kfold(20, 4, seed=2) 13 | 14 | sum_training = np.sum([len(ele[0]) for ele in folds]) 15 | self.assertTrue(sum_training == 3 * 20) 16 | 17 | sum_training = np.sum([len(ele[1]) for ele in folds]) 18 | self.assertTrue(sum_training == 20) 19 | 20 | actual_folds = [ 21 | [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18], [1, 6, 12, 14, 19]], 22 | [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18], [0, 3, 4, 5, 8, 9, 17]], 23 | [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18], [2, 7, 10, 11, 13]], 24 | [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13], [15, 16, 18]]] 25 | 26 | self.assertEqual(actual_folds, folds) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /test/test_evaluation.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from mozsci import evaluation 7 | from mozsci.inputs import mean_std_weighted 8 | from six.moves import range 9 | 10 | 11 | class TestAUCFast(unittest.TestCase): 12 | def test_auc_wmw_fast(self): 13 | 14 | t = [-1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1] 15 | p = [0.01, 0.05, 0.2, 0.25, 0.1, 0.9, 0.6, 0.01, 0.90, 1.0, 0.33, 0.55, 0.555] 16 | 17 | auc_act = 0.54761904761904767 18 | auc = evaluation.auc_wmw_fast(t, p) 19 | 20 | self.assertTrue(abs(auc_act - auc) < 1.0e-8) 21 | 22 | def test_auc_degenerate(self): 23 | y = np.array([0]) 24 | ypred = np.array([[ 1.0]]) 25 | weights = np.array([1]) 26 | auc = evaluation.auc_wmw_fast(y, ypred, weights=weights) 27 | self.assertTrue(auc == 0) 28 | 29 | 30 | class Testclassification_error(unittest.TestCase): 31 | def test_classification_error(self): 32 | y = np.array([0, 1, 1, 0]) 33 | ypred = np.array([0.1, 0.9, 0.4, 0.2]) 34 | 35 | self.assertTrue(abs(evaluation.classification_error(y, ypred) - 0.25) < 36 | 1e-12) 37 | self.assertTrue(abs(evaluation.classification_error(y, ypred, thres=0.3 38 | ) - 0.0) < 1e-12) 39 | 40 | weights = np.array([1.0, 0.8, 0.7, 0.6]) 41 | self.assertTrue(abs(evaluation.classification_error(y, ypred, weights=weights) - (1.0 - (1.0 + 0.8 + 0.6) / (weights.sum()))) < 1.0e-12) 42 | 43 | 44 | 45 | 46 | class Test_precision_recall_f1(unittest.TestCase): 47 | 48 | def setUp(self): 49 | self.yactual = np.array([0, 0, 0, 0, 1, 1, 1]) 50 | self.ypred = np.array([0, 1, 1, 1, 1, 0, 0]) 51 | self.weights = np.array([1, 2, 3, 4, 5, 6, 7]) 52 | 53 | self.yactual1 = self.yactual.reshape(7, 1) 54 | self.ypred1 = self.ypred.reshape(1, 7) 55 | self.weights1 = self.weights.reshape(1, 7) 56 | 57 | def test_precision_recall_f1(self): 58 | tp = 1.0 59 | fp = 3.0 60 | fn = 2.0 61 | 62 | actual_prec_rec_f1 = Test_precision_recall_f1.prec_rec_f1_from_tp_fp_fn(tp, fp, fn) 63 | for y in [self.yactual, self.yactual1]: 64 | for ypred in [self.ypred, self.ypred1]: 65 | prec_rec_f1 = evaluation.precision_recall_f1(y, ypred) 66 | for k in range(3): 67 | self.assertTrue(abs(actual_prec_rec_f1[k] - prec_rec_f1[k]) < 1e-12) 68 | 69 | def test_precision_recall_f1_weighted(self): 70 | tp = 5.0 71 | fp = 2.0 + 3 + 4 72 | fn = 6.0 + 7 73 | 74 | actual_prec_rec_f1 = Test_precision_recall_f1.prec_rec_f1_from_tp_fp_fn(tp, fp, fn) 75 | 76 | for y in [self.yactual, self.yactual1]: 77 | for ypred in [self.ypred, self.ypred1]: 78 | for weights in [self.weights, self.weights1]: 79 | prec_rec_f1 = evaluation.precision_recall_f1(y, ypred, weights=weights) 80 | for k in range(3): 81 | self.assertTrue(abs(actual_prec_rec_f1[k] - prec_rec_f1[k]) < 1e-12) 82 | 83 | 84 | def test_degenerate(self): 85 | # test case with degenerate input 86 | y = np.array([0]) 87 | ypred = np.array([[ 1.0]]) 88 | weights = np.array([1]) 89 | prf = evaluation.precision_recall_f1(y, ypred, weights=weights) 90 | 91 | # check that none are NaN 92 | self.assertFalse(np.array([np.isnan(ele) for ele in prf]).any()) 93 | 94 | # and they should all be 0 95 | self.assertTrue(np.allclose(prf, [0, 0, 0])) 96 | 97 | 98 | @staticmethod 99 | def prec_rec_f1_from_tp_fp_fn(tp, fp, fn): 100 | actual_prec_rec_f1 = np.zeros(3) 101 | actual_prec_rec_f1[0] = tp / (tp + fp) # precision 102 | actual_prec_rec_f1[1] = tp / (tp + fn) # recall 103 | actual_prec_rec_f1[2] = 2.0 * actual_prec_rec_f1[0] * actual_prec_rec_f1[1] / (actual_prec_rec_f1[0] + actual_prec_rec_f1[1]) # f1 104 | return actual_prec_rec_f1 105 | 106 | 107 | 108 | class Test_pearson_weighted(unittest.TestCase): 109 | def test_pearson_weighted(self): 110 | from scipy.stats import pearsonr 111 | 112 | x = np.array([1, 2, 3, 4, 5]) 113 | y = np.array([1.0, 1.5, -0.5, 3.4, 2.9]) 114 | weights = np.array([1, 0, 0.5, 2, 1.5]) 115 | 116 | r_no_wgt = pearsonr(x, y)[0] 117 | r_no_wgt_test = evaluation.pearsonr_weighted(x, y) 118 | r_ones_wgt = evaluation.pearsonr_weighted(x, y, np.ones(x.shape)) 119 | 120 | self.assertTrue(abs(r_no_wgt - r_no_wgt_test) < 1e-12) 121 | self.assertTrue(abs(r_no_wgt - r_ones_wgt) < 1e-12) 122 | 123 | xm = mean_std_weighted(x, weights) 124 | ym = mean_std_weighted(y, weights) 125 | r_wgt = np.sum((x - xm['mean']) * (y - ym['mean']) * weights) / np.sum(weights) 126 | self.assertTrue((evaluation.pearsonr_weighted(x, y, weights) - r_wgt) < 1e-12) 127 | 128 | 129 | 130 | 131 | class Test_spearmanr_by(unittest.TestCase): 132 | 133 | def test_spearmanr_by(self): 134 | 135 | f = np.array([50, 52.19589972, 44.97281905, 50, 136 | 47.6719409 , 45.96619825, 50, 50, 137 | 48.18824048, 54.88529706, 42.67667074, 41.80373588, 138 | 37.29934119, 57.98812747, 45.04782628, 38.10858417, 139 | 46.44031713, 40.59823939, 26.29936944, 23.96820474, 140 | 47.98343799, 36.4455311 , 43.92931621, 55.19172514, 141 | 33.44633285, 37.38381116, 39.03392758, 41.43285553, 142 | 28.63082987, 31.86069758, 41.19551474, 29.04928565, 143 | 39.09690404, 36.75441683, 29.66390582, 70.4035713 , 144 | 63.53532854, 49.78916058, 64.39911984, 65.41353192, 145 | 48.42353021, 60.38572122, 42.44357922, 42.86378695, 146 | 58.93821467, 61.93862217, 36.23459784, 64.57533596, 147 | 40.09399141, 45.57233379, 44.7748158 , 50.88705955, 148 | 47.24016865, 51.75866967, 36.17935042, 46.73933887, 149 | 52.7136634 , 47.0337377 , 34.19077012, 18.5836512 , 150 | 41.63257011, 9.8698871 , 37.63277795, 47.71676464, 151 | 34.89667886, 35.10845963, 44.56638481, 36.70884056, 152 | 57.9185177 , 50.65260932, 58.53307806, 43.25154747, 153 | 40.59802125, 38.97005406, 35.19682907, 51.94755877, 154 | 44.04430199, 35.84048228, 36.25006727, 46.35317423, 155 | 37.44668618, 16.90596421, 38.87970562, 47.33515849, 156 | 27.41230181, 29.47142008]) 157 | 158 | position = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 12., 159 | 13., 15., 16., 17., 19., 23., 24., 25., 26., 27., 28., 160 | 29., 1., 2., 3., 6., 8., 9., 11., 12., 13., 17., 161 | 19., 21., 1., 2., 3., 4., 5., 6., 7., 8., 9., 162 | 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 163 | 22., 23., 24., 25., 26., 27., 1., 2., 4., 5., 6., 164 | 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 165 | 18., 20., 21., 22., 23., 24., 25., 26., 27.]) 166 | 167 | queryid = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 168 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 169 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 170 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 171 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 172 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 173 | 3, 3, 3, 3, 3, 3, 3, 3], np.int) 174 | 175 | fast_spearman = evaluation.spearmanr_by(f, position, queryid) 176 | self.assertTrue(abs(fast_spearman - -0.42666971560358913) < 1e-1) 177 | 178 | 179 | class TestClassificationPerfMeasure(unittest.TestCase): 180 | 181 | def test_basic_measure_1(self): 182 | """ 183 | Test classification_model_performance. All correct case. 184 | """ 185 | observed = np.array([0, 1, 1, 0, 0, 0, 1]) 186 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 187 | 188 | measure = evaluation.classification_model_performance(observed, calculated) 189 | 190 | self.assertEqual(measure, 0) 191 | 192 | def test_basic_measure_2(self): 193 | """ 194 | Test classification_model_performance. All correct case. 195 | """ 196 | observed = np.array([0, 1, 0, 1, 0, 0, 1]) 197 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 198 | 199 | measure = evaluation.classification_model_performance(observed, calculated) 200 | 201 | self.assertAlmostEqual(measure, 0.2857142857140) 202 | 203 | def test_basic_measure_3(self): 204 | """ 205 | Test classification_model_performance. weighted case. 206 | """ 207 | observed = np.array([0, 1, 0, 1, 0, 0, 1]) 208 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 209 | 210 | measure = evaluation.classification_model_performance(observed, calculated, [1.0, 3.0]) 211 | 212 | def test_matrix_measure_1(self): 213 | """ 214 | Test classification_model_performance_matrix. All correct case. 215 | """ 216 | observed = np.array([0, 1, 1, 0, 0, 0, 1]) 217 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 218 | 219 | measure = evaluation.classification_model_performance_matrix(observed, calculated) 220 | expected_measure = np.array([[4, 0], [0, 3]]) 221 | 222 | np.testing.assert_array_almost_equal(measure, expected_measure) 223 | 224 | def test_matrix_measure_2(self): 225 | """ 226 | Test classification_model_performance_matrix. All correct case. 227 | """ 228 | observed = np.array([0, 1, 0, 1, 0, 0, 1]) 229 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 230 | 231 | measure = evaluation.classification_model_performance_matrix(observed, calculated) 232 | expected_measure = np.array([[3, 1], [1, 2]]) 233 | 234 | np.testing.assert_array_almost_equal(measure, expected_measure) 235 | 236 | def test_matrix_measure_3(self): 237 | """ 238 | Test classification_model_performance_matrix. multiple classes case. 239 | """ 240 | observed = np.array([1, 0, 1, 0, 1, 0, 2, 3]) 241 | calculated = np.array([1, 0, 1, 1, 0, 2, 3, 0]) 242 | 243 | measure = evaluation.classification_model_performance_matrix(observed, calculated) 244 | expected_measure = np.array([[1, 1, 1, 0], [1, 2, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0]]) 245 | 246 | np.testing.assert_array_almost_equal(measure, expected_measure) 247 | 248 | def test_loss_measure_1(self): 249 | """ 250 | Test classification_model_performance_loss. default loss (0-1 loss). 251 | """ 252 | observed = np.array([0, 1, 1, 0, 1, 0, 1]) 253 | calculated = np.array([0, 1, 1, 0, 0, 0, 1]) 254 | 255 | measure = evaluation.classification_model_performance_loss(observed, calculated) 256 | 257 | self.assertEqual(measure, 1) 258 | 259 | def test_loss_measure_2(self): 260 | """ 261 | Test classification_model_performance_loss. user defined loss measure - squared loss. 262 | """ 263 | observed = np.array([0, 1, 0, 1, 0, 2, 1]) 264 | calculated = np.array([0, 1, 1, 0, 2, 0, 1]) 265 | 266 | loss = lambda i, j: (i-j)*(i-j) 267 | 268 | measure = evaluation.classification_model_performance_loss(observed, calculated, loss) 269 | 270 | self.assertEqual(measure, 10) 271 | 272 | 273 | if __name__ == "__main__": 274 | unittest.main() 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /test/test_glm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import unittest 3 | import numpy as np 4 | 5 | from mozsci.glm import prob_distributions 6 | from mozsci.glm import regularization 7 | from mozsci.glm import simplified_glm 8 | 9 | class TestGlm(unittest.TestCase): 10 | 11 | def test_negative_binomial_dist_likelihood(self): 12 | """ 13 | Test the calculation of the log likelihood of the negative binomial distribution. 14 | :return: 15 | """ 16 | features = np.array([ 17 | [1,56.98883,42.45086, 1.0], 18 | [1,37.09416,46.82059, 1.0], 19 | [0,32.27546,43.56657, 1.0], 20 | [0,29.05672,43.56657, 1.0], 21 | [0,6.748048,27.24847, 1.0], 22 | [0,61.65428,48.41482, 1.0] 23 | ]) 24 | 25 | Y = np.array([4, 4, 2, 3, 3, 13 ]) 26 | beta_k = np.array([10.0, 0, 0, 0, 0]) 27 | 28 | dist = prob_distributions.NegativeBinomialWithKstar() 29 | 30 | calculated = dist.eval(beta_k, features, Y) 31 | 32 | self.assertAlmostEqual(calculated, -5.9967772892) 33 | 34 | 35 | def test_negative_binomial_dist_gradient(self): 36 | """ 37 | Test the gradient of the log likelihood of negative binomial distribution. 38 | """ 39 | # input data. 40 | features = np.array([ 41 | [1,56.98883,42.45086, 1.0], 42 | [1,37.09416,46.82059, 1.0], 43 | [0,32.27546,43.56657, 1.0], 44 | [0,29.05672,43.56657, 1.0], 45 | [0,6.748048,27.24847, 1.0], 46 | [0,61.65428,48.41482, 1.0] 47 | ]) 48 | Y = np.array([4, 4, 2, 3, 3, 13 ]) 49 | beta_k = np.array([10.0, 0, 0, 0, 0]) 50 | 51 | # expected output 52 | expected = np.array([-3.22202699e-03, 5.99972761e+00, 1.12593421e+03, 1.03394190e+03, 2.29989558e+01]) 53 | 54 | # calculation. 55 | dist = prob_distributions.NegativeBinomialWithKstar() 56 | 57 | calculated = dist.eval_gradient(beta_k, features, Y) 58 | 59 | np.testing.assert_almost_equal(calculated, expected, decimal=5) 60 | 61 | def test_poisson_regression(self): 62 | """ 63 | This method is used to test the poisson regression works as it should. 64 | The data is from: http://www.oxfordjournals.org/our_journals/tropej/online/ma_chap13.pdf 65 | :return: 66 | """ 67 | features = np.array( [ 68 | [236,0], [739,1], [970,1], [2371,1], [309,1], [679,1], [26,0], [1272,1], [3246,1], [1904,1], 69 | [357,1], [1080,1], [1027,1], [28,0], [2507,1], [138,0], [502,1], [1501,1], [2750,1], [192,1], ] ) 70 | 71 | Y = np.array([ 8, 16, 15, 23, 5, 13, 4, 19, 33, 19, 10, 16, 22, 2, 22, 2, 18, 21, 24, 9]) 72 | 73 | regular = regularization.NullRegularization() 74 | 75 | # or we can use regular = regularization.NullRegularization() 76 | reg = simplified_glm.PoissonRegression(lam=0, maxiter=500) 77 | reg.fit(features, Y) 78 | 79 | # The correct result should be [0.00033, 1.045, 1.351], The last one is the constant. 80 | # bfgs gives [ 3.26073590e-04 1.04513753e+00 1.35099878e+00] 81 | expected = np.array([0.00033, 1.045, 1.351]) 82 | np.testing.assert_almost_equal(reg.params, expected, decimal=2) 83 | 84 | def test_negative_binomial(self): 85 | """ 86 | This method is used to test the negative binomial 'regression' works as it should. 87 | Data is from : http://www.ats.ucla.edu/stat/sas/dae/negbinreg.htm 88 | What they got: loglikelihood: Log Likelihood 2149.3649 89 | Parameter DF Estimate Error Limits Chi-Square Pr > ChiSq 90 | Intercept 1 2.7161 0.2326 2.2602 3.1719 136.38 <.0001 91 | male 1 -0.4312 0.1397 -0.7049 -0.1574 9.53 0.0020 92 | math 1 -0.0016 0.0048 -0.0111 0.0079 0.11 0.7413 93 | langarts 1 -0.0143 0.0056 -0.0253 -0.0034 6.61 0.0102 94 | Dispersion 1 1.2884 0.1231 1.0471 1.5296 95 | 96 | NOTE: The negative binomial dispersion parameter was estimated by maximum likelihood. 97 | 98 | What we got: (Under the same condition - no regularization. No max iteration limit.) 99 | the likelihood term value and the regularization term value are -2149.36485714 0.0 100 | Optimization terminated successfully. 101 | Current function value: -2149.364857 102 | Iterations: 27 103 | Function evaluations: 184 104 | Gradient evaluations: 161 105 | The linear coefficients are: 106 | [ (This is k*) -2.53387660e-01 -4.31184391e-01 -1.60095828e-03 -1.43475268e-02 107 | (This is the intercept) 2.71606920e+00] 108 | """ 109 | mydata = np.genfromtxt('test/data/poissonreg.csv', delimiter=',', skip_header=1) 110 | features = mydata[:, 2:5] 111 | 112 | Y = mydata[:, 6] 113 | 114 | reg = simplified_glm.NegativeBinomialWithKstarRegression(3 + 2, lam=0) 115 | reg.fit(features, Y) 116 | 117 | ## data from ucla. 118 | expected = np.array([-0.4312, -0.0016, -0.0143, 2.7161]) 119 | 120 | np.testing.assert_almost_equal(reg.params[1:], expected, decimal=2) 121 | 122 | if __name__ == "__main__": 123 | unittest.main() 124 | -------------------------------------------------------------------------------- /test/test_histogram.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import unittest 5 | import numpy as np 6 | import time 7 | 8 | from mozsci import histogram 9 | 10 | 11 | class TestHistogram1D(unittest.TestCase): 12 | def test_histogram1d(self): 13 | 14 | h = histogram.Histogram1DFast(10, 0, 10) 15 | self.assertTrue((np.abs(h.bin_edges - np.arange(11)) < 1.0e-12).all()) 16 | 17 | x = np.array([-1.0, 0.5, 3.2, 0.77, 9.99, 10.1, 8.2]) 18 | h.update(x) 19 | 20 | xc = np.array([1.5, 2.5, 8.3]) 21 | cc = np.array([10, 5, 22]) 22 | h.update_counts(xc, cc) 23 | self.assertTrue((h.bin_count == np.array([3, 10, 5, 1, 0, 0, 0, 0, 23, 2])).all()) 24 | 25 | # check compute_indices 26 | self.assertTrue((h.compute_indices(np.arange(12) - 0.5) == np.array([0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9])).all()) 27 | 28 | 29 | # benchmark 30 | x = np.random.randn(1e7) 31 | time1 = time.time() 32 | h = histogram.Histogram1DFast(100, -5, 5) 33 | h.update(x) 34 | time2 = time.time() 35 | out = np.histogram(x, bins=100, range=[-5, 5]) 36 | time3 = time.time() 37 | 38 | print("Time for fast = " + str(time2 - time1) + " s") 39 | print("Time for numpy = " + str(time3- time2) + " s") 40 | 41 | 42 | # check sampler 43 | t1 = time.time() 44 | samples = h.sample(5e6) 45 | t2 = time.time() 46 | 47 | (counts, edges) = np.histogram(samples, 50, normed=True) 48 | centers = 0.5 * (edges[1:] + edges[0:-1]) 49 | actual_pdf = 1.0 / np.sqrt(2.0 * 3.14159) * np.exp(-centers ** 2 / 2.0) 50 | self.assertTrue(np.allclose(counts, actual_pdf, atol=5e-3)) 51 | 52 | def test_stratified_sample(self): 53 | hist = histogram.Histogram1DFast(5, 0, 5) 54 | hist.update_counts(np.array([0.5, 1.5, 2.5, 3.5, 4.5]), 55 | np.array([5e6, 1e6, 1e4, 1e3, 2])) 56 | 57 | hist.compute_pdf_cdf() 58 | 59 | # generate a sample 60 | x = hist.sample(int(hist.bin_count.sum())) 61 | 62 | # now do a stratified sample of the large sample 63 | sample_size = [5000, 3000, 1000, 250, 2] 64 | x_stratified_sample = hist.stratified_sample(x, sample_size) 65 | hist_check = histogram.Histogram1DFast(5, 0, 5) 66 | hist_check.update(x_stratified_sample) 67 | 68 | # check that the actual sample distribution matches the expected 69 | # one. We expect a small relative difference in all entries 70 | # except the last (where we expect a small absolute difference) 71 | self.assertTrue(np.allclose(1.0, 72 | hist_check.bin_count[:-1].astype(np.float) / sample_size[:-1], 73 | atol=0.10, rtol=0.0)) 74 | self.assertTrue(abs(hist_check.bin_count[-1] - sample_size[-1]) < 3) 75 | 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | 80 | 81 | -------------------------------------------------------------------------------- /test/test_inputs.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | from mozsci import inputs 6 | from six.moves import range 7 | 8 | class Test_mean_std_weightd(unittest.TestCase): 9 | def test_mean_std(self): 10 | 11 | # test 1D case 12 | x = np.array([1, 2, 3, 4, 5]) 13 | weights = np.array([0.2, 0.1, 2,0.5, 1]) 14 | 15 | ret = inputs.mean_std_weighted(x) 16 | self.assertTrue(abs(ret['mean'] - 3.0) < 1e-8) 17 | self.assertTrue(abs(ret['std'] - np.sqrt(2 * (4 + 1) / 5)) < 1e-8) 18 | 19 | ret = inputs.mean_std_weighted(x, np.ones(x.shape)) 20 | self.assertTrue(abs(ret['mean'] - 3.0) < 1e-8) 21 | self.assertTrue(abs(ret['std'] - np.sqrt(2 * (4 + 1) / 5)) < 1e-8) 22 | 23 | ret = inputs.mean_std_weighted(x, weights) 24 | m = np.sum(weights * x) / np.sum(weights) 25 | s = np.sqrt(np.sum((x - m)**2 * weights) / np.sum(weights)) 26 | self.assertTrue(abs(ret['mean'] - m) < 1e-8) 27 | self.assertTrue(abs(ret['std'] - s) < 1e-8) 28 | 29 | # 2D case 30 | x = np.array([[1, 2], 31 | [-0.5, 0.0], 32 | [3, -0.55]]) 33 | weights = np.array([0.5, 2, 1.55]) 34 | 35 | ret = inputs.mean_std_weighted(x, weights) 36 | 37 | sum_weights = np.sum(weights) 38 | m1 = (1.0 * 0.5 + -0.5 * 2 + 3 * 1.55) / sum_weights 39 | m2 = (2.0 * 0.5 + 0.0 * 2 + -0.55 * 1.55) / sum_weights 40 | self.assertTrue(np.allclose(ret['mean'], [m1, m2])) 41 | 42 | s1 = np.sqrt(((1.0 - m1) ** 2 * 0.5 + (-0.5 - m1)**2 * 2.0 + (3 - m1)**2 * 1.55) / sum_weights) 43 | s2 = np.sqrt(((2 - m2) ** 2 * 0.5 + (0.0 - m2)**2 * 2.0 + (-0.55 - m2)**2 * 1.55) / sum_weights) 44 | self.assertTrue(np.allclose(ret['std'], [s1, s2])) 45 | 46 | 47 | class TestLogScaledTransformer(unittest.TestCase): 48 | def test_log_transformer(self): 49 | mean = np.array([0.5, 1.0]) 50 | std = np.array([0.3, 0.8]) 51 | offset = 2.0 52 | nsamples = int(1e6) 53 | samples = np.zeros((nsamples, 2)) 54 | for k in range(2): 55 | samples[:, k] = np.random.normal(mean[k], std[k], nsamples) 56 | exp_samples = np.exp(samples) - offset 57 | 58 | transformer = inputs.LogScaledTransformer(offset=offset) 59 | 60 | # check fit 61 | transformer.fit(exp_samples) 62 | self.assertTrue(np.allclose(transformer.mean_, mean, atol=1e-2)) 63 | self.assertTrue(np.allclose(transformer.std_, std, atol=1e-2)) 64 | 65 | # check transform 66 | X = exp_samples[:10] 67 | transformed = transformer.transform(X) 68 | expected = 1.0 / transformer.std_ * ( 69 | np.log(X + offset) - transformer.mean_) 70 | self.assertTrue(np.allclose(transformed, expected)) 71 | 72 | # inverse transform 73 | self.assertTrue(np.allclose(X, 74 | transformer.inverse_transform(transformer.transform(X)))) 75 | 76 | class TestBucketTransformer(unittest.TestCase): 77 | def test_bucket_transformer(self): 78 | transformer = inputs.BucketTransformer([0, 1, 2.4]) 79 | X = np.array([0.5, 1.2, -1, 3.9, 1.9, 2.1]) 80 | Y = transformer.transform(X) 81 | expectedY = np.array( 82 | [[ 0., 1., 0., 0.], 83 | [ 0., 0., 1., 0.], 84 | [ 1., 0., 0., 0.], 85 | [ 0., 0., 0., 1.], 86 | [ 0., 0., 1., 0.], 87 | [ 0., 0., 1., 0.]] 88 | ) 89 | self.assertTrue(np.allclose(Y, expectedY)) 90 | 91 | 92 | if __name__ == "__main__": 93 | unittest.main() 94 | 95 | -------------------------------------------------------------------------------- /test/test_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | from mozsci.models import LinearRegression 5 | from mozsci.evaluation import pearsonr_weighted 6 | import numpy as np 7 | 8 | class TestLogisticRegression(unittest.TestCase): 9 | def test_linear_regression(self): 10 | np.random.seed(55) 11 | X = np.random.rand(1000, 3) 12 | w = [0.5, 1.3, -2.5] 13 | b = 12.5 14 | y = X[:, 0] * w[0] + X[:, 1] * w[1] + X[:, 2] * w[2] + b 15 | 16 | # should convert to the exact solution with only a little regularization 17 | lr = LinearRegression(lam=0.001) 18 | lr.fit(X, y) 19 | ypred = lr.predict(X) 20 | self.assertTrue(pearsonr_weighted(y, ypred) > 0.99) 21 | 22 | # try weighted 23 | weights = np.random.rand(1000) 24 | lr = LinearRegression(lam=0.001) 25 | lr.fit(X, y, weights=weights) 26 | ypred = lr.predict(X) 27 | self.assertTrue(pearsonr_weighted(y, ypred, weights)) 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | 33 | 34 | -------------------------------------------------------------------------------- /test/test_logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | from mozsci.models import LogisticRegression 5 | import numpy as np 6 | 7 | class TestLogisticRegression(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.x = np.array([[1, -2], [-0.5, -2]]) 11 | self.t = np.array([0, 1]) 12 | self.w = np.array([3, -1]) 13 | self.b = 1 14 | self.lam = 7 15 | 16 | def test_sigmoid(self): 17 | y = LogisticRegression._sigmoid(self.x, self.b, self.w) 18 | yact = np.array([1.0 / (1.0 + np.exp(-6)), 1.0 / (1.0 + np.exp(-1.5))]) 19 | 20 | self.assertTrue(np.all(np.abs(y - yact) < 1.0e-12)) 21 | 22 | def test_error_gradient(self): 23 | x0 = np.array([self.x[0]]) 24 | x1 = np.array([self.x[1]]) 25 | error, gradient = LogisticRegression._loss_gradient(x0, x1, self.b, self.w, self.lam) 26 | 27 | # this assumes test_sigmoid pases 28 | err_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) + 0.5 * 7 * 10 29 | 30 | pred_error = LogisticRegression._sigmoid(self.x, self.b, self.w) - self.t 31 | gradient_act = np.array([0.0, 7 * 3, 7 * -1]) 32 | gradient_act[0] = np.sum(pred_error) 33 | gradient_act[1] += np.sum(pred_error * self.x[:, 0]) 34 | gradient_act[2] += np.sum(pred_error * self.x[:, 1]) 35 | 36 | self.assertTrue( abs(float(err_act) - error) < 1.0e-12 ) 37 | self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12)) 38 | 39 | # weighted case 40 | x00 = np.array([self.x[0], [55, -2]]) 41 | error_weighted, gradient_weighted = LogisticRegression._loss_gradient(x00, x1, self.b, self.w, self.lam, [np.array([0.4, 0.75]), np.array(0.35)]) 42 | err_weighted_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid([x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10 43 | self.assertTrue( abs(float(err_weighted_act) - error_weighted) < 1.0e-12 ) 44 | 45 | def test_fit(self): 46 | from mozsci.evaluation import classification_error 47 | np.random.seed(5) 48 | N = int(1e5) 49 | x = np.random.rand(N, 2) 50 | y = (3 * x[:, 0] - 2 * x[:, 1] - 1.5 > 0.0).astype(np.int) 51 | lr = LogisticRegression() 52 | lr.fit(x, y, factr=1e4) 53 | ypred = lr.predict(x) 54 | self.assertTrue(classification_error(y, ypred) < 0.002) 55 | 56 | 57 | 58 | if __name__ == "__main__": 59 | unittest.main() 60 | 61 | 62 | -------------------------------------------------------------------------------- /test/test_map_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from mozsci.map_train import TrainModelCV, run_train_models 7 | from mozsci.evaluation import classification_error, auc_wmw_fast 8 | from mozsci.cross_validate import cv_kfold 9 | from mozsci.models import LogisticRegression 10 | 11 | 12 | class DataTest(unittest.TestCase): 13 | def setUp(self): 14 | np.random.seed(5) 15 | self.X = np.linspace(0, 1, 100).reshape(100, 1) 16 | self.y = (5 * self.X.reshape(100, ) - 2 + np.random.rand(100) > 0).astype(np.int) 17 | 18 | self.folds = cv_kfold(100, 4, seed=2) 19 | 20 | class TestTrainModelCV(DataTest): 21 | @staticmethod 22 | def agg_err(yactual, ypred): 23 | ret = {} 24 | ret['accuracy'] = classification_error(yactual, ypred) 25 | ret['auc'] = auc_wmw_fast(yactual, ypred) 26 | return ret 27 | 28 | 29 | def test_map_train_model(self): 30 | trainer = TrainModelCV([LogisticRegression, classification_error, '/tmp/logistic.json', (), {'lam':0.5}], X=self.X, y=self.y) 31 | errors = trainer.run() 32 | 33 | # load model 34 | trained_model = LogisticRegression.load_model('/tmp/logistic.json') 35 | loaded_model_error = classification_error(self.y, trained_model.predict(self.X)) 36 | 37 | # check the errors 38 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - 0.06) < 1e-12) 39 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - loaded_model_error) < 1e-12) 40 | 41 | def test_aggregate_error(self): 42 | # test an aggregate error function (that returns more than one value) 43 | trainer = TrainModelCV([LogisticRegression, TestTrainModelCV.agg_err, None, (), {'lam':0.5}], 44 | X=self.X, y=self.y, Xtest=self.X[:50, :], ytest=self.y[:50]) 45 | errors = trainer.run() 46 | 47 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train']['accuracy'] - 0.06) < 1e-8) 48 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train']['auc'] - 0.99310661764705888) < 1e-8) 49 | 50 | 51 | def test_kfold_cv(self): 52 | trainer = TrainModelCV([LogisticRegression, classification_error, None, (), {'lam':0.5}], 53 | X=self.X, y=self.y, folds=self.folds) 54 | errors = trainer.run() 55 | 56 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['train'] - 0.063340259665816398) < 1e-12) 57 | self.assertTrue(np.abs(errors[list(errors.keys())[0]]['test'] - 0.049633305762338022)< 1e-12) 58 | 59 | 60 | class Test_run_train_models(DataTest): 61 | def test_run_train_models(self): 62 | import re 63 | 64 | model_library = [[LogisticRegression, classification_error, None, (), {'lam':0.5}], 65 | [LogisticRegression, classification_error, None, (), {'lam':50}]] 66 | 67 | errors = run_train_models(2, model_library, X=self.X, y=self.y) 68 | for k in errors.keys(): 69 | if re.search("{'lam': 0.5}", k): 70 | err_check = errors[k] 71 | 72 | self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8) 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /test/test_variables.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import unittest 4 | import numpy as np 5 | 6 | from mozsci.evaluation import classification_error 7 | from mozsci.inputs import IdentityTransformer, LogScaledTransformer 8 | from mozsci import variables 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | 12 | class TestModelDriver(unittest.TestCase): 13 | def test_model_driver(self): 14 | independents = [ 15 | variables.Variable('x0', IdentityTransformer()), 16 | variables.Variable('x1', LogScaledTransformer()) 17 | ] 18 | dependents = [variables.Variable('y', IdentityTransformer())] 19 | model_variables = variables.ModelVariables(independents, dependents) 20 | 21 | # make some test data 22 | N = int(1e5) 23 | data = np.zeros( 24 | N, dtype=[('x0', np.float), ('x1', np.float), ('y', np.int)]) 25 | np.random.seed(5) 26 | data['x0'] = np.random.rand(N) 27 | data['x1'] = np.random.normal(0.5, 2.0, N) 28 | data['y'] = 3 * data['x0'] - 2 * data['x1'] - 1.5 > 0.0 29 | 30 | # rescale x1 31 | data['x1'] = np.exp(data['x1']) 32 | 33 | # create driver and fit 34 | model = variables.ModelDriver(model_variables, LogisticRegression(C=1e5)) 35 | 36 | # first try to fit with regular numpy arrays 37 | X = data.view(dtype=np.float).reshape(-1, 3)[:, :2] 38 | y = data.view(dtype=np.int).reshape(-1, 3)[:, 2].reshape(-1, 1) 39 | model.fit(X, y) 40 | ypred = model.predict(X) 41 | self.assertTrue(classification_error(y, ypred) < 0.002) 42 | 43 | # now try using __getitem__ 44 | model.fit(data, data) 45 | ypred = model.predict(data) 46 | self.assertTrue(classification_error(data['y'], ypred) < 0.002) 47 | 48 | # serialization 49 | model_string = model.dumps() 50 | model_loaded = variables.ModelDriver.loads(model_string) 51 | self.assertTrue(np.allclose( 52 | model.predict(data, predict_prob=True), 53 | model_loaded.predict(data, predict_prob=True))) 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main() 58 | --------------------------------------------------------------------------------