├── LICENSE ├── Makefile ├── README.rst ├── setup.py └── src ├── pyksc ├── __init__.py ├── _trend.pyx ├── dhwt.pyx ├── dist.pxd ├── dist.pyx ├── ksc.py ├── metrics.py ├── regression.py ├── test │ ├── __init__.py │ ├── test_dhwt.py │ ├── test_dist.py │ ├── test_ksc.py │ ├── test_regression.py │ └── test_trend.py └── trend.py ├── scripts ├── __init__.py ├── class_predict.py ├── cluster_jaccard.py ├── cluster_mutualinfo.py ├── cluster_vol.py ├── col_to_cluster.py ├── create_mic_input.py ├── learn_base.py ├── leave_k.py ├── plot_centroids.py ├── plot_members.py ├── plot_quality.py ├── plot_time_to_peak.py ├── pop_predict.py ├── radar.py ├── tags_io.py └── tree_infogain.py └── trend-learner-scripts ├── boosting.py ├── classify_pts.py ├── classify_pts_all.py ├── classify_pts_test.py ├── classify_theta.py ├── classify_theta_train.py ├── cluster.py ├── cotrain.py ├── create_test_assign.py ├── generate_cross_vals.py ├── ioutil.py ├── learn_base.py ├── multimodel_class.py ├── pipeline.sh ├── regression.py ├── sim_folds.py ├── stacking.py ├── summarize_results.py └── translation-final-results-to-paper-new.png /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2014, pyksc developers 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the pyksc nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Simple makefile 2 | 3 | PYTHON ?= python 4 | NOSETESTS ?= nosetests 5 | 6 | all: clean build 7 | 8 | build: 9 | $(PYTHON) setup.py build_ext --inplace 10 | 11 | clean: 12 | rm -rf build/ 13 | rm -rf src/build/ 14 | find . -name "*.pyc" | xargs rm -f 15 | find . -name "*.c" | xargs rm -f 16 | find . -name "*.so" | xargs rm -f 17 | 18 | test: clean build 19 | $(NOSETESTS) 20 | 21 | trailing-spaces: 22 | find -name "*.py" | xargs sed 's/^M$$//' 23 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PY-KSC 2 | ====== 3 | 4 | Implementation of the KSC time series clustering algorithm. 5 | See [1]_ for details: 6 | 7 | Dependencies for library 8 | ------------------------ 9 | * Numpy 10 | * Cython 11 | 12 | Dependencies for scripts 13 | ------------------------ 14 | * Scipy 15 | * Matplotlib 16 | 17 | How to install 18 | -------------- 19 | 20 | Clone the repo 21 | 22 | :: 23 | 24 | $ git clone https://github.com/flaviovdf/pyksc.git 25 | 26 | Make sure you have cython and numpy. If not run as root (or use your distros package manager) 27 | 28 | :: 29 | 30 | $ pip install numpy 31 | 32 | :: 33 | 34 | $ pip install Cython 35 | 36 | Install 37 | 38 | :: 39 | 40 | $ python setup.py install 41 | 42 | If you see the following error ``/usr/bin/ld: cannot find -lblas`` on linux, try installing the following two libraries 43 | 44 | :: 45 | 46 | $ sudo apt-get install libblas-dev liblapack-dev 47 | 48 | 49 | 50 | References 51 | ---------- 52 | .. [1] J. Yang and J. Leskovec, 53 | "Patterns of Temporal Variation in Online Media" - WSDM'11 54 | http://dl.acm.org/citation.cfm?id=1935863 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 3 | from __future__ import division, print_function 4 | '''Setup script''' 5 | 6 | import glob 7 | import numpy 8 | import os 9 | import sys 10 | 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | SOURCE = 'src/' 16 | os.chdir(SOURCE) 17 | 18 | import platform 19 | if platform.system() == 'Darwin': 20 | os.environ["CC"] = "gcc-6" 21 | os.environ["CXX"] = "gcc-6" 22 | 23 | if sys.version_info[:2] < (2, 7): 24 | print('Requires Python version 2.7 or later (%d.%d detected).' % 25 | sys.version_info[:2]) 26 | sys.exit(-1) 27 | 28 | def get_packages(): 29 | '''Appends all packages (based on recursive sub dirs)''' 30 | 31 | packages = ['pyksc'] 32 | 33 | for package in packages: 34 | base = os.path.join(package, '**/') 35 | sub_dirs = glob.glob(base) 36 | while len(sub_dirs) != 0: 37 | for sub_dir in sub_dirs: 38 | package_name = sub_dir.replace('/', '.') 39 | if package_name.endswith('.'): 40 | package_name = package_name[:-1] 41 | 42 | packages.append(package_name) 43 | 44 | base = os.path.join(base, '**/') 45 | sub_dirs = glob.glob(base) 46 | 47 | return packages 48 | 49 | def get_extensions(): 50 | '''Get's all .pyx and.pxd files''' 51 | 52 | extensions = [] 53 | for base in ['pyksc']: 54 | pyx_files = glob.glob(os.path.join(base, '*.pyx')) 55 | 56 | for pyx in pyx_files: 57 | pxd = pyx.replace('pyx', 'pxd') 58 | module = pyx.replace('.pyx', '').replace('/', '.') 59 | 60 | if os.path.exists(pxd): 61 | ext_files = [pyx, pxd] 62 | else: 63 | ext_files = [pyx] 64 | 65 | extension = Extension(module, ext_files, 66 | include_dirs=[numpy.get_include()], 67 | libraries=['blas'], 68 | extra_compile_args=['-fopenmp', 69 | '-msse', '-msse2', '-mfpmath=sse'], 70 | extra_link_args=['-fopenmp']) 71 | 72 | extensions.append(extension) 73 | 74 | return extensions 75 | 76 | if __name__ == "__main__": 77 | packages = get_packages() 78 | extensions = get_extensions() 79 | 80 | setup( 81 | cmdclass = {'build_ext': build_ext}, 82 | name = 'pyksc', 83 | packages = packages, 84 | ext_modules = extensions 85 | ) 86 | -------------------------------------------------------------------------------- /src/pyksc/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Implementation of the KSC algorithm. See [1] for details: 3 | 4 | References 5 | ---------- 6 | .. [1] J. Yang and J. Leskovec, 7 | "Patterns of Temporal Variation in Online Media" - WSDM'11 8 | http://dl.acm.org/citation.cfm?id=1935863 9 | ''' -------------------------------------------------------------------------------- /src/pyksc/_trend.pyx: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | # cython: boundscheck = False 3 | # cython: wraparound = False 4 | 5 | from cython.parallel import prange 6 | from cython.view cimport array as cvarray 7 | 8 | from libc.stdlib cimport free 9 | from libc.stdio cimport printf 10 | 11 | from pyksc cimport dist 12 | 13 | cimport cython 14 | cimport numpy as np 15 | 16 | import numpy as np 17 | np.import_array() 18 | 19 | #Basic math functions 20 | cdef extern from "math.h" nogil: 21 | double exp(double) 22 | 23 | cdef inline double dmin(double a, double b) nogil: return a if a < b else b 24 | 25 | cdef double dist_to_reference(double[::1] s, double[::1] r) nogil: 26 | cdef Py_ssize_t n_obs = s.shape[0] 27 | cdef Py_ssize_t n_ref = r.shape[0] 28 | 29 | cdef double min_dist = 1 30 | cdef Py_ssize_t i 31 | cdef dist.dist_struct_t *d 32 | 33 | for i in range(n_ref - n_obs + 1): 34 | d = dist.cdist(r[i:i + n_obs], s, 1) 35 | min_dist = dmin(min_dist, d.dist) 36 | free(d) 37 | 38 | return min_dist 39 | 40 | cdef void predict_one(double[::1] s, double[:, ::1] R_pos, 41 | double gamma, double[:, ::1] probs, 42 | int store_at_row, int store_at_col) nogil: 43 | 44 | cdef Py_ssize_t num_windows = s.shape[0] + 1 45 | cdef Py_ssize_t num_pos = R_pos.shape[0] 46 | 47 | cdef double prob = 0 48 | cdef Py_ssize_t i = 0 49 | for i in range(num_pos): 50 | prob += exp(-gamma * dist_to_reference(s, R_pos[i])) 51 | 52 | probs[store_at_row, store_at_col] = prob 53 | 54 | def predict(np.ndarray[double, ndim=2, mode='c'] X not None, 55 | np.ndarray[double, ndim=2, mode='c'] R not None, 56 | np.ndarray[long, ndim=1, mode='c'] labels not None, 57 | int num_labels, double gamma): 58 | 59 | cdef Py_ssize_t num_samples = X.shape[0] 60 | cdef Py_ssize_t num_points = X.shape[1] 61 | 62 | cdef double[::1] s 63 | cdef double[:, ::1] R_pos 64 | 65 | cdef double[:, ::1] probs = \ 66 | np.zeros(shape=(num_samples, num_labels), dtype=np.float64, 67 | order='C') 68 | 69 | cdef double[:, ::1] Xview = X #For nogil 70 | 71 | cdef Py_ssize_t i = 0 72 | cdef Py_ssize_t l = 0 73 | for l from 0 <= l < num_labels: 74 | #TODO: Maybe this copy is not necessary, need to check. 75 | R_pos = np.asanyarray(R[labels == l], dtype=np.float64, order='C') 76 | 77 | for i in prange(num_samples, schedule='static', nogil=True): 78 | predict_one(Xview[i], R_pos, gamma, probs, i, l) 79 | 80 | return probs.base 81 | -------------------------------------------------------------------------------- /src/pyksc/dhwt.pyx: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | ''' 3 | Implements Discrete Harr Wavelet Transform (also inverse) for a time series. 4 | This is simply done by computing the average of consecutive elements in the 5 | vector that correspond to the time series. See [1] and [2] for details. 6 | 7 | References 8 | ---------- 9 | .. [1] P. Van Fleet 10 | "The Discrete Haar Wavelet Tranformation" 11 | http://goo.gl/IPz25 12 | (last access December 2011) 13 | 14 | .. [2] I. Kaplan 15 | "Applying the Haar Wavelet Transform to Time Series Information" 16 | http://www.bearcave.com/misl/misl_tech/wavelets/haar.html 17 | (last access December 2011) 18 | ''' 19 | from __future__ import division, print_function 20 | 21 | cimport cython 22 | cimport numpy as np 23 | 24 | import numpy as np 25 | np.import_array() 26 | 27 | @cython.boundscheck(False) 28 | @cython.wraparound(False) 29 | def transform(np.ndarray[double, ndim=1] array): 30 | ''' 31 | Transform the array to a new form using the discrete harr 32 | transform operation. This is computing the average of consecutive 33 | elements in the array. 34 | 35 | Arguments 36 | --------- 37 | array: np.ndarray[double, ndim=1] 38 | the array to transform 39 | 40 | Returns 41 | ------- 42 | This method returns a tuple being the first element the wavelet and 43 | the second the coefficients to be used to transform the wavelet back 44 | to the original array. 45 | ''' 46 | cdef Py_ssize_t n = array.shape[0] 47 | cdef Py_ssize_t new_dim 48 | 49 | if n % 2 == 0: 50 | new_dim = n // 2 51 | else: 52 | new_dim = (n // 2) + 1 53 | 54 | cdef np.ndarray[double, ndim=1] wavelet = np.zeros(new_dim) 55 | cdef np.ndarray[double, ndim=1] coefficient = np.zeros(new_dim) 56 | 57 | cdef double first 58 | cdef double second 59 | cdef Py_ssize_t i = 0 60 | cdef Py_ssize_t j = 0 61 | 62 | for i in range(0, n, 2): 63 | first = array[i] 64 | if i < n - 1: 65 | second = array[i + 1] 66 | else: 67 | second = 0 68 | 69 | wavelet[j] = (first + second) / 2 70 | coefficient[j] = (first - second) / 2 71 | j += 1 72 | 73 | return wavelet, coefficient 74 | 75 | @cython.boundscheck(False) 76 | @cython.wraparound(False) 77 | def inverse(np.ndarray[double, ndim=1] wavelet, 78 | np.ndarray[double, ndim=1] coefficient): 79 | ''' 80 | Given a wavelet and its coefficients this method can be used to 81 | transform the wavelet to the original array. 82 | 83 | Arguments 84 | --------- 85 | wavelet: np.ndarray[np.float_t, ndim=1] 86 | the wavelet to transform back 87 | coefficient: np.ndarray[np.float_t, ndim=1] 88 | the coefficients needed for the transform 89 | ''' 90 | cdef Py_ssize_t n = wavelet.shape[0] 91 | 92 | #sanity check 93 | if n != coefficient.shape[0]: 94 | return None 95 | 96 | cdef Py_ssize_t new_dim 97 | if n % 2 == 0 or n == 1: 98 | new_dim = n * 2 99 | else: 100 | new_dim = n * 2 - 1 101 | 102 | cdef np.ndarray[np.float_t, ndim=1] array = np.zeros(new_dim) 103 | 104 | cdef double first 105 | cdef double second 106 | cdef Py_ssize_t i = 0 107 | cdef Py_ssize_t j = 0 108 | for i in range(n): 109 | first = wavelet[i] + coefficient[i] 110 | second = wavelet[i] - coefficient[i] 111 | 112 | if j < new_dim: 113 | array[j] = first 114 | 115 | if j + 1 < new_dim: 116 | array[j + 1] = second 117 | 118 | j += 2 119 | 120 | return array 121 | -------------------------------------------------------------------------------- /src/pyksc/dist.pxd: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | 3 | #A basic structure for the return value of the distance func 4 | cdef struct dist_struct_t: 5 | double dist 6 | double alpha 7 | int shift 8 | 9 | #Distance function 10 | cdef dist_struct_t* cdist(double[::1] array1, double[::1] array2, int rolling)\ 11 | nogil 12 | 13 | cdef dist_struct_t* cshift_dist(double[::1] array1, double[::1] array2,\ 14 | int shift_amount, int rolling) nogil 15 | 16 | -------------------------------------------------------------------------------- /src/pyksc/dist.pyx: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | # cython: cdivision = True 3 | # cython: boundscheck = False 4 | # cython: wraparound = False 5 | 6 | ''' 7 | Basic array functions are kept here. Also, in this module 8 | we implement the time series distance metric defined in [1]. 9 | 10 | References 11 | ---------- 12 | .. [1] J. Yang and J. Leskovec, 13 | "Patterns of Temporal Variation in Online Media" - WSDM'11 14 | http://dl.acm.org/citation.cfm?id=1935863 15 | ''' 16 | from __future__ import division, print_function 17 | 18 | from cpython cimport bool 19 | from libc.stdlib cimport abort 20 | from libc.stdlib cimport free 21 | from libc.stdlib cimport malloc 22 | from libc.stdio cimport printf 23 | 24 | from cython.parallel import parallel 25 | from cython.parallel import prange 26 | 27 | cimport cython 28 | cimport numpy as np 29 | import numpy as np 30 | 31 | np.import_array() 32 | 33 | #Basic math functions 34 | cdef extern from "math.h" nogil: 35 | double sqrt(double) 36 | 37 | cdef extern from "cblas.h" nogil: 38 | double cblas_dnrm2(int N, double *X, int incX) 39 | double cblas_ddot(int N, double *X, int incX, double *Y, int incY) 40 | 41 | #Inlines, some basic blas vector stuff renamed for legacy and disabling gil 42 | cdef inline double cinner_prod(double *array1, double *array2, \ 43 | Py_ssize_t size) nogil: \ 44 | return cblas_ddot(size, array1, 1, array2, 1) 45 | 46 | cdef inline double csqsum(double *array1, Py_ssize_t size) nogil: \ 47 | return cblas_dnrm2(size, array1, 1) ** 2 48 | 49 | cdef inline double cnorm(double *array1, Py_ssize_t size) nogil: \ 50 | return cblas_dnrm2(size, array1, 1) 51 | 52 | #CDEF functions 53 | cdef double* cshift_drop(double[::1] array, int amount) nogil: 54 | ''' 55 | Shifts the array by N positions. This is similar to a binary shift where 56 | the element's fall of at the ends. 57 | ''' 58 | cdef Py_ssize_t size = array.shape[0] 59 | 60 | cdef double *shifted 61 | shifted = malloc(size * sizeof(double)) 62 | if shifted == NULL: 63 | abort() 64 | 65 | cdef Py_ssize_t delta_shifted = 0 66 | cdef Py_ssize_t delta_array = 0 67 | if amount > 0: 68 | delta_shifted = amount 69 | else: 70 | delta_array = -amount 71 | amount = -amount 72 | 73 | cdef Py_ssize_t i = 0 74 | for i in range(size): 75 | shifted[i] = 0 76 | 77 | i = 0 78 | for i in range(size - amount): 79 | shifted[i + delta_shifted] = array[i + delta_array] 80 | 81 | return shifted 82 | 83 | cdef double* cshift_roll(double[::1] array, int amount) nogil: 84 | ''' 85 | Shifts the array by N positions. This is a rolling shifts, where elements 86 | come back at the other side of the array. 87 | ''' 88 | cdef Py_ssize_t size = array.shape[0] 89 | 90 | cdef Py_ssize_t delta_shifted = 0 91 | cdef Py_ssize_t delta_array = 0 92 | if amount > 0: 93 | delta_shifted = amount 94 | else: 95 | delta_array = -amount 96 | 97 | cdef double *shifted 98 | shifted = malloc(size * sizeof(double)) 99 | if shifted == NULL: 100 | abort() 101 | 102 | cdef Py_ssize_t i = 0 103 | for i in range(size): 104 | shifted[(i + delta_shifted) % size] = array[(i + delta_array) % size] 105 | 106 | return shifted 107 | 108 | cdef dist_struct_t* cshift_dist(double[::1] array1, double[::1] array2, \ 109 | int shift_amount, int rolling) nogil: 110 | ''' 111 | Computes the distance between two time series using a given shift. 112 | ''' 113 | cdef Py_ssize_t size = array1.shape[0] 114 | 115 | #return val 116 | cdef dist_struct_t* rv = malloc(sizeof(dist_struct_t)) 117 | if rv == NULL: 118 | abort() 119 | rv.shift = shift_amount 120 | 121 | if size == 0: 122 | rv.dist = 0 123 | rv.alpha = 0 124 | return rv 125 | 126 | cdef double *shifted 127 | if rolling: 128 | shifted = cshift_roll(array2, shift_amount) 129 | else: 130 | shifted = cshift_drop(array2, shift_amount) 131 | 132 | #computing scaling 133 | cdef double alpha 134 | cdef double sqsum_shift = csqsum(shifted, size) 135 | if sqsum_shift != 0: 136 | alpha = cinner_prod(&array1[0], shifted, size) / sqsum_shift 137 | else: 138 | alpha = 0 139 | 140 | rv.alpha = alpha 141 | 142 | #actual distance 143 | cdef Py_ssize_t i = 0 144 | cdef double dist = 0 145 | for i in range(size): 146 | dist += (array1[i] - alpha * shifted[i]) ** 2 147 | 148 | free(shifted) 149 | 150 | cdef double norm1 = cnorm(&array1[0], size) 151 | if norm1 != 0: 152 | rv.dist = sqrt(dist) / norm1 153 | elif sqsum_shift != 0: #array one is all zeros, but 2 is not 154 | rv.dist = 1 155 | else: #both are all zeros 156 | rv.dist = 0 157 | 158 | return rv 159 | 160 | cdef dist_struct_t* cdist(double[::1] array1, double[::1] array2, int rolling) \ 161 | nogil: 162 | ''' 163 | Computes the distance between two time series by searching for the optimal 164 | shifting parameter. 165 | ''' 166 | 167 | cdef Py_ssize_t size = array1.shape[0] 168 | cdef dist_struct_t* rv = malloc(sizeof(dist_struct_t)) 169 | if rv == NULL: 170 | abort() 171 | 172 | rv.dist = 1 173 | rv.shift = 0 174 | rv.alpha = 0 175 | if size == 0: 176 | rv.dist = 0 177 | return rv 178 | 179 | cdef double best_distance = 1 180 | cdef Py_ssize_t best_shift = 0 181 | 182 | cdef dist_struct_t* curr_dist 183 | cdef Py_ssize_t i 184 | for i in range(-size + 1, size): 185 | curr_dist = cshift_dist(array1, array2, i, rolling) 186 | if curr_dist.dist < best_distance: 187 | free(rv) 188 | rv = curr_dist 189 | rv.shift = i 190 | best_distance = rv.dist 191 | else: 192 | free(curr_dist) 193 | 194 | return rv 195 | 196 | cdef tuple cdist_all(double[:, ::1] matrix1, double[:, ::1] matrix2, int rolling): 197 | ''' 198 | Computes the distance between all pairs of rows in the given matrices. 199 | The elements of the first matrix are the ones which will be shifted. 200 | ''' 201 | 202 | cdef Py_ssize_t n_rows1 = matrix1.shape[0] 203 | cdef Py_ssize_t n_rows2 = matrix2.shape[0] 204 | cdef Py_ssize_t n_cols = matrix1.shape[1] 205 | 206 | cdef np.ndarray[double, ndim=2] rv_dist = np.ndarray((n_rows1, n_rows2)) 207 | cdef np.ndarray[int, ndim=2] rv_shifts = np.ndarray((n_rows1, n_rows2), 208 | dtype='i') 209 | 210 | cdef dist_struct_t*** aux = \ 211 | malloc(n_rows1 * sizeof(dist_struct_t**)) 212 | if aux == NULL: 213 | abort() 214 | 215 | cdef Py_ssize_t i 216 | cdef Py_ssize_t j 217 | for i in prange(n_rows1, nogil=True, schedule='static'): 218 | aux[i] = malloc(n_rows2 * sizeof(dist_struct_t*)) 219 | if aux[i] == NULL: 220 | abort() 221 | 222 | for j in range(n_rows2): 223 | aux[i][j] = cdist(matrix1[i], matrix2[j], rolling) 224 | rv_dist[i, j] = aux[i][j].dist 225 | rv_shifts[i, j] = aux[i][j].shift 226 | 227 | free(aux[i][j]) 228 | free(aux[i]) 229 | free(aux) 230 | 231 | return (rv_dist, rv_shifts) 232 | 233 | #Python wrappers 234 | def shift(np.ndarray[double, ndim=1, mode='c'] array not None, int amount, 235 | bool rolling=False): 236 | ''' 237 | Shifts the array by N positions. This is a rolling shifts, where elements 238 | come back at the other side of the array. This method return a new array, 239 | it does not do inplace shifts. 240 | 241 | Arguments 242 | --------- 243 | array: np.ndarray[np.float_t, ndim=1] 244 | The array to shift 245 | amount: int 246 | The amount to shuft by, positive integer signal right shifts while 247 | negative ones signal left shifts 248 | rolling: bool (default `False`) 249 | indicates whether we should use a rolling distance (i.e. elements at 250 | one end re appear at another) or a drop distance (i.e. elements fall 251 | and zeroes take their place, similar to a binary shift) 252 | ''' 253 | 254 | cdef Py_ssize_t size = array.shape[0] 255 | cdef double *shift_buff 256 | if rolling: 257 | shift_buff = cshift_roll(array, amount) 258 | else: 259 | shift_buff = cshift_drop(array, amount) 260 | 261 | cdef np.ndarray[double, ndim=1] rv = np.ndarray(size) 262 | free(rv.data) 263 | rv.data = shift_buff 264 | return rv 265 | 266 | def inner_prod(np.ndarray[double, ndim=1, mode='c'] array1 not None, 267 | np.ndarray[double, ndim=1, mode='c'] array2 not None): 268 | ''' 269 | Return's the inner product between two arrays. It is a necessity for both 270 | arrays to have the same shape. 271 | 272 | Arguments 273 | --------- 274 | array1: np.ndarray[np.float_t, ndim=1] 275 | First array 276 | array2: np.ndarray[np.float_t, ndim=1] 277 | Second array 278 | ''' 279 | 280 | assert array1.shape[0] == array2.shape[0] 281 | cdef Py_ssize_t size = array1.shape[0] 282 | return cinner_prod(&array1[0], &array2[0], size) 283 | 284 | def sqsum(np.ndarray[double, ndim=1, mode='c'] array not None): 285 | ''' 286 | Returns the squared sum of the elements in the given array. 287 | 288 | Arguments 289 | --------- 290 | array: np.ndarray[np.float_t, ndim=1] 291 | The array to sum the elements 292 | ''' 293 | 294 | return csqsum(&array[0], array.shape[0]) 295 | 296 | def shift_dist(np.ndarray[double, ndim=1, mode='c'] array1 not None, 297 | np.ndarray[double, ndim=1, mode='c'] array2 not None, 298 | int shift_amount, bool rolling=False): 299 | ''' 300 | Computes the distance between two time series. This is an implementation 301 | of the distance metric define in Section 2.2 of [1]. This is the distance 302 | metric for a fixed shifting parmeter, where the scaling can be easily 303 | computed. 304 | 305 | Arguments 306 | --------- 307 | array1: np.ndarray[np.float_t, ndim=1] 308 | First time series 309 | array2: np.ndarray[np.float_t, ndim=1] 310 | Second time series 311 | shift_amout: int 312 | the shifting parameter 313 | rolling: bool (default `False`) 314 | indicates whether we should use a rolling distance (i.e. elements at 315 | one end reappear at another) or a drop distance (i.e. elements fall 316 | and zeroes take their place, similar to a binary shift) 317 | 318 | References 319 | ---------- 320 | .. [1] J. Yang and J. Leskovec, 321 | "Patterns of Temporal Variation in Online Media" - WSDM'11 322 | http://dl.acm.org/citation.cfm?id=1935863 323 | ''' 324 | assert array1.shape[0] == array2.shape[0] 325 | 326 | cdef dist_struct_t* rv 327 | cdef double dist 328 | try: 329 | if rolling: 330 | rv = cshift_dist(array1, array2, shift_amount, 1) 331 | else: 332 | rv = cshift_dist(array1, array2, shift_amount, 0) 333 | 334 | dist = rv.dist 335 | return dist 336 | finally: 337 | free(rv) 338 | 339 | def dist(np.ndarray[double, ndim=1, mode='c'] array1 not None, 340 | np.ndarray[double, ndim=1, mode='c'] array2 not None, 341 | bool rolling=False): 342 | ''' 343 | Computes the distance between two time series. This is an implementation 344 | of the distance metric define in Section 2.2 of [1]. It searchs for optimal 345 | scaling and shifting paramters to align both series and compare similarity 346 | mostly based on *shape*. 347 | 348 | This is a symmetric measure *only* when using rolling shifts. 349 | 350 | Arguments 351 | --------- 352 | array1: np.ndarray[np.float_t, ndim=1, mode='c'] 353 | First time series 354 | array2: np.ndarray[np.float_t, ndim=1, mode='c'] 355 | Second time series 356 | rolling: bool (default `False`) 357 | indicates whether we should use a rolling distance (i.e. elements at 358 | one end reappear at another) or a drop distance (i.e. elements fall 359 | and zeroes take their place, similar to a binary shift) 360 | 361 | References 362 | ---------- 363 | .. [1] J. Yang and J. Leskovec, 364 | "Patterns of Temporal Variation in Online Media" - WSDM'11 365 | http://dl.acm.org/citation.cfm?id=1935863 366 | ''' 367 | assert array1.shape[0] == array2.shape[0] 368 | 369 | cdef dist_struct_t *rv 370 | cdef int roll = 0 371 | if rolling: 372 | roll = 1 373 | 374 | try: 375 | rv = cdist(array1, array2, roll) 376 | return rv.dist 377 | finally: 378 | free(rv) 379 | 380 | def dist_all(np.ndarray[double, ndim=2, mode='c'] matrix1 not None, 381 | np.ndarray[double, ndim=2, mode='c'] matrix2 not None, 382 | bool rolling=False): 383 | 384 | ''' 385 | Computes the distance between all of examples (rows) from the first 386 | matrix to all other examples in the second matrix. The return value 387 | is a matrix of n_rows1, n_rows2 containing the distances. 388 | 389 | The elements of the first matrix are the ones which will be shifted. 390 | 391 | Both matrices must have the same number of columns. 392 | 393 | Arguments 394 | --------- 395 | matrix1: np.ndarray[np.float_t, ndim=2, mode='c'] 396 | A matrix of time series 397 | matrix2: np.ndarray[np.float_t, ndim=2, mode='c'] 398 | A matrix of time series 399 | rolling: bool (default `False`) 400 | indicates whether we should use a rolling distance (i.e. elements at 401 | one end reappear at another) or a drop distance (i.e. elements fall 402 | and zeroes take their place, similar to a binary shift) 403 | ''' 404 | 405 | assert matrix1.shape[1] == matrix2.shape[1] 406 | cdef int roll = 0 407 | if rolling: 408 | roll = 1 409 | 410 | return cdist_all(matrix1, matrix2, roll) 411 | -------------------------------------------------------------------------------- /src/pyksc/ksc.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | ''' 3 | Implementation of the KSC and IncrementalKSC algorithms. See [1] for details. 4 | Both algorithms can be used for clustering time series data, the second 5 | (IncrementalKSC) being an optimization of the initial clusters heuristic to 6 | be used by the first. 7 | 8 | References 9 | ---------- 10 | .. [1] J. Yang and J. Leskovec, 11 | "Patterns of Temporal Variation in Online Media" - WSDM'11 12 | http://dl.acm.org/citation.cfm?id=1935863 13 | ''' 14 | from __future__ import division, print_function 15 | 16 | from pyksc.dhwt import transform 17 | from pyksc.dist import dist_all 18 | from pyksc.dist import shift 19 | 20 | from pyksc.metrics import cost 21 | 22 | import numpy as np 23 | import scipy.linalg as LA 24 | 25 | def _compute_centroids(tseries, assign, num_clusters, to_shift=None): 26 | ''' 27 | Given a time series matrix and cluster assignments, this method will 28 | compute the spectral centroids for each cluster. 29 | 30 | Arguments 31 | --------- 32 | tseries: matrix (n_series, n_points) 33 | Time series beng clustered 34 | assign: array of ints (size = n_series) 35 | The cluster assignment for each time series 36 | num_clusters: int 37 | The number of clusters being searched for 38 | to_shift (optional): array of ints (size = n_series) 39 | Determines if time series should be shifted, if different from `None`. 40 | In this case, each series will be shifted by the corresponding amount 41 | in the array. 42 | ''' 43 | 44 | series_size = tseries.shape[1] 45 | centroids = np.ndarray((num_clusters, series_size)) 46 | 47 | #shift series for best centroid distance 48 | #TODO: this method can be cythonized and done in parallel 49 | shifted = tseries 50 | if to_shift is not None: 51 | for i in xrange(tseries.shape[0]): 52 | shifted[i] = shift(tseries[i], to_shift[i], rolling=True) 53 | 54 | #compute centroids 55 | for k in xrange(num_clusters): 56 | members = shifted[assign == k] 57 | if members.any(): 58 | num_members = 0 59 | if members.ndim == 2: 60 | axis = 1 61 | num_members = members.shape[0] 62 | else: 63 | axis = 0 64 | num_members = 1 65 | 66 | ssqs = np.tile(np.sum(members**2, axis=axis), (series_size, 1)) 67 | #the original papers divides by ssqs only, while the author's 68 | #example implementation uses sqrt. We chose sqrt because it appears 69 | #to yield better centroids. 70 | aux = members / np.sqrt(ssqs.T) 71 | 72 | x_mat = np.dot(aux.T, aux) 73 | i_mat = num_members * np.eye(series_size) 74 | m_mat = i_mat - x_mat 75 | 76 | #compute eigenvalues and chose the vector for the smallest one 77 | #TODO: Check if using scipy's linalg is faster (has more options 78 | # such as finding only the smallest eigval) 79 | _, eig_vectors = LA.eigh(m_mat, eigvals=(0, 0)) 80 | centroids[k] = eig_vectors[:,0] 81 | 82 | if centroids[k].sum() < 0: 83 | centroids[k] = -centroids[k] 84 | else: 85 | centroids[k] = np.zeros(series_size) 86 | 87 | return centroids 88 | 89 | def _base_ksc(tseries, initial_centroids, n_iters=-1): 90 | ''' 91 | This is the base of the KSC algorithm. It follows the same idea of a K-Means 92 | algorithm. Firstly, we assign time series to a new cluster based on the 93 | distance to the centroids. For each time series, it is computed the best 94 | shift to minimize the distance to the closest centroid. 95 | 96 | The assignment step is followed by an update step where new centroids are 97 | computed based on the new clustering (based on the update step). 98 | 99 | Both steps above are repeated `n_iters` times. If this parameter is negative 100 | then the steps are repeated until convergence, that is, until no time series 101 | changes cluster between consecutive steps. 102 | 103 | Arguments 104 | --------- 105 | tseries: a matrix of shape (number of time series, size of each series) 106 | The time series to cluster 107 | initial_centroids: a matrix of shape (num. of clusters, size of time series) 108 | The initial centroid estimates 109 | n_iters: int 110 | The number of iterations which the algorithm will run 111 | 112 | Returns 113 | ------- 114 | centroids: a matrix of shape (num. of clusters, size of time series) 115 | The final centroids found by the algorithm 116 | assign: an array of num. series size 117 | The cluster id which each time series belongs to 118 | best_shift: an array of num. series size 119 | The amount shift amount performed for each time series 120 | cent_dists: a matrix of shape (num. centroids, num. series) 121 | The distance of each centroid to each time series 122 | 123 | References 124 | ---------- References 125 | ---------- 126 | .. [1] J. Yang and J. Leskovec, 127 | "Patterns of Temporal Variation in Online Media" - WSDM'11 128 | http://dl.acm.org/citation.cfm?id=1935863 129 | .. [1] J. Yang and J. Leskovec, 130 | "Patterns of Temporal Variation in Online Media" - WSDM'11 131 | http://dl.acm.org/citation.cfm?id=1935863 132 | .. [2] Wikipedia, 133 | "K-means clustering" 134 | http://en.wikipedia.org/wiki/K-means_clustering 135 | ''' 136 | 137 | num_clusters = initial_centroids.shape[0] 138 | num_series = tseries.shape[0] 139 | 140 | centroids = initial_centroids 141 | 142 | #KSC algorithm 143 | cent_dists = None 144 | assign = None 145 | prev_assign = None 146 | best_shift = None 147 | 148 | iters = n_iters 149 | converged = False 150 | 151 | while iters != 0 and not converged: 152 | #assign elements to new clusters References 153 | cent_dists, shifts = dist_all(centroids, tseries, rolling=True) 154 | 155 | assign = cent_dists.argmin(axis=0) 156 | best_shift = np.ndarray(num_series, dtype='i') 157 | for i in xrange(shifts.shape[1]): 158 | best_shift[i] = shifts[assign[i], i] 159 | 160 | #check if converged, if not compute new centroids 161 | if prev_assign is not None and not (prev_assign - assign).any(): 162 | converged = True 163 | else: 164 | centroids = _compute_centroids(tseries, assign, num_clusters, 165 | best_shift) 166 | 167 | prev_assign = assign 168 | iters -= 1 169 | 170 | return centroids, assign, best_shift, cent_dists 171 | 172 | def ksc(tseries, num_clusters, n_iters=-1, n_runs=10): 173 | ''' 174 | This method will make `n_runs` call to `_base_ksc` returning the results 175 | from the run with the lowest over-all clustering cost. In each run, 176 | a random initialization of centroids is performed. This is done by assigning 177 | time series to clusters in a uniform random manner and then computing the 178 | centroid of each cluster. 179 | 180 | Please refer to the documentation of `_base_ksc` for a detailed summary 181 | of the KSC algorithm. 182 | 183 | Arguments 184 | --------- 185 | tseries: a matrix of shape (number of time series, size of each series) 186 | The time series to cluster 187 | n_iters: int 188 | The number of iterations which the algorithm will run 189 | n_runs: int 190 | The number of times to run the KSC algorithm 191 | 192 | Returns 193 | ------- 194 | centroids: a matrix of shape (num. of clusters, size of time series) 195 | The final centroids found by the algorithm 196 | assign: an array of num. series size 197 | The cluster id which each time series belongs to 198 | best_shift: an array of num. series size 199 | The amount shift amount performed for each time series 200 | cent_dists: a matrix of shape (num. centroids, num. series) 201 | The distance of each centroid to each time series 202 | 203 | References 204 | ---------- 205 | .. [1] J. Yang and J. Leskovec, 206 | "Patterns of Temporal Variation in Online Media" - WSDM'11 207 | http://dl.acm.org/citation.cfm?id=1935863 208 | ''' 209 | 210 | min_cost = float('+inf') 211 | 212 | best_cents = None 213 | best_assign = None 214 | best_shift = None 215 | best_dist = None 216 | 217 | for _ in xrange(n_runs): 218 | assign = np.random.randint(0, num_clusters, tseries.shape[0]) 219 | cents = _compute_centroids(tseries, assign, num_clusters) 220 | 221 | cents, assign, series_shift, dists = _base_ksc(tseries, cents, n_iters) 222 | clust_cost = cost(tseries, assign, cents, dists) 223 | 224 | if clust_cost < min_cost: 225 | min_cost = clust_cost 226 | best_cents = cents 227 | best_assign = assign 228 | best_shift = series_shift 229 | best_dist = dists 230 | 231 | return best_cents, best_assign, best_shift, best_dist 232 | 233 | def inc_ksc(tseries, num_clusters, n_iters=-1, num_wavelets=2): 234 | ''' 235 | Given the number `num_wavelets`, this method will compute subsequent 236 | Discrete Harr Wavelet Transforms of the time series to be clustered. At 237 | each transform the number of points of the time series is decreased, thus 238 | we say that we are viewing the time series at a higher resolution. 239 | 240 | Clustering will begin at the highest resolution (last transform), and the 241 | results from the previous resolution is used to initialized the current one. 242 | Only the highest resolution is initialized randomly. This technique can 243 | improve the run-time of the KSC algorithm, since it is faster to cluster 244 | at higher resolutions (less data points), being for subsequent resolutions 245 | the centroids from the previous resolution already a close approximation of 246 | the actual centroid. See [1] for details. 247 | 248 | Please refer to the documentation of `_base_ksc` for a detailed summary 249 | of the KSC algorithm. 250 | 251 | Arguments 252 | --------- 253 | tseries: a matrix of shape (number of time series, size of each series) 254 | The time series to cluster 255 | n_iters: int 256 | The number of iterations which the algorithm will run 257 | num_wavelets: int 258 | The number of wavelets to use 259 | 260 | Returns 261 | ------- 262 | centroids: a matrix of shape (num. of clusters, size of time series) 263 | The final centroids found by the algorithm 264 | assign: an array of num. series size 265 | The cluster id which each time series belongs to 266 | best_shift: an array of num. series size 267 | The amount shift amount performed for each time series 268 | cent_dists: a matrix of shape (num. centroids, num. series) 269 | The distance of each centroid to each time series 270 | 271 | References 272 | ---------- 273 | .. [1] J. Yang and J. Leskovec, 274 | "Patterns of Temporal Variation in Online Media" - WSDM'11 275 | http://dl.acm.org/citation.cfm?id=1935863 276 | ''' 277 | 278 | dhw_series = [] 279 | dhw_series.append(tseries) 280 | previous = tseries 281 | for _ in xrange(num_wavelets): 282 | new_series = [] 283 | for j in xrange(tseries.shape[0]): 284 | wave = transform(previous[j])[0] 285 | new_series.append(wave) 286 | 287 | previous = np.array(new_series) 288 | dhw_series.append(previous) 289 | 290 | assign = np.random.randint(0, num_clusters, tseries.shape[0]) 291 | cents = None 292 | series_shift = None 293 | for dhw in reversed(dhw_series): 294 | cents = _compute_centroids(dhw, assign, num_clusters, series_shift) 295 | cents, assign, series_shift, dists = _base_ksc(dhw, cents, n_iters) 296 | 297 | return cents, assign, series_shift, dists 298 | -------------------------------------------------------------------------------- /src/pyksc/metrics.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from pyksc.dist import dist_all 6 | 7 | import numpy as np 8 | 9 | def cost(tseries, assign, centroids, dist_centroids=None): 10 | 11 | num_series = tseries.shape[0] 12 | if dist_centroids is None: 13 | dist_centroids = dist_all(centroids, tseries) 14 | 15 | cost_f = 0.0 16 | for i in xrange(num_series): 17 | k = assign[i] 18 | cost_f += dist_centroids[k, i] ** 2 19 | 20 | return cost_f / num_series 21 | 22 | def avg_intra_dist(tseries, assign, dists_all_pairs=None): 23 | 24 | num_series = tseries.shape[0] 25 | 26 | if dists_all_pairs is None: 27 | dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] 28 | 29 | dists = [] 30 | for i in xrange(num_series): 31 | k = assign[i] 32 | members = assign == k 33 | dists_i = dists_all_pairs[i] 34 | dists.extend(dists_i[members]) 35 | 36 | return np.mean(dists), np.std(dists) 37 | 38 | def avg_inter_dist(tseries, assign, dists_all_pairs=None): 39 | 40 | num_series = tseries.shape[0] 41 | 42 | if dists_all_pairs is None: 43 | dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] 44 | 45 | dists = [] 46 | for i in xrange(num_series): 47 | k = assign[i] 48 | non_members = assign != k 49 | dists_i = dists_all_pairs[i] 50 | dists.extend(dists_i[non_members]) 51 | 52 | return np.mean(dists), np.std(dists) 53 | 54 | def beta_cv(tseries, assign, dists_all_pairs=None): 55 | 56 | intra_mean, intra_std = avg_intra_dist(tseries, assign, dists_all_pairs) 57 | inter_mean, inter_std = avg_inter_dist(tseries, assign, dists_all_pairs) 58 | 59 | return (inter_std / inter_mean) / (intra_std / intra_mean) 60 | 61 | def silhouette(tseries, assign, dists_all_pairs=None): 62 | 63 | if dists_all_pairs is None: 64 | dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0] 65 | 66 | num_series = tseries.shape[0] 67 | sils = np.zeros(num_series, dtype='f') 68 | labels = set(assign) 69 | for i in xrange(num_series): 70 | 71 | k = assign[i] 72 | dists_i = dists_all_pairs[i] 73 | intra = np.mean(dists_i[assign == k]) 74 | 75 | min_inter = float('inf') 76 | for o in labels: 77 | if o != k: 78 | inter = np.mean(dists_i[assign == o]) 79 | if inter < min_inter: 80 | min_inter = inter 81 | 82 | sils[i] = (min_inter - intra) / max(intra, min_inter) 83 | 84 | return np.mean(sils) 85 | -------------------------------------------------------------------------------- /src/pyksc/regression.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | ''' 3 | Implementation of some Machine Learning regression models. Basically, we 4 | implement simple wrappers around the scikit-learn library which performs 5 | the transformations and specific training models we need. 6 | ''' 7 | from __future__ import division, print_function 8 | 9 | from sklearn.base import clone 10 | from sklearn.base import BaseEstimator 11 | from sklearn.base import RegressorMixin 12 | from sklearn.externals.joblib.parallel import Parallel, delayed 13 | from sklearn.linear_model.base import LinearRegression 14 | from sklearn.utils.validation import safe_asarray 15 | 16 | import numpy as np 17 | 18 | def mean_absolute_error(y_true, y_pred): 19 | """ 20 | Mean absolute error regression loss 21 | 22 | Positive floating point value: the best value is 0.0. 23 | 24 | Parameters 25 | ---------- 26 | y_true : array-like 27 | 28 | y_pred : array-like 29 | 30 | Returns 31 | ------- 32 | mrae : float 33 | """ 34 | 35 | y_true = np.asarray(y_true) 36 | y_pred = np.asarray(y_pred) 37 | 38 | return np.mean(np.abs(y_true - y_pred)) 39 | 40 | def mean_relative_square_error(y_true, y_pred): 41 | """ 42 | Mean relative square error regression loss 43 | 44 | Positive floating point value: the best value is 0.0. 45 | 46 | Parameters 47 | ---------- 48 | y_true : array-like 49 | 50 | y_pred : array-like 51 | 52 | Returns 53 | ------- 54 | mrse : float 55 | """ 56 | y_true = np.asarray(y_true) 57 | y_pred = np.asarray(y_pred) 58 | return np.mean(((y_pred / y_true) - 1) ** 2) 59 | 60 | class RSELinearRegression(LinearRegression): 61 | ''' 62 | Implements an ordinary least squares (OLS) linear regression in which 63 | the objective function is the relative squared error (RSE) and not the 64 | absolute error. 65 | 66 | This class will use the same parameters and arguments as: 67 | sklearn.linear_model.LinearRegression. Different from the linear 68 | regression, we set `fit_intecept` to False by default. 69 | 70 | Parameters 71 | ---------- 72 | fit_intercept : boolean, optional 73 | whether to calculate the intercept for this model. If set 74 | to false, no intercept will be used in calculations 75 | (e.g. data is expected to be already centered). 76 | normalize : boolean, optional 77 | If True, the regressors X are normalized 78 | 79 | See 80 | --- 81 | sklearn.linear_model.LinearRegression 82 | ''' 83 | 84 | def __init__(self, fit_intercept=False, normalize=False, copy_X=True): 85 | super(RSELinearRegression, self).__init__(fit_intercept, normalize, 86 | copy_X) 87 | 88 | def fit(self, X, y): 89 | X = safe_asarray(X) 90 | y = np.asarray(y) 91 | 92 | X = (X.T / y).T 93 | return super(RSELinearRegression, self).fit(X, y / y) 94 | 95 | def _fit_helper(class_, X, y, learner): 96 | return class_, clone(learner).fit(X, y) 97 | 98 | def _predict_helper(examples, X, learner): 99 | return examples, learner.predict(X) 100 | 101 | class MultiClassRegression(BaseEstimator, RegressorMixin): 102 | ''' 103 | This class implements what we call a multi-class regression. In simple 104 | terms, for a dataset with class labels one specialized regression model 105 | is learned for each label. Also, a classification model is learned for the 106 | whole dataset. Thus, when predicting first the classification model is used 107 | to infer classes and secondly the specialized regression model for each 108 | class is used. 109 | 110 | Parameters 111 | ---------- 112 | clf : an instance of `sklearn.base.ClassifierMixin` 113 | this is the classifier to be used. Pass a grid search object when 114 | searching for best parameters is needed 115 | regr : a subclass of `sklearn.base.RegressorMixin` 116 | this is a class object and not a instance of the class. Pass a grid 117 | search object when searching for best parameters is needed 118 | ''' 119 | 120 | def __init__(self, clf, regr, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'): 121 | super(MultiClassRegression, self).__init__() 122 | 123 | self.clf = clf 124 | self.regr = regr 125 | self.n_jobs = n_jobs 126 | self.verbose = verbose 127 | self.pre_dispatch = pre_dispatch 128 | 129 | self.clf_model = None 130 | self.regression_models = None 131 | 132 | def fit(self, X, y_clf, y_regression): 133 | """ 134 | Fit the multiclass model. 135 | 136 | Parameters 137 | ---------- 138 | X : numpy array of shape [n_samples,n_features] 139 | Training data 140 | y_clf : numpy array of shape [n_samples] 141 | Target classes for classification model 142 | y_regression: numpy array of shape [n_samples] 143 | Target values for regression model 144 | 145 | Returns 146 | ------- 147 | self : returns an instance of self. 148 | """ 149 | 150 | X = safe_asarray(X) 151 | y_clf = np.asarray(y_clf) 152 | y_regression = np.asarray(y_regression) 153 | 154 | self.clf_model = self.clf.fit(X, y_clf) 155 | 156 | classes = set(y_clf) 157 | regr = self.regr 158 | 159 | def _generator(): 160 | for class_ in classes: 161 | examples = y_clf == class_ 162 | yield class_, X[examples], y_regression[examples], regr 163 | 164 | out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\ 165 | delayed(_fit_helper)(*params) for params in _generator()) 166 | 167 | self.regression_models = {} 168 | for class_, regr_model in out: 169 | self.regression_models[class_] = regr_model 170 | 171 | return self 172 | 173 | def predict(self, X, return_class_prediction=False): 174 | """ 175 | Predict using the muticlass regression model 176 | 177 | Parameters 178 | ---------- 179 | X : numpy array of shape [n_samples, n_features] 180 | 181 | Returns 182 | ------- 183 | C : array, shape = [n_samples] 184 | Returns predicted values. 185 | """ 186 | 187 | X = safe_asarray(X) 188 | y_clf_predicted = np.asarray(self.clf_model.predict(X)) 189 | classes = set(y_clf_predicted) 190 | 191 | def _generator(): 192 | for class_ in classes: 193 | examples = y_clf_predicted == class_ 194 | yield examples, X[examples], self.regression_models[class_] 195 | 196 | out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\ 197 | delayed(_predict_helper)(*params) for params in _generator()) 198 | 199 | y_regr_predicted = None 200 | for examples, predicted in out: 201 | if y_regr_predicted is None: 202 | y_regr_predicted = np.zeros(X.shape[0], predicted.dtype) 203 | y_regr_predicted[examples] = predicted 204 | 205 | 206 | if return_class_prediction: 207 | return y_clf_predicted, y_regr_predicted 208 | else: 209 | return y_regr_predicted 210 | -------------------------------------------------------------------------------- /src/pyksc/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flaviovdf/pyksc/6ba8988c7fad63366dc2b8d005d0779971e129c5/src/pyksc/test/__init__.py -------------------------------------------------------------------------------- /src/pyksc/test/test_dhwt.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | '''Unit tests for the dhwt module''' 3 | 4 | from __future__ import division, print_function 5 | 6 | from math import sqrt 7 | from numpy.testing import * 8 | from pyksc import dhwt 9 | 10 | import unittest 11 | 12 | import numpy as np 13 | 14 | class TestWavelets(unittest.TestCase): 15 | 16 | def test_all(self): 17 | x = np.array([]) 18 | assert_array_equal(np.array([]), dhwt.transform(x)[0]) 19 | assert_array_equal(np.array([]), dhwt.transform(x)[1]) 20 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 21 | 22 | x = np.array([1., 1]) 23 | assert_array_equal(np.array([1.]), dhwt.transform(x)[0]) 24 | assert_array_equal(np.array([0.]), dhwt.transform(x)[1]) 25 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 26 | 27 | x = np.array([1., 2, 3, 0]) 28 | assert_array_equal(np.array([1.5, 1.5]), dhwt.transform(x)[0]) 29 | assert_array_equal(np.array([-.5, 1.5]), dhwt.transform(x)[1]) 30 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 31 | 32 | x = np.array([1., 2, 3, 0, 7]) 33 | assert_array_equal(np.array([1.5, 1.5, 3.5]), dhwt.transform(x)[0]) 34 | assert_array_equal(np.array([-.5, 1.5, 3.5]), dhwt.transform(x)[1]) 35 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 36 | 37 | x = np.array([6., 12, 15, 15, 14, 12, 120, 116]) 38 | assert_array_equal(np.array([9., 15, 13, 118]), dhwt.transform(x)[0]) 39 | assert_array_equal(np.array([-3, 0, 1, 2]), dhwt.transform(x)[1]) 40 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 41 | 42 | x = np.array([6., 12, 15, 15, 14, 12, 120, 116, 2]) 43 | assert_array_equal(np.array([9., 15, 13, 118, 1]), dhwt.transform(x)[0]) 44 | assert_array_equal(np.array([-3, 0, 1, 2, 1]), dhwt.transform(x)[1]) 45 | assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) 46 | 47 | if __name__ == "__main__": 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /src/pyksc/test/test_dist.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | '''Unit tests for the dist module''' 3 | 4 | from __future__ import division, print_function 5 | 6 | from math import sqrt 7 | from numpy.testing import * 8 | from pyksc import dist 9 | 10 | import unittest 11 | 12 | import numpy as np 13 | 14 | class TestDist(unittest.TestCase): 15 | 16 | def test_shift_roll(self): 17 | array = np.array([]) 18 | assert_array_equal(np.array([]), dist.shift(array, 0)) 19 | assert_array_equal(np.array([]), dist.shift(array, -1)) 20 | assert_array_equal(np.array([]), dist.shift(array, 1)) 21 | assert_array_equal(np.array([]), dist.shift(array, 10)) 22 | assert_array_equal(np.array([]), dist.shift(array, -10)) 23 | 24 | array = np.array([1.0]) 25 | assert_array_equal(np.array([1.0]), dist.shift(array, 0, True)) 26 | assert_array_equal(np.array([1.0]), dist.shift(array, 1, True)) 27 | assert_array_equal(np.array([1.0]), dist.shift(array, 1, True)) 28 | assert_array_equal(np.array([1.0]), dist.shift(array, -2, True)) 29 | assert_array_equal(np.array([1.0]), dist.shift(array, -2, True)) 30 | 31 | array = np.array([1.0, 2.0, 3.0, 4.0]) 32 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 33 | dist.shift(array, 0, True)) 34 | 35 | assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 36 | dist.shift(array, 1, True)) 37 | assert_array_equal(np.array([2.0, 3.0, 4.0, 1.]), 38 | dist.shift(array, -1, True)) 39 | 40 | assert_array_equal(np.array([3.0, 4.0, 1.0, 2.0]), 41 | dist.shift(array, 2, True)) 42 | assert_array_equal(np.array([3.0, 4.0, 1.0, 2.0]), 43 | dist.shift(array, -2, True)) 44 | 45 | assert_array_equal(np.array([2.0, 3.0, 4.0, 1.0]), 46 | dist.shift(array, 3, True)) 47 | assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 48 | dist.shift(array, -3, True)) 49 | 50 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 51 | dist.shift(array, 4, True)) 52 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 53 | dist.shift(array, -4, True)) 54 | 55 | assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 56 | dist.shift(array, 5, True)) 57 | assert_array_equal(np.array([2.0, 3.0, 4.0, 1.]), 58 | dist.shift(array, -5, True)) 59 | 60 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 61 | dist.shift(array, 8, True)) 62 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 63 | dist.shift(array, -8, True)) 64 | 65 | def test_shift_drop(self): 66 | array = np.array([1.0]) 67 | assert_array_equal(np.array([1.0]), dist.shift(array, 0, False)) 68 | assert_array_equal(np.array([0.0]), dist.shift(array, 1, False)) 69 | assert_array_equal(np.array([0.0]), dist.shift(array, 1, False)) 70 | assert_array_equal(np.array([0.0]), dist.shift(array, -2, False)) 71 | assert_array_equal(np.array([0.0]), dist.shift(array, -2, False)) 72 | 73 | array = np.array([1.0, 2.0, 3.0, 4.0]) 74 | assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 75 | dist.shift(array, 0, False)) 76 | 77 | assert_array_equal(np.array([0.0, 1.0, 2.0, 3.0]), 78 | dist.shift(array, 1, False)) 79 | assert_array_equal(np.array([2.0, 3.0, 4.0, 0.0]), 80 | dist.shift(array, -1, False)) 81 | 82 | assert_array_equal(np.array([0.0, 0.0, 1.0, 2.0]), 83 | dist.shift(array, 2, False)) 84 | assert_array_equal(np.array([3.0, 4.0, 0.0, 0.0]), 85 | dist.shift(array, -2, False)) 86 | 87 | assert_array_equal(np.array([0.0, 0.0, 0.0, 1.0]), 88 | dist.shift(array, 3, False)) 89 | assert_array_equal(np.array([4.0, 0.0, 0.0, 0.0]), 90 | dist.shift(array, -3, False)) 91 | 92 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 93 | dist.shift(array, 4, False)) 94 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 95 | dist.shift(array, -4, False)) 96 | 97 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 98 | dist.shift(array, 5, False)) 99 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 100 | dist.shift(array, -5, False)) 101 | 102 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 103 | dist.shift(array, 50, False)) 104 | assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 105 | dist.shift(array, -50, False)) 106 | 107 | #def test_shift_all(self): 108 | # m = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) 109 | # s = np.array([1, 2]) 110 | # 111 | # expected = np.array([[3.0, 1.0, 2.0], [5.0, 6.0, 4.0]]) 112 | # assert_array_almost_equal(expected, dist.shift_all(m, s, True)[0]) 113 | 114 | def test_inner_prod(self): 115 | array1 = np.array([]) 116 | array2 = np.array([]) 117 | self.assertEqual(0, dist.inner_prod(array1, array2)) 118 | 119 | array1 = np.array([1.0, 2.0, 3.0]) 120 | array2 = np.array([2.0, 3.0, 4.0]) 121 | self.assertEqual(sum(array1 * array2), dist.inner_prod(array1, array2)) 122 | 123 | self.assertEqual(sum(array1 ** 2), dist.inner_prod(array1, array1)) 124 | 125 | def test_sqsum(self): 126 | array = np.array([1.0, 2.0, 3.0]) 127 | self.assertAlmostEqual(sum(array ** 2), dist.sqsum(array), 4) 128 | 129 | array = np.array([2.0]) 130 | self.assertEqual(4, dist.sqsum(array)) 131 | 132 | array = np.array([]) 133 | self.assertEqual(0, dist.sqsum(array)) 134 | 135 | def test_shift_dist(self): 136 | array1 = np.array([]) 137 | array2 = np.array([]) 138 | self.assertEqual(0, dist.shift_dist(array1, array2, 0)) 139 | 140 | array1 = np.array([0., 0.]) 141 | array2 = np.array([0., 0.]) 142 | self.assertEqual(0, dist.shift_dist(array1, array2, 0)) 143 | 144 | array1 = np.array([1., 2.]) 145 | array2 = np.array([0., 0.]) 146 | self.assertEqual(1, dist.shift_dist(array1, array2, 0)) 147 | 148 | array1 = np.array([0., 0.]) 149 | array2 = np.array([1., 2.]) 150 | self.assertEqual(1, dist.shift_dist(array1, array2, 0)) 151 | 152 | array1 = np.array([2.0, 3.0, 4.0]) 153 | array2 = np.array([3.0, 4.0, 0.0]) 154 | 155 | self.assertAlmostEqual(0, dist.shift_dist(array1, array1, 0)) 156 | self.assertAlmostEqual(0, dist.shift_dist(array2, array2, 0)) 157 | 158 | expected = 2 / sqrt(29) 159 | self.assertAlmostEqual(expected, dist.shift_dist(array1, array2, 1, False)) 160 | 161 | expected = 2 / sqrt(29) 162 | self.assertAlmostEqual(expected, dist.shift_dist(array1, array2, 1, True)) 163 | 164 | def test_dist(self): 165 | array1 = np.array([]) 166 | array2 = np.array([]) 167 | self.assertEqual(0, dist.dist(array1, array2)) 168 | 169 | array1 = np.array([0., 0.]) 170 | array2 = np.array([0., 0.]) 171 | self.assertEqual(0, dist.dist(array1, array2)) 172 | 173 | array1 = np.array([1., 2.]) 174 | array2 = np.array([0., 0.]) 175 | self.assertEqual(1, dist.dist(array1, array2)) 176 | 177 | array1 = np.array([0., 0.]) 178 | array2 = np.array([1., 2.]) 179 | self.assertEqual(1, dist.dist(array1, array2)) 180 | 181 | array1 = np.array([2.0, 3.0, 4.0]) 182 | array2 = np.array([3.0, 4.0, 0.0]) 183 | 184 | self.assertAlmostEqual(0, dist.dist(array1, array1)) 185 | self.assertAlmostEqual(0, dist.dist(array2, array2)) 186 | 187 | expected = 2 / sqrt(29) 188 | self.assertAlmostEqual(expected, dist.dist(array1, array2, True)) 189 | self.assertAlmostEqual(expected, dist.dist(array2, array1, True)) 190 | 191 | def test_dist_all(self): 192 | m1 = np.array([[0.0], [0.0]]) 193 | m2 = np.array([[0.0], [0.0]]) 194 | 195 | expected = np.array([[0.0, 0.0], [0.0, 0.0]]) 196 | assert_array_equal(expected, dist.dist_all(m1, m2)[0]) 197 | assert_array_equal(expected, dist.dist_all(m1, m2)[1]) 198 | 199 | m1 = np.array([[1.0], [1.0]]) 200 | m2 = np.array([[0.0], [0.0]]) 201 | expected = np.array([[1.0, 1.0], [1.0, 1.0]]) 202 | assert_array_equal(expected, dist.dist_all(m1, m2)[0]) 203 | 204 | m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]]) 205 | m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]]) 206 | expected = np.array([[0.0, 2/sqrt(29)], [2/sqrt(29), 0.0]]) 207 | assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0]) 208 | 209 | if __name__ == "__main__": 210 | unittest.main() 211 | -------------------------------------------------------------------------------- /src/pyksc/test/test_ksc.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | '''Unit tests for the ksc module''' 3 | 4 | from __future__ import division, print_function 5 | 6 | from pyksc import dist 7 | from pyksc import ksc 8 | 9 | import unittest 10 | 11 | import numpy as np 12 | 13 | class TestKSC(unittest.TestCase): 14 | 15 | def ksc_runner(self, method): 16 | k = 2 17 | #One cluster with uniform series, another with a peak. 18 | X = np.array([[1.0,1,1], 19 | [1.1,1,1], 20 | [1.2,1,1], 21 | [1.3,1,1], 22 | [1.3,1,1], 23 | [1.3,1,1], 24 | [1.3,1,1], 25 | [1.3,1,1], 26 | [90,2000,90], 27 | [90,2001,90], 28 | [90,2002,90], 29 | [90,2003,90]]) 30 | 31 | cents, assign, shift, distc = ksc.ksc(X, k) 32 | del shift 33 | 34 | self.assertEqual(len(set(assign)), k) 35 | self.assertEqual(sum(assign == assign[0]), 8) 36 | self.assertEqual(sum(assign == assign[-1]), 4) 37 | 38 | self.assertEqual(len(set(assign[:8])), 1) 39 | self.assertEqual(len(set(assign[8:])), 1) 40 | self.assertFalse(set(assign[:8]) == set(assign[8:])) 41 | 42 | cluster_one = assign[0] 43 | cluster_two = assign[-1] 44 | 45 | self.assertTrue(dist.dist(X[0], cents[cluster_one]) < \ 46 | dist.dist(X[0], cents[cluster_two])) 47 | self.assertTrue(dist.dist(cents[cluster_one], cents[cluster_two]) > 0) 48 | 49 | for i in xrange(X.shape[0]): 50 | self.assertAlmostEqual(dist.dist(X[i], cents[0], True), 51 | distc.T[i, 0], 5) 52 | self.assertAlmostEqual(dist.dist(X[i], cents[1], True), 53 | distc.T[i, 1], 5) 54 | 55 | def test_clustering(self): 56 | self.ksc_runner(ksc.ksc) 57 | 58 | def test_incremental_cluster(self): 59 | self.ksc_runner(ksc.inc_ksc) 60 | 61 | if __name__ == "__main__": 62 | unittest.main() -------------------------------------------------------------------------------- /src/pyksc/test/test_regression.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | '''Unit tests for the regression module''' 3 | 4 | from __future__ import division, print_function 5 | 6 | from numpy.testing import * 7 | from pyksc import regression 8 | 9 | from sklearn import linear_model 10 | from sklearn.grid_search import GridSearchCV 11 | 12 | import numpy as np 13 | import unittest 14 | 15 | 16 | class TestRSELinearRegression(unittest.TestCase): 17 | 18 | def test_rse(self): 19 | assert_equal(regression.mean_relative_square_error([1, 1, 1], 20 | [0, 0, 0]), 1) 21 | 22 | assert_almost_equal(regression.mean_relative_square_error([10, 10, 10],\ 23 | [1, 2, 3]), 0.6466, 3) 24 | 25 | assert_equal(regression.mean_relative_square_error([1, 0.5, 0.8], 26 | [1, 0.5, 0.8]), 0) 27 | 28 | def test_rse_fit_one_attr(self): 29 | 30 | X = [[1], 31 | [4]] 32 | 33 | X_conv = [[1], 34 | [2]] 35 | y = [1, 2] 36 | 37 | rse_lsq = regression.RSELinearRegression(fit_intercept=False) 38 | lsq = linear_model.LinearRegression(fit_intercept=False) 39 | 40 | model_rse = rse_lsq.fit(X, y) 41 | model_lsq = lsq.fit(X_conv, np.ones(len(y))) 42 | 43 | assert_array_equal(model_lsq.coef_, model_rse.coef_) 44 | assert_equal(model_lsq.intercept_, model_rse.intercept_) 45 | 46 | assert_array_almost_equal(model_rse.predict([[1], [4]]), 47 | model_lsq.predict([[1], [4]])) 48 | 49 | def test_rse_fit(self): 50 | 51 | X = [[1.0, 2], 52 | [4, 8]] 53 | 54 | X_conv = [[1.0, 2], 55 | [2, 4]] 56 | y = [1, 2] 57 | 58 | rse_lsq = regression.RSELinearRegression(fit_intercept=False) 59 | lsq = linear_model.LinearRegression(fit_intercept=False) 60 | 61 | model_rse = rse_lsq.fit(X, y) 62 | model_lsq = lsq.fit(X_conv, np.ones(len(y))) 63 | 64 | assert_array_equal(model_lsq.coef_, model_rse.coef_) 65 | assert_equal(model_lsq.intercept_, model_rse.intercept_) 66 | 67 | assert_array_almost_equal(model_rse.predict([[1, 2], [1, 2]]), 68 | model_lsq.predict([[1, 2], [1, 2]])) 69 | 70 | class TestMultiClassRegression(unittest.TestCase): 71 | 72 | def test_multiclass(self): 73 | 74 | X = [[1, 2], 75 | [1, 2], 76 | [4, 8], 77 | [4, 8], 78 | [200, 200], 79 | [199, 200.1], 80 | [200.2, 198]] 81 | 82 | y_clf = [0, 0, 0, 0, 1, 1, 1] 83 | y_regression = [1, 1, 2, 2, 100, 100, 100] 84 | 85 | regr_class = regression.RSELinearRegression(fit_intercept=False) 86 | clf_class = linear_model.LogisticRegression() 87 | 88 | multi_class = regression.MultiClassRegression(clf_class, regr_class) 89 | 90 | model = multi_class.fit(X, y_clf, y_regression) 91 | p = model.predict([[1, 2], 92 | [200, 200], 93 | [1, 2], 94 | [200, 200]]) 95 | assert_equal(p[0], p[2]) 96 | assert_equal(p[1], p[3]) 97 | self.assertTrue(p[0] != p[1]) 98 | 99 | def test_multiclass_parallel(self): 100 | X = [[1, 2], 101 | [1, 2], 102 | [4, 8], 103 | [4, 8], 104 | [200, 200], 105 | [199, 200.1], 106 | [200.2, 198]] 107 | 108 | y_clf = [0, 0, 0, 0, 1, 1, 1] 109 | y_regression = [1, 1, 2, 2, 100, 100, 100] 110 | 111 | regr_class = regression.RSELinearRegression(fit_intercept=False) 112 | clf_class = linear_model.LogisticRegression() 113 | 114 | multi_class = regression.MultiClassRegression(clf_class, regr_class, 115 | n_jobs=2) 116 | 117 | model = multi_class.fit(X, y_clf, y_regression) 118 | p = model.predict([[1, 2], 119 | [200, 200], 120 | [1, 2], 121 | [200, 200]]) 122 | assert_equal(p[0], p[2]) 123 | assert_equal(p[1], p[3]) 124 | self.assertTrue(p[0] != p[1]) 125 | 126 | def test_with_grid_search(self): 127 | X = [[1, 2], 128 | [1, 2], 129 | [4, 8], 130 | [4, 8], 131 | [200, 200], 132 | [199, 200.1], 133 | [200.2, 198]] 134 | 135 | y_clf = [0, 0, 0, 0, 1, 1, 1] 136 | y_regression = [1, 1, 2, 2, 100, 100, 100] 137 | 138 | regr_class = GridSearchCV(regression.RSELinearRegression(), 139 | {'normalize':[0,1]}) 140 | clf_class = GridSearchCV(linear_model.LogisticRegression(), {'C':[1,2]}) 141 | 142 | multi_class = regression.MultiClassRegression(clf_class, regr_class) 143 | 144 | model = multi_class.fit(X, y_clf, y_regression) 145 | p = model.predict([[1, 2], 146 | [200, 200], 147 | [1, 2], 148 | [200, 200]]) 149 | 150 | assert_equal(p[0], p[2]) 151 | assert_equal(p[1], p[3]) 152 | self.assertTrue(p[0] != p[1]) -------------------------------------------------------------------------------- /src/pyksc/test/test_trend.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | '''Unit tests for the trend module''' 3 | 4 | from __future__ import division, print_function 5 | 6 | from pyksc.trend import TrendLearner 7 | 8 | import unittest 9 | import numpy as np 10 | 11 | class TestTrend(unittest.TestCase): 12 | 13 | def addnoise(self, base): 14 | return np.array(base) + np.random.random(len(base)) 15 | 16 | def test_predict_good(self): 17 | 18 | base_one = np.ones(10) 19 | base_two = np.array([90, 2000, 90, 2000, 90, 2000, 90, 2000, 90, 2000]) 20 | 21 | y = [] 22 | X = [] 23 | for _ in range(10): 24 | X.append(self.addnoise(base_one)) 25 | X.append(self.addnoise(base_two)) 26 | y.append(1) 27 | y.append(0) 28 | 29 | 30 | l = TrendLearner(3, 1) 31 | l.fit(X, y) 32 | 33 | P = [] 34 | for _ in range(50): 35 | P.append(self.addnoise(base_one)) 36 | P.append(self.addnoise(base_two)) 37 | 38 | predict = l.predict(P) 39 | self.assertEqual(50, sum(predict == 0)) 40 | self.assertEqual(50, sum(predict == 1)) 41 | 42 | probs = l.predict_proba(P) 43 | 44 | for i in xrange(probs.shape[0]): 45 | if i % 2 == 0: 46 | self.assertTrue(probs[i, 1] > probs[i, 0]) 47 | else: 48 | self.assertTrue(probs[i, 0] > probs[i, 1]) 49 | 50 | def test_predict_bad(self): 51 | 52 | base_one = np.ones(10) 53 | base_two = np.array([90, 2000, 90, 2000, 90, 2000, 90, 2000, 90, 2000]) 54 | 55 | y = [] 56 | X = [] 57 | for _ in range(10): 58 | X.append(self.addnoise(base_one)) 59 | X.append(self.addnoise(base_two)) 60 | y.append(1) 61 | y.append(0) 62 | 63 | 64 | l = TrendLearner(1, 1) 65 | l.fit(X, y) 66 | 67 | P = [] 68 | for _ in range(50): 69 | P.append(self.addnoise(base_one)) 70 | P.append(self.addnoise(base_two)) 71 | 72 | predict = l.predict(P) 73 | self.assertEqual(100, sum(predict == 0)) 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /src/pyksc/trend.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | 3 | import _trend 4 | 5 | from sklearn.base import BaseEstimator 6 | from sklearn.base import ClassifierMixin 7 | 8 | import numpy as np 9 | 10 | class TrendLearner(BaseEstimator, ClassifierMixin): 11 | 12 | def __init__(self, num_steps, gamma=1): 13 | self.num_steps = num_steps 14 | self.gamma = gamma 15 | self.num_labels = 0 16 | self.R = None 17 | self.labels = None 18 | 19 | def fit(self, X, y): 20 | 21 | self.R = np.asanyarray(X, dtype=np.float64, order='C') 22 | 23 | y = np.asanyarray(y) 24 | unique, labels_flat = np.unique(y, return_inverse=True) 25 | self.labels = labels_flat.reshape(y.shape) 26 | self.num_labels = unique.shape[0] 27 | 28 | 29 | def predict(self, X): 30 | 31 | X = np.asanyarray(X)[:, :self.num_steps] 32 | X = np.asanyarray(X, dtype=np.float64, order='C') 33 | 34 | P = _trend.predict(X, self.R, self.labels, self.num_labels, self.gamma) 35 | 36 | return P.argmax(axis=1) 37 | 38 | def predict_proba(self, X): 39 | 40 | X = np.asanyarray(X)[:, :self.num_steps] 41 | X = np.asanyarray(X, dtype=np.float64, order='C') 42 | 43 | P = _trend.predict(X, self.R, self.labels, self.num_labels, self.gamma) 44 | P = ((P.T / P.sum(axis=1)).T) 45 | 46 | return P 47 | -------------------------------------------------------------------------------- /src/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('agg') 3 | 4 | from math import sqrt 5 | from matplotlib import rc 6 | 7 | def initialize_matplotlib(): 8 | rc('axes', labelsize=20) 9 | rc('axes', unicode_minus=False) 10 | rc('axes', grid=True) 11 | rc('grid', color='lightgrey') 12 | rc('grid', linestyle=':') 13 | rc('font', family='serif') 14 | rc('legend', fontsize=18) 15 | rc('lines', linewidth=2) 16 | rc('ps', usedistiller='xpdf') 17 | rc('text', usetex=true) 18 | rc('xtick', labelsize=20) 19 | rc('ytick', labelsize=20) 20 | -------------------------------------------------------------------------------- /src/scripts/class_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scripts.learn_base import create_input_table 6 | from scripts.learn_base import create_grid_search 7 | from scripts.learn_base import clf_summary 8 | 9 | from sklearn.cross_validation import cross_val_score 10 | from sklearn.cross_validation import StratifiedKFold 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.metrics import precision_recall_fscore_support 13 | from sklearn.preprocessing import scale 14 | 15 | from vod.stats.ci import half_confidence_interval_size as hci 16 | 17 | import argparse 18 | import numpy as np 19 | import sys 20 | import traceback 21 | 22 | def run_classifier(clf, X, y): 23 | n_folds = 5 24 | cross_fold = StratifiedKFold(y, k=n_folds) 25 | 26 | #class_matrices has shape [n_folds, 4, n_classes] 27 | #The second dimension has 4 metrics: for precision, recall, f1, support 28 | R_cv = cross_val_score(clf, X, y, cv=cross_fold, n_jobs=1, 29 | score_func=precision_recall_fscore_support) 30 | 31 | C_cv = cross_val_score(clf, X, y, cv=cross_fold, n_jobs=1, 32 | score_func=confusion_matrix) 33 | 34 | class_matrices = [] 35 | conf_matrices = [] 36 | for i in xrange(n_folds): 37 | class_matrices.append(R_cv[i]) 38 | 39 | conf_matrix_aux = 1.0 * C_cv[i] 40 | conf_matrix_aux = (conf_matrix_aux.T / conf_matrix_aux.sum(axis=1)).T 41 | conf_matrices.append(conf_matrix_aux) 42 | 43 | return class_matrices, conf_matrices 44 | 45 | def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name): 46 | X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath) 47 | y = np.loadtxt(classes_fpath) 48 | 49 | clf = create_grid_search(clf_name) 50 | class_matrices, conf_matrices = run_classifier(clf, X, y) 51 | 52 | metric_means = np.mean(class_matrices, axis=0) 53 | metric_ci = hci(class_matrices, .95, axis=0) 54 | print(clf_summary(metric_means, metric_ci)) 55 | print() 56 | 57 | conf_means = np.mean(conf_matrices, axis=0) 58 | conf_ci = hci(conf_matrices, .95, axis=0) 59 | print("Average confusion matrix with .95 confidence interval") 60 | print(" \ttrue ") 61 | print("predic") 62 | for i in xrange(conf_means.shape[0]): 63 | print(i, end="\t \t") 64 | for j in xrange(conf_means.shape[1]): 65 | print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t") 66 | print() 67 | 68 | def create_parser(prog_name): 69 | 70 | desc = __doc__ 71 | formatter = argparse.RawDescriptionHelpFormatter 72 | parser = argparse.ArgumentParser(prog_name, description=desc, 73 | formatter_class=formatter) 74 | 75 | parser.add_argument('--features_fpath', type=str, 76 | help='Input file with video features') 77 | parser.add_argument('--tseries_fpath', type=str, 78 | help='Input file with video time series') 79 | parser.add_argument('--tags_fpath', type=str, 80 | help='Input file with video tags') 81 | parser.add_argument('classes_fpath', type=str, 82 | help='Classes to predict') 83 | parser.add_argument('clf_name', type=str, choices=['rbf_svm', 84 | 'linear_svm', 85 | 'extra_trees'], 86 | help='Classifier to use') 87 | 88 | return parser 89 | 90 | def entry_point(args=None): 91 | '''Fake main used to create argparse and call real one''' 92 | 93 | if not args: 94 | args = [] 95 | 96 | parser = create_parser(args[0]) 97 | values = parser.parse_args(args[1:]) 98 | 99 | try: 100 | return main(values.features_fpath, values.tseries_fpath, 101 | values.tags_fpath, values.classes_fpath, values.clf_name) 102 | except: 103 | traceback.print_exc() 104 | parser.print_usage(file=sys.stderr) 105 | return 1 106 | 107 | if __name__ == '__main__': 108 | sys.exit(entry_point(sys.argv)) 109 | -------------------------------------------------------------------------------- /src/scripts/cluster_jaccard.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from collections import defaultdict 6 | from scripts import initialize_matplotlib 7 | 8 | import numpy as np 9 | import plac 10 | import sys 11 | 12 | def load_text_file(features_fpath, classes, user_users): 13 | 14 | to_cmp = defaultdict(set) 15 | 16 | with open(features_fpath) as features_file: 17 | for curr_line, line in enumerate(features_file): 18 | spl = line.split() 19 | 20 | if user_users: 21 | data = set([spl[1]]) 22 | else: 23 | data = set(token.strip() for token in spl[2:]) 24 | 25 | class_num = classes[curr_line] 26 | to_cmp[class_num].update(data) 27 | 28 | return to_cmp 29 | 30 | def asym_jaccard(first_set, second_set): 31 | intersect = first_set.intersection(second_set) 32 | return len(intersect) / len(first_set) 33 | 34 | @plac.annotations(features_fpath=plac.Annotation('Tags file', type=str), 35 | classes_fpath=plac.Annotation('Video classes file', type=str), 36 | user_users=plac.Annotation('Use user_names instead of tags', 37 | kind='flag', abbrev='u', 38 | type=bool)) 39 | def main(features_fpath, classes_fpath, user_users=False): 40 | 41 | initialize_matplotlib() 42 | 43 | classes = np.loadtxt(classes_fpath) 44 | num_classes = len(set(classes)) 45 | 46 | to_compare = load_text_file(features_fpath, classes, user_users) 47 | 48 | print(end='\t') 49 | for i in xrange(num_classes): 50 | print(i, end='\t') 51 | print() 52 | 53 | for j in xrange(num_classes): 54 | print(j, end='\t') 55 | for i in xrange(num_classes): 56 | 57 | first_set = to_compare[i] 58 | second_set = to_compare[j] 59 | 60 | asym_j = asym_jaccard(first_set, second_set) 61 | print('%.3f' % asym_j, end='\t') 62 | print() 63 | 64 | if __name__ == '__main__': 65 | sys.exit(plac.call(main)) -------------------------------------------------------------------------------- /src/scripts/cluster_mutualinfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from collections import defaultdict 6 | from vod.entropy import kullback_leiber_divergence 7 | 8 | import numpy as np 9 | import plac 10 | import sys 11 | 12 | def load_text_file(features_fpath, classes, use): 13 | #TODO: stemming and category names abbrv 14 | num_classes = len(set(classes)) 15 | 16 | count_class = [0] * num_classes 17 | prob_col = defaultdict(float) 18 | count_class_col = defaultdict(lambda: defaultdict(float)) 19 | 20 | with open(features_fpath) as features_file: 21 | for curr_line, line in enumerate(features_file): 22 | spl = line.split() 23 | 24 | class_num = classes[curr_line] 25 | 26 | if use == 'user': 27 | count_class_col[spl[1]][class_num] += 1 28 | prob_col[spl[1]] += 1 29 | elif use == 'cat': 30 | if len(spl) > 2: 31 | count_class_col[spl[2]][class_num] += 1 32 | prob_col[spl[2]] += 1 33 | else: 34 | for token in spl[3:]: 35 | prob_col[token] += 1 36 | count_class_col[token][class_num] += 1 37 | 38 | count_class[int(class_num)] += 1 39 | 40 | prob_class = np.array(count_class, dtype='f') 41 | prob_class /= prob_class.sum() 42 | 43 | prob_class_col = {} 44 | sum_col = sum(prob_col.values()) 45 | for token in count_class_col: 46 | prob_col[token] = prob_col[token] / sum_col 47 | 48 | aux = np.zeros(num_classes, dtype='f') 49 | for class_num in xrange(num_classes): 50 | aux[class_num] = count_class_col[token][class_num] 51 | aux /= aux.sum() 52 | 53 | prob_class_col[token] = aux 54 | 55 | return prob_class, prob_col, prob_class_col 56 | 57 | def load_svm_file(features_fpath, classes): 58 | col_dict = { 59 | 'EXTERNAL':8, 60 | 'FEATURED':9, 61 | 'INTERNAL':10, 62 | 'MOBILE':11, 63 | 'SEARCH':12, 64 | 'SOCIAL':13, 65 | 'VIRAL':14 66 | } 67 | 68 | num_classes = len(set(classes)) 69 | count_class = [0] * num_classes 70 | prob_col = defaultdict(float) 71 | count_class_col = defaultdict(lambda: defaultdict(float)) 72 | 73 | with open(features_fpath) as features_file: 74 | curr_line = 0 75 | for line in features_file: 76 | if '#' in line: 77 | continue 78 | 79 | spl = line.split() 80 | for ref_name, col_id in col_dict.items(): 81 | ref_abbrv = ref_name 82 | class_num = classes[curr_line] 83 | 84 | weight = float(spl[col_id]) 85 | 86 | prob_col[ref_abbrv] += weight 87 | count_class[int(class_num)] += 1 88 | count_class_col[ref_abbrv][class_num] += weight 89 | 90 | curr_line += 1 91 | 92 | prob_class = np.array(count_class, dtype='f') 93 | prob_class /= prob_class.sum() 94 | 95 | prob_class_col = {} 96 | sum_col = sum(prob_col.values()) 97 | for token in count_class_col: 98 | prob_col[token] = prob_col[token] / sum_col 99 | 100 | aux = np.zeros(num_classes, dtype='f') 101 | for class_num in xrange(num_classes): 102 | aux[class_num] = count_class_col[token][class_num] 103 | aux /= aux.sum() 104 | 105 | prob_class_col[token] = aux 106 | 107 | return prob_class, prob_col, prob_class_col 108 | 109 | @plac.annotations(features_fpath=plac.Annotation('Input file', type=str), 110 | classes_fpath=plac.Annotation('Video classes file', type=str), 111 | use=plac.Annotation('Indicates which information to use', 112 | type=str, 113 | choices=['user', 'tags', 'cat', 'ref'])) 114 | def main(features_fpath, classes_fpath, use): 115 | 116 | classes = np.loadtxt(classes_fpath) 117 | 118 | if use in {'user', 'tags', 'cat'}: 119 | prob_class, prob_col, prob_class_col = load_text_file(features_fpath, 120 | classes, use) 121 | else: 122 | prob_class, prob_col, prob_class_col = load_svm_file(features_fpath, 123 | classes) 124 | info_gains = [] 125 | mutual_info = 0 126 | 127 | for token in prob_class_col: 128 | dkl = kullback_leiber_divergence(prob_class_col[token], prob_class) 129 | 130 | mutual_info += prob_col[token] * dkl 131 | info_gains.append((dkl, token)) 132 | 133 | print('Mutual info: ', mutual_info) 134 | for dkl, token in sorted(info_gains, reverse=True): 135 | print(dkl, token) 136 | 137 | if __name__ == '__main__': 138 | sys.exit(plac.call(main)) -------------------------------------------------------------------------------- /src/scripts/cluster_vol.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scipy import stats 6 | 7 | from collections import defaultdict 8 | from matplotlib import pyplot as plt 9 | from scripts import initialize_matplotlib 10 | 11 | import numpy as np 12 | import plac 13 | import sys 14 | 15 | cols = {'PEAK_VIEWS':3, 'SUM_VIEWS':-1} 16 | 17 | @plac.annotations(features_fpath=plac.Annotation('Features file', type=str), 18 | classes_fpath=plac.Annotation('Video classes file', type=str), 19 | tseries_fpath=plac.Annotation('Time Series file', type=str)) 20 | def main(features_fpath, classes_fpath, tseries_fpath): 21 | X = np.genfromtxt(features_fpath)[:,1:].copy() 22 | y = np.loadtxt(classes_fpath) 23 | T = np.genfromtxt(tseries_fpath)[:,1:].copy() 24 | 25 | bah = T.sum(axis=1) / X[:,-1] 26 | print(np.mean(bah)) 27 | print(np.median(bah)) 28 | print(np.std(bah)) 29 | print(stats.scoreatpercentile(bah, 25)) 30 | 31 | num_clusters = len(set(y)) 32 | 33 | 34 | for k in xrange(num_clusters): 35 | print(k, end='\t') 36 | M = X[y == k] 37 | 38 | for column, col_num in sorted(cols.items()): 39 | data = M[:,col_num] 40 | mean = np.mean(data) 41 | print(mean, end='\t') 42 | print() 43 | 44 | print('Tot.', end='\t') 45 | for column, col_num in sorted(cols.items()): 46 | data = X[:,col_num] 47 | 48 | mean = np.mean(data) 49 | print(mean, end='\t') 50 | print() 51 | 52 | if __name__ == '__main__': 53 | sys.exit(plac.call(main)) 54 | -------------------------------------------------------------------------------- /src/scripts/col_to_cluster.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from collections import defaultdict 6 | from matplotlib import pyplot as plt 7 | 8 | from radar import radar_factory 9 | from scipy import stats 10 | 11 | from scripts import initialize_matplotlib 12 | 13 | import numpy as np 14 | import plac 15 | import sys 16 | 17 | REFERRER_ABBRV = { 18 | 'EXTERNAL':'EXT.', 19 | 'FEATURED':'FEAT.', 20 | 'INTERNAL':'INT.', 21 | 'MOBILE':'MOBI.', 22 | 'SEARCH':'SEAR.', 23 | 'SOCIAL':'SOC.', 24 | 'VIRAL':'VIR.'} 25 | 26 | CATEG_ABBRV = { 27 | 'Autos&Vehicles':'Vehi.', 28 | 'Autos':'Vehi.', 29 | 'Comedy':'Com.', 30 | 'Education':'Edu.', 31 | 'Entertainment':'Ent.', 32 | 'Film':'Film', 33 | 'Film&Animation':'Film', 34 | 'Games':'Game', 35 | 'Gaming':'Game', 36 | 'Howto':'Howto', 37 | 'Howto&Style':'Howto', 38 | 'Movies':'Film', 39 | 'Music':'Music', 40 | 'NULL':'-', 41 | 'News':'News', 42 | 'News&Politics':'News', 43 | 'Nonprofit':'Nonprof.', 44 | 'Nonprofits&Activism':'Nonprof.', 45 | 'People&Blogs':'People', 46 | 'People':'People', 47 | 'Pets&Animals':'Pets', 48 | 'Pets':'Pets', 49 | 'Animals':'Pets', 50 | 'Science&Technology':'Sci.', 51 | 'Science':'Sci.', 52 | 'Tech':'Sci.', 53 | 'Shows':'Show', 54 | 'Sports':'Sport', 55 | 'Trailers':'Film', 56 | 'Travel&Events':'Travel', 57 | 'Travel':'Travel'} 58 | 59 | def load_text_file(features_fpath, col_to_use, classes): 60 | 61 | to_plot = defaultdict(lambda: defaultdict(float)) 62 | sum_classes = defaultdict(float) 63 | labels = set() 64 | with open(features_fpath) as features_file: 65 | for curr_line, line in enumerate(features_file): 66 | spl = line.split() 67 | if col_to_use >= len(spl): 68 | continue 69 | 70 | data = CATEG_ABBRV[line.split()[col_to_use].strip()] 71 | class_num = classes[curr_line] 72 | 73 | labels.add(data) 74 | sum_classes[class_num] += 1 75 | to_plot[class_num][data] += 1 76 | 77 | return to_plot, sum_classes, sorted(labels) 78 | 79 | def load_svm_file(features_fpath, classes): 80 | 81 | col_dict = { 82 | 'EXTERNAL':13, 83 | 'FEATURED':14, 84 | 'INTERNAL':15, 85 | 'MOBILE':16, 86 | 'SEARCH':17, 87 | 'SOCIAL':18, 88 | 'VIRAL':19 89 | } 90 | 91 | to_plot = defaultdict(lambda: defaultdict(float)) 92 | sum_classes = defaultdict(float) 93 | labels = set() 94 | with open(features_fpath) as features_file: 95 | curr_line = 0 96 | for line in features_file: 97 | if '#' in line: 98 | for key, id_ in col_dict.items(): 99 | print(id_, key, line.split()[id_]) 100 | continue 101 | 102 | class_num = classes[curr_line] 103 | sum_classes[class_num] += float(line.split()[-1]) 104 | for ref_name, col_id in col_dict.items(): 105 | ref_abbrv = REFERRER_ABBRV[ref_name] 106 | 107 | val = float(line.split()[col_id]) 108 | present = val > 0 109 | if present: 110 | labels.add(ref_abbrv) 111 | to_plot[class_num][ref_abbrv] += val 112 | 113 | curr_line += 1 114 | 115 | return to_plot, sum_classes, sorted(labels) 116 | 117 | def generate_data_plot(to_plot, sum_classes, labels, classes): 118 | num_classes = len(set(classes)) 119 | colors = ['b', 'g', 'm', 'y'] 120 | 121 | total = 0 122 | for class_num in xrange(num_classes): 123 | color = colors[class_num] 124 | 125 | data_plot = [] 126 | for label in labels: 127 | total += to_plot[class_num][label] 128 | data_plot.append(to_plot[class_num][label] / sum_classes[class_num]) 129 | 130 | yield data_plot, color, class_num 131 | 132 | def radar_plot(labels, data_plots, out_fpath): 133 | 134 | theta = radar_factory(len(labels)) 135 | 136 | fig = plt.figure() 137 | ax = fig.add_subplot(1, 1, 1, projection='radar') 138 | 139 | for data_plot, color, class_num in data_plots: 140 | ax.plot(theta, data_plot, color=color, label='C%d'%class_num) 141 | ax.fill(theta, data_plot, facecolor=color, alpha=0.25) 142 | 143 | ax.set_varlabels(labels) 144 | plt.legend(frameon=False, ncol=4, bbox_to_anchor=(0.5, -0.15), 145 | loc='lower center') 146 | plt.savefig(out_fpath) 147 | 148 | def chisq(counts, expected_prob): 149 | counts = np.array(counts) 150 | expected = np.array(expected_prob) * counts.sum() 151 | 152 | return stats.chisquare(counts, expected)[1] 153 | 154 | def allchisq(to_plot, sum_classes, labels, classes): 155 | num_classes = len(set(classes)) 156 | 157 | totals = [] 158 | for label in labels: 159 | sum_ = 0 160 | for class_num in xrange(num_classes): 161 | sum_ += to_plot[class_num][label] 162 | totals.append(sum_) 163 | 164 | probs = [] 165 | sum_totals = sum(totals) 166 | for i, t in enumerate(totals): 167 | probs.append( t / sum_totals) 168 | 169 | for class_num in xrange(num_classes): 170 | counts = [] 171 | for label in labels: 172 | counts.append(to_plot[class_num][label]) 173 | 174 | chisq(counts, probs) 175 | 176 | def stacked_bars(labels, data_plots, out_fpath, label_translation, ref=True): 177 | 178 | x_locations = [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19] 179 | 180 | data_class = {} 181 | data_label = {} 182 | for data, _, class_num in data_plots: 183 | 184 | best_idx = np.argsort(data)[::-1][:4] 185 | best_cls = np.array(data)[best_idx] 186 | best_lbl = np.array(labels)[best_idx] 187 | 188 | data_class[label_translation[class_num]] = best_cls 189 | data_label[label_translation[class_num]] = best_lbl 190 | 191 | bar_data = [] 192 | bar_labels = [] 193 | for cls in sorted(data_class): 194 | bar_data.extend(data_class[cls]) 195 | bar_labels.extend(data_label[cls]) 196 | 197 | colors = ['b', 'g', 'm', 'r', 'y', 'c', '#A617A1', '#2B5700', 'w', 198 | '#FF7300', 'k'] * 3 199 | 200 | colored={} 201 | if ref: 202 | to_use = set(REFERRER_ABBRV.values()) 203 | else: 204 | to_use = set(CATEG_ABBRV.values()) 205 | 206 | for i, l in enumerate(to_use): 207 | colored[l] = colors[i] 208 | 209 | for x, y, l in zip(x_locations, bar_data, bar_labels): 210 | 211 | c = colored[l] 212 | plt.bar(left=x, height=y, color=c, width=1, alpha=0.5) 213 | plt.text(x + .75, y, l, va='bottom', ha='center', rotation=45) 214 | 215 | plt.xlim(xmin=0, xmax=21) 216 | plt.xlabel('Cluster') 217 | if ref: 218 | plt.ylim(ymin=0, ymax=.31) 219 | plt.ylabel('Fraction of Views in Cluster') 220 | else: 221 | plt.ylim(ymin=0, ymax=.4) 222 | plt.ylabel('Fraction of Videos in Cluster') 223 | 224 | plt.xticks([3, 8, 13, 18], ['$C0$', '$C1$', '$C2$', '$C3']) 225 | plt.savefig(out_fpath) 226 | 227 | @plac.annotations(features_fpath=plac.Annotation('Features file', type=str), 228 | classes_fpath=plac.Annotation('Video classes file', type=str), 229 | out_fpath=plac.Annotation('Plot file', type=str), 230 | trans_fpath=plac.Annotation('Translation of cluster num to labe', 231 | type=str), 232 | col_to_use=plac.Annotation('Column number to use', type=int, 233 | kind='option', abbrev='c'), 234 | is_text_features=plac.Annotation('Indicates file type', 235 | kind='flag', abbrev='t', 236 | type=bool)) 237 | def main(features_fpath, classes_fpath, out_fpath, 238 | trans_fpath, col_to_use=2, is_text_features=False): 239 | initialize_matplotlib() 240 | 241 | classes = np.loadtxt(classes_fpath) 242 | 243 | if is_text_features: 244 | to_plot, sum_classes, labels = \ 245 | load_text_file(features_fpath, col_to_use, classes) 246 | ref=False 247 | else: 248 | to_plot, sum_classes, labels = \ 249 | load_svm_file(features_fpath, classes) 250 | ref=True 251 | 252 | trans = {} 253 | with open(trans_fpath) as f: 254 | for l in f: 255 | spl = l.split() 256 | trans[int(spl[0])] = int(spl[1]) 257 | 258 | data = generate_data_plot(to_plot, sum_classes, labels, classes) 259 | stacked_bars(labels, data, out_fpath, trans, ref) 260 | #allchisq(to_plot, sum_classes, labels, classes) 261 | 262 | if __name__ == '__main__': 263 | sys.exit(plac.call(main)) 264 | -------------------------------------------------------------------------------- /src/scripts/create_mic_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scripts.learn_base import create_input_table, hstack_if_possible 6 | 7 | import numpy as np 8 | import plac 9 | import sys 10 | 11 | @plac.annotations(features_fpath=plac.Annotation('Partial Features', 12 | type=str), 13 | tag_categ_fpath=plac.Annotation('Tags file', type=str), 14 | tseries_fpath=plac.Annotation('Time series file', type=str), 15 | assign_fpath=plac.Annotation('Series assignment file', 16 | type=str)) 17 | def main(features_fpath, tag_categ_fpath, tseries_fpath, assign_fpath): 18 | 19 | X, feature_ids, _ = \ 20 | create_input_table(features_fpath, None, tag_categ_fpath,-1) 21 | 22 | y_clf = np.genfromtxt(assign_fpath) 23 | y_rgr = np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1) 24 | 25 | for feat_id in range(len(feature_ids)): 26 | print(feature_ids[feat_id], end=',') 27 | 28 | print('TREND', end=',') 29 | print('FINAL_VIEWS') 30 | 31 | M = np.column_stack((X, y_clf, y_rgr)) 32 | np.savetxt(sys.stdout, M, '%d', delimiter=',') 33 | 34 | if __name__ == '__main__': 35 | sys.exit(plac.call(main)) -------------------------------------------------------------------------------- /src/scripts/learn_base.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | ''' 3 | Common functions for creating classifiers and regressors for machine learning 4 | tasks 5 | ''' 6 | from __future__ import division, print_function 7 | 8 | from scripts.col_to_cluster import CATEG_ABBRV 9 | 10 | from scipy import sparse 11 | 12 | from sklearn import ensemble 13 | from sklearn import grid_search 14 | from sklearn import svm 15 | 16 | import cStringIO 17 | import numpy as np 18 | 19 | #Params 20 | SVM_C_RANGE = [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3] 21 | SVM_GAMMA_RANGE = [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3] 22 | 23 | TREE_SPLIT_RANGE = [1, 2, 4, 8, 16] 24 | 25 | PARAMS = {'rbf_svm':{'C':SVM_C_RANGE, 'gamma':SVM_GAMMA_RANGE}, 26 | 'linear_svm':{'C':SVM_C_RANGE}, 27 | 'extra_trees':{'min_samples_split':TREE_SPLIT_RANGE}} 28 | 29 | #Classifiers 30 | CACHE_SIZE = 1024 * 4 31 | CLFS = {'rbf_svm':svm.SVC(kernel='rbf', cache_size=CACHE_SIZE), 32 | 'linear_svm':svm.LinearSVC(), 33 | 'extra_trees':ensemble.ExtraTreesClassifier(n_estimators=20, 34 | compute_importances=True, 35 | criterion='gini', 36 | n_jobs=1)} 37 | 38 | CLFS_SPARSE = {'rbf_svm':svm.sparse.SVC(kernel='rbf', cache_size=CACHE_SIZE), 39 | 'linear_svm':svm.sparse.LinearSVC(), 40 | 'extra_trees':CLFS['extra_trees']} 41 | 42 | #Regressors 43 | RGRS = {'rbf_svm':svm.SVR(kernel='rbf', cache_size=CACHE_SIZE), 44 | 'linear_svm':svm.SVR(kernel='linear'), 45 | 'extra_trees':ensemble.ExtraTreesRegressor(n_estimators=20, 46 | compute_importances=True)} 47 | 48 | RGRS_SPARSE = {'rbf_svm':svm.sparse.SVR(kernel='rbf', cache_size=CACHE_SIZE), 49 | 'linear_svm':svm.sparse.SVR(kernel='linear'), 50 | 'extra_trees':CLFS['extra_trees']} 51 | 52 | #Category Parsing Utilities 53 | CAT_COL = 2 54 | CAT_IDS = dict((abbrv, i) \ 55 | for i, abbrv in enumerate(sorted(set(CATEG_ABBRV.values())))) 56 | 57 | def _get_classifier_and_params(name, sparse = False): 58 | if sparse: 59 | dict_to_use = CLFS_SPARSE 60 | else: 61 | dict_to_use = CLFS 62 | 63 | return dict_to_use[name], PARAMS[name] 64 | 65 | def _get_regressor_and_params(name, sparse = False): 66 | if sparse: 67 | dict_to_use = RGRS_SPARSE 68 | else: 69 | dict_to_use = RGRS 70 | 71 | return dict_to_use[name], PARAMS[name] 72 | 73 | def create_grid_search(name, sparse=False, regressor=False, n_jobs=1): 74 | if regressor: 75 | learner, params = _get_regressor_and_params(name, sparse) 76 | else: 77 | learner, params = _get_classifier_and_params(name, sparse) 78 | 79 | return grid_search.GridSearchCV(learner, params, cv=3, refit=True, 80 | n_jobs=n_jobs) 81 | 82 | def hstack_if_possible(X, Y): 83 | if X is not None: 84 | return np.hstack((X, Y)) 85 | else: 86 | return Y 87 | 88 | def update_col_ids(ids_to_insert, column_ids=None): 89 | if not column_ids: 90 | column_ids = {} 91 | 92 | base = len(column_ids) 93 | column_ids.update((pnt + base, name) for pnt, name in ids_to_insert.items()) 94 | 95 | return column_ids 96 | 97 | def load_referrers(referrers_fpath, X = None, column_ids=None): 98 | X_ref = np.genfromtxt(referrers_fpath)[:,1:].copy() 99 | 100 | new_col_ids = {} 101 | with open(referrers_fpath) as referrers_file: 102 | for line in referrers_file: 103 | if '#' in line: 104 | spl = line.split()[1:] 105 | new_col_ids = dict((k, v) for k, v in enumerate(spl)) 106 | 107 | return hstack_if_possible(X, X_ref), \ 108 | update_col_ids(new_col_ids, column_ids) 109 | 110 | def load_time_series(tseries_fpath, num_pts = -1, X = None, column_ids=None): 111 | X_series = np.genfromtxt(tseries_fpath)[:,1:][:,range(num_pts)] 112 | 113 | new_col_ids = dict((i, 'POINT_%d'%pnt) \ 114 | for i, pnt in enumerate(range(num_pts))) 115 | 116 | return hstack_if_possible(X, X_series), \ 117 | update_col_ids(new_col_ids, column_ids) 118 | 119 | def load_categories(tags_cat_fpath, X = None, column_ids=None): 120 | with open(tags_cat_fpath) as tags_cat_file: 121 | data = [] 122 | row = [] 123 | col = [] 124 | new_col_ids = {} 125 | for i, line in enumerate(tags_cat_file): 126 | spl = line.split() 127 | category = 'NULL' 128 | if len(spl) > CAT_COL: 129 | category = line.split()[CAT_COL] 130 | 131 | abbrv = CATEG_ABBRV[category] 132 | categ_id = CAT_IDS[abbrv] 133 | 134 | data.append(1) 135 | row.append(i) 136 | col.append(categ_id) 137 | 138 | new_col_ids[categ_id] = 'CAT_%s' % abbrv 139 | 140 | X_categ = np.asarray(sparse.coo_matrix((data, (row, col))).todense()) 141 | return hstack_if_possible(X, X_categ), \ 142 | update_col_ids(new_col_ids, column_ids) 143 | 144 | def create_input_table(referrers_fpath = None, tseries_fpath = None, 145 | tags_cat_fpath = None, num_pts = 3): 146 | 147 | X = None 148 | column_ids = None 149 | 150 | if referrers_fpath: 151 | X, column_ids = load_referrers(referrers_fpath) 152 | 153 | if tseries_fpath and num_pts > 0: 154 | X, column_ids = load_time_series(tseries_fpath, num_pts, X, column_ids) 155 | 156 | if tags_cat_fpath: 157 | X, column_ids = load_categories(tags_cat_fpath, X, column_ids) 158 | 159 | inverse_names = dict((v, k) for k, v in column_ids.items()) 160 | return X, column_ids, inverse_names 161 | 162 | def clf_summary(mean_scores, ci_scores): 163 | 164 | buff = cStringIO.StringIO() 165 | try: 166 | print('class \tprecision \trecall \tf1 score \tsupport', file=buff) 167 | for j in xrange(mean_scores.shape[1]): 168 | print(j, end="\t", file=buff) 169 | for i in xrange(mean_scores.shape[0]): 170 | print('%.3f +- %.3f' % (mean_scores[i, j], ci_scores[i, j]), 171 | end="\t", file=buff) 172 | print(file=buff) 173 | print(file=buff) 174 | 175 | return buff.getvalue() 176 | finally: 177 | buff.close() 178 | -------------------------------------------------------------------------------- /src/scripts/leave_k.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scripts.learn_base import create_input_table 6 | from scripts.learn_base import create_grid_search 7 | 8 | from sklearn.metrics import f1_score 9 | from sklearn.metrics import mean_square_error as mse 10 | from sklearn.metrics import r2_score 11 | 12 | import plac 13 | import numpy as np 14 | import os 15 | import sys 16 | 17 | def create_learners(learner_name='extra_trees'): 18 | clf = create_grid_search(learner_name, n_jobs=-1) 19 | rgr = create_grid_search(learner_name, regressor=True, n_jobs=-1) 20 | 21 | return clf, rgr 22 | 23 | def print_importance(feature_ids, importance_clf, importance_rgr): 24 | print() 25 | print('Classification Importance') 26 | for key in importance_clf.argsort()[::-1]: 27 | print(feature_ids[key], importance_clf[key]) 28 | 29 | print() 30 | print('Regression Importance') 31 | for key in importance_rgr.argsort()[::-1]: 32 | print(feature_ids[key], importance_rgr[key]) 33 | 34 | def mae(y_true, y_pred): 35 | y_true = np.asanyarray(y_true) 36 | y_pred = np.asanyarray(y_pred) 37 | 38 | return np.mean(np.abs(y_true - y_pred)) 39 | 40 | def run_experiment(X, y_clf, y_rgr, feature_ids, out_foldpath, k=500): 41 | clf, rgr = create_learners() 42 | 43 | n = len(y_clf) 44 | train_index = np.ones(n, dtype=np.bool) 45 | train_index[-k:] = False 46 | test_index = np.logical_not(train_index) 47 | 48 | clf_model = clf.fit(X[train_index], y_clf[train_index]) 49 | rgr_model = rgr.fit(X[train_index], y_rgr[train_index]) 50 | 51 | clf_true = y_clf[test_index] 52 | clf_pred = clf_model.predict(X[test_index]) 53 | 54 | rgr_true = y_rgr[test_index] 55 | rgr_pred = rgr_model.predict(X[test_index]) 56 | 57 | clf_pred_fpath = os.path.join(out_foldpath, '%clf.pred') 58 | clf_true_fpath = os.path.join(out_foldpath, '%clf.true') 59 | 60 | rgr_pred_fpath = os.path.join(out_foldpath, '%rgr.pred') 61 | rgr_true_fpath = os.path.join(out_foldpath, '%rgr.true') 62 | 63 | np.savetxt(clf_pred_fpath, clf_pred, fmt="%d") 64 | np.savetxt(clf_true_fpath, clf_true, fmt="%d") 65 | 66 | np.savetxt(rgr_pred_fpath, rgr_pred) 67 | np.savetxt(rgr_true_fpath, rgr_true) 68 | 69 | print('Micro F1: ', f1_score(clf_true, clf_pred, average='micro')) 70 | print('Macro F1: ', f1_score(clf_true, clf_pred, average='macro')) 71 | print() 72 | print('R2: ', r2_score(rgr_true, rgr_pred)) 73 | print('MAE: ', mae(rgr_true, rgr_pred)) 74 | print('MSE: ', mse(rgr_true, rgr_pred)) 75 | print() 76 | print_importance(feature_ids, 77 | clf_model.best_estimator_.feature_importances_, 78 | rgr_model.best_estimator_.feature_importances_) 79 | 80 | @plac.annotations(partial_features_fpath=plac.Annotation('Partial Features', 81 | type=str), 82 | tag_categ_fpath=plac.Annotation('Tags file', type=str), 83 | tseries_fpath=plac.Annotation('Time series file', type=str), 84 | num_days_to_use=plac.Annotation('Num Days Series', type=int), 85 | assign_fpath=plac.Annotation('Series assignment file', 86 | type=str), 87 | out_foldpath=plac.Annotation('Output folder', type=str)) 88 | def main(partial_features_fpath, tag_categ_fpath, tseries_fpath, 89 | num_days_to_use, assign_fpath, out_foldpath): 90 | 91 | X, feature_ids, feature_names = \ 92 | create_input_table(partial_features_fpath, tseries_fpath, 93 | tag_categ_fpath, num_pts = num_days_to_use) 94 | 95 | #Sort X by upload date 96 | up_date_col = feature_names['A_UPLOAD_DATE'] 97 | sort_by_date = X[:,up_date_col].argsort() 98 | X = X[sort_by_date].copy() 99 | 100 | y_clf = np.genfromtxt(assign_fpath)[sort_by_date] 101 | y_regr = np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1)[sort_by_date] 102 | run_experiment(X, y_clf, y_regr, feature_ids, out_foldpath) 103 | 104 | if __name__ == '__main__': 105 | sys.exit(plac.call(main)) -------------------------------------------------------------------------------- /src/scripts/plot_centroids.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from matplotlib import pyplot as plt 6 | 7 | from pyksc import dist 8 | from pyksc import ksc 9 | 10 | from scripts import initialize_matplotlib 11 | 12 | import argparse 13 | import numpy as np 14 | import os 15 | import sys 16 | import traceback 17 | 18 | def main(tseries_fpath, k, plot_foldpath): 19 | import mkl 20 | mkl.set_num_threads(16) 21 | 22 | initialize_matplotlib() 23 | 24 | X = np.genfromtxt(tseries_fpath)[:,1:] 25 | aux = X.sum(axis=1) 26 | fix = np.where(aux == 0)[0] 27 | X[fix] += .001 #fixing zero only rows 28 | X = X.copy() 29 | 30 | cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) 31 | 32 | for i in xrange(cent.shape[0]): 33 | t_series = cent[i] 34 | 35 | plt.plot(t_series, '-k') 36 | plt.gca().get_xaxis().set_visible(False) 37 | plt.gca().get_yaxis().set_visible(False) 38 | #plt.ylabel('Views') 39 | #plt.xlabel('Time') 40 | plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i)) 41 | plt.close() 42 | 43 | half = t_series.shape[0] // 2 44 | to_shift = half - np.argmax(t_series) 45 | to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True) 46 | plt.plot(to_plot_peak_center, '-k') 47 | plt.gca().get_xaxis().set_visible(False) 48 | plt.gca().get_yaxis().set_visible(False) 49 | #plt.ylabel('Views') 50 | #plt.xlabel('Time') 51 | plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i)) 52 | plt.close() 53 | 54 | to_shift = 0 - np.argmin(t_series) 55 | to_plot_min_first = dist.shift(t_series, to_shift, rolling=True) 56 | plt.plot(to_plot_min_first, '-k') 57 | plt.gca().get_xaxis().set_visible(False) 58 | plt.gca().get_yaxis().set_visible(False) 59 | #plt.ylabel('Views') 60 | #plt.xlabel('Time') 61 | plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i)) 62 | plt.close() 63 | 64 | np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f') 65 | np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d') 66 | np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d') 67 | np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'), dists_cent, 68 | fmt='%.5f') 69 | 70 | def create_parser(prog_name): 71 | 72 | desc = __doc__ 73 | formatter = argparse.RawDescriptionHelpFormatter 74 | parser = argparse.ArgumentParser(prog_name, description=desc, 75 | formatter_class=formatter) 76 | 77 | parser.add_argument('tseries_fpath', type=str, help='Time series file') 78 | parser.add_argument('k', type=int, help='Number of clusters') 79 | parser.add_argument('plot_foldpath', type=str, help='Folder to store plots') 80 | 81 | return parser 82 | 83 | def entry_point(args=None): 84 | '''Fake main used to create argparse and call real one''' 85 | 86 | if not args: 87 | args = [] 88 | 89 | parser = create_parser(args[0]) 90 | values = parser.parse_args(args[1:]) 91 | 92 | try: 93 | return main(values.tseries_fpath, values.k, values.plot_foldpath) 94 | except: 95 | traceback.print_exc() 96 | parser.print_usage(file=sys.stderr) 97 | return 1 98 | 99 | if __name__ == '__main__': 100 | sys.exit(entry_point(sys.argv)) 101 | -------------------------------------------------------------------------------- /src/scripts/plot_members.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from matplotlib import pyplot as plt 6 | 7 | from pyksc import dist 8 | from scripts import initialize_matplotlib 9 | 10 | import plac 11 | import numpy as np 12 | import os 13 | import sys 14 | 15 | def plot_series(t_series, plot_foldpath, name, shift=False): 16 | 17 | to_plot = t_series 18 | if shift: 19 | to_shift = 0 - np.argmin(t_series) 20 | to_plot = dist.shift(t_series, to_shift, rolling=True) 21 | 22 | plt.plot(to_plot, '-k') 23 | plt.ylabel('Views') 24 | plt.xlabel('Time') 25 | plt.savefig(os.path.join(plot_foldpath, '%s.png' % name)) 26 | plt.close() 27 | 28 | @plac.annotations(tseries_fpath=plac.Annotation('Input file', type=str), 29 | assign_fpath=plac.Annotation('Series assignment file', 30 | type=str), 31 | centroids_fpath=plac.Annotation('Cluster centroids file', 32 | type=str), 33 | plot_foldpath=plac.Annotation('Output folder', type=str)) 34 | def main(tseries_fpath, assign_fpath, centroids_fpath, plot_foldpath): 35 | initialize_matplotlib() 36 | 37 | X = np.genfromtxt(tseries_fpath)[:,1:].copy() 38 | y = np.genfromtxt(assign_fpath) 39 | centroids = np.genfromtxt(centroids_fpath) 40 | 41 | num_classes = len(set(y)) 42 | 43 | for k in xrange(num_classes): 44 | centroid_plot_foldpath = os.path.join(plot_foldpath, str(k)) 45 | os.mkdir(centroid_plot_foldpath) 46 | 47 | centroid = centroids[k] 48 | plot_series(centroid, centroid_plot_foldpath, 'centroid', True) 49 | 50 | members = X[y == k] 51 | n_samples = members.shape[0] 52 | sample_rows = np.arange(n_samples) 53 | np.random.shuffle(sample_rows) 54 | 55 | members_to_plot = members[sample_rows[:10]] 56 | for i in xrange(members_to_plot.shape[0]): 57 | print(k, i) 58 | plot_series(members_to_plot[i], centroid_plot_foldpath, 'ex-%d' % i) 59 | 60 | if __name__ == '__main__': 61 | sys.exit(plac.call(main)) -------------------------------------------------------------------------------- /src/scripts/plot_quality.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from matplotlib import pyplot as plt 6 | 7 | from pyksc import dist 8 | from pyksc import metrics 9 | from pyksc import ksc 10 | 11 | from scripts import initialize_matplotlib 12 | 13 | from vod.stats.ci import half_confidence_interval_size as hci 14 | 15 | import argparse 16 | import numpy as np 17 | import os 18 | import sys 19 | import traceback 20 | 21 | def run_clustering(X, k, dists_all): 22 | 23 | cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) 24 | 25 | intra = metrics.avg_intra_dist(X, assign, dists_all)[0] 26 | inter = metrics.avg_inter_dist(X, assign, dists_all)[0] 27 | bcv = metrics.beta_cv(X, assign, dists_all) 28 | cost = metrics.cost(X, assign, None, dists_cent) 29 | 30 | return intra, inter, bcv, cost 31 | 32 | def main(tseries_fpath, plot_foldpath): 33 | assert os.path.isdir(plot_foldpath) 34 | initialize_matplotlib() 35 | 36 | X = np.genfromtxt(tseries_fpath)[:,1:].copy() 37 | 38 | n_samples = X.shape[0] 39 | sample_rows = np.arange(n_samples) 40 | 41 | clust_range = range(2, 16) 42 | n_clustering_vals = len(clust_range) 43 | 44 | intra_array = np.zeros(shape=(25, n_clustering_vals)) 45 | inter_array = np.zeros(shape=(25, n_clustering_vals)) 46 | bcvs_array = np.zeros(shape=(25, n_clustering_vals)) 47 | costs_array = np.zeros(shape=(25, n_clustering_vals)) 48 | 49 | r = 0 50 | for i in xrange(5): 51 | np.random.shuffle(sample_rows) 52 | rand_sample = sample_rows[:200] 53 | 54 | X_new = X[rand_sample] 55 | D_new = dist.dist_all(X_new, X_new, rolling=True)[0] 56 | 57 | for j in xrange(5): 58 | for k in clust_range: 59 | intra, inter, bcv, cost = run_clustering(X_new, k, D_new) 60 | 61 | intra_array[r, k - 2] = intra 62 | inter_array[r, k - 2] = inter 63 | bcvs_array[r, k - 2] = bcv 64 | costs_array[r, k - 2] = cost 65 | 66 | r += 1 67 | print(r) 68 | 69 | intra_err = np.zeros(n_clustering_vals) 70 | inter_err = np.zeros(n_clustering_vals) 71 | bcvs_err = np.zeros(n_clustering_vals) 72 | costs_err = np.zeros(n_clustering_vals) 73 | 74 | for k in clust_range: 75 | j = k - 2 76 | intra_err[j] = hci(intra_array[:,j], .95) 77 | inter_err[j] = hci(inter_array[:,j], .95) 78 | bcvs_err[j] = hci(bcvs_array[:,j], .95) 79 | costs_err[j] = hci(costs_array[:,j], .95) 80 | 81 | plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt='gD', 82 | label='Inter Cluster', yerr=inter_err) 83 | plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt='bo', 84 | label='BetaCV', yerr=bcvs_err) 85 | plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt='rs', 86 | label='Intra Cluster', yerr=intra_err) 87 | plt.ylabel('Average Distance') 88 | plt.xlabel('Number of clusters') 89 | plt.xlim((0., 16)) 90 | plt.ylim((0., 1.)) 91 | plt.legend(frameon=False, loc='lower left') 92 | plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf')) 93 | plt.close() 94 | 95 | plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt='bo', 96 | label='Cost', yerr=costs_err) 97 | plt.ylabel('Cost (F)') 98 | plt.xlabel('Number of clusters') 99 | plt.xlim((0., 16)) 100 | plt.ylim((0., 1.)) 101 | plt.legend(frameon=False, loc='lower left') 102 | plt.savefig(os.path.join(plot_foldpath, 'cost.pdf')) 103 | plt.close() 104 | 105 | def create_parser(prog_name): 106 | 107 | desc = __doc__ 108 | formatter = argparse.RawDescriptionHelpFormatter 109 | parser = argparse.ArgumentParser(prog_name, description=desc, 110 | formatter_class=formatter) 111 | 112 | parser.add_argument('tseries_fpath', type=str, help='Time series file') 113 | parser.add_argument('plot_foldpath', type=str, help='Folder to store plots') 114 | return parser 115 | 116 | def entry_point(args=None): 117 | '''Fake main used to create argparse and call real one''' 118 | 119 | if not args: 120 | args = [] 121 | 122 | parser = create_parser(args[0]) 123 | values = parser.parse_args(args[1:]) 124 | 125 | try: 126 | return main(values.tseries_fpath, values.plot_foldpath) 127 | except: 128 | traceback.print_exc() 129 | parser.print_usage(file=sys.stderr) 130 | return 1 131 | 132 | if __name__ == '__main__': 133 | sys.exit(entry_point(sys.argv)) 134 | -------------------------------------------------------------------------------- /src/scripts/plot_time_to_peak.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from matplotlib import dates 6 | from matplotlib import pyplot as plt 7 | 8 | from scripts import initialize_matplotlib 9 | 10 | import argparse 11 | import numpy as np 12 | import os 13 | import sys 14 | import traceback 15 | 16 | refs = { 17 | 'G_EXTERNAL_EVENT_DATE':0, 18 | 'G_FEATURED_EVENT_DATE':1, 19 | 'G_INTERNAL_EVENT_DATE':2, 20 | 'G_MOBILE_EVENT_DATE':3, 21 | 'G_SEARCH_EVENT_DATE':4, 22 | 'G_SOCIAL_EVENT_DATE':5, 23 | 'G_VIRAL_EVENT_DATE':6 24 | } 25 | 26 | UP_DATE = -1 27 | 28 | def main(features_fpath): 29 | initialize_matplotlib() 30 | 31 | X = np.genfromtxt(features_fpath)[:,1:] 32 | 33 | for r, k in sorted(refs.items()): 34 | idxs = X[:,k] > 0 35 | time_to_ref = (X[:,UP_DATE][idxs] - X[:,k][idxs]) 36 | print(r, np.mean(time_to_ref), np.std(time_to_ref)) 37 | 38 | print('peak_frac', np.mean(X[:,-3]), np.std(X[:,-3])) 39 | 40 | time_to_peak = (X[:,-4] - X[:,UP_DATE]) / 7 41 | print('peak_date', np.mean(time_to_peak), np.std(time_to_peak)) 42 | 43 | import time 44 | plt.hist(X[:,UP_DATE], bins=20) 45 | ticks, labels = plt.xticks() 46 | plt.xticks(ticks, [time.strftime('%m/%y', time.localtime(x)) for x in ticks]) 47 | plt.ylabel('\# Videos') 48 | plt.xlabel('Month/Year') 49 | plt.savefig('hist.pdf') 50 | 51 | 52 | # plt.plot(t_series, '-k') 53 | # plt.ylabel('Views') 54 | # plt.xlabel('Time') 55 | # plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i)) 56 | # plt.close() 57 | # 58 | # half = t_series.shape[0] // 2 59 | # to_shift = half - np.argmax(t_series) 60 | # to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True) 61 | # plt.plot(to_plot_peak_center, '-k') 62 | # plt.ylabel('Views') 63 | # plt.xlabel('Time') 64 | # plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i)) 65 | # plt.close() 66 | # 67 | # to_shift = 0 - np.argmin(t_series) 68 | # to_plot_min_first = dist.shift(t_series, to_shift, rolling=True) 69 | # plt.plot(to_plot_min_first, '-k') 70 | # plt.ylabel('Views') 71 | # plt.xlabel('Time') 72 | # plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i)) 73 | # plt.close() 74 | # 75 | # np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f') 76 | # np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d') 77 | # np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d') 78 | # np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'), dists_cent, 79 | # fmt='%.5f') 80 | 81 | def create_parser(prog_name): 82 | 83 | desc = __doc__ 84 | formatter = argparse.RawDescriptionHelpFormatter 85 | parser = argparse.ArgumentParser(prog_name, description=desc, 86 | formatter_class=formatter) 87 | 88 | parser.add_argument('features_fpath', type=str, help='Features file') 89 | 90 | return parser 91 | 92 | def entry_point(args=None): 93 | '''Fake main used to create argparse and call real one''' 94 | 95 | if not args: 96 | args = [] 97 | 98 | parser = create_parser(args[0]) 99 | values = parser.parse_args(args[1:]) 100 | 101 | try: 102 | return main(values.features_fpath) 103 | except: 104 | traceback.print_exc() 105 | parser.print_usage(file=sys.stderr) 106 | return 1 107 | 108 | if __name__ == '__main__': 109 | sys.exit(entry_point(sys.argv)) -------------------------------------------------------------------------------- /src/scripts/pop_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scripts.learn_base import create_input_table 6 | from scripts.learn_base import create_grid_search 7 | from scripts.learn_base import clf_summary 8 | 9 | from pyksc.regression import mean_relative_square_error as mrse 10 | 11 | from sklearn.cross_validation import StratifiedKFold 12 | from sklearn.cross_validation import StratifiedShuffleSplit 13 | from sklearn.metrics import f1_score 14 | from sklearn.metrics import precision_recall_fscore_support 15 | from sklearn.metrics import mean_squared_error as mse 16 | from sklearn.metrics import r2_score 17 | from sklearn.preprocessing import scale 18 | 19 | from vod.stats.ci import half_confidence_interval_size as hci 20 | 21 | import plac 22 | import numpy as np 23 | import os 24 | import sys 25 | 26 | def create_learners(learner_name='rbf_svm'): 27 | clf = create_grid_search(learner_name, n_jobs=-1) 28 | rgr = create_grid_search(learner_name, regressor=True, n_jobs=-1) 29 | 30 | return clf, rgr 31 | 32 | def fit_and_predict(clf, rgr, X, y_clf, y_rgr, train, test, out_folder, fold): 33 | clf_model = clf.fit(X[train], y_clf[train]) 34 | 35 | y_clf_true = y_clf[test] 36 | y_rgr_true = y_rgr[test] 37 | y_clf_pred = clf_model.predict(X[test]) 38 | 39 | class_scores = np.array(precision_recall_fscore_support(y_clf_true, 40 | y_clf_pred)) 41 | micro_f1 = f1_score(y_clf_true, y_clf_pred, average='micro') 42 | macro_f1 = f1_score(y_clf_true, y_clf_pred, average='macro') 43 | 44 | rgr_model = rgr.fit(X[train], y_rgr[train]) 45 | y_rgr_pred = rgr_model.predict(X[test]) 46 | 47 | general_r2 = r2_score(y_rgr_true, y_rgr_pred) 48 | mse_score = mse(y_rgr_true, y_rgr_pred) 49 | mrse_score = mrse(y_rgr_true, y_rgr_pred) 50 | 51 | clf_pred_fpath = os.path.join(out_folder, '%d-clf.pred' % fold) 52 | clf_true_fpath = os.path.join(out_folder, '%d-clf.true' % fold) 53 | 54 | rgr_pred_fpath = os.path.join(out_folder, '%d-rgr.pred' % fold) 55 | rgr_true_fpath = os.path.join(out_folder, '%d-rgr.true' % fold) 56 | 57 | np.savetxt(clf_pred_fpath, y_clf_pred, fmt="%d") 58 | np.savetxt(clf_true_fpath, y_clf_true, fmt="%d") 59 | 60 | np.savetxt(rgr_pred_fpath, y_rgr_pred) 61 | np.savetxt(rgr_true_fpath, y_rgr_true) 62 | 63 | return class_scores, micro_f1, macro_f1, general_r2, mse_score,\ 64 | mrse_score 65 | 66 | def print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all): 67 | metric_means = np.mean(clf_scores, axis=0) 68 | metric_ci = hci(clf_scores, .95, axis=0) 69 | 70 | print(clf_summary(metric_means, metric_ci)) 71 | print('Micro F1 - mean: %f +- %f' % (np.mean(micro), hci(micro, .95))) 72 | print('Macro F1 - mean: %f +- %f' % (np.mean(macro), hci(macro, .95))) 73 | print('R2 all - mean: %f +- %f' % (np.mean(r2_all), hci(r2_all, .95))) 74 | print('MSE all - mean: %f +- %f' % (np.mean(mse_all), hci(mse_all, .95))) 75 | print('MRSE all - mean: %f +- %f' % (np.mean(mrse_all), 76 | hci(mrse_all, .95))) 77 | 78 | def run_experiment(X, y_clf, y_regr, feature_ids, out_folder): 79 | 80 | clf_scores = [] 81 | micro = [] 82 | macro = [] 83 | r2_all = [] 84 | mse_all = [] 85 | mrse_all = [] 86 | 87 | learner, rgr_base = create_learners() 88 | cv = StratifiedKFold(y_clf, k=5) 89 | fold_num = 1 90 | for train, test in cv: 91 | class_scores, micro_f1, macro_f1, general_r2, \ 92 | mse_score, mrse_score = \ 93 | fit_and_predict(learner, rgr_base, X, y_clf, y_regr, train, 94 | test, out_folder, fold_num) 95 | 96 | clf_scores.append(class_scores) 97 | micro.append(micro_f1) 98 | macro.append(macro_f1) 99 | 100 | r2_all.append(general_r2) 101 | mse_all.append(mse_score) 102 | mrse_all.append(mrse_score) 103 | 104 | fold_num += 1 105 | 106 | print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all) 107 | 108 | @plac.annotations(features_fpath=plac.Annotation('Partial Features', 109 | type=str), 110 | tag_categ_fpath=plac.Annotation('Tags file', type=str), 111 | tseries_fpath=plac.Annotation('Time series file', type=str), 112 | num_days_to_use=plac.Annotation('Num Days Series', type=int), 113 | assign_fpath=plac.Annotation('Series assignment file', 114 | type=str), 115 | out_foldpath=plac.Annotation('Output folder', type=str)) 116 | def main(features_fpath, tag_categ_fpath, tseries_fpath, num_days_to_use, 117 | assign_fpath, out_foldpath): 118 | 119 | X, feature_ids, _ = \ 120 | create_input_table(features_fpath, tseries_fpath, tag_categ_fpath, 121 | num_days_to_use) 122 | 123 | X = scale(X) 124 | y_clf = np.genfromtxt(assign_fpath) 125 | y_regr = scale(np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1)) 126 | run_experiment(X, y_clf, y_regr, feature_ids, out_foldpath) 127 | 128 | if __name__ == '__main__': 129 | sys.exit(plac.call(main)) 130 | -------------------------------------------------------------------------------- /src/scripts/radar.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | 7 | from matplotlib import pyplot as plt 8 | from matplotlib.projections.polar import PolarAxes 9 | from matplotlib.projections import register_projection 10 | 11 | def radar_factory(num_vars, frame='circle'): 12 | """Create a radar chart with `num_vars` axes.""" 13 | # calculate evenly-spaced axis angles 14 | theta = 2*np.pi * np.linspace(0, 1-1./num_vars, num_vars) 15 | # rotate theta such that the first axis is at the top 16 | theta += np.pi/2 17 | 18 | def draw_poly_frame(self, x0, y0, r): 19 | # TODO: use transforms to convert (x, y) to (r, theta) 20 | verts = [(r*np.cos(t) + x0, r*np.sin(t) + y0) for t in theta] 21 | return plt.Polygon(verts, closed=True, edgecolor='k') 22 | 23 | def draw_circle_frame(self, x0, y0, r): 24 | return plt.Circle((x0, y0), r) 25 | 26 | frame_dict = {'polygon': draw_poly_frame, 'circle': draw_circle_frame} 27 | if frame not in frame_dict: 28 | raise ValueError, 'unknown value for `frame`: %s' % frame 29 | 30 | class RadarAxes(PolarAxes): 31 | """Class for creating a radar chart (a.k.a. a spider or star chart) 32 | 33 | http://en.wikipedia.org/wiki/Radar_chart 34 | """ 35 | name = 'radar' 36 | # use 1 line segment to connect specified points 37 | RESOLUTION = 1 38 | # define draw_frame method 39 | draw_frame = frame_dict[frame] 40 | 41 | def fill(self, *args, **kwargs): 42 | """Override fill so that line is closed by default""" 43 | closed = kwargs.pop('closed', True) 44 | return super(RadarAxes, self).fill(closed=closed, *args, **kwargs) 45 | 46 | def plot(self, *args, **kwargs): 47 | """Override plot so that line is closed by default""" 48 | lines = super(RadarAxes, self).plot(*args, **kwargs) 49 | for line in lines: 50 | self._close_line(line) 51 | 52 | def _close_line(self, line): 53 | x, y = line.get_data() 54 | # FIXME: markers at x[0], y[0] get doubled-up 55 | if x[0] != x[-1]: 56 | x = np.concatenate((x, [x[0]])) 57 | y = np.concatenate((y, [y[0]])) 58 | line.set_data(x, y) 59 | 60 | def set_varlabels(self, labels): 61 | self.set_thetagrids(theta * 180/np.pi, labels) 62 | 63 | def _gen_axes_patch(self): 64 | x0, y0 = (0.5, 0.5) 65 | r = 0.5 66 | return self.draw_frame(x0, y0, r) 67 | 68 | register_projection(RadarAxes) 69 | return theta 70 | -------------------------------------------------------------------------------- /src/scripts/tags_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | '''This module contains the code used for data conversion''' 3 | from __future__ import division, print_function 4 | 5 | from collections import defaultdict 6 | 7 | from sklearn.base import BaseEstimator 8 | from sklearn.feature_extraction.text import Vectorizer 9 | 10 | import nltk 11 | 12 | class NoopAnalyzer(BaseEstimator): 13 | ''' 14 | Since we use NLTK to preprocess (more control) this 15 | class is used to bypass sklearns preprocessing 16 | ''' 17 | def analyze(self, text_document): 18 | '''Does nothing''' 19 | return text_document 20 | 21 | def __tokenize_and_stem(fpath): 22 | ''' 23 | Tokenizes and stems the file, converting each line to 24 | an array of words. 25 | 26 | Arguments 27 | --------- 28 | fpath: a path to a file 29 | Each line is a song, tags are separated by space 30 | ''' 31 | tokenizer = nltk.RegexpTokenizer(r'\w+') 32 | stopwords = set(nltk.corpus.stopwords.words('english')) 33 | stemmer = nltk.stem.PorterStemmer() 34 | 35 | docs = [] 36 | term_pops = defaultdict(int) 37 | with open(fpath) as tags_file: 38 | for line in tags_file: 39 | as_doc = [] 40 | for term in tokenizer.tokenize(line)[1:]: 41 | term = term.lower().strip() 42 | if term not in stopwords and term != '': 43 | stemmed = stemmer.stem(term) 44 | as_doc.append(stemmed) 45 | term_pops[stemmed] += 1 46 | 47 | docs.append(as_doc) 48 | 49 | return docs, term_pops 50 | 51 | def clean_up(fpath, bottom_filter=0.01): 52 | ''' 53 | Converts a YouTube tag file to a series of tokens. This code 54 | stems the tags, removes stopwords and filters infrequent 55 | tags (whose probability is bellow `bottom_filter`). 56 | 57 | Arguments 58 | --------- 59 | fpath: a path to a file 60 | Each line is a song, tags are separated by space 61 | bottom_filter: float (defaults to 0.01, one percent) 62 | Minimum probability for tags to be considered useful 63 | ''' 64 | docs, term_pops = __tokenize_and_stem(fpath) 65 | for doc in docs: 66 | to_yield = [] 67 | for term in doc: 68 | prob_term = term_pops[term] / len(term_pops) 69 | if prob_term > bottom_filter: 70 | to_yield.append(term) 71 | 72 | yield to_yield 73 | 74 | def vectorize_videos(fpath, use_idf=False): 75 | ''' 76 | Converts a YouTube tag file to a sparse matrix pondered. We can assign 77 | weights based on IDF if specified. 78 | 79 | Arguments 80 | --------- 81 | fpath: a path to a file 82 | Each line is a song, tags are separated by space 83 | use_idf: bool (optinal, defaults to True) 84 | Indicates whether to use IDF. 85 | bottom_filter: float (defaults to 0.005, half of one percent) 86 | Minimum probability for tags to be considered useful 87 | ''' 88 | #Vectorizes to TF-IDF 89 | vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf) 90 | sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0)) 91 | vocabulary = vectorizer.vocabulary 92 | return sparse_matrix, vocabulary -------------------------------------------------------------------------------- /src/scripts/tree_infogain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from sklearn.cross_validation import cross_val_score 6 | from sklearn.ensemble import ExtraTreesClassifier 7 | from sklearn.preprocessing import scale 8 | from sklearn.metrics import f1_score, precision_score, recall_score 9 | from sklearn.tree import DecisionTreeClassifier 10 | from sklearn.tree import export_graphviz 11 | 12 | import argparse 13 | import numpy as np 14 | import sys 15 | import traceback 16 | 17 | #def find_best_parameters(X_model, y_model, kernel): 18 | # 19 | # param_grid = { 20 | # 'C':[0.1, 0.5, 1, 5, 10, 50, 100], 21 | # 'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1] 22 | # } 23 | # 24 | # clf = GridSearchCV(svc, param_grid, n_jobs=-1, score_func=f1_score) 25 | # clf = clf.fit(X_model, y_model) 26 | # best_clf = clf.best_estimator 27 | # 28 | # return best_clf 29 | 30 | def main(features_fpath, classes_fpath): 31 | 32 | with open(features_fpath) as features_file: 33 | for line in features_file: 34 | if '#' in line: 35 | spl = line.split() 36 | names = spl[1:] 37 | 38 | X = scale(np.genfromtxt(features_fpath)[:,1:].copy()) 39 | y = np.loadtxt(classes_fpath) 40 | 41 | forest = ExtraTreesClassifier(max_depth=4, 42 | criterion="entropy", 43 | compute_importances=True) 44 | 45 | scores = cross_val_score(forest, X, y, score_func=f1_score, cv=5) 46 | print(scores) 47 | 48 | forest.fit(X, y) 49 | 50 | importances = forest.feature_importances_ 51 | indices = np.argsort(importances)[::-1] 52 | 53 | # Print the feature ranking 54 | print("Feature ranking:") 55 | for f in xrange(len(importances[indices])): 56 | print("%d. feature %s (%f)" % (f + 1, names[indices[f]], 57 | importances[indices[f]])) 58 | 59 | export_graphviz(forest, 'bala.dot') 60 | 61 | def create_parser(prog_name): 62 | 63 | desc = __doc__ 64 | formatter = argparse.RawDescriptionHelpFormatter 65 | parser = argparse.ArgumentParser(prog_name, description=desc, 66 | formatter_class=formatter) 67 | 68 | parser.add_argument('features_fpath', type=str, 69 | help='Input file with video features') 70 | parser.add_argument('classes_fpath', type=str, 71 | help='Classes to predict') 72 | 73 | return parser 74 | 75 | def entry_point(args=None): 76 | '''Fake main used to create argparse and call real one''' 77 | 78 | if not args: 79 | args = [] 80 | 81 | parser = create_parser(args[0]) 82 | values = parser.parse_args(args[1:]) 83 | 84 | try: 85 | return main(values.features_fpath, values.classes_fpath) 86 | except: 87 | traceback.print_exc() 88 | parser.print_usage(file=sys.stderr) 89 | return 1 90 | 91 | if __name__ == '__main__': 92 | sys.exit(entry_point(sys.argv)) 93 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/boosting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import division, print_function 3 | 4 | from sklearn.base import clone 5 | 6 | import numpy as np 7 | 8 | EPS = 1e6 9 | 10 | def sample_with_rep(weights, num_samples): 11 | 12 | weights = np.asanyarray(weights) 13 | 14 | assert weights.sum() >= (1 - EPS) and weights.sum() <= (1 + EPS) 15 | 16 | x = np.arange(weights.shape[0]) 17 | y = np.random.multinomial(num_samples, weights) 18 | return np.repeat(x, y) 19 | 20 | def error(y_true, y_pred, weights = None): 21 | 22 | y_true = np.asanyarray(y_true) 23 | y_pred = np.asanyarray(y_pred) 24 | 25 | if weights is not None: 26 | weights = np.asanyarray(weights) 27 | else: 28 | weights = np.ones(y_true.shape) 29 | 30 | data = np.asanyarray(y_true != y_pred, dtype='i') 31 | return (data * weights).sum() / weights.sum() 32 | 33 | def comp_alpha(err, num_classes): 34 | 35 | return np.log((1 - err) / err) + np.log(num_classes - 1) 36 | 37 | def compute_weights(y_true, y_pred, old_weights, alpha): 38 | 39 | y_true = np.asanyarray(y_true) 40 | y_pred = np.asanyarray(y_pred) 41 | old_weights = np.asanyarray(old_weights) 42 | 43 | return old_weights * np.exp(alpha * (y_true != y_pred)) 44 | 45 | def get_alpha_and_weights(y_true, y_pred, old_weights): 46 | 47 | num_classes = len(set(y_true)) 48 | alpha = comp_alpha(error(y_true, y_pred), num_classes) 49 | weights = compute_weights(y_true, y_pred, old_weights, alpha) 50 | 51 | weights /= weights.sum() 52 | return alpha, weights 53 | 54 | class ClassBoost(object): 55 | 56 | def __init__(self, classifier, sample_factor = 2.5): 57 | self.classifier = clone(classifier) 58 | self.sample_factor = sample_factor 59 | self.base_w = 0 60 | self.class_w = 0 61 | 62 | def fit(self, X, y, B): 63 | 64 | X = np.asanyarray(X) 65 | y = np.asanyarray(y) 66 | B = np.asanyarray(B) 67 | 68 | ypred_base = np.asanyarray(B).argmax(axis = 1) 69 | 70 | assert X.shape[0] == y.shape[0] == ypred_base.shape[0] 71 | 72 | n = X.shape[0] 73 | 74 | uni_weights = np.ones(n) / n 75 | base_alpha, base_weights = get_alpha_and_weights(y, ypred_base, 76 | uni_weights) 77 | 78 | #Sampling with repetition 79 | num_samples = int(n * self.sample_factor) 80 | idx = sample_with_rep(base_weights, num_samples) 81 | 82 | #Fitting 83 | X_new = X[idx] 84 | y_new = y[idx] 85 | self.classifier.fit(X_new, y_new) 86 | y_pred_new = self.classifier.predict(X) 87 | 88 | class_alpha, class_weights = get_alpha_and_weights(y, y_pred_new, 89 | base_weights) 90 | 91 | self.base_w = base_alpha 92 | self.class_w = class_alpha 93 | 94 | def predict(self, X, B): 95 | P_class = self.classifier.predict_proba(X) 96 | return (B * self.base_w + P_class * self.base_w).argmax(axis = 1) 97 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/classify_pts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from pyksc.trend import TrendLearner 6 | 7 | from sklearn.metrics import classification_report 8 | from sklearn.metrics import confusion_matrix 9 | 10 | import ioutil 11 | import numpy as np 12 | import os 13 | import plac 14 | import sys 15 | 16 | def fit(C, y_train, X, y_true, num_pts): 17 | 18 | learner = TrendLearner(num_pts, 1) 19 | learner.fit(C, y_train) 20 | 21 | probs = learner.predict_proba(X) 22 | y_pred = probs.argmax(axis=1) 23 | 24 | return y_pred, probs 25 | 26 | def main(tseries_fpath, train_fpath, centroids_fpath, classes_fpath, out_folder, 27 | gamma_max): 28 | gamma_max = int(gamma_max) 29 | 30 | X = ioutil.load_series(tseries_fpath, train_fpath) 31 | C = np.genfromtxt(centroids_fpath, dtype='f') 32 | 33 | y_train = np.arange(C.shape[0]) 34 | y_true = np.genfromtxt(classes_fpath, dtype='i') 35 | max_pts = gamma_max 36 | #max_pts = X.shape[1] 37 | 38 | best_by = np.zeros(X.shape[0]) 39 | min_conf = np.zeros(X.shape[0]) 40 | all_probs = np.zeros(shape=(X.shape[0], max_pts)) 41 | 42 | lousy_conf = 1.0 / C.shape[0] #if confidence is equal to this, classifier did nothing 43 | for num_pts in range(1, max_pts + 1): 44 | y_pred, probs = fit(C, y_train, X, y_true, num_pts) 45 | 46 | for i in xrange(X.shape[0]): 47 | p_true = probs[i, y_true[i]] 48 | if best_by[i] == 0 and y_pred[i] == y_true[i] and p_true > lousy_conf: 49 | best_by[i] = num_pts 50 | min_conf[i] = probs[i, y_true[i]] 51 | all_probs[i, num_pts - 1] = p_true 52 | 53 | summary_fpath = os.path.join(out_folder,\ 54 | 'class_summ-%d-pts.dat' % num_pts) 55 | probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts) 56 | 57 | with open(summary_fpath, 'w') as summary_file: 58 | print(classification_report(y_true, y_pred), file=summary_file) 59 | np.savetxt(probs_fpath, probs) 60 | 61 | best_fpath = os.path.join(out_folder, 'best-by.dat') 62 | conf_fpath = os.path.join(out_folder, 'conf.dat') 63 | all_conf_fpath = os.path.join(out_folder, 'all-conf.dat') 64 | 65 | np.savetxt(best_fpath, best_by) 66 | np.savetxt(conf_fpath, min_conf) 67 | np.savetxt(all_conf_fpath, np.asarray(all_probs)) 68 | 69 | if __name__ == '__main__': 70 | sys.exit(plac.call(main)) 71 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/classify_pts_all.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from pyksc.trend import TrendLearner 6 | 7 | from sklearn.metrics import classification_report 8 | from sklearn.metrics import confusion_matrix 9 | 10 | import ioutil 11 | import numpy as np 12 | import os 13 | import plac 14 | import sys 15 | 16 | def fit(Xtrain, y_train, Xtest, num_pts): 17 | 18 | learner = TrendLearner(num_pts, 1) 19 | learner.fit(Xtrain, y_train) 20 | 21 | probs = learner.predict_proba(Xtest) 22 | y_pred = probs.argmax(axis=1) 23 | 24 | return y_pred, probs 25 | 26 | def main(tseries_fpath, train_fpath, test_fpath, ytrain_fpath, ytest_fpath, out_folder): 27 | Xtrain = ioutil.load_series(tseries_fpath, train_fpath) 28 | Xtest = ioutil.load_series(tseries_fpath, test_fpath) 29 | 30 | y_train = np.genfromtxt(ytrain_fpath) 31 | y_true = np.genfromtxt(ytest_fpath) 32 | max_pts = Xtrain.shape[1] 33 | 34 | best_by = np.zeros(Xtest.shape[0]) 35 | min_conf = np.zeros(Xtest.shape[0]) 36 | all_probs = np.zeros(shape=(Xtest.shape[0], max_pts)) 37 | 38 | lousy_conf = 1.0 / len(set(y_train)) #if confidence is equal to this, classifier did nothing 39 | for num_pts in range(1, max_pts + 1): 40 | y_pred, probs = fit(Xtrain, y_train, Xtest, num_pts) 41 | 42 | for i in xrange(Xtest.shape[0]): 43 | p_true = probs[i, y_true[i]] 44 | if best_by[i] == 0 and y_pred[i] == y_true[i] and p_true > lousy_conf: 45 | best_by[i] = num_pts 46 | min_conf[i] = probs[i, y_true[i]] 47 | all_probs[i, num_pts - 1] = p_true 48 | 49 | summary_fpath = os.path.join(out_folder,\ 50 | 'class_summ-%d-pts.dat' % num_pts) 51 | probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts) 52 | 53 | with open(summary_fpath, 'w') as summary_file: 54 | print(classification_report(y_true, y_pred), file=summary_file) 55 | np.savetxt(probs_fpath, probs) 56 | 57 | best_fpath = os.path.join(out_folder, 'best-by.dat') 58 | conf_fpath = os.path.join(out_folder, 'conf.dat') 59 | all_conf_fpath = os.path.join(out_folder, 'all-conf.dat') 60 | 61 | np.savetxt(best_fpath, best_by) 62 | np.savetxt(conf_fpath, min_conf) 63 | np.savetxt(all_conf_fpath, np.asarray(all_probs)) 64 | 65 | if __name__ == '__main__': 66 | sys.exit(plac.call(main)) 67 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/classify_pts_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from pyksc.trend import TrendLearner 6 | 7 | from sklearn.metrics import classification_report 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.metrics import precision_score 10 | 11 | import argparse 12 | import ioutil 13 | import numpy as np 14 | import os 15 | import plac 16 | import sys 17 | 18 | def fit(Xtrain, y_train, Xtest, num_pts): 19 | 20 | learner = TrendLearner(num_pts, 1) 21 | learner.fit(Xtrain, y_train) 22 | probs = learner.predict_proba(Xtest) 23 | 24 | return probs 25 | 26 | def main(tseries_fpath, centroids_fpath, test_fpath, assign_fpath, out_folder, 27 | gamma_max): 28 | gamma_max = int(gamma_max) 29 | 30 | C = np.genfromtxt(centroids_fpath) 31 | Xtest = ioutil.load_series(tseries_fpath, test_fpath) 32 | y_train = np.arange(C.shape[0]) 33 | 34 | max_pts = gamma_max 35 | for num_pts in range(1, max_pts + 1): 36 | #for num_pts in [1, 25, 50, 75]: 37 | probs = fit(C, y_train, Xtest, num_pts) 38 | 39 | probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts) 40 | np.savetxt(probs_fpath, probs) 41 | 42 | if __name__ == '__main__': 43 | sys.exit(plac.call(main)) 44 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/classify_theta.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from scipy.stats.mstats import mquantiles 6 | 7 | from sklearn.metrics import f1_score 8 | from sklearn.metrics import classification_report 9 | 10 | from pyksc import dist 11 | 12 | import argparse 13 | import glob 14 | import ioutil 15 | import multiprocessing 16 | import numpy as np 17 | import os 18 | import plac 19 | import sys 20 | 21 | FNAME = 'probs-%d-pts.dat' 22 | 23 | def pred(probs_folder, num_series, max_pts, min_pts, thetas): 24 | 25 | y_pred = np.zeros(num_series) - 1 26 | best_by = np.zeros(num_series) + np.inf 27 | confs = np.zeros(num_series) 28 | all_confs = np.zeros((num_series, len(thetas))) 29 | 30 | for num_pts in range(1, max_pts + 1): 31 | fpath = os.path.join(probs_folder, FNAME) % num_pts 32 | P = np.loadtxt(fpath) 33 | 34 | curr_pred = P.argmax(axis=1) 35 | curr_score = P.max(axis=1) 36 | 37 | for i in xrange(num_series): 38 | score = curr_score[i] 39 | curr_cls = curr_pred[i] 40 | 41 | theta = thetas[curr_cls] 42 | min_req = min_pts[curr_cls] 43 | 44 | if num_pts >= min_req and score > theta and y_pred[i] == -1: 45 | y_pred[i] = curr_cls 46 | best_by[i] = num_pts 47 | confs[i] = score 48 | all_confs[i] = P[i] 49 | 50 | #if y_pred[i] != curr_cls and score > confs[i]: 51 | # y_pred[i] = curr_cls 52 | # #best_by[i] = num_pts 53 | # confs[i] = score 54 | # all_confs[i] = P[i] 55 | 56 | assert y_pred[confs > 0].sum() == y_pred[y_pred != -1].sum() 57 | assert y_pred[best_by != np.inf].sum() == y_pred[y_pred != -1].sum() 58 | 59 | return y_pred, best_by, confs, all_confs 60 | 61 | def aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, 62 | idx, summ_file): 63 | 64 | X = X[idx] 65 | peak_days = peak_days[idx] 66 | sum_views = sum_views[idx] 67 | best_by = np.asanyarray(best_by[idx], dtype='i') 68 | y_true = y_true[idx] 69 | y_pred = y_pred[idx] 70 | confs = confs[idx] 71 | 72 | left_frac = np.zeros(X.shape[0]) 73 | for i in xrange(X.shape[0]): 74 | left_frac[i] = \ 75 | (sum_views[i] - X[i][:best_by[i]].sum()) / sum_views[i] 76 | 77 | dist_peak = (peak_days - best_by - 1) 78 | 79 | print('- PeakDistQuantiles (peak - best)', mquantiles(dist_peak), file=summ_file) 80 | print('- LeftViewsQuantiles', mquantiles(left_frac), file=summ_file) 81 | 82 | 83 | def save_results(X, peak_days, sum_views, pts_grid, theta_grid, best_by, all_confs, 84 | y_true, y_pred, confs, out_folder): 85 | 86 | valid = confs > 0 87 | correct = y_true == y_pred 88 | 89 | summ_fpath = os.path.join(out_folder, 'summ.dat') 90 | with open(summ_fpath, 'w') as summ_file: 91 | print('Params', file=summ_file) 92 | for cls in sorted(pts_grid): 93 | print('\t Cls = %d; min_pts = %d; theta = %.3f' \ 94 | % (cls, pts_grid[cls], theta_grid[cls]), file=summ_file) 95 | print(file=summ_file) 96 | 97 | print('All', file=summ_file) 98 | aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, valid, summ_file) 99 | print(file=summ_file) 100 | 101 | print('Correct Only', file=summ_file) 102 | aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, correct, summ_file) 103 | print(file=summ_file) 104 | 105 | print('Incorrect Only', file=summ_file) 106 | aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, ~correct, summ_file) 107 | print(file=summ_file) 108 | 109 | #print(classification_report(y_true[valid], y_pred[valid]), 110 | # file=summ_file) 111 | print(classification_report(y_true, y_pred), 112 | file=summ_file) 113 | print(file=summ_file) 114 | print('# invalid %d' % (~valid).sum(), file=summ_file) 115 | 116 | ypred_fpath = os.path.join(out_folder, 'pred.dat') 117 | np.savetxt(ypred_fpath, y_pred) 118 | 119 | bestby_fpath = os.path.join(out_folder, 'best-by.dat') 120 | np.savetxt(bestby_fpath, best_by) 121 | 122 | conf_fpath = os.path.join(out_folder, 'conf.dat') 123 | np.savetxt(conf_fpath, confs) 124 | 125 | conf_fpath = os.path.join(out_folder, 'all-conf.dat') 126 | np.savetxt(conf_fpath, all_confs) 127 | 128 | def run_fold(folder, tseries_fpath, min_pts, thetas, out_folder, gamma_max): 129 | 130 | try: 131 | os.makedirs(out_folder) 132 | except: 133 | pass 134 | 135 | test_fpath = os.path.join(folder, 'test.dat') 136 | cents_fpath = os.path.join(folder, 'ksc', 'cents.dat') 137 | assign_fpath = os.path.join(folder, 'ksc', 'test_assign.dat') 138 | probs_folder = os.path.join(folder, 'probs-test') 139 | 140 | X = ioutil.load_series(tseries_fpath, test_fpath) 141 | test_idx = np.loadtxt(test_fpath, dtype='bool') 142 | y_true = np.loadtxt(assign_fpath) 143 | 144 | num_series = X.shape[0] 145 | max_pts = gamma_max 146 | 147 | peak_days = [] 148 | sum_views = [] 149 | with open(tseries_fpath) as tseries_file: 150 | for i, line in enumerate(tseries_file): 151 | if test_idx[i]: 152 | x = np.array([int(v) for v in line.split()[1:]]) 153 | peak_days.append(x.argmax()) 154 | sum_views.append(x.sum()) 155 | 156 | peak_days = np.array(peak_days) 157 | sum_views = np.array(sum_views) 158 | 159 | y_pred, best_by, confs, all_confs = \ 160 | pred(probs_folder, num_series, max_pts, min_pts, thetas) 161 | save_results(X, peak_days, sum_views, min_pts, thetas, best_by, all_confs, 162 | y_true, y_pred, confs, out_folder) 163 | 164 | 165 | def get_params(folder, threshold, max_k): 166 | 167 | assign = np.loadtxt(os.path.join(folder, 'ksc', 'assign.dat'), dtype='i') 168 | P = np.loadtxt(os.path.join(folder, 'probs', 'all-conf.dat'), dtype='f') 169 | best_by = np.loadtxt(os.path.join(folder, 'probs', 'best-by.dat'), dtype='i') 170 | 171 | thetas = {} 172 | min_pts = {} 173 | for i in xrange(2, P.shape[1]): 174 | fpath = os.path.join(folder, 'probs', 'probs-%d-pts.dat' % i) 175 | Pi = np.loadtxt(fpath, dtype='f') 176 | for k in set(assign): 177 | y_true = assign == k 178 | 179 | maxcls = Pi.argmax(axis=1) 180 | y_pred = maxcls == k 181 | score = f1_score(y_true, y_pred) 182 | if score >= threshold and k not in thetas: 183 | thetas[k] = P[assign == k][:,i].mean() 184 | min_pts[k] = i 185 | 186 | for k in xrange(max_k): 187 | if k not in thetas: 188 | thetas[k] = 1.0 / len(set(assign)) 189 | min_pts[k] = 0 190 | 191 | return thetas, min_pts 192 | 193 | def multi_proc_run(args): 194 | 195 | folder, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k = args 196 | fitted_thetas, fitted_min_pts = get_params(folder, float(f1_target), max_k) 197 | 198 | out_folder = os.path.join(folder, results_sub_folder) 199 | run_fold(folder, tseries_fpath, fitted_min_pts, fitted_thetas, 200 | out_folder, gamma_max) 201 | 202 | def main(tseries_fpath, base_folder, f1_target, results_sub_folder, gamma_max, max_k): 203 | gamma_max = int(gamma_max) 204 | max_k = int(max_k) 205 | 206 | folders = glob.glob(os.path.join(base_folder, 'fold-*/')) 207 | pool = multiprocessing.Pool() 208 | pool.map(multi_proc_run, \ 209 | [(fold, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k) for fold in folders]) 210 | pool.terminate() 211 | pool.join() 212 | 213 | if __name__ == '__main__': 214 | sys.exit(plac.call(main)) 215 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/classify_theta_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from classify_theta import get_params, pred, save_results 6 | 7 | from pyksc import dist 8 | 9 | import argparse 10 | import glob 11 | import ioutil 12 | import multiprocessing 13 | import numpy as np 14 | import os 15 | import plac 16 | import sys 17 | 18 | FNAME = 'probs-%d-pts.dat' 19 | 20 | def run_fold(folder, tseries_fpath, min_pts, thetas, gamma_max, out_folder): 21 | 22 | try: 23 | os.makedirs(out_folder) 24 | except: 25 | pass 26 | 27 | train_fpath = os.path.join(folder, 'train.dat') 28 | cents_fpath = os.path.join(folder, 'ksc', 'cents.dat') 29 | assign_fpath = os.path.join(folder, 'ksc', 'assign.dat') 30 | probs_folder = os.path.join(folder, 'probs') 31 | 32 | X = ioutil.load_series(tseries_fpath, train_fpath) 33 | train_idx = np.loadtxt(train_fpath, dtype='bool') 34 | y_true = np.loadtxt(assign_fpath) 35 | 36 | num_series = X.shape[0] 37 | max_pts = gamma_max 38 | 39 | #Since we prune the first 100 lines of X we need to read other info 40 | peak_days = [] 41 | sum_views = [] 42 | with open(tseries_fpath) as tseries_file: 43 | for i, line in enumerate(tseries_file): 44 | if train_idx[i]: 45 | x = np.array([int(v) for v in line.split()[1:]]) 46 | peak_days.append(x.argmax()) 47 | sum_views.append(x.sum()) 48 | 49 | peak_days = np.array(peak_days) 50 | sum_views = np.array(sum_views) 51 | 52 | y_pred, best_by, confs, all_confs = \ 53 | pred(probs_folder, num_series, max_pts, min_pts, thetas) 54 | save_results(X, peak_days, sum_views, min_pts, thetas, best_by, all_confs, 55 | y_true, y_pred, confs, out_folder) 56 | 57 | def multi_proc_run(args): 58 | 59 | folder, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k = args 60 | fitted_thetas, fitted_min_pts = get_params(folder, f1_target, max_k) 61 | 62 | out_folder = os.path.join(folder, results_sub_folder) 63 | run_fold(folder, tseries_fpath, fitted_min_pts, fitted_thetas, 64 | gamma_max, out_folder) 65 | 66 | def main(tseries_fpath, base_folder, f1_target, results_sub_folder, gamma_max, max_k): 67 | gamma_max = int(gamma_max) 68 | max_k = int(max_k) 69 | 70 | f1_target = float(f1_target) 71 | folders = glob.glob(os.path.join(base_folder, 'fold-*/')) 72 | pool = multiprocessing.Pool() 73 | pool.map(multi_proc_run, [(fold, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k) for fold in folders]) 74 | pool.terminate() 75 | pool.join() 76 | 77 | if __name__ == '__main__': 78 | sys.exit(plac.call(main)) 79 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/cluster.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from pyksc import dist 6 | from pyksc import ksc 7 | 8 | import ioutil 9 | import numpy as np 10 | import os 11 | import plac 12 | 13 | def main(tseries_fpath, base_folder, k): 14 | k = int(k) 15 | 16 | idx_fpath = os.path.join(os.path.join(base_folder, '..'), 'train.dat') 17 | X = ioutil.load_series(tseries_fpath, idx_fpath) 18 | 19 | cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) 20 | np.savetxt(os.path.join(base_folder, 'cents.dat'), cent, fmt='%.5f') 21 | np.savetxt(os.path.join(base_folder, 'assign.dat'), assign, fmt='%d') 22 | np.savetxt(os.path.join(base_folder, 'shift.dat'), shift, fmt='%d') 23 | np.savetxt(os.path.join(base_folder, 'dists_cent.dat'), dists_cent, 24 | fmt='%.5f') 25 | 26 | if __name__ == '__main__': 27 | plac.call(main) 28 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/cotrain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import division, print_function 3 | 4 | from scipy import stats 5 | 6 | from sklearn.base import clone 7 | 8 | import numpy as np 9 | 10 | class CoTrain(object): 11 | 12 | def __init__(self, classifier, label_fract = .25): 13 | self.classifier = clone(classifier) 14 | self.label_fract = label_fract 15 | 16 | def fit(self, X, y, P): 17 | X = np.asanyarray(X) 18 | y = np.asanyarray(y) 19 | P = np.asanyarray(P) 20 | 21 | assert X.shape[0] == y.shape[0] == P.shape[0] 22 | 23 | n = X.shape[0] 24 | idx = np.arange(n) 25 | np.random.shuffle(idx) 26 | 27 | n_classes = len(set(y)) 28 | n_labelled = int(n * self.label_fract) 29 | 30 | init_labelled = idx[:n_labelled] 31 | w_label = np.zeros(n, dtype='bool') 32 | w_label[init_labelled] = True 33 | 34 | classes = np.arange(n_classes) 35 | y_new = np.zeros(n) - 1 36 | y_new[init_labelled] = y[init_labelled] 37 | while not w_label.all(): 38 | self.classifier.fit(X[w_label], y_new[w_label]) 39 | P_cls = self.classifier.predict_proba(X[~w_label]) 40 | 41 | best_c = P_cls.argmax(axis = 0) 42 | best_p = P[~w_label].argmax(axis = 0) 43 | 44 | idx_c = np.where(~w_label)[0][best_c] 45 | idx_p = np.where(~w_label)[0][best_p] 46 | 47 | w_label[idx_c] = True 48 | w_label[idx_p] = True 49 | 50 | y_new[idx_c] = classes 51 | y_new[idx_p] = classes 52 | 53 | def predict(self, X, P): 54 | P_class = self.classifier.predict_proba(X) 55 | return (P * P_class).argmax(axis = 1) 56 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/create_test_assign.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pyksc import dist 4 | 5 | import ioutil 6 | import numpy as np 7 | import plac 8 | import sys 9 | 10 | def main(tseries_fpath, test_fpath, cents_fpath): 11 | 12 | X = ioutil.load_series(tseries_fpath, test_fpath) 13 | 14 | C = np.loadtxt(cents_fpath) 15 | dist_cents = dist.dist_all(C, X, rolling=True)[0] 16 | y_true = dist_cents.argmin(axis=0) 17 | 18 | for t in y_true: 19 | print t 20 | 21 | if __name__ == '__main__': 22 | sys.exit(plac.call(main)) 23 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/generate_cross_vals.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf8 3 | from __future__ import print_function, division 4 | 5 | from sklearn import model_selection 6 | 7 | import numpy as np 8 | import os 9 | import plac 10 | import sys 11 | 12 | def main(tseries_fpath, out_folder): 13 | X = np.genfromtxt(tseries_fpath)[:,1:] 14 | num_series = X.shape[0] 15 | 16 | curr_fold = 1 17 | cv = model_selection.KFold(5, shuffle=True) 18 | to_save_train = np.zeros(len(X), dtype='b') 19 | to_save_test = np.zeros(len(X), dtype='b') 20 | 21 | for train, test in cv.split(X): 22 | curr_out_folder = os.path.join(out_folder, 'fold-%d' % curr_fold) 23 | 24 | try: 25 | os.makedirs(curr_out_folder) 26 | except: 27 | pass 28 | 29 | to_save_train[:] = False 30 | to_save_test[:] = False 31 | to_save_train[train] = True 32 | to_save_test[test] = True 33 | 34 | np.savetxt(os.path.join(curr_out_folder, 'train.dat'), to_save_train, fmt='%i') 35 | np.savetxt(os.path.join(curr_out_folder, 'test.dat'), to_save_test, fmt='%i') 36 | curr_fold += 1 37 | 38 | if __name__ == '__main__': 39 | sys.exit(plac.call(main)) 40 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/ioutil.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | 3 | import numpy as np 4 | 5 | EPS = 1e-6 6 | 7 | def load_series(tseries_fpath, idx_fpath): 8 | X = np.genfromtxt(tseries_fpath)[:, 1:] + EPS 9 | train_idx = np.loadtxt(idx_fpath, dtype='bool') 10 | return np.asanyarray(X[train_idx], order='C') 11 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/learn_base.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | ''' 3 | Common functions for creating classifiers and regressors for machine learning 4 | tasks 5 | ''' 6 | from __future__ import division, print_function 7 | 8 | from scipy import sparse 9 | 10 | from sklearn import neighbors 11 | from sklearn import ensemble 12 | from sklearn import model_selection 13 | from sklearn import linear_model 14 | from sklearn import svm 15 | 16 | import cStringIO 17 | import numpy as np 18 | 19 | #Params 20 | TREE_SPLIT_RANGE = [1, 2, 4, 8, 16, 32, 64, 128] 21 | KNN_K_RANGE = [5, 10, 15] 22 | 23 | PARAMS = {'lr':{'C':[1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4]}, 24 | 'knn':{'n_neighbors':KNN_K_RANGE}, 25 | 'extra_trees':{'min_samples_split':TREE_SPLIT_RANGE}} 26 | 27 | #Classifiers 28 | CLFS = {'lr':linear_model.LogisticRegression(), 29 | 'knn':neighbors.KNeighborsClassifier(), 30 | 'extra_trees':ensemble.ExtraTreesClassifier(n_estimators=100, 31 | criterion='entropy', 32 | n_jobs=1)} 33 | 34 | #Category Parsing Utilities 35 | CATEG_ABBRV = { 36 | 'Autos&Vehicles':'Vehi.', 37 | 'Autos':'Vehi.', 38 | 'Comedy':'Com.', 39 | 'Education':'Edu.', 40 | 'Entertainment':'Ent.', 41 | 'Film':'Film', 42 | 'Film&Animation':'Film', 43 | 'Games':'Game', 44 | 'Gaming':'Game', 45 | 'Howto':'Howto', 46 | 'Howto&Style':'Howto', 47 | 'Movies':'Film', 48 | 'Music':'Music', 49 | 'NULL':'-', 50 | 'News':'News', 51 | 'News&Politics':'News', 52 | 'Nonprofit':'Nonprof.', 53 | 'Nonprofits&Activism':'Nonprof.', 54 | 'People&Blogs':'People', 55 | 'People':'People', 56 | 'Pets&Animals':'Pets', 57 | 'Pets':'Pets', 58 | 'Animals':'Pets', 59 | 'Science&Technology':'Sci.', 60 | 'Science':'Sci.', 61 | 'Tech':'Sci.', 62 | 'Shows':'Show', 63 | 'Sports':'Sport', 64 | 'Trailers':'Film', 65 | 'Travel&Events':'Travel', 66 | 'Travel':'Travel'} 67 | 68 | CAT_COL = 2 69 | CAT_IDS = dict((abbrv, i) \ 70 | for i, abbrv in enumerate(sorted(set(CATEG_ABBRV.values())))) 71 | INV_CAT_IDS = dict((v, k) for k, v in CAT_IDS.items()) 72 | 73 | def _get_classifier_and_params(name): 74 | return CLFS[name], PARAMS[name] 75 | 76 | def create_grid_search(name, n_jobs=-1): 77 | learner, params = _get_classifier_and_params(name) 78 | return model_selection.GridSearchCV(learner, params, cv=3, refit=True, 79 | n_jobs=n_jobs) 80 | 81 | def hstack_if_possible(X, Y): 82 | if X is not None: 83 | return np.hstack((X, Y)) 84 | else: 85 | return Y 86 | 87 | def load_categories(tags_cat_fpath): 88 | with open(tags_cat_fpath) as tags_cat_file: 89 | 90 | data = [] 91 | for i, line in enumerate(tags_cat_file): 92 | spl = line.split() 93 | category = 'NULL' 94 | if len(spl) > CAT_COL: 95 | category = line.split()[CAT_COL] 96 | 97 | abbrv = CATEG_ABBRV[category] 98 | categ_id = CAT_IDS[abbrv] 99 | 100 | n_rows = len(CAT_IDS) 101 | row = np.zeros(n_rows) 102 | row[categ_id] = 1 103 | 104 | data.append(row) 105 | 106 | X_categ = np.asarray(data) 107 | return X_categ 108 | 109 | def clf_summary(mean_scores, ci_scores): 110 | 111 | buff = cStringIO.StringIO() 112 | try: 113 | print('class \tprecision \trecall \tf1 score \tsupport', file=buff) 114 | for j in xrange(mean_scores.shape[1]): 115 | print(j, end="\t", file=buff) 116 | for i in xrange(mean_scores.shape[0]): 117 | print('%.3f +- %.3f' % (mean_scores[i, j], ci_scores[i, j]), 118 | end="\t", file=buff) 119 | print(file=buff) 120 | print(file=buff) 121 | 122 | return buff.getvalue() 123 | finally: 124 | buff.close() 125 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/multimodel_class.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 2 | 3 | from __future__ import division, print_function 4 | 5 | from sklearn.base import clone 6 | from sklearn.metrics import classification_report 7 | 8 | from learn_base import create_grid_search 9 | from learn_base import load_categories 10 | 11 | import boosting 12 | import cotrain 13 | import stacking 14 | import numpy as np 15 | import os 16 | import plac 17 | import sys 18 | 19 | def load_features(features_folder, best_by, gamma_max): 20 | 21 | F = [] 22 | matrices = {} 23 | feats_fname = 'year#%d.txt' 24 | 25 | for i in xrange(best_by.shape[0]): 26 | bby = best_by[i] 27 | 28 | if bby == np.inf: 29 | feats_file = os.path.join(features_folder, feats_fname % gamma_max) 30 | else: 31 | bby = int(bby) 32 | feats_file = os.path.join(features_folder, feats_fname % bby) 33 | 34 | if bby in matrices: 35 | Fi = matrices[bby] 36 | else: 37 | Fi = np.genfromtxt(feats_file)[:,1:] 38 | matrices[bby] = Fi 39 | 40 | feats = Fi[i] 41 | 42 | F.append(feats) 43 | 44 | return np.asanyarray(F) 45 | 46 | def save_results(out_folder, base_name, y_pred, y_true): 47 | folder = os.path.join(out_folder, base_name) 48 | 49 | try: 50 | os.mkdir(folder) 51 | except: 52 | pass 53 | 54 | out_file = os.path.join(folder, 'pred.dat') 55 | np.savetxt(out_file, y_pred) 56 | 57 | with open(os.path.join(folder, 'summ.dat'), 'w') as summ_file: 58 | print(classification_report(y_true, y_pred), file=summ_file) 59 | 60 | def run_classifier(out_folder, trend_probs, referrers, y, train, test): 61 | 62 | F = referrers #static features 63 | etree = create_grid_search('lr', n_jobs = 1) 64 | 65 | y_pred = trend_probs[test].argmax(axis=1) 66 | save_results(out_folder, 'tl-base-lr', y_pred, y[test]) 67 | 68 | aux = clone(etree) 69 | aux.fit(F[train], y[train]) 70 | y_pred = aux.predict(F[test]) 71 | save_results(out_folder, 'tree-feats', y_pred, y[test]) 72 | 73 | aux = clone(etree) 74 | aux.fit(trend_probs[train], y[train]) 75 | y_pred = aux.predict(trend_probs[test]) 76 | save_results(out_folder, 'tree-probs', y_pred, y[test]) 77 | 78 | C = np.hstack((F, trend_probs)) 79 | aux = clone(etree) 80 | aux.fit(C[train], y[train]) 81 | y_pred = aux.predict(C[test]) 82 | save_results(out_folder, 'meta-combine', y_pred, y[test]) 83 | 84 | #stack_clf = stacking.Stacking(3, [etree], 'tree') 85 | #stack_clf.fit(F[train], y[train], trend_probs[train]) 86 | #y_pred = stack_clf.predict(F[test], trend_probs[test]) 87 | #save_results(out_folder, 'meta-stack-tree', y_pred) 88 | 89 | stack_clf = stacking.Stacking(3, [etree], 'linear') 90 | stack_clf.fit(F[train], y[train], trend_probs[train]) 91 | y_pred = stack_clf.predict(F[test], trend_probs[test]) 92 | save_results(out_folder, 'meta-stack-linear', y_pred, y[test]) 93 | 94 | #stack_clf = stacking.Stacking(3, [etree], 'deco') 95 | #stack_clf.fit(F[train], y[train], trend_probs[train]) 96 | #y_pred = stack_clf.predict(F[test], trend_probs[test]) 97 | #save_results(out_folder, 'meta-stack-svm', y_pred) 98 | 99 | def run_one_folder(features_folder, fold_folder, results_name, gamma_max): 100 | 101 | #File paths 102 | best_by_test_fpath = os.path.join(fold_folder, results_name, 103 | 'best-by.dat') 104 | best_by_train_fpath = os.path.join(fold_folder, results_name + '-train', 105 | 'best-by.dat') 106 | 107 | all_conf_test_fpath = os.path.join(fold_folder, results_name, 108 | 'all-conf.dat') 109 | all_conf_train_fpath = os.path.join(fold_folder, results_name + '-train', 110 | 'all-conf.dat') 111 | 112 | ytest_fpath = os.path.join(fold_folder, 'ksc', 'test_assign.dat') 113 | ytrain_fpath = os.path.join(fold_folder, 'ksc', 'assign.dat') 114 | 115 | test_fpath = os.path.join(fold_folder, 'test.dat') 116 | train_fpath = os.path.join(fold_folder, 'train.dat') 117 | tags_fpath = os.path.join(features_folder, 'tags.dat') 118 | 119 | #Loading Matrices 120 | best_by_test = np.genfromtxt(best_by_test_fpath) 121 | best_by_train = np.genfromtxt(best_by_train_fpath) 122 | 123 | test = np.loadtxt(test_fpath, dtype='bool') 124 | train = np.loadtxt(train_fpath, dtype='bool') 125 | 126 | assert np.logical_xor(train, test).all() 127 | assert best_by_train.shape == train.sum() 128 | assert best_by_test.shape == test.sum() 129 | 130 | best_by = np.zeros(best_by_test.shape[0] + best_by_train.shape[0]) 131 | best_by[test] = best_by_test 132 | best_by[train] = best_by_train 133 | 134 | trend_probs_test = np.genfromtxt(all_conf_test_fpath) 135 | trend_probs_train = np.genfromtxt(all_conf_train_fpath) 136 | 137 | assert trend_probs_train.shape[0] == train.sum() 138 | assert trend_probs_test.shape[0] == test.sum() 139 | 140 | shape = (trend_probs_test.shape[0] + trend_probs_train.shape[0], 141 | trend_probs_test.shape[1]) 142 | trend_probs = np.zeros(shape) 143 | trend_probs[test] = trend_probs_test 144 | trend_probs[train] = trend_probs_train 145 | 146 | y_true_test = np.loadtxt(ytest_fpath, dtype='i') 147 | y_true_train = np.loadtxt(ytrain_fpath, dtype='i') 148 | 149 | assert y_true_train.shape[0] == train.sum() 150 | assert y_true_test.shape[0] == test.sum() 151 | 152 | y_true = np.zeros(y_true_train.shape[0] + y_true_test.shape[0]) 153 | y_true[test] = y_true_test 154 | y_true[train] = y_true_train 155 | 156 | referrers = load_features(features_folder, best_by, gamma_max) 157 | 158 | #Actual test, ufa 159 | run_classifier(os.path.join(fold_folder, results_name), 160 | trend_probs, referrers, y_true, train, test) 161 | 162 | @plac.annotations( 163 | features_folder=plac.Annotation('Folder with features', type=str), 164 | fold_folder=plac.Annotation('Folder with the train and test data', type=str), 165 | results_name=plac.Annotation('Base name of the results folder', type=str), 166 | gamma_max=plac.Annotation('Gamma Max', type=int)) 167 | def main(features_folder, fold_folder, results_name, gamma_max): 168 | run_one_folder(features_folder, fold_folder, results_name, gamma_max) 169 | 170 | if __name__ == '__main__': 171 | sys.exit(plac.call(main)) 172 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 3 ]; then 4 | echo "Please provide me with a time series file, an output folder and a features folder" 5 | exit 1 6 | fi 7 | 8 | IN=$1 9 | BASE_FOLD=$2 10 | FEATURES_FOLDER=$3 11 | 12 | K=2 13 | F1=0.5 14 | GAMMA_MAX=20 15 | 16 | #Creates output folder 17 | mkdir -p $BASE_FOLD 2> /dev/null 18 | 19 | #Generate cross-val 20 | python generate_cross_vals.py $IN $BASE_FOLD 21 | 22 | #Cluster dataset 23 | for fold in $BASE_FOLD/*/; do 24 | mkdir -p $fold/ksc 2> /dev/null 25 | python cluster.py $IN $fold/ksc $K 26 | done 27 | 28 | #Compute agreement between folds 29 | python sim_folds.py $IN $BASE_FOLD 30 | 31 | #Precompute probabilities train 32 | for fold in $BASE_FOLD/*/; do 33 | mkdir -p $fold/probs/ 2> /dev/null 34 | python classify_pts.py $IN $fold/train.dat $fold/ksc/cents.dat \ 35 | $fold/ksc/assign.dat $fold/probs/ $GAMMA_MAX 36 | done 37 | 38 | #Precompute probabilities test 39 | for fold in $BASE_FOLD/*/; do 40 | mkdir -p $fold/probs-test/ 2> /dev/null 41 | python classify_pts_test.py $IN $fold/ksc/cents.dat $fold/test.dat \ 42 | $fold/ksc/assign.dat $fold/probs-test/ $GAMMA_MAX 43 | done 44 | 45 | #Create the assign for the test 46 | for fold in $BASE_FOLD/*/; do 47 | python create_test_assign.py $IN $fold/test.dat \ 48 | $fold/ksc/cents.dat > $fold/ksc/test_assign.dat 49 | done 50 | 51 | #Learn parameters train 52 | for fold in $BASE_FOLD/*/; do 53 | mkdir -p $fold/cls-res-fitted-$F1-$GAMMA_MAX-train 2> /dev/null 54 | done 55 | python classify_theta_train.py $IN $BASE_FOLD $F1 cls-res-fitted-$F1-$GAMMA_MAX-train $GAMMA_MAX $K 56 | 57 | #Learn parameters test 58 | for fold in $BASE_FOLD/*/; do 59 | mkdir -p $fold/cls-res-fitted-$F1-$GAMMA_MAX 2> /dev/null 60 | done 61 | python classify_theta.py $IN $BASE_FOLD $F1 cls-res-fitted-$F1-$GAMMA_MAX $GAMMA_MAX $K 62 | 63 | #Adding static features 64 | for fold in $BASE_FOLD/*/; do 65 | python multimodel_class.py $FEATURES_FOLDER $fold cls-res-fitted-$F1-$GAMMA_MAX $GAMMA_MAX 66 | done 67 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import division, print_function 3 | 4 | from collections import defaultdict 5 | 6 | from vod.stats.ci import half_confidence_interval_size as hci 7 | 8 | import numpy as np 9 | import plac 10 | import sys 11 | 12 | class OLS(object): 13 | 14 | def __init__(self): 15 | 16 | self.coeffs = None 17 | self.residuals = None 18 | self.gcv_sqerrors = None 19 | 20 | def fit(self, X, y): 21 | 22 | assert X.shape[0] == y.shape[0] 23 | 24 | X = np.asanyarray(X, dtype='f', order='C') 25 | y = np.asanyarray(y, dtype='f', order='C') 26 | 27 | n = y.shape[0] 28 | 29 | PI = np.linalg.pinv(X) 30 | H = np.dot(X, PI) 31 | self.coeffs = np.dot(PI, y) 32 | 33 | y_hat = np.dot(X, self.coeffs) 34 | 35 | self.residuals = y_hat - y 36 | 37 | aux = self.residuals / (1 - np.diag(H)) 38 | self.gcv_sqerrors = np.power(aux, 2) 39 | 40 | def predict(X): 41 | return np.dot(X, self.coeffs) 42 | 43 | def fit(X, tr, tt): 44 | ols = OLS() 45 | 46 | y = X[:, :tt].sum(axis=1) 47 | XR = (X[:, :tr].T / y).T 48 | ols.fit(XR, np.ones(XR.shape[0])) 49 | 50 | return ols 51 | 52 | def main(tseries_fpath, predict_fpath, bestby_fpath): 53 | 54 | X = np.genfromtxt(tseries_fpath)[:,1:] + 0.0001 55 | cls_pred = np.loadtxt(predict_fpath, dtype='i') 56 | rgr_true = X.sum(axis=1) 57 | bestby = np.genfromtxt(bestby_fpath) 58 | 59 | cls_labels = set(cls_pred[cls_pred != -1]) 60 | 61 | tt = X.shape[1] 62 | models = {} 63 | models_per_clust = {} 64 | ref_time = np.arange(1, tt + 1) 65 | 66 | #tr = 7 67 | #ref_time = np.array([tr]) 68 | #bestby = np.zeros(bestby.shape[0]) + tr 69 | 70 | for tr in ref_time: 71 | models[tr] = fit(X, tr, tt) 72 | 73 | for k in sorted(cls_labels): 74 | Xk = X[cls_pred == k] 75 | models_per_clust[tr, k] = fit(Xk, tr, tt) 76 | 77 | errors_all = [] 78 | errors_cls = [] 79 | errors_per_cls = defaultdict(list) 80 | for tr in ref_time: 81 | idx = bestby == tr 82 | ols = models[tr] 83 | 84 | errors_all.extend(ols.gcv_sqerrors[idx]) 85 | classes = cls_pred[idx] 86 | 87 | for cls in set(classes): 88 | bestby_for_cls = bestby[cls_pred == cls] 89 | idx_cls = bestby_for_cls == tr 90 | 91 | ols = models_per_clust[tr, cls] 92 | errors_cls.extend(ols.gcv_sqerrors[idx_cls]) 93 | errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls]) 94 | 95 | print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95)) 96 | print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95)) 97 | print() 98 | print('Per class') 99 | for cls in cls_labels: 100 | err = errors_per_cls[cls] 101 | print('Cls = ', cls, np.mean(err), '+-', hci(err, .95)) 102 | 103 | 104 | if __name__ == '__main__': 105 | sys.exit(plac.call(main)) 106 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/sim_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf8 3 | from __future__ import print_function, division 4 | 5 | from pyksc import dist 6 | 7 | import glob 8 | import numpy as np 9 | import os 10 | import plac 11 | import sys 12 | 13 | def main(tseries_fpath, in_folder): 14 | 15 | ids = [] 16 | with open(tseries_fpath) as tseries_file: 17 | for l in tseries_file: 18 | ids.append(l.split()[0]) 19 | 20 | ids = np.array(ids) 21 | folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc')) 22 | num_folders = len(folders) 23 | 24 | agree = 0 25 | diff = 0 26 | 27 | for i in xrange(num_folders): 28 | 29 | base_i = os.path.dirname(folders[i]) 30 | Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat')) 31 | 32 | train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool') 33 | assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat')) 34 | 35 | for j in xrange(i, num_folders): 36 | 37 | base_j = os.path.dirname(folders[j]) 38 | Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat')) 39 | 40 | dists = dist.dist_all(Ci, Cj, rolling=True)[0] 41 | argsrt = dists.argsort(axis=1) 42 | 43 | train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool') 44 | assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat')) 45 | 46 | for k in xrange(argsrt.shape[0]): 47 | first = True 48 | for o in argsrt[k]: 49 | ids_k = set(ids[train_i][assign_i == k]) 50 | ids_o = set(ids[train_j][assign_j == o]) 51 | n_inter = len(ids_k.intersection(ids_o)) 52 | 53 | if first: 54 | first = False 55 | agree += n_inter 56 | else: 57 | diff += n_inter 58 | 59 | print('AgreedProb = ', agree / (agree + diff)) 60 | print('DisagreeProb = ', diff / (agree + diff)) 61 | 62 | if __name__ == '__main__': 63 | sys.exit(plac.call(main)) 64 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/stacking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf8 3 | 4 | from __future__ import division, print_function 5 | 6 | from learn_base import create_grid_search 7 | 8 | from sklearn import base 9 | from sklearn import model_selection 10 | from sklearn import linear_model 11 | from sklearn import tree 12 | 13 | import numpy as np 14 | 15 | class StackingException(Exception): pass 16 | 17 | class Stacking(object): 18 | '''Implements a stacking classifier''' 19 | 20 | def __init__(self, num_splits, base_models, stacker_name='linear'): 21 | 22 | STACKERS = {'tree':_TreeStacking, 23 | 'linear':_MLRStacking, 24 | 'deco':_DecoStacking} 25 | 26 | self.num_splits = num_splits 27 | self.base_classifiers = [] 28 | 29 | for base_model in base_models: 30 | clone = base.clone(base_model) 31 | self.base_classifiers.append(clone) 32 | 33 | if stacker_name not in STACKERS: 34 | names = STACKERS.keys() 35 | raise StackingException('Unknown combiner, choose from: %s' % names) 36 | 37 | self.stacker = STACKERS[stacker_name]() 38 | self.P_fit = None 39 | self.y_fit = None 40 | self.model = None 41 | self.num_classes = 0 42 | 43 | def fit(self, X, y, B): 44 | X = np.asanyarray(X) 45 | y = np.asanyarray(y) 46 | 47 | assert X.shape[0] == y.shape[0] 48 | assert y.ndim == 1 49 | 50 | self.num_classes = len(set(y)) 51 | num_base_models = len(self.base_classifiers) 52 | 53 | P = np.zeros((X.shape[0], self.num_classes * num_base_models)) 54 | 55 | kfold = model_selection.StratifiedKFold(self.num_splits) 56 | for train, test in kfold.split(X, y): 57 | for i, base_model in enumerate(self.base_classifiers): 58 | base_model.fit(X[train], y[train]) 59 | 60 | base_probs = base_model.predict_proba(X[test]) 61 | cols_min = i * self.num_classes 62 | cols_max = self.num_classes * (i + 1) 63 | P[test, cols_min:cols_max] = base_probs 64 | 65 | #a = P.max(axis=1) 66 | #b = P.argmax(axis=1) 67 | #c = B.max(axis=1) 68 | #d = B.argmax(axis=1) 69 | #self.stacker.fit(np.vstack((a, b, c, d)).T, y) 70 | self.stacker.fit(np.hstack((P, B)), y) 71 | 72 | def predict(self, X, B): 73 | X = np.asanyarray(X) 74 | 75 | num_features = len(self.base_classifiers) * self.num_classes 76 | P = np.zeros((X.shape[0], num_features)) 77 | for i, base_model in enumerate(self.base_classifiers): 78 | base_probs = base_model.predict_proba(X) 79 | cols_min = i * self.num_classes 80 | cols_max = self.num_classes * (i + 1) 81 | P[:, cols_min:cols_max] = base_probs 82 | 83 | #a = P.max(axis=1) 84 | #b = P.argmax(axis=1) 85 | #c = B.max(axis=1) 86 | #d = B.argmax(axis=1) 87 | #P = np.vstack((a, b, c, d)).T 88 | P = np.hstack((P, B)) 89 | return self.stacker.predict(P) 90 | 91 | class _MLRStacking(base.BaseEstimator, base.ClassifierMixin): 92 | """Implements a multi-response linear regression classifier""" 93 | 94 | def __init__(self): 95 | self.regressors = dict() 96 | 97 | def fit(self, X, y): 98 | X = np.asanyarray(X) 99 | y = np.asanyarray(y) 100 | 101 | for yi in set(y): 102 | self.regressors[yi] = linear_model.LinearRegression() 103 | specific_y = np.asanyarray(y == yi, dtype='i') 104 | self.regressors[yi].fit(X, specific_y) 105 | 106 | def predict(self, X): 107 | X = np.asanyarray(X) 108 | 109 | prediction = np.zeros(X.shape[0]) 110 | best_value = np.zeros_like(prediction) 111 | for yi, regressor in self.regressors.items(): 112 | value = regressor.predict(X) 113 | for index, vindex in enumerate(value): 114 | if vindex > best_value[index]: 115 | best_value[index] = vindex 116 | prediction[index] = yi 117 | return prediction 118 | 119 | class _TreeStacking(base.BaseEstimator, base.ClassifierMixin): 120 | '''Implements stacking with a multiresponse regression tree''' 121 | 122 | def __init__(self): 123 | self.model = None 124 | 125 | def _y_to_one_zero_mat(self, y): 126 | y = np.asanyarray(y) 127 | 128 | #Guarantees that y is 0 to n - 1 129 | unique_y, labels_flat = np.unique(y, return_inverse=True) 130 | y = labels_flat.reshape(y.shape) 131 | 132 | Y = np.zeros(shape=(len(y), len(unique_y)), dtype='f', order='C') 133 | for yi in unique_y: 134 | Y[:, yi] = (y == yi) 135 | 136 | return Y 137 | 138 | def fit(self, X, y): 139 | X = np.asanyarray(X, dtype='f', order='C') 140 | Y = self._y_to_one_zero_mat(y) 141 | 142 | self.model = tree.DecisionTreeRegressor() 143 | self.model.fit(X, Y) 144 | 145 | def predict(self, X): 146 | X = np.asanyarray(X, dtype='f', order='C') 147 | P = self.model.predict(X) 148 | return P.argmax(axis=1) 149 | 150 | class _DecoStacking(base.BaseEstimator, base.ClassifierMixin): 151 | 152 | def __init__(self): 153 | self.model = etree = create_grid_search('extra_trees', n_jobs = 1) 154 | 155 | def fit(self, X, y): 156 | X = np.asanyarray(X, dtype='f', order='C') 157 | y = np.asanyarray(y, dtype='f', order='C') 158 | 159 | self.model.fit(X, y) 160 | 161 | def predict(self, X): 162 | return self.model.predict(X) 163 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/summarize_results.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf8 2 | from __future__ import division, print_function 3 | 4 | from pyksc import dist 5 | 6 | from sklearn.metrics import f1_score 7 | from sklearn.metrics import classification_report 8 | 9 | import glob 10 | import numpy as np 11 | import os 12 | import plac 13 | 14 | def main(tseries_fpath, base_folder): 15 | 16 | folders = glob.glob(os.path.join(base_folder, 'fold-*')) 17 | num_folders = len(folders) 18 | 19 | cluster_mapping = [] 20 | C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat')) 21 | 22 | for i in xrange(num_folders): 23 | Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat')) 24 | 25 | dists = dist.dist_all(Ci, C_base, rolling=True)[0] 26 | closest = dists.argmin(axis=1) 27 | 28 | cluster_mapping.append({}) 29 | for k in xrange(Ci.shape[0]): 30 | cluster_mapping[i][k] = closest[k] 31 | 32 | y_true_all = [] 33 | y_pred_all = [] 34 | for i in xrange(num_folders): 35 | y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat')) 36 | y_pred = np.loadtxt(os.path.join(folders[i], \ 37 | 'cls-res-fitted-50/pred.dat')) 38 | 39 | for j in xrange(y_true.shape[0]): 40 | y_true[j] = cluster_mapping[i][y_true[j]] 41 | if y_pred[j] != -1: 42 | y_pred[j] = cluster_mapping[i][y_pred[j]] 43 | 44 | y_true_all.extend(y_true) 45 | y_pred_all.extend(y_pred) 46 | 47 | y_pred_all = np.asarray(y_pred_all) 48 | y_true_all = np.asarray(y_true_all) 49 | 50 | report = classification_report(y_true_all, y_pred_all) 51 | valid = y_pred_all != -1 52 | print() 53 | print('Using the centroids from folder: ', folders[0]) 54 | print('Micro Aggregation of Folds:') 55 | print('%.3f fract of videos were not classified' % (sum(~valid) / y_pred_all.shape[0])) 56 | print() 57 | print(classification_report(y_true_all[valid], y_pred_all[valid])) 58 | 59 | if __name__ == '__main__': 60 | plac.call(main) 61 | -------------------------------------------------------------------------------- /src/trend-learner-scripts/translation-final-results-to-paper-new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flaviovdf/pyksc/6ba8988c7fad63366dc2b8d005d0779971e129c5/src/trend-learner-scripts/translation-final-results-to-paper-new.png --------------------------------------------------------------------------------