├── LICENSE
├── Makefile
├── README.rst
├── setup.py
└── src
    ├── pyksc
        ├── __init__.py
        ├── _trend.pyx
        ├── dhwt.pyx
        ├── dist.pxd
        ├── dist.pyx
        ├── ksc.py
        ├── metrics.py
        ├── regression.py
        ├── test
        │   ├── __init__.py
        │   ├── test_dhwt.py
        │   ├── test_dist.py
        │   ├── test_ksc.py
        │   ├── test_regression.py
        │   └── test_trend.py
        └── trend.py
    ├── scripts
        ├── __init__.py
        ├── class_predict.py
        ├── cluster_jaccard.py
        ├── cluster_mutualinfo.py
        ├── cluster_vol.py
        ├── col_to_cluster.py
        ├── create_mic_input.py
        ├── learn_base.py
        ├── leave_k.py
        ├── plot_centroids.py
        ├── plot_members.py
        ├── plot_quality.py
        ├── plot_time_to_peak.py
        ├── pop_predict.py
        ├── radar.py
        ├── tags_io.py
        └── tree_infogain.py
    └── trend-learner-scripts
        ├── boosting.py
        ├── classify_pts.py
        ├── classify_pts_all.py
        ├── classify_pts_test.py
        ├── classify_theta.py
        ├── classify_theta_train.py
        ├── cluster.py
        ├── cotrain.py
        ├── create_test_assign.py
        ├── generate_cross_vals.py
        ├── ioutil.py
        ├── learn_base.py
        ├── multimodel_class.py
        ├── pipeline.sh
        ├── regression.py
        ├── sim_folds.py
        ├── stacking.py
        ├── summarize_results.py
        └── translation-final-results-to-paper-new.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2014, pyksc developers
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the pyksc nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Simple makefile
 2 | 
 3 | PYTHON ?= python
 4 | NOSETESTS ?= nosetests
 5 | 
 6 | all: clean build
 7 | 
 8 | build:
 9 | 	$(PYTHON) setup.py build_ext --inplace
10 | 
11 | clean:
12 | 	rm -rf build/
13 | 	rm -rf src/build/
14 | 	find . -name "*.pyc" | xargs rm -f
15 | 	find . -name "*.c" | xargs rm -f
16 | 	find . -name "*.so" | xargs rm -f
17 | 
18 | test: clean build
19 | 	$(NOSETESTS)
20 | 
21 | trailing-spaces: 
22 | 	find -name "*.py" | xargs sed 's/^M$$//'
23 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | PY-KSC
 2 | ======
 3 | 
 4 | Implementation of the KSC time series clustering algorithm.
 5 | See [1]_ for details:
 6 | 
 7 | Dependencies for library
 8 | ------------------------
 9 |    * Numpy
10 |    * Cython
11 | 
12 | Dependencies for scripts
13 | ------------------------
14 |    * Scipy
15 |    * Matplotlib
16 | 
17 | How to install
18 | --------------
19 | 
20 | Clone the repo
21 | 
22 | ::
23 | 
24 | $ git clone https://github.com/flaviovdf/pyksc.git
25 | 
26 | Make sure you have cython and numpy. If not run as root (or use your distros package manager)
27 | 
28 | ::
29 | 
30 | $ pip install numpy
31 | 
32 | ::
33 | 
34 | $ pip install Cython
35 | 
36 | Install
37 | 
38 | ::
39 | 
40 | $ python setup.py install
41 | 
42 | If you see the following error ``/usr/bin/ld: cannot find -lblas`` on linux, try installing the following two libraries 
43 | 
44 | ::
45 | 
46 | $ sudo apt-get install libblas-dev liblapack-dev
47 | 
48 | 
49 | 
50 | References
51 | ----------
52 | .. [1] J. Yang and J. Leskovec, 
53 |    "Patterns of Temporal Variation in Online Media" - WSDM'11  
54 |    http://dl.acm.org/citation.cfm?id=1935863
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8
 3 | from __future__ import division, print_function
 4 | '''Setup script'''
 5 | 
 6 | import glob
 7 | import numpy
 8 | import os
 9 | import sys
10 | 
11 | from distutils.core import setup
12 | from distutils.extension import Extension
13 | from Cython.Distutils import build_ext
14 | 
15 | SOURCE = 'src/'
16 | os.chdir(SOURCE)
17 | 
18 | import platform
19 | if platform.system() == 'Darwin':
20 |     os.environ["CC"] = "gcc-6"
21 |     os.environ["CXX"] = "gcc-6"
22 | 
23 | if sys.version_info[:2] < (2, 7):
24 |     print('Requires Python version 2.7 or later (%d.%d detected).' %
25 |           sys.version_info[:2])
26 |     sys.exit(-1)
27 | 
28 | def get_packages():
29 |     '''Appends all packages (based on recursive sub dirs)'''
30 | 
31 |     packages  = ['pyksc']
32 | 
33 |     for package in packages:
34 |         base = os.path.join(package, '**/')
35 |         sub_dirs = glob.glob(base)
36 |         while len(sub_dirs) != 0:
37 |             for sub_dir in sub_dirs:
38 |                 package_name = sub_dir.replace('/', '.')
39 |                 if package_name.endswith('.'):
40 |                     package_name = package_name[:-1]
41 | 
42 |                 packages.append(package_name)
43 |         
44 |             base = os.path.join(base, '**/')
45 |             sub_dirs = glob.glob(base)
46 | 
47 |     return packages
48 | 
49 | def get_extensions():
50 |     '''Get's all .pyx and.pxd files'''
51 |     
52 |     extensions = []
53 |     for base in ['pyksc']:
54 |         pyx_files = glob.glob(os.path.join(base, '*.pyx'))
55 | 
56 |         for pyx in pyx_files:    
57 |             pxd = pyx.replace('pyx', 'pxd')
58 |             module = pyx.replace('.pyx', '').replace('/', '.')
59 |         
60 |             if os.path.exists(pxd):
61 |                 ext_files = [pyx, pxd]
62 |             else:
63 |                 ext_files = [pyx]
64 | 
65 |             extension = Extension(module, ext_files,
66 |                                   include_dirs=[numpy.get_include()],
67 |                                   libraries=['blas'],
68 |                                   extra_compile_args=['-fopenmp',
69 |                                    '-msse', '-msse2', '-mfpmath=sse'],
70 |                                   extra_link_args=['-fopenmp'])
71 |         
72 |             extensions.append(extension)
73 |     
74 |     return extensions
75 | 
76 | if __name__ == "__main__":
77 |     packages = get_packages()
78 |     extensions = get_extensions()
79 |     
80 |     setup(
81 |         cmdclass = {'build_ext': build_ext},
82 |         name             = 'pyksc',
83 |         packages         = packages,
84 |         ext_modules      = extensions
85 |       )
86 | 


--------------------------------------------------------------------------------
/src/pyksc/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Implementation of the KSC algorithm.  See [1] for details:
3 | 
4 | References
5 | ----------
6 | .. [1] J. Yang and J. Leskovec, 
7 |    "Patterns of Temporal Variation in Online Media" - WSDM'11  
8 |    http://dl.acm.org/citation.cfm?id=1935863
9 | '''


--------------------------------------------------------------------------------
/src/pyksc/_trend.pyx:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | # cython: boundscheck = False
 3 | # cython: wraparound = False
 4 | 
 5 | from cython.parallel import prange
 6 | from cython.view cimport array as cvarray
 7 | 
 8 | from libc.stdlib cimport free
 9 | from libc.stdio cimport printf
10 | 
11 | from pyksc cimport dist
12 | 
13 | cimport cython
14 | cimport numpy as np
15 | 
16 | import numpy as np
17 | np.import_array()
18 | 
19 | #Basic math functions
20 | cdef extern from "math.h" nogil:
21 |     double exp(double)
22 | 
23 | cdef inline double dmin(double a, double b) nogil: return a if a < b else b
24 | 
25 | cdef double dist_to_reference(double[::1] s, double[::1] r) nogil:
26 |     cdef Py_ssize_t n_obs = s.shape[0]
27 |     cdef Py_ssize_t n_ref = r.shape[0]
28 | 
29 |     cdef double min_dist = 1
30 |     cdef Py_ssize_t i
31 |     cdef dist.dist_struct_t *d
32 |     
33 |     for i in range(n_ref - n_obs + 1):
34 |         d = dist.cdist(r[i:i + n_obs], s, 1)
35 |         min_dist = dmin(min_dist, d.dist)
36 |         free(d)
37 | 
38 |     return min_dist
39 | 
40 | cdef void predict_one(double[::1] s, double[:, ::1] R_pos,
41 |         double gamma, double[:, ::1] probs, 
42 |         int store_at_row, int store_at_col) nogil:
43 | 
44 |     cdef Py_ssize_t num_windows = s.shape[0] + 1
45 |     cdef Py_ssize_t num_pos = R_pos.shape[0]
46 | 
47 |     cdef double prob = 0
48 |     cdef Py_ssize_t i = 0
49 |     for i in range(num_pos):
50 |         prob += exp(-gamma * dist_to_reference(s, R_pos[i]))
51 | 
52 |     probs[store_at_row, store_at_col] = prob
53 | 
54 | def predict(np.ndarray[double, ndim=2, mode='c'] X not None, 
55 |             np.ndarray[double, ndim=2, mode='c'] R not None, 
56 |             np.ndarray[long, ndim=1, mode='c'] labels not None,
57 |             int num_labels, double gamma):
58 | 
59 |     cdef Py_ssize_t num_samples = X.shape[0]
60 |     cdef Py_ssize_t num_points = X.shape[1]
61 | 
62 |     cdef double[::1] s
63 |     cdef double[:, ::1] R_pos
64 |     
65 |     cdef double[:, ::1] probs = \
66 |             np.zeros(shape=(num_samples, num_labels), dtype=np.float64, 
67 |                      order='C')
68 |             
69 |     cdef double[:, ::1] Xview = X #For nogil
70 | 
71 |     cdef Py_ssize_t i = 0
72 |     cdef Py_ssize_t l = 0
73 |     for l from 0 <= l < num_labels:
74 |         #TODO: Maybe this copy is not necessary, need to check.
75 |         R_pos = np.asanyarray(R[labels == l], dtype=np.float64, order='C')
76 | 
77 |         for i in prange(num_samples, schedule='static', nogil=True):
78 |             predict_one(Xview[i], R_pos, gamma, probs, i, l)
79 | 
80 |     return probs.base
81 | 


--------------------------------------------------------------------------------
/src/pyksc/dhwt.pyx:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''
  3 | Implements Discrete Harr Wavelet Transform (also inverse) for a time series.
  4 | This is simply done by computing the average of consecutive elements in the
  5 | vector that correspond to the time series. See [1] and [2] for details.
  6 | 
  7 | References
  8 | ----------
  9 | .. [1] P. Van Fleet
 10 |    "The Discrete Haar Wavelet Tranformation"
 11 |    http://goo.gl/IPz25
 12 |    (last access December 2011)
 13 |    
 14 | .. [2] I. Kaplan
 15 |    "Applying the Haar Wavelet Transform to Time Series Information"
 16 |    http://www.bearcave.com/misl/misl_tech/wavelets/haar.html
 17 |    (last access December 2011)
 18 | '''
 19 | from __future__ import division, print_function
 20 | 
 21 | cimport cython
 22 | cimport numpy as np
 23 | 
 24 | import numpy as np
 25 | np.import_array()
 26 | 
 27 | @cython.boundscheck(False)
 28 | @cython.wraparound(False)
 29 | def transform(np.ndarray[double, ndim=1] array):
 30 |     '''
 31 |     Transform the array to a new form using the discrete harr
 32 |     transform operation. This is computing the average of consecutive 
 33 |     elements in the array.
 34 | 
 35 |     Arguments
 36 |     ---------
 37 |     array: np.ndarray[double, ndim=1]
 38 |         the array to transform
 39 | 
 40 |     Returns
 41 |     -------
 42 |     This method returns a tuple being the first element the wavelet and
 43 |     the second the coefficients to be used to transform the wavelet back
 44 |     to the original array.
 45 |     '''
 46 |     cdef Py_ssize_t n = array.shape[0]
 47 |     cdef Py_ssize_t new_dim
 48 | 
 49 |     if n % 2 == 0:
 50 |         new_dim = n // 2
 51 |     else:
 52 |         new_dim = (n // 2) + 1
 53 | 
 54 |     cdef np.ndarray[double, ndim=1] wavelet = np.zeros(new_dim)
 55 |     cdef np.ndarray[double, ndim=1] coefficient = np.zeros(new_dim)
 56 | 
 57 |     cdef double first
 58 |     cdef double second
 59 |     cdef Py_ssize_t i = 0
 60 |     cdef Py_ssize_t j = 0
 61 | 
 62 |     for i in range(0, n, 2):
 63 |         first = array[i]
 64 |         if i < n - 1:
 65 |             second = array[i + 1]
 66 |         else:
 67 |             second = 0
 68 |         
 69 |         wavelet[j] = (first + second) / 2
 70 |         coefficient[j] = (first - second) / 2
 71 |         j += 1
 72 | 
 73 |     return wavelet, coefficient
 74 | 
 75 | @cython.boundscheck(False)
 76 | @cython.wraparound(False)
 77 | def inverse(np.ndarray[double, ndim=1] wavelet,
 78 |             np.ndarray[double, ndim=1] coefficient):
 79 |     '''
 80 |     Given a wavelet and its coefficients this method can be used to 
 81 |     transform the wavelet to the original array.
 82 |     
 83 |     Arguments
 84 |     ---------
 85 |     wavelet: np.ndarray[np.float_t, ndim=1]
 86 |              the wavelet to transform back
 87 |     coefficient: np.ndarray[np.float_t, ndim=1]
 88 |              the coefficients needed for the transform
 89 |     '''
 90 |     cdef Py_ssize_t n = wavelet.shape[0]
 91 | 
 92 |     #sanity check
 93 |     if n != coefficient.shape[0]:
 94 |         return None
 95 | 
 96 |     cdef Py_ssize_t new_dim
 97 |     if n % 2 == 0 or n == 1:
 98 |         new_dim = n * 2
 99 |     else:
100 |         new_dim = n * 2 - 1
101 | 
102 |     cdef np.ndarray[np.float_t, ndim=1] array = np.zeros(new_dim)
103 | 
104 |     cdef double first
105 |     cdef double second
106 |     cdef Py_ssize_t i = 0
107 |     cdef Py_ssize_t j = 0
108 |     for i in range(n):
109 |         first = wavelet[i] + coefficient[i]
110 |         second = wavelet[i] - coefficient[i]
111 |         
112 |         if j < new_dim: 
113 |             array[j] = first
114 | 
115 |         if j + 1 < new_dim:
116 |             array[j + 1] = second
117 | 
118 |         j += 2
119 | 
120 |     return array
121 | 


--------------------------------------------------------------------------------
/src/pyksc/dist.pxd:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | 
 3 | #A basic structure for the return value of the distance func
 4 | cdef struct dist_struct_t:
 5 |     double dist
 6 |     double alpha
 7 |     int shift
 8 | 
 9 | #Distance function
10 | cdef dist_struct_t* cdist(double[::1] array1, double[::1] array2, int rolling)\
11 |         nogil
12 | 
13 | cdef dist_struct_t* cshift_dist(double[::1] array1, double[::1] array2,\
14 |         int shift_amount, int rolling) nogil
15 | 
16 | 


--------------------------------------------------------------------------------
/src/pyksc/dist.pyx:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | # cython: cdivision = True
  3 | # cython: boundscheck = False
  4 | # cython: wraparound = False
  5 | 
  6 | '''
  7 | Basic array functions are kept here. Also, in this module
  8 | we implement the time series distance metric defined in [1].
  9 | 
 10 | References
 11 | ----------
 12 | .. [1] J. Yang and J. Leskovec,
 13 |     "Patterns of Temporal Variation in Online Media" - WSDM'11
 14 |     http://dl.acm.org/citation.cfm?id=1935863
 15 | '''
 16 | from __future__ import division, print_function
 17 | 
 18 | from cpython cimport bool
 19 | from libc.stdlib cimport abort
 20 | from libc.stdlib cimport free
 21 | from libc.stdlib cimport malloc
 22 | from libc.stdio cimport printf
 23 | 
 24 | from cython.parallel import parallel
 25 | from cython.parallel import prange
 26 | 
 27 | cimport cython
 28 | cimport numpy as np
 29 | import numpy as np
 30 | 
 31 | np.import_array()
 32 | 
 33 | #Basic math functions
 34 | cdef extern from "math.h" nogil:
 35 |     double sqrt(double)
 36 | 
 37 | cdef extern from "cblas.h" nogil:
 38 |     double cblas_dnrm2(int N, double *X, int incX)
 39 |     double cblas_ddot(int N, double *X, int incX, double *Y, int incY)
 40 | 
 41 | #Inlines, some basic blas vector stuff renamed for legacy and disabling gil
 42 | cdef inline double cinner_prod(double *array1, double *array2, \
 43 |         Py_ssize_t size) nogil: \
 44 |         return cblas_ddot(size, array1, 1, array2, 1)
 45 | 
 46 | cdef inline double csqsum(double *array1, Py_ssize_t size) nogil: \
 47 |         return cblas_dnrm2(size, array1, 1) ** 2
 48 | 
 49 | cdef inline double cnorm(double *array1, Py_ssize_t size) nogil: \
 50 |         return cblas_dnrm2(size, array1, 1)
 51 | 
 52 | #CDEF functions
 53 | cdef double* cshift_drop(double[::1] array, int amount) nogil:
 54 |     '''
 55 |     Shifts the array by N positions. This is similar to a binary shift where
 56 |     the element's fall of at the ends.
 57 |     '''
 58 |     cdef Py_ssize_t size = array.shape[0]
 59 |     
 60 |     cdef double *shifted
 61 |     shifted = <double *> malloc(size * sizeof(double))
 62 |     if shifted == NULL:
 63 |         abort()
 64 | 
 65 |     cdef Py_ssize_t delta_shifted = 0
 66 |     cdef Py_ssize_t delta_array = 0
 67 |     if amount > 0:
 68 |         delta_shifted = amount
 69 |     else:
 70 |         delta_array = -amount
 71 |         amount = -amount
 72 | 
 73 |     cdef Py_ssize_t i = 0
 74 |     for i in range(size):
 75 |         shifted[i] = 0
 76 | 
 77 |     i = 0
 78 |     for i in range(size - amount):
 79 |         shifted[i + delta_shifted] = array[i + delta_array]
 80 |     
 81 |     return shifted
 82 | 
 83 | cdef double* cshift_roll(double[::1] array, int amount) nogil:
 84 |     '''
 85 |     Shifts the array by N positions. This is a rolling shifts, where elements 
 86 |     come back at the other side of the array.
 87 |     '''
 88 |     cdef Py_ssize_t size = array.shape[0]
 89 | 
 90 |     cdef Py_ssize_t delta_shifted = 0
 91 |     cdef Py_ssize_t delta_array = 0
 92 |     if amount > 0:
 93 |         delta_shifted = amount
 94 |     else:
 95 |         delta_array = -amount
 96 | 
 97 |     cdef double *shifted
 98 |     shifted = <double *> malloc(size * sizeof(double))
 99 |     if shifted == NULL:
100 |         abort()
101 | 
102 |     cdef Py_ssize_t i = 0
103 |     for i in range(size):
104 |         shifted[(i + delta_shifted) % size] = array[(i + delta_array) % size]
105 | 
106 |     return shifted
107 | 
108 | cdef dist_struct_t* cshift_dist(double[::1] array1, double[::1] array2, \
109 |                         int shift_amount, int rolling) nogil:
110 |     '''
111 |     Computes the distance between two time series using a given shift.
112 |     '''
113 |     cdef Py_ssize_t size = array1.shape[0]
114 |    
115 |     #return val
116 |     cdef dist_struct_t* rv = <dist_struct_t*>malloc(sizeof(dist_struct_t))
117 |     if rv == NULL:
118 |         abort()
119 |     rv.shift = shift_amount
120 |  
121 |     if size == 0:
122 |         rv.dist = 0
123 |         rv.alpha = 0
124 |         return rv
125 |     
126 |     cdef double *shifted
127 |     if rolling:
128 |         shifted = cshift_roll(array2, shift_amount)
129 |     else:
130 |         shifted = cshift_drop(array2, shift_amount)
131 |     
132 |     #computing scaling
133 |     cdef double alpha
134 |     cdef double sqsum_shift = csqsum(shifted, size)
135 |     if sqsum_shift != 0:
136 |         alpha = cinner_prod(&array1[0], shifted, size) / sqsum_shift
137 |     else:
138 |         alpha = 0
139 |     
140 |     rv.alpha = alpha
141 | 
142 |     #actual distance
143 |     cdef Py_ssize_t i = 0
144 |     cdef double dist = 0
145 |     for i in range(size):
146 |         dist += (array1[i] - alpha * shifted[i]) ** 2
147 |     
148 |     free(shifted)
149 |         
150 |     cdef double norm1 = cnorm(&array1[0], size)
151 |     if norm1 != 0:
152 |         rv.dist = sqrt(dist) / norm1
153 |     elif sqsum_shift != 0: #array one is all zeros, but 2 is not
154 |         rv.dist = 1
155 |     else: #both are all zeros
156 |         rv.dist = 0
157 | 
158 |     return rv
159 | 
160 | cdef dist_struct_t* cdist(double[::1] array1, double[::1] array2, int rolling) \
161 |         nogil:
162 |     '''
163 |     Computes the distance between two time series by searching for the optimal
164 |     shifting parameter.
165 |     '''
166 | 
167 |     cdef Py_ssize_t size = array1.shape[0]
168 |     cdef dist_struct_t* rv = <dist_struct_t*>malloc(sizeof(dist_struct_t))
169 |     if rv == NULL:
170 |         abort()
171 | 
172 |     rv.dist = 1
173 |     rv.shift = 0
174 |     rv.alpha = 0
175 |     if size == 0:
176 |         rv.dist = 0
177 |         return rv
178 | 
179 |     cdef double best_distance = 1
180 |     cdef Py_ssize_t best_shift = 0
181 | 
182 |     cdef dist_struct_t* curr_dist
183 |     cdef Py_ssize_t i
184 |     for i in range(-size + 1, size):
185 |         curr_dist = cshift_dist(array1, array2, i, rolling)
186 |         if curr_dist.dist < best_distance:
187 |             free(rv)
188 |             rv = curr_dist
189 |             rv.shift = i
190 |             best_distance = rv.dist
191 |         else:
192 |             free(curr_dist)
193 | 
194 |     return rv
195 | 
196 | cdef tuple cdist_all(double[:, ::1] matrix1, double[:, ::1] matrix2, int rolling):
197 |     '''
198 |     Computes the distance between all pairs of rows in the given matrices.
199 |     The elements of the first matrix are the ones which will be shifted.
200 |     '''
201 |     
202 |     cdef Py_ssize_t n_rows1 = matrix1.shape[0]
203 |     cdef Py_ssize_t n_rows2 = matrix2.shape[0]
204 |     cdef Py_ssize_t n_cols = matrix1.shape[1]
205 |     
206 |     cdef np.ndarray[double, ndim=2] rv_dist = np.ndarray((n_rows1, n_rows2))
207 |     cdef np.ndarray[int, ndim=2] rv_shifts = np.ndarray((n_rows1, n_rows2),
208 |             dtype='i')
209 | 
210 |     cdef dist_struct_t*** aux = \
211 |             <dist_struct_t***> malloc(n_rows1 * sizeof(dist_struct_t**))
212 |     if aux == NULL:
213 |         abort()
214 | 
215 |     cdef Py_ssize_t i
216 |     cdef Py_ssize_t j
217 |     for i in prange(n_rows1, nogil=True, schedule='static'):
218 |         aux[i] = <dist_struct_t**> malloc(n_rows2 * sizeof(dist_struct_t*))
219 |         if aux[i] == NULL:
220 |             abort()
221 | 
222 |         for j in range(n_rows2):
223 |             aux[i][j] = cdist(matrix1[i], matrix2[j], rolling)
224 |             rv_dist[i, j] = aux[i][j].dist
225 |             rv_shifts[i, j] = aux[i][j].shift
226 | 
227 |             free(aux[i][j])
228 |         free(aux[i])
229 |     free(aux)
230 | 
231 |     return (rv_dist, rv_shifts)
232 | 
233 | #Python wrappers
234 | def shift(np.ndarray[double, ndim=1, mode='c'] array not None, int amount,
235 |           bool rolling=False):
236 |     '''
237 |     Shifts the array by N positions. This is a rolling shifts, where elements 
238 |     come back at the other side of the array. This method return a new array,
239 |     it does not do inplace shifts.
240 | 
241 |     Arguments
242 |     ---------
243 |     array: np.ndarray[np.float_t, ndim=1]
244 |         The array to shift
245 |     amount: int
246 |         The amount to shuft by, positive integer signal right shifts while
247 |         negative ones signal left shifts
248 |     rolling: bool (default `False`)
249 |         indicates whether we should use a rolling distance (i.e. elements at
250 |         one end re appear at another) or a drop distance (i.e. elements fall
251 |         and zeroes take their place, similar to a binary shift)
252 |     '''
253 | 
254 |     cdef Py_ssize_t size = array.shape[0]
255 |     cdef double *shift_buff
256 |     if rolling:
257 |         shift_buff = cshift_roll(array, amount)
258 |     else:
259 |         shift_buff = cshift_drop(array, amount)
260 | 
261 |     cdef np.ndarray[double, ndim=1] rv = np.ndarray(size)
262 |     free(rv.data)
263 |     rv.data = <char *>shift_buff
264 |     return rv
265 | 
266 | def inner_prod(np.ndarray[double, ndim=1, mode='c'] array1 not None,
267 |                np.ndarray[double, ndim=1, mode='c'] array2 not None):
268 |     '''
269 |     Return's the inner product between two arrays. It is a necessity for both 
270 |     arrays to have the same shape.
271 |     
272 |     Arguments
273 |     ---------
274 |     array1: np.ndarray[np.float_t, ndim=1]
275 |         First array
276 |     array2: np.ndarray[np.float_t, ndim=1]
277 |         Second array
278 |     '''
279 | 
280 |     assert array1.shape[0] == array2.shape[0]
281 |     cdef Py_ssize_t size = array1.shape[0]
282 |     return cinner_prod(&array1[0], &array2[0], size)
283 | 
284 | def sqsum(np.ndarray[double, ndim=1, mode='c'] array not None):
285 |     '''
286 |     Returns the squared sum of the elements in the given array.
287 | 
288 |     Arguments
289 |     ---------
290 |     array: np.ndarray[np.float_t, ndim=1]
291 |         The array to sum the elements
292 |     '''
293 |     
294 |     return csqsum(&array[0], array.shape[0])
295 | 
296 | def shift_dist(np.ndarray[double, ndim=1, mode='c'] array1 not None,
297 |                np.ndarray[double, ndim=1, mode='c'] array2 not None, 
298 |                int shift_amount, bool rolling=False):
299 |     '''
300 |     Computes the distance between two time series. This is an implementation
301 |     of the distance metric define in Section 2.2 of [1]. This is the distance
302 |     metric for a fixed shifting parmeter, where the scaling can be easily
303 |     computed.
304 | 
305 |     Arguments
306 |     ---------
307 |     array1: np.ndarray[np.float_t, ndim=1]
308 |         First time series
309 |     array2: np.ndarray[np.float_t, ndim=1]
310 |         Second time series
311 |     shift_amout: int
312 |         the shifting parameter
313 |     rolling: bool (default `False`)
314 |         indicates whether we should use a rolling distance (i.e. elements at
315 |         one end reappear at another) or a drop distance (i.e. elements fall
316 |         and zeroes take their place, similar to a binary shift)
317 |     
318 |     References
319 |     ----------
320 |     .. [1] J. Yang and J. Leskovec,
321 |        "Patterns of Temporal Variation in Online Media" - WSDM'11
322 |         http://dl.acm.org/citation.cfm?id=1935863
323 |     '''
324 |     assert array1.shape[0] == array2.shape[0]
325 |     
326 |     cdef dist_struct_t* rv
327 |     cdef double dist
328 |     try:
329 |         if rolling:
330 |             rv = cshift_dist(array1, array2, shift_amount, 1)
331 |         else:
332 |             rv = cshift_dist(array1, array2, shift_amount, 0)
333 | 
334 |         dist = rv.dist
335 |         return dist
336 |     finally:
337 |         free(rv)
338 | 
339 | def dist(np.ndarray[double, ndim=1, mode='c'] array1 not None, 
340 |          np.ndarray[double, ndim=1, mode='c'] array2 not None,
341 |          bool rolling=False):
342 |     '''
343 |     Computes the distance between two time series. This is an implementation
344 |     of the distance metric define in Section 2.2 of [1]. It searchs for optimal
345 |     scaling and shifting paramters to align both series and compare similarity
346 |     mostly based on *shape*. 
347 |     
348 |     This is a symmetric measure *only* when using rolling shifts.
349 | 
350 |     Arguments
351 |     ---------
352 |     array1: np.ndarray[np.float_t, ndim=1, mode='c']
353 |         First time series
354 |     array2: np.ndarray[np.float_t, ndim=1, mode='c']
355 |         Second time series
356 |     rolling: bool (default `False`)
357 |         indicates whether we should use a rolling distance (i.e. elements at
358 |         one end reappear at another) or a drop distance (i.e. elements fall
359 |         and zeroes take their place, similar to a binary shift)
360 | 
361 |     References
362 |     ----------
363 |     .. [1] J. Yang and J. Leskovec, 
364 |        "Patterns of Temporal Variation in Online Media" - WSDM'11  
365 |         http://dl.acm.org/citation.cfm?id=1935863
366 |     '''
367 |     assert array1.shape[0] == array2.shape[0]
368 | 
369 |     cdef dist_struct_t  *rv
370 |     cdef int roll = 0
371 |     if rolling:
372 |         roll = 1
373 | 
374 |     try:
375 |         rv = cdist(array1, array2, roll)
376 |         return rv.dist
377 |     finally:
378 |         free(rv)
379 | 
380 | def dist_all(np.ndarray[double, ndim=2, mode='c'] matrix1 not None,
381 |              np.ndarray[double, ndim=2, mode='c'] matrix2 not None,
382 |              bool rolling=False):
383 | 
384 |     '''
385 |     Computes the distance between all of examples (rows) from the first
386 |     matrix to all other examples in the second matrix. The return value
387 |     is a matrix of n_rows1, n_rows2 containing the distances.
388 | 
389 |     The elements of the first matrix are the ones which will be shifted.
390 | 
391 |     Both matrices must have the same number of columns.
392 | 
393 |     Arguments
394 |     ---------
395 |     matrix1: np.ndarray[np.float_t, ndim=2, mode='c']
396 |         A matrix of time series
397 |     matrix2: np.ndarray[np.float_t, ndim=2, mode='c']
398 |         A matrix of time series
399 |     rolling: bool (default `False`)
400 |         indicates whether we should use a rolling distance (i.e. elements at
401 |         one end reappear at another) or a drop distance (i.e. elements fall
402 |         and zeroes take their place, similar to a binary shift)
403 |     '''
404 | 
405 |     assert matrix1.shape[1] == matrix2.shape[1]
406 |     cdef int roll = 0
407 |     if rolling:
408 |         roll = 1
409 | 
410 |     return cdist_all(matrix1, matrix2, roll)
411 | 


--------------------------------------------------------------------------------
/src/pyksc/ksc.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''
  3 | Implementation of the KSC and IncrementalKSC algorithms.  See [1] for details.
  4 | Both algorithms can be used for clustering time series data, the second
  5 | (IncrementalKSC) being an optimization of the initial clusters heuristic to
  6 | be used by the first.
  7 | 
  8 | References
  9 | ----------
 10 | .. [1] J. Yang and J. Leskovec, 
 11 |    "Patterns of Temporal Variation in Online Media" - WSDM'11  
 12 |    http://dl.acm.org/citation.cfm?id=1935863
 13 | '''
 14 | from __future__ import division, print_function
 15 | 
 16 | from pyksc.dhwt import transform
 17 | from pyksc.dist import dist_all
 18 | from pyksc.dist import shift
 19 | 
 20 | from pyksc.metrics import cost
 21 | 
 22 | import numpy as np
 23 | import scipy.linalg as LA
 24 | 
 25 | def _compute_centroids(tseries, assign, num_clusters, to_shift=None):
 26 |     '''
 27 |     Given a time series matrix and cluster assignments, this method will
 28 |     compute the spectral centroids for each cluster.
 29 |     
 30 |     Arguments
 31 |     ---------
 32 |     tseries: matrix (n_series, n_points)
 33 |         Time series beng clustered
 34 |     assign: array of ints (size = n_series)
 35 |         The cluster assignment for each time series
 36 |     num_clusters: int
 37 |         The number of clusters being searched for
 38 |     to_shift (optional): array of ints (size = n_series)
 39 |         Determines if time series should be shifted, if different from `None`.
 40 |         In this case, each series will be shifted by the corresponding amount
 41 |         in the array.
 42 |     '''
 43 | 
 44 |     series_size = tseries.shape[1]
 45 |     centroids = np.ndarray((num_clusters, series_size))
 46 | 
 47 |     #shift series for best centroid distance
 48 |     #TODO: this method can be cythonized and done in parallel
 49 |     shifted = tseries
 50 |     if to_shift is not None:
 51 |         for i in xrange(tseries.shape[0]):
 52 |             shifted[i] = shift(tseries[i], to_shift[i], rolling=True)
 53 | 
 54 |     #compute centroids
 55 |     for k in xrange(num_clusters):
 56 |         members = shifted[assign == k]
 57 |         if members.any():
 58 |             num_members = 0
 59 |             if members.ndim == 2:
 60 |                 axis = 1
 61 |                 num_members = members.shape[0]
 62 |             else:
 63 |                 axis = 0
 64 |                 num_members = 1
 65 |             
 66 |             ssqs = np.tile(np.sum(members**2, axis=axis), (series_size, 1))
 67 |             #the original papers divides by ssqs only, while the author's
 68 |             #example implementation uses sqrt. We chose sqrt because it appears
 69 |             #to yield better centroids.
 70 |             aux = members / np.sqrt(ssqs.T)
 71 | 
 72 |             x_mat = np.dot(aux.T, aux)
 73 |             i_mat = num_members * np.eye(series_size)
 74 |             m_mat = i_mat - x_mat
 75 | 
 76 |             #compute eigenvalues and chose the vector for the smallest one
 77 |             #TODO: Check if using scipy's linalg is faster (has more options
 78 |             #      such as finding only the smallest eigval)
 79 |             _, eig_vectors = LA.eigh(m_mat, eigvals=(0, 0))
 80 |             centroids[k] = eig_vectors[:,0]
 81 |             
 82 |             if centroids[k].sum() < 0:
 83 |                 centroids[k] = -centroids[k]
 84 |         else:
 85 |             centroids[k] = np.zeros(series_size)
 86 | 
 87 |     return centroids
 88 | 
 89 | def _base_ksc(tseries, initial_centroids, n_iters=-1):
 90 |     '''
 91 |     This is the base of the KSC algorithm. It follows the same idea of a K-Means
 92 |     algorithm. Firstly, we assign time series to a new cluster based on the
 93 |     distance to the centroids. For each time series, it is computed the best
 94 |     shift to minimize the distance to the closest centroid.
 95 |      
 96 |     The assignment step is followed by an update step where new centroids are 
 97 |     computed based on the new clustering (based on the update step).
 98 |     
 99 |     Both steps above are repeated `n_iters` times. If this parameter is negative
100 |     then the steps are repeated until convergence, that is, until no time series
101 |     changes cluster between consecutive steps. 
102 | 
103 |     Arguments
104 |     ---------
105 |     tseries: a matrix of shape (number of time series, size of each series)
106 |         The time series to cluster
107 |     initial_centroids: a matrix of shape (num. of clusters, size of time series)
108 |         The initial centroid estimates
109 |     n_iters: int
110 |         The number of iterations which the algorithm will run
111 | 
112 |     Returns
113 |     -------
114 |     centroids: a matrix of shape (num. of clusters, size of time series)
115 |         The final centroids found by the algorithm
116 |     assign: an array of num. series size
117 |         The cluster id which each time series belongs to
118 |     best_shift: an array of num. series size
119 |         The amount shift amount performed for each time series
120 |     cent_dists: a matrix of shape (num. centroids, num. series)
121 |         The distance of each centroid to each time series
122 | 
123 |     References
124 |     ----------    References
125 |     ----------
126 |     .. [1] J. Yang and J. Leskovec, 
127 |        "Patterns of Temporal Variation in Online Media" - WSDM'11  
128 |        http://dl.acm.org/citation.cfm?id=1935863
129 |     .. [1] J. Yang and J. Leskovec, 
130 |        "Patterns of Temporal Variation in Online Media" - WSDM'11  
131 |        http://dl.acm.org/citation.cfm?id=1935863
132 |     .. [2] Wikipedia, 
133 |         "K-means clustering"  
134 |         http://en.wikipedia.org/wiki/K-means_clustering
135 |     '''
136 |     
137 |     num_clusters = initial_centroids.shape[0]
138 |     num_series = tseries.shape[0]
139 | 
140 |     centroids = initial_centroids
141 | 
142 |     #KSC algorithm
143 |     cent_dists = None
144 |     assign = None
145 |     prev_assign = None
146 |     best_shift = None
147 | 
148 |     iters = n_iters
149 |     converged = False
150 | 
151 |     while iters != 0 and not converged:
152 |         #assign elements to new clusters    References
153 |         cent_dists, shifts = dist_all(centroids, tseries, rolling=True)
154 |         
155 |         assign = cent_dists.argmin(axis=0)
156 |         best_shift = np.ndarray(num_series, dtype='i')
157 |         for i in xrange(shifts.shape[1]):
158 |             best_shift[i] = shifts[assign[i], i]
159 |         
160 |         #check if converged, if not compute new centroids
161 |         if prev_assign is not None and not (prev_assign - assign).any():
162 |             converged = True
163 |         else: 
164 |             centroids = _compute_centroids(tseries, assign, num_clusters, 
165 |                                           best_shift)
166 | 
167 |         prev_assign = assign
168 |         iters -= 1
169 |     
170 |     return centroids, assign, best_shift, cent_dists
171 | 
172 | def ksc(tseries, num_clusters, n_iters=-1, n_runs=10):
173 |     '''
174 |     This method will make `n_runs` call to `_base_ksc` returning the results
175 |     from the run with the lowest over-all clustering cost. In each run,
176 |     a random initialization of centroids is performed. This is done by assigning
177 |     time series to clusters in a uniform random manner and then computing the
178 |     centroid of each cluster.
179 | 
180 |     Please refer to the documentation of `_base_ksc` for a detailed summary
181 |     of the KSC algorithm.
182 | 
183 |     Arguments
184 |     ---------
185 |     tseries: a matrix of shape (number of time series, size of each series)
186 |         The time series to cluster
187 |     n_iters: int
188 |         The number of iterations which the algorithm will run
189 |     n_runs: int
190 |         The number of times to run the KSC algorithm
191 |         
192 |     Returns
193 |     -------
194 |     centroids: a matrix of shape (num. of clusters, size of time series)
195 |         The final centroids found by the algorithm
196 |     assign: an array of num. series size
197 |         The cluster id which each time series belongs to
198 |     best_shift: an array of num. series size
199 |         The amount shift amount performed for each time series
200 |     cent_dists: a matrix of shape (num. centroids, num. series)
201 |         The distance of each centroid to each time series
202 |         
203 |     References
204 |     ----------
205 |     .. [1] J. Yang and J. Leskovec, 
206 |        "Patterns of Temporal Variation in Online Media" - WSDM'11  
207 |        http://dl.acm.org/citation.cfm?id=1935863
208 |     '''
209 |     
210 |     min_cost = float('+inf')
211 |     
212 |     best_cents = None
213 |     best_assign = None
214 |     best_shift = None
215 |     best_dist = None
216 | 
217 |     for _ in xrange(n_runs):
218 |         assign = np.random.randint(0, num_clusters, tseries.shape[0])
219 |         cents = _compute_centroids(tseries, assign, num_clusters)
220 | 
221 |         cents, assign, series_shift, dists = _base_ksc(tseries, cents, n_iters)
222 |         clust_cost = cost(tseries, assign, cents, dists)
223 | 
224 |         if clust_cost < min_cost:
225 |             min_cost = clust_cost
226 |             best_cents = cents
227 |             best_assign = assign
228 |             best_shift = series_shift
229 |             best_dist = dists
230 | 
231 |     return best_cents, best_assign, best_shift, best_dist
232 | 
233 | def inc_ksc(tseries, num_clusters, n_iters=-1, num_wavelets=2):
234 |     '''
235 |     Given the number `num_wavelets`, this method will compute subsequent 
236 |     Discrete Harr Wavelet Transforms of the time series to be clustered. At
237 |     each transform the number of points of the time series is decreased, thus
238 |     we say that we are viewing the time series at a higher resolution. 
239 |     
240 |     Clustering will begin at the highest resolution (last transform), and the
241 |     results from the previous resolution is used to initialized the current one.
242 |     Only the highest resolution is initialized randomly. This technique can 
243 |     improve the run-time of the KSC algorithm, since it is faster to cluster 
244 |     at higher resolutions (less data points), being for subsequent resolutions
245 |     the centroids from the previous resolution already a close approximation of
246 |     the actual centroid. See [1] for details. 
247 |     
248 |     Please refer to the documentation of `_base_ksc` for a detailed summary
249 |     of the KSC algorithm.
250 | 
251 |     Arguments
252 |     ---------
253 |     tseries: a matrix of shape (number of time series, size of each series)
254 |         The time series to cluster
255 |     n_iters: int
256 |         The number of iterations which the algorithm will run
257 |     num_wavelets: int
258 |         The number of wavelets to use
259 |         
260 |     Returns
261 |     -------
262 |     centroids: a matrix of shape (num. of clusters, size of time series)
263 |         The final centroids found by the algorithm
264 |     assign: an array of num. series size
265 |         The cluster id which each time series belongs to
266 |     best_shift: an array of num. series size
267 |         The amount shift amount performed for each time series
268 |     cent_dists: a matrix of shape (num. centroids, num. series)
269 |         The distance of each centroid to each time series
270 |         
271 |     References
272 |     ----------
273 |     .. [1] J. Yang and J. Leskovec, 
274 |        "Patterns of Temporal Variation in Online Media" - WSDM'11  
275 |        http://dl.acm.org/citation.cfm?id=1935863
276 |     '''
277 |     
278 |     dhw_series = []
279 |     dhw_series.append(tseries)
280 |     previous = tseries
281 |     for _ in xrange(num_wavelets):
282 |         new_series = []
283 |         for j in xrange(tseries.shape[0]):
284 |             wave = transform(previous[j])[0]
285 |             new_series.append(wave)
286 | 
287 |         previous = np.array(new_series)
288 |         dhw_series.append(previous)
289 | 
290 |     assign = np.random.randint(0, num_clusters, tseries.shape[0])
291 |     cents = None
292 |     series_shift = None
293 |     for dhw in reversed(dhw_series):
294 |         cents = _compute_centroids(dhw, assign, num_clusters, series_shift)
295 |         cents, assign, series_shift, dists = _base_ksc(dhw, cents, n_iters)
296 |     
297 |     return cents, assign, series_shift, dists
298 | 


--------------------------------------------------------------------------------
/src/pyksc/metrics.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from pyksc.dist import dist_all
 6 | 
 7 | import numpy as np
 8 | 
 9 | def cost(tseries, assign, centroids, dist_centroids=None):
10 |     
11 |     num_series = tseries.shape[0]
12 |     if dist_centroids is None:
13 |         dist_centroids = dist_all(centroids, tseries)
14 |     
15 |     cost_f = 0.0
16 |     for i in xrange(num_series):
17 |         k = assign[i]
18 |         cost_f += dist_centroids[k, i] ** 2
19 |     
20 |     return cost_f / num_series
21 | 
22 | def avg_intra_dist(tseries, assign, dists_all_pairs=None):
23 |     
24 |     num_series = tseries.shape[0]
25 |     
26 |     if dists_all_pairs is None:
27 |         dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
28 |     
29 |     dists = []
30 |     for i in xrange(num_series):
31 |         k = assign[i]
32 |         members = assign == k
33 |         dists_i = dists_all_pairs[i]
34 |         dists.extend(dists_i[members])
35 |         
36 |     return np.mean(dists), np.std(dists)
37 | 
38 | def avg_inter_dist(tseries, assign, dists_all_pairs=None):
39 |     
40 |     num_series = tseries.shape[0]
41 |     
42 |     if dists_all_pairs is None:
43 |         dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
44 |     
45 |     dists = []
46 |     for i in xrange(num_series):
47 |         k = assign[i]
48 |         non_members = assign != k
49 |         dists_i = dists_all_pairs[i]
50 |         dists.extend(dists_i[non_members])
51 |         
52 |     return np.mean(dists), np.std(dists)
53 | 
54 | def beta_cv(tseries, assign, dists_all_pairs=None):
55 |     
56 |     intra_mean, intra_std = avg_intra_dist(tseries, assign, dists_all_pairs)
57 |     inter_mean, inter_std = avg_inter_dist(tseries, assign, dists_all_pairs)
58 |     
59 |     return (inter_std / inter_mean) / (intra_std / intra_mean)
60 | 
61 | def silhouette(tseries, assign, dists_all_pairs=None):
62 |     
63 |     if dists_all_pairs is None:
64 |         dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
65 | 
66 |     num_series = tseries.shape[0]
67 |     sils = np.zeros(num_series, dtype='f')
68 |     labels = set(assign)
69 |     for i in xrange(num_series):
70 |         
71 |         k = assign[i]
72 |         dists_i = dists_all_pairs[i]
73 |         intra = np.mean(dists_i[assign == k])
74 |         
75 |         min_inter = float('inf')
76 |         for o in labels:
77 |             if o != k:
78 |                 inter = np.mean(dists_i[assign == o])
79 |                 if inter < min_inter:
80 |                     min_inter = inter
81 |          
82 |         sils[i] = (min_inter - intra) / max(intra, min_inter)
83 |     
84 |     return np.mean(sils)
85 | 


--------------------------------------------------------------------------------
/src/pyksc/regression.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''
  3 | Implementation of some Machine Learning regression models. Basically, we 
  4 | implement simple wrappers around the scikit-learn library which performs
  5 | the transformations and specific training models we need.
  6 | '''
  7 | from __future__ import division, print_function
  8 | 
  9 | from sklearn.base import clone
 10 | from sklearn.base import BaseEstimator
 11 | from sklearn.base import RegressorMixin 
 12 | from sklearn.externals.joblib.parallel import Parallel, delayed
 13 | from sklearn.linear_model.base import LinearRegression
 14 | from sklearn.utils.validation import safe_asarray
 15 | 
 16 | import numpy as np
 17 | 
 18 | def mean_absolute_error(y_true, y_pred):
 19 |     """
 20 |     Mean absolute error regression loss
 21 | 
 22 |     Positive floating point value: the best value is 0.0.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     y_true : array-like
 27 | 
 28 |     y_pred : array-like
 29 | 
 30 |     Returns
 31 |     -------
 32 |     mrae : float
 33 |     """
 34 |     
 35 |     y_true = np.asarray(y_true)
 36 |     y_pred = np.asarray(y_pred)
 37 |     
 38 |     return np.mean(np.abs(y_true - y_pred))
 39 | 
 40 | def mean_relative_square_error(y_true, y_pred):
 41 |     """
 42 |     Mean relative square error regression loss
 43 | 
 44 |     Positive floating point value: the best value is 0.0.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     y_true : array-like
 49 | 
 50 |     y_pred : array-like
 51 | 
 52 |     Returns
 53 |     -------
 54 |     mrse : float
 55 |     """
 56 |     y_true = np.asarray(y_true)
 57 |     y_pred = np.asarray(y_pred)
 58 |     return np.mean(((y_pred / y_true) - 1) ** 2)
 59 |     
 60 | class RSELinearRegression(LinearRegression):
 61 |     '''
 62 |     Implements an ordinary least squares (OLS) linear regression in which
 63 |     the objective function is the relative squared error (RSE) and not the 
 64 |     absolute error.
 65 |     
 66 |     This class will use the same parameters and arguments as:
 67 |     sklearn.linear_model.LinearRegression. Different from the linear
 68 |     regression, we set `fit_intecept` to False by default.
 69 |     
 70 |     Parameters
 71 |     ----------
 72 |     fit_intercept : boolean, optional
 73 |         whether to calculate the intercept for this model. If set
 74 |         to false, no intercept will be used in calculations
 75 |         (e.g. data is expected to be already centered).
 76 |     normalize : boolean, optional
 77 |         If True, the regressors X are normalized
 78 |     
 79 |     See
 80 |     ---
 81 |     sklearn.linear_model.LinearRegression
 82 |     '''
 83 |     
 84 |     def __init__(self, fit_intercept=False, normalize=False, copy_X=True):
 85 |         super(RSELinearRegression, self).__init__(fit_intercept, normalize, 
 86 |                                                   copy_X)
 87 |         
 88 |     def fit(self, X, y):
 89 |         X = safe_asarray(X)
 90 |         y = np.asarray(y)
 91 |         
 92 |         X = (X.T / y).T
 93 |         return super(RSELinearRegression, self).fit(X, y / y)
 94 | 
 95 | def _fit_helper(class_, X, y, learner):
 96 |     return class_, clone(learner).fit(X, y)
 97 | 
 98 | def _predict_helper(examples, X, learner):
 99 |     return examples, learner.predict(X)
100 | 
101 | class MultiClassRegression(BaseEstimator, RegressorMixin):
102 |     '''
103 |     This class implements what we call a multi-class regression. In simple
104 |     terms, for a dataset with class labels one specialized regression model
105 |     is learned for each label. Also, a classification model is learned for the
106 |     whole dataset. Thus, when predicting first the classification model is used
107 |     to infer classes and secondly the specialized regression model for each
108 |     class is used.
109 | 
110 |     Parameters
111 |     ----------
112 |     clf : an instance of `sklearn.base.ClassifierMixin`
113 |         this is the classifier to be used. Pass a grid search object when
114 |         searching for best parameters is needed
115 |     regr : a subclass of `sklearn.base.RegressorMixin`
116 |         this is a class object and not a instance of the class. Pass a grid 
117 |         search object when searching for best parameters is needed
118 |     '''
119 |     
120 |     def __init__(self, clf, regr, n_jobs=1, verbose=0, pre_dispatch='2*n_jobs'):
121 |         super(MultiClassRegression, self).__init__()
122 |         
123 |         self.clf = clf
124 |         self.regr = regr
125 |         self.n_jobs = n_jobs
126 |         self.verbose = verbose
127 |         self.pre_dispatch = pre_dispatch
128 |         
129 |         self.clf_model = None
130 |         self.regression_models = None
131 |     
132 |     def fit(self, X, y_clf, y_regression):
133 |         """
134 |         Fit the multiclass model.
135 | 
136 |         Parameters
137 |         ----------
138 |         X : numpy array of shape [n_samples,n_features]
139 |             Training data
140 |         y_clf : numpy array of shape [n_samples]
141 |             Target classes for classification model
142 |         y_regression: numpy array of shape [n_samples]
143 |             Target values for regression model 
144 |             
145 |         Returns
146 |         -------
147 |         self : returns an instance of self.
148 |         """
149 |         
150 |         X = safe_asarray(X)
151 |         y_clf = np.asarray(y_clf)
152 |         y_regression = np.asarray(y_regression)
153 |         
154 |         self.clf_model = self.clf.fit(X, y_clf)
155 |         
156 |         classes = set(y_clf)
157 |         regr = self.regr
158 |         
159 |         def _generator():
160 |             for class_ in classes:
161 |                 examples = y_clf == class_
162 |                 yield class_, X[examples], y_regression[examples], regr
163 |         
164 |         out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\
165 |                 delayed(_fit_helper)(*params) for params in _generator())
166 |         
167 |         self.regression_models = {}
168 |         for class_, regr_model in out:
169 |             self.regression_models[class_] = regr_model
170 |         
171 |         return self
172 | 
173 |     def predict(self, X, return_class_prediction=False):
174 |         """
175 |         Predict using the muticlass regression model
176 | 
177 |         Parameters
178 |         ----------
179 |         X : numpy array of shape [n_samples, n_features]
180 | 
181 |         Returns
182 |         -------
183 |         C : array, shape = [n_samples]
184 |             Returns predicted values.
185 |         """
186 |         
187 |         X = safe_asarray(X)
188 |         y_clf_predicted = np.asarray(self.clf_model.predict(X))
189 |         classes = set(y_clf_predicted)
190 |         
191 |         def _generator():
192 |             for class_ in classes:
193 |                 examples = y_clf_predicted == class_
194 |                 yield examples, X[examples], self.regression_models[class_]
195 |         
196 |         out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\
197 |                 delayed(_predict_helper)(*params) for params in _generator())
198 |         
199 |         y_regr_predicted = None
200 |         for examples, predicted in out:
201 |             if y_regr_predicted is None:
202 |                 y_regr_predicted = np.zeros(X.shape[0], predicted.dtype)
203 |             y_regr_predicted[examples] = predicted
204 |             
205 | 
206 |         if return_class_prediction:
207 |             return y_clf_predicted, y_regr_predicted
208 |         else:
209 |             return y_regr_predicted
210 | 


--------------------------------------------------------------------------------
/src/pyksc/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flaviovdf/pyksc/6ba8988c7fad63366dc2b8d005d0779971e129c5/src/pyksc/test/__init__.py


--------------------------------------------------------------------------------
/src/pyksc/test/test_dhwt.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | '''Unit tests for the dhwt module'''
 3 | 
 4 | from __future__ import division, print_function
 5 | 
 6 | from math import sqrt
 7 | from numpy.testing import *
 8 | from pyksc import dhwt
 9 | 
10 | import unittest
11 | 
12 | import numpy as np
13 | 
14 | class TestWavelets(unittest.TestCase):
15 | 
16 |     def test_all(self):
17 |         x = np.array([])
18 |         assert_array_equal(np.array([]), dhwt.transform(x)[0])
19 |         assert_array_equal(np.array([]), dhwt.transform(x)[1])
20 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
21 |         
22 |         x = np.array([1., 1])
23 |         assert_array_equal(np.array([1.]), dhwt.transform(x)[0])
24 |         assert_array_equal(np.array([0.]), dhwt.transform(x)[1])
25 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
26 |         
27 |         x = np.array([1., 2, 3, 0])
28 |         assert_array_equal(np.array([1.5, 1.5]), dhwt.transform(x)[0])
29 |         assert_array_equal(np.array([-.5, 1.5]), dhwt.transform(x)[1])
30 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
31 |         
32 |         x = np.array([1., 2, 3, 0, 7])
33 |         assert_array_equal(np.array([1.5, 1.5, 3.5]), dhwt.transform(x)[0])
34 |         assert_array_equal(np.array([-.5, 1.5, 3.5]), dhwt.transform(x)[1])
35 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
36 | 
37 |         x = np.array([6., 12, 15, 15, 14, 12, 120, 116])
38 |         assert_array_equal(np.array([9., 15, 13, 118]), dhwt.transform(x)[0])
39 |         assert_array_equal(np.array([-3, 0, 1, 2]), dhwt.transform(x)[1])
40 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
41 |         
42 |         x = np.array([6., 12, 15, 15, 14, 12, 120, 116, 2])
43 |         assert_array_equal(np.array([9., 15, 13, 118, 1]), dhwt.transform(x)[0])
44 |         assert_array_equal(np.array([-3, 0, 1, 2, 1]), dhwt.transform(x)[1])
45 |         assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
46 | 
47 | if __name__ == "__main__":
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/src/pyksc/test/test_dist.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''Unit tests for the dist module'''
  3 | 
  4 | from __future__ import division, print_function
  5 | 
  6 | from math import sqrt
  7 | from numpy.testing import *
  8 | from pyksc import dist
  9 | 
 10 | import unittest
 11 | 
 12 | import numpy as np
 13 | 
 14 | class TestDist(unittest.TestCase):
 15 | 
 16 |     def test_shift_roll(self):
 17 |         array = np.array([])
 18 |         assert_array_equal(np.array([]), dist.shift(array, 0))
 19 |         assert_array_equal(np.array([]), dist.shift(array, -1))
 20 |         assert_array_equal(np.array([]), dist.shift(array, 1))
 21 |         assert_array_equal(np.array([]), dist.shift(array, 10))
 22 |         assert_array_equal(np.array([]), dist.shift(array, -10))
 23 | 
 24 |         array = np.array([1.0])
 25 |         assert_array_equal(np.array([1.0]), dist.shift(array, 0, True))
 26 |         assert_array_equal(np.array([1.0]), dist.shift(array, 1, True))
 27 |         assert_array_equal(np.array([1.0]), dist.shift(array, 1, True))
 28 |         assert_array_equal(np.array([1.0]), dist.shift(array, -2, True))
 29 |         assert_array_equal(np.array([1.0]), dist.shift(array, -2, True))
 30 | 
 31 |         array = np.array([1.0, 2.0, 3.0, 4.0])
 32 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 33 |                 dist.shift(array, 0, True))
 34 | 
 35 |         assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 
 36 |                 dist.shift(array, 1, True))
 37 |         assert_array_equal(np.array([2.0, 3.0, 4.0, 1.]), 
 38 |                 dist.shift(array, -1, True))
 39 | 
 40 |         assert_array_equal(np.array([3.0, 4.0, 1.0, 2.0]), 
 41 |                 dist.shift(array, 2, True))
 42 |         assert_array_equal(np.array([3.0, 4.0, 1.0, 2.0]), 
 43 |                 dist.shift(array, -2, True))
 44 | 
 45 |         assert_array_equal(np.array([2.0, 3.0, 4.0, 1.0]), 
 46 |                 dist.shift(array, 3, True))
 47 |         assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 
 48 |                 dist.shift(array, -3, True))
 49 |         
 50 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 51 |                 dist.shift(array, 4, True))
 52 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 53 |                 dist.shift(array, -4, True))
 54 |         
 55 |         assert_array_equal(np.array([4.0, 1.0, 2.0, 3.0]), 
 56 |                 dist.shift(array, 5, True))
 57 |         assert_array_equal(np.array([2.0, 3.0, 4.0, 1.]), 
 58 |                 dist.shift(array, -5, True))
 59 | 
 60 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 61 |                 dist.shift(array, 8, True))
 62 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 63 |                 dist.shift(array, -8, True))
 64 |     
 65 |     def test_shift_drop(self):
 66 |         array = np.array([1.0])
 67 |         assert_array_equal(np.array([1.0]), dist.shift(array, 0, False))
 68 |         assert_array_equal(np.array([0.0]), dist.shift(array, 1, False))
 69 |         assert_array_equal(np.array([0.0]), dist.shift(array, 1, False))
 70 |         assert_array_equal(np.array([0.0]), dist.shift(array, -2, False))
 71 |         assert_array_equal(np.array([0.0]), dist.shift(array, -2, False))
 72 | 
 73 |         array = np.array([1.0, 2.0, 3.0, 4.0])
 74 |         assert_array_equal(np.array([1.0, 2.0, 3.0, 4.0]), 
 75 |                 dist.shift(array, 0, False))
 76 | 
 77 |         assert_array_equal(np.array([0.0, 1.0, 2.0, 3.0]), 
 78 |                 dist.shift(array, 1, False))
 79 |         assert_array_equal(np.array([2.0, 3.0, 4.0, 0.0]), 
 80 |                 dist.shift(array, -1, False))
 81 | 
 82 |         assert_array_equal(np.array([0.0, 0.0, 1.0, 2.0]), 
 83 |                 dist.shift(array, 2, False))
 84 |         assert_array_equal(np.array([3.0, 4.0, 0.0, 0.0]), 
 85 |                 dist.shift(array, -2, False))
 86 | 
 87 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 1.0]), 
 88 |                 dist.shift(array, 3, False))
 89 |         assert_array_equal(np.array([4.0, 0.0, 0.0, 0.0]), 
 90 |                 dist.shift(array, -3, False))
 91 |         
 92 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
 93 |                 dist.shift(array, 4, False))
 94 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
 95 |                 dist.shift(array, -4, False))
 96 |         
 97 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
 98 |                 dist.shift(array, 5, False))
 99 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
100 |                 dist.shift(array, -5, False))
101 |         
102 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
103 |                 dist.shift(array, 50, False))
104 |         assert_array_equal(np.array([0.0, 0.0, 0.0, 0.0]), 
105 |                 dist.shift(array, -50, False))
106 | 
107 |     #def test_shift_all(self):
108 |     #    m = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
109 |     #    s = np.array([1, 2])
110 |     #
111 |     #    expected = np.array([[3.0, 1.0, 2.0], [5.0, 6.0, 4.0]])
112 |     #    assert_array_almost_equal(expected, dist.shift_all(m, s, True)[0])
113 | 
114 |     def test_inner_prod(self):
115 |         array1 = np.array([])
116 |         array2 = np.array([])
117 |         self.assertEqual(0, dist.inner_prod(array1, array2))
118 |                 
119 |         array1 = np.array([1.0, 2.0, 3.0])
120 |         array2 = np.array([2.0, 3.0, 4.0])
121 |         self.assertEqual(sum(array1 * array2), dist.inner_prod(array1, array2))
122 |         
123 |         self.assertEqual(sum(array1 ** 2), dist.inner_prod(array1, array1))
124 | 
125 |     def test_sqsum(self):
126 |         array = np.array([1.0, 2.0, 3.0])
127 |         self.assertAlmostEqual(sum(array ** 2), dist.sqsum(array), 4)
128 |         
129 |         array = np.array([2.0])
130 |         self.assertEqual(4, dist.sqsum(array))
131 | 
132 |         array = np.array([])
133 |         self.assertEqual(0, dist.sqsum(array))
134 |     
135 |     def test_shift_dist(self):
136 |         array1 = np.array([])
137 |         array2 = np.array([])
138 |         self.assertEqual(0, dist.shift_dist(array1, array2, 0))
139 |         
140 |         array1 = np.array([0., 0.])
141 |         array2 = np.array([0., 0.])
142 |         self.assertEqual(0, dist.shift_dist(array1, array2, 0))
143 |         
144 |         array1 = np.array([1., 2.])
145 |         array2 = np.array([0., 0.])
146 |         self.assertEqual(1, dist.shift_dist(array1, array2, 0))
147 |         
148 |         array1 = np.array([0., 0.])
149 |         array2 = np.array([1., 2.])
150 |         self.assertEqual(1, dist.shift_dist(array1, array2, 0))
151 | 
152 |         array1 = np.array([2.0, 3.0, 4.0])
153 |         array2 = np.array([3.0, 4.0, 0.0])
154 | 
155 |         self.assertAlmostEqual(0, dist.shift_dist(array1, array1, 0))
156 |         self.assertAlmostEqual(0, dist.shift_dist(array2, array2, 0))
157 |         
158 |         expected = 2 / sqrt(29)
159 |         self.assertAlmostEqual(expected, dist.shift_dist(array1, array2, 1, False))
160 |         
161 |         expected = 2 / sqrt(29)
162 |         self.assertAlmostEqual(expected, dist.shift_dist(array1, array2, 1, True))
163 |     
164 |     def test_dist(self):
165 |         array1 = np.array([])
166 |         array2 = np.array([])
167 |         self.assertEqual(0, dist.dist(array1, array2))
168 |         
169 |         array1 = np.array([0., 0.])
170 |         array2 = np.array([0., 0.])
171 |         self.assertEqual(0, dist.dist(array1, array2))
172 |         
173 |         array1 = np.array([1., 2.])
174 |         array2 = np.array([0., 0.])
175 |         self.assertEqual(1, dist.dist(array1, array2))
176 |        
177 |         array1 = np.array([0., 0.])
178 |         array2 = np.array([1., 2.])
179 |         self.assertEqual(1, dist.dist(array1, array2))
180 |         
181 |         array1 = np.array([2.0, 3.0, 4.0])
182 |         array2 = np.array([3.0, 4.0, 0.0])
183 | 
184 |         self.assertAlmostEqual(0, dist.dist(array1, array1))
185 |         self.assertAlmostEqual(0, dist.dist(array2, array2))
186 |         
187 |         expected = 2 / sqrt(29)
188 |         self.assertAlmostEqual(expected, dist.dist(array1, array2, True))
189 |         self.assertAlmostEqual(expected, dist.dist(array2, array1, True))
190 | 
191 |     def test_dist_all(self):
192 |         m1 = np.array([[0.0], [0.0]])
193 |         m2 = np.array([[0.0], [0.0]])
194 | 
195 |         expected = np.array([[0.0, 0.0], [0.0, 0.0]])
196 |         assert_array_equal(expected, dist.dist_all(m1, m2)[0])
197 |         assert_array_equal(expected, dist.dist_all(m1, m2)[1])
198 | 
199 |         m1 = np.array([[1.0], [1.0]])
200 |         m2 = np.array([[0.0], [0.0]])
201 |         expected = np.array([[1.0, 1.0], [1.0, 1.0]])
202 |         assert_array_equal(expected, dist.dist_all(m1, m2)[0])
203 | 
204 |         m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
205 |         m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
206 |         expected = np.array([[0.0, 2/sqrt(29)], [2/sqrt(29), 0.0]])
207 |         assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
208 | 
209 | if __name__ == "__main__":
210 |     unittest.main()
211 | 


--------------------------------------------------------------------------------
/src/pyksc/test/test_ksc.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | '''Unit tests for the ksc module'''
 3 | 
 4 | from __future__ import division, print_function
 5 | 
 6 | from pyksc import dist
 7 | from pyksc import ksc
 8 | 
 9 | import unittest
10 | 
11 | import numpy as np
12 | 
13 | class TestKSC(unittest.TestCase):
14 |     
15 |     def ksc_runner(self, method):
16 |         k = 2
17 |         #One cluster with uniform series, another with a peak.
18 |         X = np.array([[1.0,1,1],
19 |                       [1.1,1,1],
20 |                       [1.2,1,1],
21 |                       [1.3,1,1],
22 |                       [1.3,1,1],
23 |                       [1.3,1,1],
24 |                       [1.3,1,1],
25 |                       [1.3,1,1],
26 |                       [90,2000,90],
27 |                       [90,2001,90],
28 |                       [90,2002,90],
29 |                       [90,2003,90]])
30 |         
31 |         cents, assign, shift, distc = ksc.ksc(X, k)
32 |         del shift
33 |         
34 |         self.assertEqual(len(set(assign)), k)
35 |         self.assertEqual(sum(assign == assign[0]), 8)
36 |         self.assertEqual(sum(assign == assign[-1]), 4)
37 |         
38 |         self.assertEqual(len(set(assign[:8])), 1)
39 |         self.assertEqual(len(set(assign[8:])), 1)
40 |         self.assertFalse(set(assign[:8]) == set(assign[8:]))
41 | 
42 |         cluster_one = assign[0]
43 |         cluster_two = assign[-1]
44 |         
45 |         self.assertTrue(dist.dist(X[0], cents[cluster_one]) < \
46 |                         dist.dist(X[0], cents[cluster_two]))
47 |         self.assertTrue(dist.dist(cents[cluster_one], cents[cluster_two]) > 0)
48 |         
49 |         for i in xrange(X.shape[0]):
50 |             self.assertAlmostEqual(dist.dist(X[i], cents[0], True), 
51 |                                    distc.T[i, 0], 5)
52 |             self.assertAlmostEqual(dist.dist(X[i], cents[1], True), 
53 |                                    distc.T[i, 1], 5)
54 |         
55 |     def test_clustering(self):
56 |         self.ksc_runner(ksc.ksc)
57 |         
58 |     def test_incremental_cluster(self):
59 |         self.ksc_runner(ksc.inc_ksc)
60 |     
61 | if __name__ == "__main__":
62 |     unittest.main()


--------------------------------------------------------------------------------
/src/pyksc/test/test_regression.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''Unit tests for the regression module'''
  3 | 
  4 | from __future__ import division, print_function
  5 | 
  6 | from numpy.testing import *
  7 | from pyksc import regression
  8 | 
  9 | from sklearn import linear_model
 10 | from sklearn.grid_search import GridSearchCV
 11 | 
 12 | import numpy as np
 13 | import unittest
 14 | 
 15 | 
 16 | class TestRSELinearRegression(unittest.TestCase):
 17 |     
 18 |     def test_rse(self):
 19 |         assert_equal(regression.mean_relative_square_error([1, 1, 1], 
 20 |                                                            [0, 0, 0]), 1)
 21 | 
 22 |         assert_almost_equal(regression.mean_relative_square_error([10, 10, 10],\
 23 |                                                     [1, 2, 3]), 0.6466, 3)
 24 |                 
 25 |         assert_equal(regression.mean_relative_square_error([1, 0.5, 0.8], 
 26 |                                                            [1, 0.5, 0.8]), 0)
 27 | 
 28 |     def test_rse_fit_one_attr(self):
 29 |         
 30 |         X = [[1],
 31 |              [4]]
 32 |         
 33 |         X_conv = [[1],
 34 |                   [2]]
 35 |         y = [1, 2]
 36 |         
 37 |         rse_lsq = regression.RSELinearRegression(fit_intercept=False)
 38 |         lsq = linear_model.LinearRegression(fit_intercept=False)
 39 |         
 40 |         model_rse = rse_lsq.fit(X, y)
 41 |         model_lsq = lsq.fit(X_conv, np.ones(len(y)))
 42 |         
 43 |         assert_array_equal(model_lsq.coef_, model_rse.coef_)
 44 |         assert_equal(model_lsq.intercept_, model_rse.intercept_)
 45 |         
 46 |         assert_array_almost_equal(model_rse.predict([[1], [4]]),
 47 |                                   model_lsq.predict([[1], [4]]))
 48 |     
 49 |     def test_rse_fit(self):
 50 |         
 51 |         X = [[1.0, 2],
 52 |              [4, 8]]
 53 |         
 54 |         X_conv = [[1.0, 2],
 55 |                   [2, 4]]
 56 |         y = [1, 2]
 57 |         
 58 |         rse_lsq = regression.RSELinearRegression(fit_intercept=False)
 59 |         lsq = linear_model.LinearRegression(fit_intercept=False)
 60 |         
 61 |         model_rse = rse_lsq.fit(X, y)
 62 |         model_lsq = lsq.fit(X_conv, np.ones(len(y)))
 63 |         
 64 |         assert_array_equal(model_lsq.coef_, model_rse.coef_)
 65 |         assert_equal(model_lsq.intercept_, model_rse.intercept_)
 66 |         
 67 |         assert_array_almost_equal(model_rse.predict([[1, 2], [1, 2]]),
 68 |                                   model_lsq.predict([[1, 2], [1, 2]]))
 69 |         
 70 | class TestMultiClassRegression(unittest.TestCase):
 71 |     
 72 |     def test_multiclass(self):
 73 |         
 74 |         X = [[1, 2],
 75 |              [1, 2],
 76 |              [4, 8],
 77 |              [4, 8],
 78 |              [200, 200],
 79 |              [199, 200.1],
 80 |              [200.2, 198]]
 81 |         
 82 |         y_clf = [0, 0, 0, 0, 1, 1, 1]
 83 |         y_regression = [1, 1, 2, 2, 100, 100, 100]
 84 |         
 85 |         regr_class = regression.RSELinearRegression(fit_intercept=False)
 86 |         clf_class = linear_model.LogisticRegression()
 87 |         
 88 |         multi_class = regression.MultiClassRegression(clf_class, regr_class)
 89 |         
 90 |         model = multi_class.fit(X, y_clf, y_regression)
 91 |         p = model.predict([[1, 2], 
 92 |                            [200, 200], 
 93 |                            [1, 2], 
 94 |                            [200, 200]])
 95 |         assert_equal(p[0], p[2])
 96 |         assert_equal(p[1], p[3])
 97 |         self.assertTrue(p[0] != p[1])
 98 | 
 99 |     def test_multiclass_parallel(self):
100 |         X = [[1, 2],
101 |              [1, 2],
102 |              [4, 8],
103 |              [4, 8],
104 |              [200, 200],
105 |              [199, 200.1],
106 |              [200.2, 198]]
107 |         
108 |         y_clf = [0, 0, 0, 0, 1, 1, 1]
109 |         y_regression = [1, 1, 2, 2, 100, 100, 100]
110 |         
111 |         regr_class = regression.RSELinearRegression(fit_intercept=False)
112 |         clf_class = linear_model.LogisticRegression()
113 |         
114 |         multi_class = regression.MultiClassRegression(clf_class, regr_class,
115 |                                                       n_jobs=2)
116 |         
117 |         model = multi_class.fit(X, y_clf, y_regression)
118 |         p = model.predict([[1, 2], 
119 |                            [200, 200], 
120 |                            [1, 2], 
121 |                            [200, 200]])
122 |         assert_equal(p[0], p[2])
123 |         assert_equal(p[1], p[3])
124 |         self.assertTrue(p[0] != p[1])
125 | 
126 |     def test_with_grid_search(self):
127 |         X = [[1, 2],
128 |              [1, 2],
129 |              [4, 8],
130 |              [4, 8],
131 |              [200, 200],
132 |              [199, 200.1],
133 |              [200.2, 198]]
134 |         
135 |         y_clf = [0, 0, 0, 0, 1, 1, 1]
136 |         y_regression = [1, 1, 2, 2, 100, 100, 100]
137 |         
138 |         regr_class = GridSearchCV(regression.RSELinearRegression(), 
139 |                                   {'normalize':[0,1]})
140 |         clf_class = GridSearchCV(linear_model.LogisticRegression(), {'C':[1,2]})
141 |         
142 |         multi_class = regression.MultiClassRegression(clf_class, regr_class)
143 |         
144 |         model = multi_class.fit(X, y_clf, y_regression)
145 |         p = model.predict([[1, 2], 
146 |                            [200, 200], 
147 |                            [1, 2], 
148 |                            [200, 200]])
149 |         
150 |         assert_equal(p[0], p[2])
151 |         assert_equal(p[1], p[3])
152 |         self.assertTrue(p[0] != p[1])


--------------------------------------------------------------------------------
/src/pyksc/test/test_trend.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | '''Unit tests for the trend module'''
 3 | 
 4 | from __future__ import division, print_function
 5 | 
 6 | from pyksc.trend import TrendLearner
 7 | 
 8 | import unittest
 9 | import numpy as np
10 | 
11 | class TestTrend(unittest.TestCase):
12 | 
13 |     def addnoise(self, base):
14 |         return np.array(base) + np.random.random(len(base))
15 | 
16 |     def test_predict_good(self):
17 |         
18 |         base_one = np.ones(10)
19 |         base_two = np.array([90, 2000, 90, 2000, 90, 2000, 90, 2000, 90, 2000])
20 |         
21 |         y = []
22 |         X = []
23 |         for _ in range(10):
24 |             X.append(self.addnoise(base_one))
25 |             X.append(self.addnoise(base_two))
26 |             y.append(1)
27 |             y.append(0)
28 |         
29 |         
30 |         l = TrendLearner(3, 1)
31 |         l.fit(X, y)
32 | 
33 |         P = []
34 |         for _ in range(50):
35 |             P.append(self.addnoise(base_one))
36 |             P.append(self.addnoise(base_two))
37 | 
38 |         predict = l.predict(P)
39 |         self.assertEqual(50, sum(predict == 0))
40 |         self.assertEqual(50, sum(predict == 1))
41 | 
42 |         probs = l.predict_proba(P)
43 |         
44 |         for i in xrange(probs.shape[0]):
45 |             if i % 2 == 0:
46 |                 self.assertTrue(probs[i, 1] > probs[i, 0])
47 |             else:
48 |                 self.assertTrue(probs[i, 0] > probs[i, 1])
49 |                 
50 |     def test_predict_bad(self):
51 |         
52 |         base_one = np.ones(10)
53 |         base_two = np.array([90, 2000, 90, 2000, 90, 2000, 90, 2000, 90, 2000])
54 |         
55 |         y = []
56 |         X = []
57 |         for _ in range(10):
58 |             X.append(self.addnoise(base_one))
59 |             X.append(self.addnoise(base_two))
60 |             y.append(1)
61 |             y.append(0)
62 |         
63 |         
64 |         l = TrendLearner(1, 1)
65 |         l.fit(X, y)
66 | 
67 |         P = []
68 |         for _ in range(50):
69 |             P.append(self.addnoise(base_one))
70 |             P.append(self.addnoise(base_two))
71 | 
72 |         predict = l.predict(P)
73 |         self.assertEqual(100, sum(predict == 0))
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/src/pyksc/trend.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | 
 3 | import _trend
 4 | 
 5 | from sklearn.base import BaseEstimator
 6 | from sklearn.base import ClassifierMixin
 7 | 
 8 | import numpy as np
 9 | 
10 | class TrendLearner(BaseEstimator, ClassifierMixin):
11 | 
12 |     def __init__(self, num_steps, gamma=1):
13 |         self.num_steps = num_steps
14 |         self.gamma = gamma
15 |         self.num_labels = 0
16 |         self.R = None
17 |         self.labels = None
18 | 
19 |     def fit(self, X, y):
20 | 
21 |         self.R = np.asanyarray(X, dtype=np.float64, order='C')
22 |         
23 |         y = np.asanyarray(y)
24 |         unique, labels_flat = np.unique(y, return_inverse=True)
25 |         self.labels = labels_flat.reshape(y.shape)
26 |         self.num_labels = unique.shape[0]
27 |         
28 | 
29 |     def predict(self, X):
30 | 
31 |         X = np.asanyarray(X)[:, :self.num_steps]
32 |         X = np.asanyarray(X, dtype=np.float64, order='C')
33 |         
34 |         P = _trend.predict(X, self.R, self.labels, self.num_labels, self.gamma)
35 |         
36 |         return P.argmax(axis=1)
37 |     
38 |     def predict_proba(self, X):
39 |         
40 |         X = np.asanyarray(X)[:, :self.num_steps]
41 |         X = np.asanyarray(X, dtype=np.float64, order='C')
42 |         
43 |         P = _trend.predict(X, self.R, self.labels, self.num_labels, self.gamma)
44 |         P = ((P.T / P.sum(axis=1)).T)
45 | 
46 |         return P
47 | 


--------------------------------------------------------------------------------
/src/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('agg')
 3 | 
 4 | from math import sqrt
 5 | from matplotlib import rc
 6 | 
 7 | def initialize_matplotlib():
 8 |     rc('axes', labelsize=20)
 9 |     rc('axes', unicode_minus=False)
10 |     rc('axes', grid=True)
11 |     rc('grid', color='lightgrey')
12 |     rc('grid', linestyle=':')
13 |     rc('font', family='serif')
14 |     rc('legend', fontsize=18)
15 |     rc('lines', linewidth=2)
16 |     rc('ps', usedistiller='xpdf')
17 |     rc('text', usetex=true)
18 |     rc('xtick', labelsize=20)
19 |     rc('ytick', labelsize=20)
20 | 


--------------------------------------------------------------------------------
/src/scripts/class_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from scripts.learn_base import create_input_table
  6 | from scripts.learn_base import create_grid_search
  7 | from scripts.learn_base import clf_summary
  8 | 
  9 | from sklearn.cross_validation import cross_val_score
 10 | from sklearn.cross_validation import StratifiedKFold
 11 | from sklearn.metrics import confusion_matrix
 12 | from sklearn.metrics import precision_recall_fscore_support
 13 | from sklearn.preprocessing import scale
 14 | 
 15 | from vod.stats.ci import half_confidence_interval_size as hci
 16 | 
 17 | import argparse
 18 | import numpy as np
 19 | import sys
 20 | import traceback
 21 | 
 22 | def run_classifier(clf, X, y):
 23 |     n_folds = 5
 24 |     cross_fold = StratifiedKFold(y, k=n_folds)
 25 |     
 26 |     #class_matrices has shape [n_folds, 4, n_classes] 
 27 |     #The second dimension has 4 metrics: for precision, recall, f1, support
 28 |     R_cv = cross_val_score(clf, X, y, cv=cross_fold, n_jobs=1, 
 29 |                            score_func=precision_recall_fscore_support)
 30 |     
 31 |     C_cv = cross_val_score(clf, X, y, cv=cross_fold, n_jobs=1, 
 32 |                            score_func=confusion_matrix)
 33 | 
 34 |     class_matrices = []
 35 |     conf_matrices = []
 36 |     for i in xrange(n_folds):
 37 |         class_matrices.append(R_cv[i])
 38 |         
 39 |         conf_matrix_aux = 1.0 * C_cv[i]
 40 |         conf_matrix_aux = (conf_matrix_aux.T / conf_matrix_aux.sum(axis=1)).T
 41 |         conf_matrices.append(conf_matrix_aux)
 42 | 
 43 |     return class_matrices, conf_matrices
 44 |     
 45 | def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name):
 46 |     X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath)
 47 |     y = np.loadtxt(classes_fpath)
 48 |     
 49 |     clf = create_grid_search(clf_name)
 50 |     class_matrices, conf_matrices = run_classifier(clf, X, y)
 51 |     
 52 |     metric_means = np.mean(class_matrices, axis=0)
 53 |     metric_ci = hci(class_matrices, .95, axis=0)
 54 |     print(clf_summary(metric_means, metric_ci))
 55 |     print()
 56 |     
 57 |     conf_means = np.mean(conf_matrices, axis=0)
 58 |     conf_ci = hci(conf_matrices, .95, axis=0)
 59 |     print("Average confusion matrix with .95 confidence interval")
 60 |     print(" \ttrue ")
 61 |     print("predic")
 62 |     for i in xrange(conf_means.shape[0]):
 63 |         print(i, end="\t \t")
 64 |         for j in xrange(conf_means.shape[1]):
 65 |             print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t")
 66 |         print()
 67 | 
 68 | def create_parser(prog_name):
 69 |     
 70 |     desc = __doc__
 71 |     formatter = argparse.RawDescriptionHelpFormatter
 72 |     parser = argparse.ArgumentParser(prog_name, description=desc, 
 73 |                                      formatter_class=formatter)
 74 |     
 75 |     parser.add_argument('--features_fpath', type=str,
 76 |                         help='Input file with video features')
 77 |     parser.add_argument('--tseries_fpath', type=str,
 78 |                         help='Input file with video time series')
 79 |     parser.add_argument('--tags_fpath', type=str,
 80 |                         help='Input file with video tags')
 81 |     parser.add_argument('classes_fpath', type=str,
 82 |                         help='Classes to predict')
 83 |     parser.add_argument('clf_name', type=str, choices=['rbf_svm', 
 84 |                                                        'linear_svm',
 85 |                                                        'extra_trees'],
 86 |                         help='Classifier to use')
 87 |     
 88 |     return parser
 89 | 
 90 | def entry_point(args=None):
 91 |     '''Fake main used to create argparse and call real one'''
 92 |     
 93 |     if not args: 
 94 |         args = []
 95 | 
 96 |     parser = create_parser(args[0])
 97 |     values = parser.parse_args(args[1:])
 98 |     
 99 |     try:
100 |         return main(values.features_fpath, values.tseries_fpath, 
101 |                     values.tags_fpath, values.classes_fpath, values.clf_name)
102 |     except:
103 |         traceback.print_exc()
104 |         parser.print_usage(file=sys.stderr)
105 |         return 1
106 | 
107 | if __name__ == '__main__':
108 |     sys.exit(entry_point(sys.argv))
109 | 


--------------------------------------------------------------------------------
/src/scripts/cluster_jaccard.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from collections import defaultdict
 6 | from scripts import initialize_matplotlib
 7 | 
 8 | import numpy as np
 9 | import plac
10 | import sys
11 | 
12 | def load_text_file(features_fpath, classes, user_users):
13 |     
14 |     to_cmp = defaultdict(set)
15 |     
16 |     with open(features_fpath) as features_file:
17 |         for curr_line, line in enumerate(features_file):
18 |             spl = line.split()
19 |             
20 |             if user_users:
21 |                 data = set([spl[1]])
22 |             else:
23 |                 data = set(token.strip() for token in spl[2:])
24 |             
25 |             class_num = classes[curr_line]
26 |             to_cmp[class_num].update(data)
27 |             
28 |     return to_cmp
29 | 
30 | def asym_jaccard(first_set, second_set):
31 |     intersect = first_set.intersection(second_set)
32 |     return len(intersect) / len(first_set)
33 | 
34 | @plac.annotations(features_fpath=plac.Annotation('Tags file', type=str),
35 |                   classes_fpath=plac.Annotation('Video classes file', type=str),
36 |                   user_users=plac.Annotation('Use user_names instead of tags',
37 |                                                    kind='flag', abbrev='u',
38 |                                                    type=bool))
39 | def main(features_fpath, classes_fpath, user_users=False):
40 |     
41 |     initialize_matplotlib()
42 |     
43 |     classes = np.loadtxt(classes_fpath)
44 |     num_classes = len(set(classes))
45 |     
46 |     to_compare = load_text_file(features_fpath, classes, user_users)
47 |     
48 |     print(end='\t')
49 |     for i in xrange(num_classes):
50 |         print(i, end='\t')
51 |     print()
52 |     
53 |     for j in xrange(num_classes):
54 |         print(j, end='\t')
55 |         for i in xrange(num_classes):
56 |             
57 |             first_set = to_compare[i]
58 |             second_set = to_compare[j]
59 |             
60 |             asym_j = asym_jaccard(first_set, second_set)
61 |             print('%.3f' % asym_j, end='\t')
62 |         print()
63 |     
64 | if __name__ == '__main__':
65 |     sys.exit(plac.call(main))


--------------------------------------------------------------------------------
/src/scripts/cluster_mutualinfo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from collections import defaultdict
  6 | from vod.entropy import kullback_leiber_divergence
  7 | 
  8 | import numpy as np
  9 | import plac
 10 | import sys
 11 | 
 12 | def load_text_file(features_fpath, classes, use):
 13 |     #TODO: stemming and category names abbrv
 14 |     num_classes = len(set(classes))
 15 |     
 16 |     count_class = [0] * num_classes
 17 |     prob_col = defaultdict(float)
 18 |     count_class_col = defaultdict(lambda: defaultdict(float))
 19 |     
 20 |     with open(features_fpath) as features_file:
 21 |         for curr_line, line in enumerate(features_file):
 22 |             spl = line.split()
 23 |             
 24 |             class_num = classes[curr_line]
 25 |             
 26 |             if use == 'user':
 27 |                 count_class_col[spl[1]][class_num] += 1
 28 |                 prob_col[spl[1]] += 1
 29 |             elif use == 'cat':
 30 |                 if len(spl) > 2:
 31 |                     count_class_col[spl[2]][class_num] += 1
 32 |                     prob_col[spl[2]] += 1
 33 |             else:
 34 |                 for token in spl[3:]:
 35 |                     prob_col[token] += 1
 36 |                     count_class_col[token][class_num] += 1
 37 |             
 38 |             count_class[int(class_num)] += 1
 39 |     
 40 |     prob_class = np.array(count_class, dtype='f')
 41 |     prob_class /= prob_class.sum()
 42 |     
 43 |     prob_class_col = {}
 44 |     sum_col = sum(prob_col.values())
 45 |     for token in count_class_col:
 46 |         prob_col[token] = prob_col[token] / sum_col
 47 |          
 48 |         aux = np.zeros(num_classes, dtype='f')
 49 |         for class_num in xrange(num_classes):
 50 |             aux[class_num] = count_class_col[token][class_num]
 51 |         aux /= aux.sum()
 52 |         
 53 |         prob_class_col[token] = aux 
 54 |             
 55 |     return prob_class, prob_col, prob_class_col
 56 | 
 57 | def load_svm_file(features_fpath, classes):
 58 |     col_dict = {
 59 |         'EXTERNAL':8,
 60 |         'FEATURED':9,
 61 |         'INTERNAL':10,
 62 |         'MOBILE':11,
 63 |         'SEARCH':12,
 64 |         'SOCIAL':13,
 65 |         'VIRAL':14
 66 |     }
 67 | 
 68 |     num_classes = len(set(classes))
 69 |     count_class = [0] * num_classes
 70 |     prob_col = defaultdict(float)
 71 |     count_class_col = defaultdict(lambda: defaultdict(float))
 72 | 
 73 |     with open(features_fpath) as features_file:
 74 |         curr_line = 0
 75 |         for line in features_file:
 76 |             if '#' in line:
 77 |                 continue
 78 |             
 79 |             spl = line.split()
 80 |             for ref_name, col_id in col_dict.items():
 81 |                 ref_abbrv = ref_name
 82 |                 class_num = classes[curr_line]
 83 |                 
 84 |                 weight = float(spl[col_id])
 85 |                 
 86 |                 prob_col[ref_abbrv] += weight
 87 |                 count_class[int(class_num)] += 1
 88 |                 count_class_col[ref_abbrv][class_num] += weight
 89 |     
 90 |             curr_line += 1
 91 |             
 92 |     prob_class = np.array(count_class, dtype='f')
 93 |     prob_class /= prob_class.sum()
 94 |     
 95 |     prob_class_col = {}
 96 |     sum_col = sum(prob_col.values())
 97 |     for token in count_class_col:
 98 |         prob_col[token] = prob_col[token] / sum_col
 99 |          
100 |         aux = np.zeros(num_classes, dtype='f')
101 |         for class_num in xrange(num_classes):
102 |             aux[class_num] = count_class_col[token][class_num]
103 |         aux /= aux.sum()
104 |         
105 |         prob_class_col[token] = aux 
106 |             
107 |     return prob_class, prob_col, prob_class_col
108 | 
109 | @plac.annotations(features_fpath=plac.Annotation('Input file', type=str),
110 |                   classes_fpath=plac.Annotation('Video classes file', type=str),
111 |                   use=plac.Annotation('Indicates which information to use',
112 |                                       type=str, 
113 |                                       choices=['user', 'tags', 'cat', 'ref']))
114 | def main(features_fpath, classes_fpath, use):
115 |     
116 |     classes = np.loadtxt(classes_fpath)
117 |     
118 |     if use in {'user', 'tags', 'cat'}:
119 |         prob_class, prob_col, prob_class_col = load_text_file(features_fpath, 
120 |                                                               classes, use)
121 |     else:
122 |         prob_class, prob_col, prob_class_col = load_svm_file(features_fpath, 
123 |                                                              classes)
124 |     info_gains = []
125 |     mutual_info = 0
126 | 
127 |     for token in prob_class_col:
128 |         dkl = kullback_leiber_divergence(prob_class_col[token], prob_class)
129 |         
130 |         mutual_info += prob_col[token] * dkl
131 |         info_gains.append((dkl, token))
132 |     
133 |     print('Mutual info: ', mutual_info)
134 |     for dkl, token in sorted(info_gains, reverse=True):
135 |         print(dkl, token)
136 |     
137 | if __name__ == '__main__':
138 |     sys.exit(plac.call(main))


--------------------------------------------------------------------------------
/src/scripts/cluster_vol.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from scipy import stats
 6 | 
 7 | from collections import defaultdict
 8 | from matplotlib import pyplot as plt
 9 | from scripts import initialize_matplotlib
10 | 
11 | import numpy as np
12 | import plac
13 | import sys
14 | 
15 | cols = {'PEAK_VIEWS':3, 'SUM_VIEWS':-1}
16 | 
17 | @plac.annotations(features_fpath=plac.Annotation('Features file', type=str),
18 |                   classes_fpath=plac.Annotation('Video classes file', type=str),
19 |                   tseries_fpath=plac.Annotation('Time Series file', type=str))
20 | def main(features_fpath, classes_fpath, tseries_fpath):
21 |     X = np.genfromtxt(features_fpath)[:,1:].copy()
22 |     y = np.loadtxt(classes_fpath)
23 |     T  = np.genfromtxt(tseries_fpath)[:,1:].copy()
24 | 
25 |     bah = T.sum(axis=1) / X[:,-1]
26 |     print(np.mean(bah))
27 |     print(np.median(bah))
28 |     print(np.std(bah))
29 |     print(stats.scoreatpercentile(bah, 25))
30 | 
31 |     num_clusters = len(set(y))
32 | 
33 | 
34 |     for k in xrange(num_clusters):
35 |         print(k, end='\t')
36 |         M = X[y == k]
37 | 
38 |         for column, col_num in sorted(cols.items()):
39 |             data = M[:,col_num]
40 |             mean = np.mean(data)
41 |             print(mean, end='\t')
42 |         print()
43 | 
44 |     print('Tot.', end='\t')
45 |     for column, col_num in sorted(cols.items()):
46 |         data = X[:,col_num]
47 |         
48 |         mean = np.mean(data)
49 |         print(mean, end='\t')
50 |     print()
51 | 
52 | if __name__ == '__main__':
53 |     sys.exit(plac.call(main))
54 | 


--------------------------------------------------------------------------------
/src/scripts/col_to_cluster.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from collections import defaultdict
  6 | from matplotlib import pyplot as plt
  7 | 
  8 | from radar import radar_factory
  9 | from scipy import stats
 10 | 
 11 | from scripts import initialize_matplotlib
 12 | 
 13 | import numpy as np
 14 | import plac
 15 | import sys
 16 | 
 17 | REFERRER_ABBRV = {
 18 |     'EXTERNAL':'EXT.',
 19 |     'FEATURED':'FEAT.',
 20 |     'INTERNAL':'INT.',
 21 |     'MOBILE':'MOBI.',
 22 |     'SEARCH':'SEAR.',
 23 |     'SOCIAL':'SOC.',
 24 |     'VIRAL':'VIR.'}
 25 | 
 26 | CATEG_ABBRV = {
 27 |     'Autos&amp;Vehicles':'Vehi.',
 28 |     'Autos':'Vehi.',
 29 |     'Comedy':'Com.',
 30 |     'Education':'Edu.',
 31 |     'Entertainment':'Ent.',
 32 |     'Film':'Film',
 33 |     'Film&amp;Animation':'Film',
 34 |     'Games':'Game',
 35 |     'Gaming':'Game',
 36 |     'Howto':'Howto',
 37 |     'Howto&amp;Style':'Howto',
 38 |     'Movies':'Film',
 39 |     'Music':'Music',
 40 |     'NULL':'-',
 41 |     'News':'News',
 42 |     'News&amp;Politics':'News',
 43 |     'Nonprofit':'Nonprof.',
 44 |     'Nonprofits&amp;Activism':'Nonprof.',
 45 |     'People&amp;Blogs':'People',
 46 |     'People':'People',
 47 |     'Pets&amp;Animals':'Pets',
 48 |     'Pets':'Pets',
 49 |     'Animals':'Pets',
 50 |     'Science&amp;Technology':'Sci.',
 51 |     'Science':'Sci.',
 52 |     'Tech':'Sci.',
 53 |     'Shows':'Show',
 54 |     'Sports':'Sport',
 55 |     'Trailers':'Film',
 56 |     'Travel&amp;Events':'Travel',
 57 |     'Travel':'Travel'}
 58 |     
 59 | def load_text_file(features_fpath, col_to_use, classes):
 60 |     
 61 |     to_plot = defaultdict(lambda: defaultdict(float))
 62 |     sum_classes = defaultdict(float)
 63 |     labels = set()
 64 |     with open(features_fpath) as features_file:
 65 |         for curr_line, line in enumerate(features_file):
 66 |             spl = line.split()
 67 |             if col_to_use >= len(spl):
 68 |                 continue
 69 |             
 70 |             data = CATEG_ABBRV[line.split()[col_to_use].strip()]
 71 |             class_num = classes[curr_line]
 72 |             
 73 |             labels.add(data)
 74 |             sum_classes[class_num] += 1
 75 |             to_plot[class_num][data] += 1
 76 |             
 77 |     return to_plot, sum_classes, sorted(labels)
 78 | 
 79 | def load_svm_file(features_fpath, classes):
 80 |     
 81 |     col_dict = {
 82 |         'EXTERNAL':13,
 83 |         'FEATURED':14,
 84 |         'INTERNAL':15,
 85 |         'MOBILE':16,
 86 |         'SEARCH':17,
 87 |         'SOCIAL':18,
 88 |         'VIRAL':19
 89 |     }
 90 | 
 91 |     to_plot = defaultdict(lambda: defaultdict(float))
 92 |     sum_classes = defaultdict(float)
 93 |     labels = set()
 94 |     with open(features_fpath) as features_file:
 95 |         curr_line = 0
 96 |         for line in features_file:
 97 |             if '#' in line:
 98 |                 for key, id_ in col_dict.items():
 99 |                     print(id_, key, line.split()[id_])
100 |                 continue
101 |             
102 |             class_num = classes[curr_line]
103 |             sum_classes[class_num] += float(line.split()[-1])
104 |             for ref_name, col_id in col_dict.items():
105 |                 ref_abbrv = REFERRER_ABBRV[ref_name]
106 |                 
107 |                 val = float(line.split()[col_id])
108 |                 present = val > 0
109 |                 if present:
110 |                     labels.add(ref_abbrv)
111 |                     to_plot[class_num][ref_abbrv] += val
112 |     
113 |             curr_line += 1
114 |             
115 |     return to_plot, sum_classes, sorted(labels)
116 | 
117 | def generate_data_plot(to_plot, sum_classes, labels, classes):
118 |     num_classes = len(set(classes))
119 |     colors = ['b', 'g', 'm', 'y']
120 |     
121 |     total = 0        
122 |     for class_num in xrange(num_classes):
123 |         color = colors[class_num]
124 |         
125 |         data_plot = []
126 |         for label in labels:
127 |             total += to_plot[class_num][label]
128 |             data_plot.append(to_plot[class_num][label] / sum_classes[class_num])
129 |         
130 |         yield data_plot, color, class_num
131 |         
132 | def radar_plot(labels, data_plots, out_fpath):
133 | 
134 |     theta = radar_factory(len(labels))
135 | 
136 |     fig = plt.figure()
137 |     ax = fig.add_subplot(1, 1, 1, projection='radar')
138 | 
139 |     for data_plot, color, class_num in data_plots:
140 |         ax.plot(theta, data_plot, color=color, label='C%d'%class_num)
141 |         ax.fill(theta, data_plot, facecolor=color, alpha=0.25)
142 |         
143 |     ax.set_varlabels(labels)
144 |     plt.legend(frameon=False, ncol=4, bbox_to_anchor=(0.5, -0.15), 
145 |                loc='lower center')
146 |     plt.savefig(out_fpath)
147 | 
148 | def chisq(counts, expected_prob):
149 |     counts = np.array(counts)
150 |     expected = np.array(expected_prob) * counts.sum()
151 | 
152 |     return stats.chisquare(counts, expected)[1]
153 | 
154 | def allchisq(to_plot, sum_classes, labels, classes):
155 |     num_classes = len(set(classes))
156 |     
157 |     totals = []
158 |     for label in labels:
159 |         sum_ = 0
160 |         for class_num in xrange(num_classes):
161 |             sum_ += to_plot[class_num][label]
162 |         totals.append(sum_)
163 | 
164 |     probs = []
165 |     sum_totals = sum(totals)
166 |     for i, t in enumerate(totals):
167 |         probs.append( t / sum_totals)
168 |     
169 |     for class_num in xrange(num_classes):
170 |         counts = []
171 |         for label in labels:
172 |             counts.append(to_plot[class_num][label])
173 | 
174 |         chisq(counts, probs)
175 | 
176 | def stacked_bars(labels, data_plots, out_fpath, label_translation, ref=True):
177 |     
178 |     x_locations = [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19]
179 | 
180 |     data_class = {}
181 |     data_label = {}
182 |     for data, _, class_num in data_plots:
183 |         
184 |         best_idx = np.argsort(data)[::-1][:4]
185 |         best_cls = np.array(data)[best_idx] 
186 |         best_lbl = np.array(labels)[best_idx]
187 | 
188 |         data_class[label_translation[class_num]] = best_cls
189 |         data_label[label_translation[class_num]] = best_lbl
190 | 
191 |     bar_data   = []
192 |     bar_labels = []
193 |     for cls in sorted(data_class):
194 |         bar_data.extend(data_class[cls])
195 |         bar_labels.extend(data_label[cls])
196 |     
197 |     colors = ['b', 'g', 'm', 'r', 'y', 'c', '#A617A1', '#2B5700', 'w', 
198 |               '#FF7300', 'k'] * 3
199 | 
200 |     colored={}
201 |     if ref:
202 |         to_use = set(REFERRER_ABBRV.values())
203 |     else:
204 |         to_use = set(CATEG_ABBRV.values())
205 | 
206 |     for i, l in enumerate(to_use):
207 |         colored[l] = colors[i]
208 | 
209 |     for x, y, l in zip(x_locations, bar_data, bar_labels):
210 |       
211 |         c = colored[l]
212 |         plt.bar(left=x, height=y, color=c, width=1, alpha=0.5)
213 |         plt.text(x + .75, y, l, va='bottom', ha='center', rotation=45)
214 |     
215 |     plt.xlim(xmin=0, xmax=21)
216 |     plt.xlabel('Cluster')
217 |     if ref:
218 |         plt.ylim(ymin=0, ymax=.31)
219 |         plt.ylabel('Fraction of Views in Cluster')
220 |     else:
221 |         plt.ylim(ymin=0, ymax=.4)
222 |         plt.ylabel('Fraction of Videos in Cluster')
223 | 
224 |     plt.xticks([3, 8, 13, 18],  ['$C0$', '$C1$', '$C2$', '$C3'])
225 |     plt.savefig(out_fpath)
226 | 
227 | @plac.annotations(features_fpath=plac.Annotation('Features file', type=str),
228 |                   classes_fpath=plac.Annotation('Video classes file', type=str),
229 |                   out_fpath=plac.Annotation('Plot file', type=str),
230 |                   trans_fpath=plac.Annotation('Translation of cluster num to labe', 
231 |                                               type=str),
232 |                   col_to_use=plac.Annotation('Column number to use', type=int,
233 |                                              kind='option', abbrev='c'),
234 |                   is_text_features=plac.Annotation('Indicates file type',
235 |                                                    kind='flag', abbrev='t',
236 |                                                    type=bool))
237 | def main(features_fpath, classes_fpath, out_fpath, 
238 |          trans_fpath, col_to_use=2, is_text_features=False):
239 |     initialize_matplotlib()
240 |     
241 |     classes = np.loadtxt(classes_fpath)
242 | 
243 |     if is_text_features:
244 |         to_plot, sum_classes, labels = \
245 |         load_text_file(features_fpath, col_to_use, classes)
246 |         ref=False
247 |     else:
248 |         to_plot, sum_classes, labels = \
249 |         load_svm_file(features_fpath, classes)
250 |         ref=True
251 | 
252 |     trans = {}
253 |     with open(trans_fpath) as f:
254 |         for l in f:
255 |             spl = l.split()
256 |             trans[int(spl[0])] = int(spl[1])
257 | 
258 |     data = generate_data_plot(to_plot, sum_classes, labels, classes)
259 |     stacked_bars(labels, data, out_fpath, trans, ref)
260 |     #allchisq(to_plot, sum_classes, labels, classes)
261 |     
262 | if __name__ == '__main__':
263 |     sys.exit(plac.call(main))
264 | 


--------------------------------------------------------------------------------
/src/scripts/create_mic_input.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from scripts.learn_base import create_input_table, hstack_if_possible
 6 | 
 7 | import numpy as np
 8 | import plac
 9 | import sys
10 | 
11 | @plac.annotations(features_fpath=plac.Annotation('Partial Features', 
12 |                                                          type=str),
13 |                   tag_categ_fpath=plac.Annotation('Tags file', type=str),
14 |                   tseries_fpath=plac.Annotation('Time series file', type=str),
15 |                   assign_fpath=plac.Annotation('Series assignment file', 
16 |                                                type=str))
17 | def main(features_fpath, tag_categ_fpath, tseries_fpath, assign_fpath):
18 |     
19 |     X, feature_ids, _ = \
20 |             create_input_table(features_fpath, None, tag_categ_fpath,-1)
21 |    
22 |     y_clf = np.genfromtxt(assign_fpath)
23 |     y_rgr = np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1)
24 | 
25 |     for feat_id in range(len(feature_ids)):
26 |         print(feature_ids[feat_id], end=',')
27 |     
28 |     print('TREND', end=',')
29 |     print('FINAL_VIEWS')
30 |     
31 |     M = np.column_stack((X, y_clf, y_rgr))
32 |     np.savetxt(sys.stdout, M, '%d', delimiter=',')
33 | 
34 | if __name__ == '__main__':
35 |     sys.exit(plac.call(main))


--------------------------------------------------------------------------------
/src/scripts/learn_base.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''
  3 | Common functions for creating classifiers and regressors for machine learning
  4 | tasks
  5 | '''
  6 | from __future__ import division, print_function
  7 | 
  8 | from scripts.col_to_cluster import CATEG_ABBRV
  9 | 
 10 | from scipy import sparse
 11 | 
 12 | from sklearn import ensemble
 13 | from sklearn import grid_search
 14 | from sklearn import svm
 15 | 
 16 | import cStringIO
 17 | import numpy as np
 18 | 
 19 | #Params
 20 | SVM_C_RANGE = [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]
 21 | SVM_GAMMA_RANGE = [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]
 22 | 
 23 | TREE_SPLIT_RANGE = [1, 2, 4, 8, 16]
 24 | 
 25 | PARAMS = {'rbf_svm':{'C':SVM_C_RANGE, 'gamma':SVM_GAMMA_RANGE},
 26 |           'linear_svm':{'C':SVM_C_RANGE},
 27 |           'extra_trees':{'min_samples_split':TREE_SPLIT_RANGE}}
 28 | 
 29 | #Classifiers
 30 | CACHE_SIZE = 1024 * 4
 31 | CLFS = {'rbf_svm':svm.SVC(kernel='rbf', cache_size=CACHE_SIZE),
 32 |         'linear_svm':svm.LinearSVC(),
 33 |         'extra_trees':ensemble.ExtraTreesClassifier(n_estimators=20, 
 34 |                                                     compute_importances=True,
 35 |                                                     criterion='gini',
 36 |                                                     n_jobs=1)}
 37 | 
 38 | CLFS_SPARSE = {'rbf_svm':svm.sparse.SVC(kernel='rbf', cache_size=CACHE_SIZE),
 39 |                'linear_svm':svm.sparse.LinearSVC(),
 40 |                'extra_trees':CLFS['extra_trees']}
 41 | 
 42 | #Regressors
 43 | RGRS = {'rbf_svm':svm.SVR(kernel='rbf', cache_size=CACHE_SIZE),
 44 |         'linear_svm':svm.SVR(kernel='linear'),
 45 |         'extra_trees':ensemble.ExtraTreesRegressor(n_estimators=20, 
 46 |                                                    compute_importances=True)}
 47 | 
 48 | RGRS_SPARSE = {'rbf_svm':svm.sparse.SVR(kernel='rbf', cache_size=CACHE_SIZE),
 49 |                'linear_svm':svm.sparse.SVR(kernel='linear'),
 50 |                'extra_trees':CLFS['extra_trees']}
 51 | 
 52 | #Category Parsing Utilities
 53 | CAT_COL = 2
 54 | CAT_IDS = dict((abbrv, i) \
 55 |                for i, abbrv in enumerate(sorted(set(CATEG_ABBRV.values()))))
 56 | 
 57 | def _get_classifier_and_params(name, sparse = False):
 58 |     if sparse:
 59 |         dict_to_use = CLFS_SPARSE
 60 |     else:
 61 |         dict_to_use = CLFS
 62 |     
 63 |     return dict_to_use[name], PARAMS[name]
 64 | 
 65 | def _get_regressor_and_params(name, sparse = False):
 66 |     if sparse:
 67 |         dict_to_use = RGRS_SPARSE
 68 |     else:
 69 |         dict_to_use = RGRS
 70 |     
 71 |     return dict_to_use[name], PARAMS[name]
 72 | 
 73 | def create_grid_search(name, sparse=False, regressor=False, n_jobs=1):
 74 |     if regressor:
 75 |         learner, params = _get_regressor_and_params(name, sparse)
 76 |     else:
 77 |         learner, params = _get_classifier_and_params(name, sparse)
 78 |         
 79 |     return grid_search.GridSearchCV(learner, params, cv=3, refit=True, 
 80 |                                     n_jobs=n_jobs)
 81 | 
 82 | def hstack_if_possible(X, Y):
 83 |     if X is not None:
 84 |         return np.hstack((X, Y))
 85 |     else:
 86 |         return Y
 87 |     
 88 | def update_col_ids(ids_to_insert, column_ids=None):
 89 |     if not column_ids: 
 90 |         column_ids = {}
 91 |         
 92 |     base = len(column_ids)
 93 |     column_ids.update((pnt + base, name) for pnt, name in ids_to_insert.items())
 94 | 
 95 |     return column_ids
 96 | 
 97 | def load_referrers(referrers_fpath, X = None, column_ids=None):
 98 |     X_ref = np.genfromtxt(referrers_fpath)[:,1:].copy()
 99 | 
100 |     new_col_ids = {}
101 |     with open(referrers_fpath) as referrers_file:
102 |         for line in referrers_file:
103 |             if '#' in line:
104 |                 spl = line.split()[1:]
105 |                 new_col_ids = dict((k, v) for k, v in enumerate(spl))
106 | 
107 |                 return hstack_if_possible(X, X_ref), \
108 |                     update_col_ids(new_col_ids, column_ids)
109 | 
110 | def load_time_series(tseries_fpath, num_pts = -1, X = None, column_ids=None):
111 |     X_series = np.genfromtxt(tseries_fpath)[:,1:][:,range(num_pts)]
112 |     
113 |     new_col_ids = dict((i, 'POINT_%d'%pnt) \
114 |                        for i, pnt in enumerate(range(num_pts)))
115 | 
116 |     return hstack_if_possible(X, X_series), \
117 |         update_col_ids(new_col_ids, column_ids)
118 |                       
119 | def load_categories(tags_cat_fpath, X = None, column_ids=None):
120 |     with open(tags_cat_fpath) as tags_cat_file:
121 |         data = []
122 |         row = []
123 |         col = []
124 |         new_col_ids = {}
125 |         for i, line in enumerate(tags_cat_file):
126 |             spl = line.split()
127 |             category = 'NULL'
128 |             if len(spl) > CAT_COL:
129 |                 category = line.split()[CAT_COL]
130 |                 
131 |             abbrv = CATEG_ABBRV[category]
132 |             categ_id = CAT_IDS[abbrv]
133 |             
134 |             data.append(1)
135 |             row.append(i)
136 |             col.append(categ_id)
137 |             
138 |             new_col_ids[categ_id] = 'CAT_%s' % abbrv
139 |         
140 |         X_categ = np.asarray(sparse.coo_matrix((data, (row, col))).todense())
141 |         return hstack_if_possible(X, X_categ), \
142 |             update_col_ids(new_col_ids, column_ids)
143 |             
144 | def create_input_table(referrers_fpath = None, tseries_fpath = None, 
145 |                        tags_cat_fpath = None, num_pts = 3):
146 |     
147 |     X = None
148 |     column_ids = None
149 |     
150 |     if referrers_fpath:
151 |         X, column_ids = load_referrers(referrers_fpath)
152 |         
153 |     if tseries_fpath and num_pts > 0:
154 |         X, column_ids = load_time_series(tseries_fpath, num_pts, X, column_ids)
155 |     
156 |     if tags_cat_fpath:
157 |         X, column_ids = load_categories(tags_cat_fpath, X, column_ids)
158 |     
159 |     inverse_names = dict((v, k) for k, v in column_ids.items())
160 |     return X, column_ids, inverse_names
161 | 
162 | def clf_summary(mean_scores, ci_scores):
163 |     
164 |     buff = cStringIO.StringIO()
165 |     try:
166 |         print('class \tprecision \trecall \tf1 score \tsupport', file=buff)
167 |         for j in xrange(mean_scores.shape[1]):
168 |             print(j, end="\t", file=buff)
169 |             for i in xrange(mean_scores.shape[0]):
170 |                 print('%.3f +- %.3f' % (mean_scores[i, j], ci_scores[i, j]), 
171 |                       end="\t", file=buff)
172 |             print(file=buff)
173 |         print(file=buff)
174 |     
175 |         return buff.getvalue()
176 |     finally:
177 |         buff.close()
178 | 


--------------------------------------------------------------------------------
/src/scripts/leave_k.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from scripts.learn_base import create_input_table
  6 | from scripts.learn_base import create_grid_search
  7 | 
  8 | from sklearn.metrics import f1_score
  9 | from sklearn.metrics import mean_square_error as mse
 10 | from sklearn.metrics import r2_score
 11 | 
 12 | import plac
 13 | import numpy as np
 14 | import os
 15 | import sys
 16 | 
 17 | def create_learners(learner_name='extra_trees'):
 18 |     clf = create_grid_search(learner_name, n_jobs=-1)
 19 |     rgr = create_grid_search(learner_name, regressor=True, n_jobs=-1)
 20 | 
 21 |     return clf, rgr
 22 |     
 23 | def print_importance(feature_ids, importance_clf, importance_rgr):
 24 |     print()    
 25 |     print('Classification Importance')
 26 |     for key in importance_clf.argsort()[::-1]:
 27 |         print(feature_ids[key], importance_clf[key])
 28 |     
 29 |     print()
 30 |     print('Regression Importance')
 31 |     for key in importance_rgr.argsort()[::-1]:
 32 |         print(feature_ids[key], importance_rgr[key])
 33 | 
 34 | def mae(y_true, y_pred):
 35 |     y_true = np.asanyarray(y_true)
 36 |     y_pred = np.asanyarray(y_pred)
 37 |     
 38 |     return np.mean(np.abs(y_true - y_pred))
 39 | 
 40 | def run_experiment(X, y_clf, y_rgr, feature_ids, out_foldpath, k=500):
 41 |     clf, rgr = create_learners()
 42 |     
 43 |     n = len(y_clf)
 44 |     train_index = np.ones(n, dtype=np.bool)
 45 |     train_index[-k:] = False
 46 |     test_index = np.logical_not(train_index)
 47 |     
 48 |     clf_model = clf.fit(X[train_index], y_clf[train_index]) 
 49 |     rgr_model = rgr.fit(X[train_index], y_rgr[train_index])
 50 |     
 51 |     clf_true = y_clf[test_index]
 52 |     clf_pred = clf_model.predict(X[test_index])
 53 |     
 54 |     rgr_true = y_rgr[test_index]
 55 |     rgr_pred = rgr_model.predict(X[test_index])
 56 |     
 57 |     clf_pred_fpath = os.path.join(out_foldpath, '%clf.pred')
 58 |     clf_true_fpath = os.path.join(out_foldpath, '%clf.true')
 59 |     
 60 |     rgr_pred_fpath = os.path.join(out_foldpath, '%rgr.pred')
 61 |     rgr_true_fpath = os.path.join(out_foldpath, '%rgr.true')
 62 |     
 63 |     np.savetxt(clf_pred_fpath, clf_pred, fmt="%d")
 64 |     np.savetxt(clf_true_fpath, clf_true, fmt="%d")
 65 |     
 66 |     np.savetxt(rgr_pred_fpath, rgr_pred)
 67 |     np.savetxt(rgr_true_fpath, rgr_true)
 68 |     
 69 |     print('Micro F1: ', f1_score(clf_true, clf_pred, average='micro'))
 70 |     print('Macro F1: ', f1_score(clf_true, clf_pred, average='macro'))
 71 |     print()
 72 |     print('R2: ', r2_score(rgr_true, rgr_pred))
 73 |     print('MAE: ', mae(rgr_true, rgr_pred))
 74 |     print('MSE: ', mse(rgr_true, rgr_pred))
 75 |     print()
 76 |     print_importance(feature_ids, 
 77 |                      clf_model.best_estimator_.feature_importances_,
 78 |                      rgr_model.best_estimator_.feature_importances_)
 79 | 
 80 | @plac.annotations(partial_features_fpath=plac.Annotation('Partial Features', 
 81 |                                                          type=str),
 82 |                   tag_categ_fpath=plac.Annotation('Tags file', type=str),
 83 |                   tseries_fpath=plac.Annotation('Time series file', type=str),
 84 |                   num_days_to_use=plac.Annotation('Num Days Series', type=int),
 85 |                   assign_fpath=plac.Annotation('Series assignment file', 
 86 |                                                type=str),
 87 |                   out_foldpath=plac.Annotation('Output folder', type=str))
 88 | def main(partial_features_fpath, tag_categ_fpath, tseries_fpath, 
 89 |          num_days_to_use, assign_fpath, out_foldpath):
 90 |     
 91 |     X, feature_ids, feature_names = \
 92 |             create_input_table(partial_features_fpath, tseries_fpath, 
 93 |                                tag_categ_fpath, num_pts = num_days_to_use)
 94 |     
 95 |     #Sort X by upload date
 96 |     up_date_col = feature_names['A_UPLOAD_DATE']
 97 |     sort_by_date = X[:,up_date_col].argsort()
 98 |     X = X[sort_by_date].copy()
 99 |     
100 |     y_clf = np.genfromtxt(assign_fpath)[sort_by_date]
101 |     y_regr = np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1)[sort_by_date]
102 |     run_experiment(X, y_clf, y_regr, feature_ids, out_foldpath)
103 | 
104 | if __name__ == '__main__':
105 |     sys.exit(plac.call(main))


--------------------------------------------------------------------------------
/src/scripts/plot_centroids.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from matplotlib import pyplot as plt
  6 | 
  7 | from pyksc import dist
  8 | from pyksc import ksc
  9 | 
 10 | from scripts import initialize_matplotlib
 11 | 
 12 | import argparse
 13 | import numpy as np
 14 | import os
 15 | import sys
 16 | import traceback
 17 | 
 18 | def main(tseries_fpath, k, plot_foldpath):
 19 |     import mkl
 20 |     mkl.set_num_threads(16)
 21 | 
 22 |     initialize_matplotlib()
 23 |     
 24 |     X = np.genfromtxt(tseries_fpath)[:,1:]
 25 |     aux = X.sum(axis=1)
 26 |     fix = np.where(aux == 0)[0]
 27 |     X[fix] += .001 #fixing zero only rows
 28 |     X = X.copy()
 29 | 
 30 |     cent, assign, shift, dists_cent = ksc.inc_ksc(X, k)
 31 |     
 32 |     for i in xrange(cent.shape[0]):
 33 |         t_series = cent[i]
 34 |         
 35 |         plt.plot(t_series, '-k')
 36 |         plt.gca().get_xaxis().set_visible(False)
 37 |         plt.gca().get_yaxis().set_visible(False)
 38 |         #plt.ylabel('Views')
 39 |         #plt.xlabel('Time')
 40 |         plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i))
 41 |         plt.close()
 42 |         
 43 |         half = t_series.shape[0] // 2
 44 |         to_shift = half - np.argmax(t_series)
 45 |         to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True)
 46 |         plt.plot(to_plot_peak_center, '-k')
 47 |         plt.gca().get_xaxis().set_visible(False)
 48 |         plt.gca().get_yaxis().set_visible(False)
 49 |         #plt.ylabel('Views')
 50 |         #plt.xlabel('Time')
 51 |         plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i))
 52 |         plt.close()
 53 |         
 54 |         to_shift = 0 - np.argmin(t_series)
 55 |         to_plot_min_first = dist.shift(t_series, to_shift, rolling=True)
 56 |         plt.plot(to_plot_min_first, '-k')
 57 |         plt.gca().get_xaxis().set_visible(False)
 58 |         plt.gca().get_yaxis().set_visible(False)
 59 |         #plt.ylabel('Views')
 60 |         #plt.xlabel('Time')
 61 |         plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i))
 62 |         plt.close()
 63 |         
 64 |     np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f')
 65 |     np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d')
 66 |     np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d')
 67 |     np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'), dists_cent, 
 68 |                fmt='%.5f')
 69 | 
 70 | def create_parser(prog_name):
 71 |     
 72 |     desc = __doc__
 73 |     formatter = argparse.RawDescriptionHelpFormatter
 74 |     parser = argparse.ArgumentParser(prog_name, description=desc,
 75 |                                      formatter_class=formatter)
 76 |     
 77 |     parser.add_argument('tseries_fpath', type=str, help='Time series file')
 78 |     parser.add_argument('k', type=int, help='Number of clusters')
 79 |     parser.add_argument('plot_foldpath', type=str, help='Folder to store plots')
 80 |     
 81 |     return parser
 82 | 
 83 | def entry_point(args=None):
 84 |     '''Fake main used to create argparse and call real one'''
 85 |     
 86 |     if not args: 
 87 |         args = []
 88 | 
 89 |     parser = create_parser(args[0])
 90 |     values = parser.parse_args(args[1:])
 91 |     
 92 |     try:
 93 |         return main(values.tseries_fpath, values.k, values.plot_foldpath)
 94 |     except:
 95 |         traceback.print_exc()
 96 |         parser.print_usage(file=sys.stderr)
 97 |         return 1
 98 | 
 99 | if __name__ == '__main__':
100 |     sys.exit(entry_point(sys.argv))
101 | 


--------------------------------------------------------------------------------
/src/scripts/plot_members.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | from pyksc import dist
 8 | from scripts import initialize_matplotlib
 9 | 
10 | import plac
11 | import numpy as np
12 | import os
13 | import sys
14 |         
15 | def plot_series(t_series, plot_foldpath, name, shift=False):
16 |     
17 |     to_plot = t_series
18 |     if shift:
19 |         to_shift = 0 - np.argmin(t_series)
20 |         to_plot = dist.shift(t_series, to_shift, rolling=True)
21 |         
22 |     plt.plot(to_plot, '-k')
23 |     plt.ylabel('Views')
24 |     plt.xlabel('Time')
25 |     plt.savefig(os.path.join(plot_foldpath, '%s.png' % name))
26 |     plt.close()
27 | 
28 | @plac.annotations(tseries_fpath=plac.Annotation('Input file', type=str),
29 |                   assign_fpath=plac.Annotation('Series assignment file', 
30 |                                                type=str),
31 |                   centroids_fpath=plac.Annotation('Cluster centroids file',
32 |                                                   type=str),
33 |                   plot_foldpath=plac.Annotation('Output folder', type=str)) 
34 | def main(tseries_fpath, assign_fpath, centroids_fpath, plot_foldpath):
35 |     initialize_matplotlib()
36 |     
37 |     X = np.genfromtxt(tseries_fpath)[:,1:].copy()
38 |     y = np.genfromtxt(assign_fpath)
39 |     centroids = np.genfromtxt(centroids_fpath)
40 | 
41 |     num_classes = len(set(y))
42 |     
43 |     for k in xrange(num_classes):
44 |         centroid_plot_foldpath = os.path.join(plot_foldpath, str(k))
45 |         os.mkdir(centroid_plot_foldpath)
46 | 
47 |         centroid = centroids[k]
48 |         plot_series(centroid, centroid_plot_foldpath, 'centroid', True)
49 |         
50 |         members = X[y == k]
51 |         n_samples = members.shape[0]
52 |         sample_rows = np.arange(n_samples)
53 |         np.random.shuffle(sample_rows)        
54 |         
55 |         members_to_plot = members[sample_rows[:10]]
56 |         for i in xrange(members_to_plot.shape[0]):
57 |             print(k, i)
58 |             plot_series(members_to_plot[i], centroid_plot_foldpath, 'ex-%d' % i)
59 |             
60 | if __name__ == '__main__':
61 |     sys.exit(plac.call(main))


--------------------------------------------------------------------------------
/src/scripts/plot_quality.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from matplotlib import pyplot as plt
  6 | 
  7 | from pyksc import dist
  8 | from pyksc import metrics
  9 | from pyksc import ksc
 10 | 
 11 | from scripts import initialize_matplotlib
 12 | 
 13 | from vod.stats.ci import half_confidence_interval_size as hci
 14 | 
 15 | import argparse
 16 | import numpy as np
 17 | import os
 18 | import sys
 19 | import traceback
 20 | 
 21 | def run_clustering(X, k, dists_all):
 22 | 
 23 |     cent, assign, shift, dists_cent = ksc.inc_ksc(X, k)
 24 | 
 25 |     intra = metrics.avg_intra_dist(X, assign, dists_all)[0]
 26 |     inter = metrics.avg_inter_dist(X, assign, dists_all)[0]
 27 |     bcv = metrics.beta_cv(X, assign, dists_all)
 28 |     cost = metrics.cost(X, assign, None, dists_cent)
 29 | 
 30 |     return intra, inter, bcv, cost
 31 |     
 32 | def main(tseries_fpath, plot_foldpath):
 33 |     assert os.path.isdir(plot_foldpath)
 34 |     initialize_matplotlib()
 35 |     
 36 |     X = np.genfromtxt(tseries_fpath)[:,1:].copy()
 37 | 
 38 |     n_samples = X.shape[0]
 39 |     sample_rows = np.arange(n_samples)
 40 |     
 41 |     clust_range = range(2, 16)
 42 |     n_clustering_vals = len(clust_range)
 43 |     
 44 |     intra_array = np.zeros(shape=(25, n_clustering_vals))
 45 |     inter_array = np.zeros(shape=(25, n_clustering_vals))
 46 |     bcvs_array = np.zeros(shape=(25, n_clustering_vals))
 47 |     costs_array = np.zeros(shape=(25, n_clustering_vals))
 48 |     
 49 |     r = 0
 50 |     for i in xrange(5):
 51 |         np.random.shuffle(sample_rows)
 52 |         rand_sample = sample_rows[:200]
 53 |         
 54 |         X_new = X[rand_sample]
 55 |         D_new = dist.dist_all(X_new, X_new, rolling=True)[0]
 56 |         
 57 |         for j in xrange(5):
 58 |             for k in clust_range:
 59 |                 intra, inter, bcv, cost = run_clustering(X_new, k, D_new)
 60 |                 
 61 |                 intra_array[r, k - 2] = intra
 62 |                 inter_array[r, k - 2] = inter
 63 |                 bcvs_array[r, k - 2]  = bcv
 64 |                 costs_array[r, k - 2] = cost
 65 |                 
 66 |             r += 1
 67 |             print(r)
 68 | 
 69 |     intra_err = np.zeros(n_clustering_vals)
 70 |     inter_err = np.zeros(n_clustering_vals)
 71 |     bcvs_err = np.zeros(n_clustering_vals)
 72 |     costs_err = np.zeros(n_clustering_vals)
 73 | 
 74 |     for k in clust_range:
 75 |         j = k - 2
 76 |         intra_err[j] = hci(intra_array[:,j], .95)
 77 |         inter_err[j] = hci(inter_array[:,j], .95)
 78 |         bcvs_err[j] = hci(bcvs_array[:,j], .95)
 79 |         costs_err[j] = hci(costs_array[:,j], .95)
 80 |             
 81 |     plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt='gD', 
 82 |                  label='Inter Cluster', yerr=inter_err)
 83 |     plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt='bo', 
 84 |                  label='BetaCV', yerr=bcvs_err)
 85 |     plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt='rs', 
 86 |                  label='Intra Cluster', yerr=intra_err)
 87 |     plt.ylabel('Average Distance')
 88 |     plt.xlabel('Number of clusters')
 89 |     plt.xlim((0., 16))
 90 |     plt.ylim((0., 1.))
 91 |     plt.legend(frameon=False, loc='lower left')
 92 |     plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))
 93 |     plt.close()
 94 |     
 95 |     plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt='bo', 
 96 |                  label='Cost', yerr=costs_err)
 97 |     plt.ylabel('Cost (F)')
 98 |     plt.xlabel('Number of clusters')
 99 |     plt.xlim((0., 16))
100 |     plt.ylim((0., 1.))
101 |     plt.legend(frameon=False, loc='lower left')
102 |     plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
103 |     plt.close()
104 | 
105 | def create_parser(prog_name):
106 |     
107 |     desc = __doc__
108 |     formatter = argparse.RawDescriptionHelpFormatter
109 |     parser = argparse.ArgumentParser(prog_name, description=desc,
110 |                                      formatter_class=formatter)
111 |     
112 |     parser.add_argument('tseries_fpath', type=str, help='Time series file')
113 |     parser.add_argument('plot_foldpath', type=str, help='Folder to store plots')
114 |     return parser
115 | 
116 | def entry_point(args=None):
117 |     '''Fake main used to create argparse and call real one'''
118 |     
119 |     if not args: 
120 |         args = []
121 | 
122 |     parser = create_parser(args[0])
123 |     values = parser.parse_args(args[1:])
124 |     
125 |     try:
126 |         return main(values.tseries_fpath, values.plot_foldpath)
127 |     except:
128 |         traceback.print_exc()
129 |         parser.print_usage(file=sys.stderr)
130 |         return 1
131 | 
132 | if __name__ == '__main__':
133 |     sys.exit(entry_point(sys.argv))
134 | 


--------------------------------------------------------------------------------
/src/scripts/plot_time_to_peak.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from matplotlib import dates
  6 | from matplotlib import pyplot as plt
  7 | 
  8 | from scripts import initialize_matplotlib
  9 | 
 10 | import argparse
 11 | import numpy as np
 12 | import os
 13 | import sys
 14 | import traceback
 15 | 
 16 | refs = {
 17 | 'G_EXTERNAL_EVENT_DATE':0, 
 18 | 'G_FEATURED_EVENT_DATE':1, 
 19 | 'G_INTERNAL_EVENT_DATE':2,
 20 | 'G_MOBILE_EVENT_DATE':3, 
 21 | 'G_SEARCH_EVENT_DATE':4, 
 22 | 'G_SOCIAL_EVENT_DATE':5, 
 23 | 'G_VIRAL_EVENT_DATE':6
 24 | }
 25 | 
 26 | UP_DATE = -1
 27 | 
 28 | def main(features_fpath):
 29 |     initialize_matplotlib()
 30 |     
 31 |     X = np.genfromtxt(features_fpath)[:,1:]
 32 |     
 33 |     for r, k in sorted(refs.items()):
 34 |         idxs = X[:,k] > 0
 35 |         time_to_ref = (X[:,UP_DATE][idxs] - X[:,k][idxs])
 36 |         print(r, np.mean(time_to_ref), np.std(time_to_ref))
 37 | 
 38 |     print('peak_frac', np.mean(X[:,-3]), np.std(X[:,-3]))
 39 |     
 40 |     time_to_peak = (X[:,-4] - X[:,UP_DATE]) / 7
 41 |     print('peak_date', np.mean(time_to_peak), np.std(time_to_peak))
 42 |     
 43 |     import time
 44 |     plt.hist(X[:,UP_DATE], bins=20)
 45 |     ticks, labels = plt.xticks()
 46 |     plt.xticks(ticks, [time.strftime('%m/%y', time.localtime(x)) for x in ticks])
 47 |     plt.ylabel('\# Videos')
 48 |     plt.xlabel('Month/Year')
 49 |     plt.savefig('hist.pdf')
 50 |     
 51 |     
 52 | #        plt.plot(t_series, '-k')
 53 | #        plt.ylabel('Views')
 54 | #        plt.xlabel('Time')
 55 | #        plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i))
 56 | #        plt.close()
 57 | #        
 58 | #        half = t_series.shape[0] // 2
 59 | #        to_shift = half - np.argmax(t_series)
 60 | #        to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True)
 61 | #        plt.plot(to_plot_peak_center, '-k')
 62 | #        plt.ylabel('Views')
 63 | #        plt.xlabel('Time')
 64 | #        plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i))
 65 | #        plt.close()
 66 | #        
 67 | #        to_shift = 0 - np.argmin(t_series)
 68 | #        to_plot_min_first = dist.shift(t_series, to_shift, rolling=True)
 69 | #        plt.plot(to_plot_min_first, '-k')
 70 | #        plt.ylabel('Views')
 71 | #        plt.xlabel('Time')
 72 | #        plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i))
 73 | #        plt.close()
 74 | #        
 75 | #    np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f')
 76 | #    np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d')
 77 | #    np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d')
 78 | #    np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'), dists_cent, 
 79 | #               fmt='%.5f')
 80 | 
 81 | def create_parser(prog_name):
 82 |     
 83 |     desc = __doc__
 84 |     formatter = argparse.RawDescriptionHelpFormatter
 85 |     parser = argparse.ArgumentParser(prog_name, description=desc,
 86 |                                      formatter_class=formatter)
 87 |     
 88 |     parser.add_argument('features_fpath', type=str, help='Features file')
 89 |     
 90 |     return parser
 91 | 
 92 | def entry_point(args=None):
 93 |     '''Fake main used to create argparse and call real one'''
 94 |     
 95 |     if not args: 
 96 |         args = []
 97 | 
 98 |     parser = create_parser(args[0])
 99 |     values = parser.parse_args(args[1:])
100 |     
101 |     try:
102 |         return main(values.features_fpath)
103 |     except:
104 |         traceback.print_exc()
105 |         parser.print_usage(file=sys.stderr)
106 |         return 1
107 | 
108 | if __name__ == '__main__':
109 |     sys.exit(entry_point(sys.argv))


--------------------------------------------------------------------------------
/src/scripts/pop_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from scripts.learn_base import create_input_table
  6 | from scripts.learn_base import create_grid_search
  7 | from scripts.learn_base import clf_summary
  8 | 
  9 | from pyksc.regression import mean_relative_square_error as mrse
 10 | 
 11 | from sklearn.cross_validation import StratifiedKFold
 12 | from sklearn.cross_validation import StratifiedShuffleSplit
 13 | from sklearn.metrics import f1_score
 14 | from sklearn.metrics import precision_recall_fscore_support
 15 | from sklearn.metrics import mean_squared_error as mse
 16 | from sklearn.metrics import r2_score
 17 | from sklearn.preprocessing import scale
 18 | 
 19 | from vod.stats.ci import half_confidence_interval_size as hci
 20 | 
 21 | import plac
 22 | import numpy as np
 23 | import os
 24 | import sys
 25 | 
 26 | def create_learners(learner_name='rbf_svm'):
 27 |     clf = create_grid_search(learner_name, n_jobs=-1)
 28 |     rgr = create_grid_search(learner_name, regressor=True, n_jobs=-1)
 29 | 
 30 |     return clf, rgr
 31 |     
 32 | def fit_and_predict(clf, rgr, X, y_clf, y_rgr, train, test, out_folder, fold):
 33 |     clf_model = clf.fit(X[train], y_clf[train])
 34 |         
 35 |     y_clf_true = y_clf[test]
 36 |     y_rgr_true = y_rgr[test]
 37 |     y_clf_pred = clf_model.predict(X[test])
 38 |     
 39 |     class_scores = np.array(precision_recall_fscore_support(y_clf_true,
 40 |                                                             y_clf_pred))
 41 |     micro_f1 = f1_score(y_clf_true, y_clf_pred, average='micro')
 42 |     macro_f1 = f1_score(y_clf_true, y_clf_pred, average='macro')
 43 |     
 44 |     rgr_model = rgr.fit(X[train], y_rgr[train])
 45 |     y_rgr_pred = rgr_model.predict(X[test])
 46 |     
 47 |     general_r2 = r2_score(y_rgr_true, y_rgr_pred)
 48 |     mse_score  = mse(y_rgr_true, y_rgr_pred)
 49 |     mrse_score = mrse(y_rgr_true, y_rgr_pred)
 50 |     
 51 |     clf_pred_fpath = os.path.join(out_folder, '%d-clf.pred' % fold)
 52 |     clf_true_fpath = os.path.join(out_folder, '%d-clf.true' % fold)
 53 |     
 54 |     rgr_pred_fpath = os.path.join(out_folder, '%d-rgr.pred' % fold)
 55 |     rgr_true_fpath = os.path.join(out_folder, '%d-rgr.true' % fold)
 56 |     
 57 |     np.savetxt(clf_pred_fpath, y_clf_pred, fmt="%d")
 58 |     np.savetxt(clf_true_fpath, y_clf_true, fmt="%d")
 59 |     
 60 |     np.savetxt(rgr_pred_fpath, y_rgr_pred)
 61 |     np.savetxt(rgr_true_fpath, y_rgr_true)
 62 |     
 63 |     return class_scores, micro_f1, macro_f1, general_r2, mse_score,\
 64 |             mrse_score
 65 | 
 66 | def print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all):
 67 |     metric_means = np.mean(clf_scores, axis=0)
 68 |     metric_ci = hci(clf_scores, .95, axis=0)
 69 |     
 70 |     print(clf_summary(metric_means, metric_ci))
 71 |     print('Micro F1 - mean: %f +- %f' % (np.mean(micro), hci(micro, .95)))
 72 |     print('Macro F1 - mean: %f +- %f' % (np.mean(macro), hci(macro, .95)))
 73 |     print('R2 all   - mean: %f +- %f' % (np.mean(r2_all), hci(r2_all, .95)))
 74 |     print('MSE all   - mean: %f +- %f' % (np.mean(mse_all), hci(mse_all, .95)))
 75 |     print('MRSE all   - mean: %f +- %f' % (np.mean(mrse_all), 
 76 |                                            hci(mrse_all, .95)))
 77 | 
 78 | def run_experiment(X, y_clf, y_regr, feature_ids, out_folder):
 79 |     
 80 |     clf_scores = []
 81 |     micro = []
 82 |     macro = []
 83 |     r2_all = []
 84 |     mse_all = []
 85 |     mrse_all = []
 86 |     
 87 |     learner, rgr_base = create_learners()
 88 |     cv = StratifiedKFold(y_clf, k=5)
 89 |     fold_num = 1
 90 |     for train, test in cv:
 91 |         class_scores, micro_f1, macro_f1, general_r2, \
 92 |                 mse_score, mrse_score = \
 93 |                 fit_and_predict(learner, rgr_base, X, y_clf, y_regr, train, 
 94 |                                 test, out_folder, fold_num)
 95 |         
 96 |         clf_scores.append(class_scores)
 97 |         micro.append(micro_f1)
 98 |         macro.append(macro_f1)
 99 |         
100 |         r2_all.append(general_r2)
101 |         mse_all.append(mse_score)
102 |         mrse_all.append(mrse_score)
103 | 
104 |         fold_num += 1
105 |         
106 |     print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all)
107 | 
108 | @plac.annotations(features_fpath=plac.Annotation('Partial Features', 
109 |                                                          type=str),
110 |                   tag_categ_fpath=plac.Annotation('Tags file', type=str),
111 |                   tseries_fpath=plac.Annotation('Time series file', type=str),
112 |                   num_days_to_use=plac.Annotation('Num Days Series', type=int),
113 |                   assign_fpath=plac.Annotation('Series assignment file', 
114 |                                                type=str),
115 |                   out_foldpath=plac.Annotation('Output folder', type=str))
116 | def main(features_fpath, tag_categ_fpath, tseries_fpath, num_days_to_use, 
117 |          assign_fpath, out_foldpath):
118 |     
119 |     X, feature_ids, _ = \
120 |             create_input_table(features_fpath, tseries_fpath, tag_categ_fpath,
121 |                                num_days_to_use)
122 |    
123 |     X = scale(X)
124 |     y_clf = np.genfromtxt(assign_fpath)
125 |     y_regr = scale(np.genfromtxt(tseries_fpath)[:,1:].sum(axis=1))
126 |     run_experiment(X, y_clf, y_regr, feature_ids, out_foldpath)
127 | 
128 | if __name__ == '__main__':
129 |     sys.exit(plac.call(main))
130 | 


--------------------------------------------------------------------------------
/src/scripts/radar.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | 
 7 | from matplotlib import pyplot as plt
 8 | from matplotlib.projections.polar import PolarAxes
 9 | from matplotlib.projections import register_projection
10 | 
11 | def radar_factory(num_vars, frame='circle'):
12 |     """Create a radar chart with `num_vars` axes."""
13 |     # calculate evenly-spaced axis angles
14 |     theta = 2*np.pi * np.linspace(0, 1-1./num_vars, num_vars)
15 |     # rotate theta such that the first axis is at the top
16 |     theta += np.pi/2
17 | 
18 |     def draw_poly_frame(self, x0, y0, r):
19 |         # TODO: use transforms to convert (x, y) to (r, theta)
20 |         verts = [(r*np.cos(t) + x0, r*np.sin(t) + y0) for t in theta]
21 |         return plt.Polygon(verts, closed=True, edgecolor='k')
22 | 
23 |     def draw_circle_frame(self, x0, y0, r):
24 |         return plt.Circle((x0, y0), r)
25 | 
26 |     frame_dict = {'polygon': draw_poly_frame, 'circle': draw_circle_frame}
27 |     if frame not in frame_dict:
28 |         raise ValueError, 'unknown value for `frame`: %s' % frame
29 | 
30 |     class RadarAxes(PolarAxes):
31 |         """Class for creating a radar chart (a.k.a. a spider or star chart)
32 | 
33 |         http://en.wikipedia.org/wiki/Radar_chart
34 |         """
35 |         name = 'radar'
36 |         # use 1 line segment to connect specified points
37 |         RESOLUTION = 1
38 |         # define draw_frame method
39 |         draw_frame = frame_dict[frame]
40 | 
41 |         def fill(self, *args, **kwargs):
42 |             """Override fill so that line is closed by default"""
43 |             closed = kwargs.pop('closed', True)
44 |             return super(RadarAxes, self).fill(closed=closed, *args, **kwargs)
45 | 
46 |         def plot(self, *args, **kwargs):
47 |             """Override plot so that line is closed by default"""
48 |             lines = super(RadarAxes, self).plot(*args, **kwargs)
49 |             for line in lines:
50 |                 self._close_line(line)
51 | 
52 |         def _close_line(self, line):
53 |             x, y = line.get_data()
54 |             # FIXME: markers at x[0], y[0] get doubled-up
55 |             if x[0] != x[-1]:
56 |                 x = np.concatenate((x, [x[0]]))
57 |                 y = np.concatenate((y, [y[0]]))
58 |                 line.set_data(x, y)
59 | 
60 |         def set_varlabels(self, labels):
61 |             self.set_thetagrids(theta * 180/np.pi, labels)
62 | 
63 |         def _gen_axes_patch(self):
64 |             x0, y0 = (0.5, 0.5)
65 |             r = 0.5
66 |             return self.draw_frame(x0, y0, r)
67 | 
68 |     register_projection(RadarAxes)
69 |     return theta
70 | 


--------------------------------------------------------------------------------
/src/scripts/tags_io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | '''This module contains the code used for data conversion'''
 3 | from __future__ import division, print_function
 4 | 
 5 | from collections import defaultdict
 6 | 
 7 | from sklearn.base import BaseEstimator
 8 | from sklearn.feature_extraction.text import Vectorizer
 9 | 
10 | import nltk
11 | 
12 | class NoopAnalyzer(BaseEstimator):
13 |     '''
14 |     Since we use NLTK to preprocess (more control) this
15 |     class is used to bypass sklearns preprocessing
16 |     '''
17 |     def analyze(self, text_document):
18 |         '''Does nothing'''
19 |         return text_document
20 | 
21 | def __tokenize_and_stem(fpath):
22 |     '''
23 |     Tokenizes and stems the file, converting each line to 
24 |     an array of words.
25 |     
26 |     Arguments
27 |     ---------
28 |     fpath: a path to a file 
29 |         Each line is a song, tags are separated by space
30 |     '''
31 |     tokenizer = nltk.RegexpTokenizer(r'\w+')
32 |     stopwords = set(nltk.corpus.stopwords.words('english'))
33 |     stemmer = nltk.stem.PorterStemmer()
34 |     
35 |     docs = []
36 |     term_pops = defaultdict(int)
37 |     with open(fpath) as tags_file:
38 |         for line in tags_file:
39 |             as_doc = []
40 |             for term in tokenizer.tokenize(line)[1:]:
41 |                 term = term.lower().strip()
42 |                 if term not in stopwords and term != '':
43 |                     stemmed = stemmer.stem(term)
44 |                     as_doc.append(stemmed)
45 |                     term_pops[stemmed] += 1
46 |                    
47 |             docs.append(as_doc)
48 | 
49 |     return docs, term_pops
50 | 
51 | def clean_up(fpath, bottom_filter=0.01):
52 |     '''
53 |     Converts a YouTube tag file to a series of tokens. This code
54 |     stems the tags, removes stopwords and filters infrequent
55 |     tags (whose probability is bellow `bottom_filter`).
56 |     
57 |     Arguments
58 |     ---------
59 |     fpath: a path to a file 
60 |         Each line is a song, tags are separated by space
61 |     bottom_filter: float (defaults to 0.01, one percent)
62 |         Minimum probability for tags to be considered useful
63 |     '''
64 |     docs, term_pops = __tokenize_and_stem(fpath)
65 |     for doc in docs:
66 |         to_yield = []
67 |         for term in doc:
68 |             prob_term = term_pops[term] / len(term_pops)
69 |             if prob_term > bottom_filter:
70 |                 to_yield.append(term)
71 |         
72 |         yield to_yield
73 |     
74 | def vectorize_videos(fpath, use_idf=False):
75 |     '''
76 |     Converts a YouTube tag file to a sparse matrix pondered. We can assign
77 |     weights based on IDF if specified.
78 |     
79 |     Arguments
80 |     ---------
81 |     fpath: a path to a file 
82 |         Each line is a song, tags are separated by space
83 |     use_idf: bool (optinal, defaults to True)
84 |         Indicates whether to use IDF.
85 |     bottom_filter: float (defaults to 0.005, half of one percent)
86 |         Minimum probability for tags to be considered useful
87 |     '''
88 |     #Vectorizes to TF-IDF
89 |     vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf)
90 |     sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
91 |     vocabulary = vectorizer.vocabulary
92 |     return sparse_matrix, vocabulary


--------------------------------------------------------------------------------
/src/scripts/tree_infogain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from sklearn.cross_validation import cross_val_score 
 6 | from sklearn.ensemble import ExtraTreesClassifier
 7 | from sklearn.preprocessing import scale
 8 | from sklearn.metrics import f1_score, precision_score, recall_score
 9 | from sklearn.tree import DecisionTreeClassifier
10 | from sklearn.tree import export_graphviz
11 | 
12 | import argparse
13 | import numpy as np
14 | import sys
15 | import traceback
16 | 
17 | #def find_best_parameters(X_model, y_model, kernel):
18 | #    
19 | #    param_grid = {
20 | #                  'C':[0.1, 0.5, 1, 5, 10, 50, 100], 
21 | #                  'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
22 | #                  }
23 | #        
24 | #    clf = GridSearchCV(svc, param_grid, n_jobs=-1, score_func=f1_score)
25 | #    clf = clf.fit(X_model, y_model)
26 | #    best_clf = clf.best_estimator
27 | #    
28 | #    return best_clf
29 | 
30 | def main(features_fpath, classes_fpath):
31 |     
32 |     with open(features_fpath) as features_file:
33 |         for line in features_file:
34 |             if '#' in line:
35 |                 spl = line.split()
36 |                 names = spl[1:]
37 |     
38 |     X = scale(np.genfromtxt(features_fpath)[:,1:].copy())
39 |     y = np.loadtxt(classes_fpath)
40 |     
41 |     forest = ExtraTreesClassifier(max_depth=4,
42 |                                   criterion="entropy",
43 |                                   compute_importances=True)
44 |     
45 |     scores = cross_val_score(forest, X, y, score_func=f1_score, cv=5)
46 |     print(scores)
47 |     
48 |     forest.fit(X, y)
49 |     
50 |     importances = forest.feature_importances_
51 |     indices = np.argsort(importances)[::-1]
52 |     
53 |     # Print the feature ranking
54 |     print("Feature ranking:")
55 |     for f in xrange(len(importances[indices])):
56 |         print("%d. feature %s (%f)" % (f + 1, names[indices[f]], 
57 |                                        importances[indices[f]]))
58 |         
59 |     export_graphviz(forest, 'bala.dot')
60 | 
61 | def create_parser(prog_name):
62 |     
63 |     desc = __doc__
64 |     formatter = argparse.RawDescriptionHelpFormatter
65 |     parser = argparse.ArgumentParser(prog_name, description=desc, 
66 |                                      formatter_class=formatter)
67 |     
68 |     parser.add_argument('features_fpath', type=str,
69 |                         help='Input file with video features')
70 |     parser.add_argument('classes_fpath', type=str,
71 |                         help='Classes to predict')
72 |     
73 |     return parser
74 | 
75 | def entry_point(args=None):
76 |     '''Fake main used to create argparse and call real one'''
77 |     
78 |     if not args: 
79 |         args = []
80 | 
81 |     parser = create_parser(args[0])
82 |     values = parser.parse_args(args[1:])
83 |     
84 |     try:
85 |         return main(values.features_fpath, values.classes_fpath)
86 |     except:
87 |         traceback.print_exc()
88 |         parser.print_usage(file=sys.stderr)
89 |         return 1
90 | 
91 | if __name__ == '__main__':
92 |     sys.exit(entry_point(sys.argv))
93 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/boosting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import division, print_function
 3 | 
 4 | from sklearn.base import clone
 5 | 
 6 | import numpy as np
 7 | 
 8 | EPS = 1e6
 9 | 
10 | def sample_with_rep(weights, num_samples):
11 | 
12 |     weights = np.asanyarray(weights)
13 | 
14 |     assert weights.sum() >= (1 - EPS) and weights.sum() <= (1 + EPS)
15 | 
16 |     x = np.arange(weights.shape[0])
17 |     y = np.random.multinomial(num_samples, weights)
18 |     return np.repeat(x, y)
19 | 
20 | def error(y_true, y_pred, weights = None):
21 | 
22 |     y_true = np.asanyarray(y_true)
23 |     y_pred = np.asanyarray(y_pred)
24 | 
25 |     if weights is not None:
26 |         weights = np.asanyarray(weights)
27 |     else:
28 |         weights = np.ones(y_true.shape)
29 | 
30 |     data = np.asanyarray(y_true != y_pred, dtype='i')
31 |     return (data * weights).sum() / weights.sum()
32 | 
33 | def comp_alpha(err, num_classes):
34 | 
35 |     return np.log((1 - err) / err) + np.log(num_classes - 1)
36 | 
37 | def compute_weights(y_true, y_pred, old_weights, alpha):
38 |     
39 |     y_true = np.asanyarray(y_true)
40 |     y_pred = np.asanyarray(y_pred)
41 |     old_weights = np.asanyarray(old_weights)
42 | 
43 |     return old_weights * np.exp(alpha * (y_true != y_pred))
44 | 
45 | def get_alpha_and_weights(y_true, y_pred, old_weights):
46 | 
47 |     num_classes = len(set(y_true))
48 |     alpha = comp_alpha(error(y_true, y_pred), num_classes)
49 |     weights = compute_weights(y_true, y_pred, old_weights, alpha)
50 |     
51 |     weights /= weights.sum()
52 |     return alpha, weights
53 | 
54 | class ClassBoost(object):
55 | 
56 |     def __init__(self, classifier, sample_factor = 2.5):
57 |         self.classifier = clone(classifier)
58 |         self.sample_factor = sample_factor
59 |         self.base_w = 0
60 |         self.class_w = 0
61 | 
62 |     def fit(self, X, y, B):
63 |         
64 |         X = np.asanyarray(X)
65 |         y = np.asanyarray(y)
66 |         B = np.asanyarray(B)
67 | 
68 |         ypred_base = np.asanyarray(B).argmax(axis = 1)
69 | 
70 |         assert X.shape[0] == y.shape[0] == ypred_base.shape[0]
71 | 
72 |         n = X.shape[0]
73 | 
74 |         uni_weights = np.ones(n) / n
75 |         base_alpha, base_weights = get_alpha_and_weights(y, ypred_base, 
76 |                 uni_weights)
77 |         
78 |         #Sampling with repetition
79 |         num_samples = int(n * self.sample_factor)
80 |         idx = sample_with_rep(base_weights, num_samples)
81 | 
82 |         #Fitting
83 |         X_new = X[idx]
84 |         y_new = y[idx]
85 |         self.classifier.fit(X_new, y_new)
86 |         y_pred_new = self.classifier.predict(X)
87 | 
88 |         class_alpha, class_weights = get_alpha_and_weights(y, y_pred_new,
89 |                 base_weights)
90 | 
91 |         self.base_w = base_alpha
92 |         self.class_w = class_alpha
93 | 
94 |     def predict(self, X, B):
95 |         P_class = self.classifier.predict_proba(X)
96 |         return (B * self.base_w + P_class * self.base_w).argmax(axis = 1)
97 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/classify_pts.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from pyksc.trend import TrendLearner
 6 | 
 7 | from sklearn.metrics import classification_report
 8 | from sklearn.metrics import confusion_matrix
 9 | 
10 | import ioutil
11 | import numpy as np
12 | import os
13 | import plac
14 | import sys
15 | 
16 | def fit(C, y_train, X, y_true, num_pts):
17 | 
18 |     learner = TrendLearner(num_pts, 1)
19 |     learner.fit(C, y_train)
20 | 
21 |     probs  = learner.predict_proba(X)
22 |     y_pred = probs.argmax(axis=1)
23 |         
24 |     return y_pred, probs
25 | 
26 | def main(tseries_fpath, train_fpath, centroids_fpath, classes_fpath, out_folder,
27 | 	 gamma_max):
28 |     gamma_max = int(gamma_max)
29 | 
30 |     X = ioutil.load_series(tseries_fpath, train_fpath)
31 |     C = np.genfromtxt(centroids_fpath, dtype='f')
32 |     
33 |     y_train = np.arange(C.shape[0])
34 |     y_true = np.genfromtxt(classes_fpath, dtype='i')
35 |     max_pts = gamma_max
36 |     #max_pts = X.shape[1]
37 | 
38 |     best_by = np.zeros(X.shape[0])
39 |     min_conf = np.zeros(X.shape[0])
40 |     all_probs = np.zeros(shape=(X.shape[0], max_pts))
41 | 
42 |     lousy_conf = 1.0 / C.shape[0] #if confidence is equal to this, classifier did nothing
43 |     for num_pts in range(1, max_pts + 1):
44 |         y_pred, probs = fit(C, y_train, X, y_true, num_pts)
45 | 
46 |         for i in xrange(X.shape[0]):
47 |             p_true = probs[i, y_true[i]]
48 |             if best_by[i] == 0 and y_pred[i] == y_true[i] and p_true > lousy_conf:
49 |                 best_by[i] = num_pts
50 |                 min_conf[i] = probs[i, y_true[i]]
51 |             all_probs[i, num_pts - 1] = p_true
52 | 
53 |         summary_fpath = os.path.join(out_folder,\
54 |                 'class_summ-%d-pts.dat' % num_pts)
55 |         probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts)
56 | 
57 |         with open(summary_fpath, 'w') as summary_file:
58 |             print(classification_report(y_true, y_pred), file=summary_file)
59 |         np.savetxt(probs_fpath, probs)
60 |     
61 |     best_fpath = os.path.join(out_folder, 'best-by.dat')
62 |     conf_fpath = os.path.join(out_folder, 'conf.dat')
63 |     all_conf_fpath = os.path.join(out_folder, 'all-conf.dat')
64 | 
65 |     np.savetxt(best_fpath, best_by)
66 |     np.savetxt(conf_fpath, min_conf)
67 |     np.savetxt(all_conf_fpath, np.asarray(all_probs))
68 | 
69 | if __name__ == '__main__':
70 |     sys.exit(plac.call(main))
71 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/classify_pts_all.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from pyksc.trend import TrendLearner
 6 | 
 7 | from sklearn.metrics import classification_report
 8 | from sklearn.metrics import confusion_matrix
 9 | 
10 | import ioutil
11 | import numpy as np
12 | import os
13 | import plac
14 | import sys
15 | 
16 | def fit(Xtrain, y_train, Xtest, num_pts):
17 | 
18 |     learner = TrendLearner(num_pts, 1)
19 |     learner.fit(Xtrain, y_train)
20 | 
21 |     probs  = learner.predict_proba(Xtest)
22 |     y_pred = probs.argmax(axis=1)
23 |         
24 |     return y_pred, probs
25 | 
26 | def main(tseries_fpath, train_fpath, test_fpath, ytrain_fpath, ytest_fpath, out_folder):
27 |     Xtrain = ioutil.load_series(tseries_fpath, train_fpath)
28 |     Xtest  = ioutil.load_series(tseries_fpath, test_fpath)
29 |     
30 |     y_train = np.genfromtxt(ytrain_fpath)
31 |     y_true = np.genfromtxt(ytest_fpath)
32 |     max_pts = Xtrain.shape[1]
33 | 
34 |     best_by = np.zeros(Xtest.shape[0])
35 |     min_conf = np.zeros(Xtest.shape[0])
36 |     all_probs = np.zeros(shape=(Xtest.shape[0], max_pts))
37 | 
38 |     lousy_conf = 1.0 / len(set(y_train)) #if confidence is equal to this, classifier did nothing
39 |     for num_pts in range(1, max_pts + 1):
40 |         y_pred, probs = fit(Xtrain, y_train, Xtest, num_pts)
41 | 
42 |         for i in xrange(Xtest.shape[0]):
43 |             p_true = probs[i, y_true[i]]
44 |             if best_by[i] == 0 and y_pred[i] == y_true[i] and p_true > lousy_conf:
45 |                 best_by[i] = num_pts
46 |                 min_conf[i] = probs[i, y_true[i]]
47 |             all_probs[i, num_pts - 1] = p_true
48 | 
49 |         summary_fpath = os.path.join(out_folder,\
50 |                 'class_summ-%d-pts.dat' % num_pts)
51 |         probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts)
52 | 
53 |         with open(summary_fpath, 'w') as summary_file:
54 |             print(classification_report(y_true, y_pred), file=summary_file)
55 |         np.savetxt(probs_fpath, probs)
56 |     
57 |     best_fpath = os.path.join(out_folder, 'best-by.dat')
58 |     conf_fpath = os.path.join(out_folder, 'conf.dat')
59 |     all_conf_fpath = os.path.join(out_folder, 'all-conf.dat')
60 | 
61 |     np.savetxt(best_fpath, best_by)
62 |     np.savetxt(conf_fpath, min_conf)
63 |     np.savetxt(all_conf_fpath, np.asarray(all_probs))
64 | 
65 | if __name__ == '__main__':
66 |     sys.exit(plac.call(main))
67 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/classify_pts_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from pyksc.trend import TrendLearner
 6 | 
 7 | from sklearn.metrics import classification_report
 8 | from sklearn.metrics import confusion_matrix
 9 | from sklearn.metrics import precision_score
10 | 
11 | import argparse
12 | import ioutil
13 | import numpy as np
14 | import os
15 | import plac
16 | import sys
17 | 
18 | def fit(Xtrain, y_train, Xtest, num_pts):
19 | 
20 |     learner = TrendLearner(num_pts, 1)
21 |     learner.fit(Xtrain, y_train)
22 |     probs  = learner.predict_proba(Xtest)
23 |         
24 |     return probs
25 | 
26 | def main(tseries_fpath, centroids_fpath, test_fpath, assign_fpath, out_folder, 	
27 | 	 gamma_max):
28 |     gamma_max = int(gamma_max)
29 |     
30 |     C = np.genfromtxt(centroids_fpath)
31 |     Xtest = ioutil.load_series(tseries_fpath, test_fpath)
32 |     y_train = np.arange(C.shape[0])
33 | 
34 |     max_pts = gamma_max
35 |     for num_pts in range(1, max_pts + 1):
36 |     #for num_pts in [1, 25, 50, 75]:
37 |         probs = fit(C, y_train, Xtest, num_pts)
38 | 
39 |         probs_fpath = os.path.join(out_folder, 'probs-%d-pts.dat' % num_pts)
40 |         np.savetxt(probs_fpath, probs)
41 |     
42 | if __name__ == '__main__':
43 |     sys.exit(plac.call(main))
44 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/classify_theta.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from scipy.stats.mstats import mquantiles
  6 | 
  7 | from sklearn.metrics import f1_score
  8 | from sklearn.metrics import classification_report
  9 | 
 10 | from pyksc import dist
 11 | 
 12 | import argparse
 13 | import glob
 14 | import ioutil
 15 | import multiprocessing
 16 | import numpy as np
 17 | import os
 18 | import plac
 19 | import sys
 20 | 
 21 | FNAME = 'probs-%d-pts.dat'
 22 | 
 23 | def pred(probs_folder, num_series, max_pts, min_pts, thetas):
 24 |     
 25 |     y_pred = np.zeros(num_series) - 1
 26 |     best_by = np.zeros(num_series) + np.inf
 27 |     confs = np.zeros(num_series)
 28 |     all_confs = np.zeros((num_series, len(thetas)))
 29 |     
 30 |     for num_pts in range(1, max_pts + 1):
 31 |         fpath = os.path.join(probs_folder, FNAME) % num_pts
 32 |         P = np.loadtxt(fpath)
 33 |         
 34 |         curr_pred = P.argmax(axis=1)
 35 |         curr_score = P.max(axis=1)
 36 | 
 37 |         for i in xrange(num_series):
 38 |             score = curr_score[i]
 39 |             curr_cls = curr_pred[i]
 40 |             
 41 |             theta = thetas[curr_cls]
 42 |             min_req = min_pts[curr_cls]
 43 | 
 44 |             if num_pts >= min_req and score > theta and y_pred[i] == -1:
 45 |                 y_pred[i] = curr_cls
 46 |                 best_by[i] = num_pts
 47 |                 confs[i] = score
 48 |                 all_confs[i] = P[i]
 49 | 
 50 |                 #if y_pred[i] != curr_cls and score > confs[i]:
 51 |                 #    y_pred[i] = curr_cls
 52 |                 #    #best_by[i] = num_pts
 53 |                 #    confs[i] = score
 54 |                 #    all_confs[i] = P[i]
 55 | 
 56 |     assert y_pred[confs > 0].sum() == y_pred[y_pred != -1].sum()
 57 |     assert y_pred[best_by != np.inf].sum() == y_pred[y_pred != -1].sum()
 58 |     
 59 |     return y_pred, best_by, confs, all_confs
 60 | 
 61 | def aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, 
 62 |         idx, summ_file):
 63 | 
 64 |     X = X[idx]
 65 |     peak_days = peak_days[idx]
 66 |     sum_views = sum_views[idx]
 67 |     best_by = np.asanyarray(best_by[idx], dtype='i')
 68 |     y_true = y_true[idx]
 69 |     y_pred = y_pred[idx]
 70 |     confs = confs[idx]
 71 | 
 72 |     left_frac = np.zeros(X.shape[0])
 73 |     for i in xrange(X.shape[0]):
 74 |         left_frac[i] = \
 75 |                 (sum_views[i] - X[i][:best_by[i]].sum()) / sum_views[i]
 76 |     
 77 |     dist_peak = (peak_days - best_by - 1)
 78 | 
 79 |     print('- PeakDistQuantiles (peak - best)', mquantiles(dist_peak), file=summ_file)
 80 |     print('- LeftViewsQuantiles', mquantiles(left_frac), file=summ_file)
 81 | 
 82 | 
 83 | def save_results(X, peak_days, sum_views, pts_grid, theta_grid, best_by, all_confs,
 84 |         y_true, y_pred, confs, out_folder):
 85 | 
 86 |     valid = confs > 0
 87 |     correct = y_true == y_pred
 88 | 
 89 |     summ_fpath = os.path.join(out_folder, 'summ.dat')
 90 |     with open(summ_fpath, 'w') as summ_file:
 91 |         print('Params', file=summ_file)
 92 |         for cls in sorted(pts_grid):
 93 |             print('\t Cls = %d; min_pts = %d; theta = %.3f' \
 94 |                     % (cls, pts_grid[cls], theta_grid[cls]), file=summ_file)
 95 |         print(file=summ_file)
 96 | 
 97 |         print('All', file=summ_file)
 98 |         aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, valid, summ_file)
 99 |         print(file=summ_file)
100 |         
101 |         print('Correct Only', file=summ_file)
102 |         aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, correct, summ_file)
103 |         print(file=summ_file)
104 |         
105 |         print('Incorrect Only', file=summ_file)
106 |         aux_print(X, peak_days, sum_views, best_by, y_true, y_pred, confs, ~correct, summ_file)
107 |         print(file=summ_file)
108 | 
109 |         #print(classification_report(y_true[valid], y_pred[valid]), 
110 |         #        file=summ_file)
111 |         print(classification_report(y_true, y_pred), 
112 |                 file=summ_file)
113 |         print(file=summ_file)
114 |         print('# invalid %d' % (~valid).sum(), file=summ_file)
115 | 
116 |     ypred_fpath = os.path.join(out_folder, 'pred.dat')
117 |     np.savetxt(ypred_fpath, y_pred)
118 | 
119 |     bestby_fpath = os.path.join(out_folder, 'best-by.dat')
120 |     np.savetxt(bestby_fpath, best_by)
121 | 
122 |     conf_fpath = os.path.join(out_folder, 'conf.dat')
123 |     np.savetxt(conf_fpath, confs)
124 |     
125 |     conf_fpath = os.path.join(out_folder, 'all-conf.dat')
126 |     np.savetxt(conf_fpath, all_confs)
127 | 
128 | def run_fold(folder, tseries_fpath, min_pts, thetas, out_folder, gamma_max):
129 | 
130 |     try:
131 |         os.makedirs(out_folder)
132 |     except:
133 |         pass
134 | 
135 |     test_fpath = os.path.join(folder, 'test.dat')
136 |     cents_fpath = os.path.join(folder, 'ksc', 'cents.dat')
137 |     assign_fpath = os.path.join(folder, 'ksc', 'test_assign.dat')
138 |     probs_folder = os.path.join(folder, 'probs-test')
139 | 
140 |     X = ioutil.load_series(tseries_fpath, test_fpath)
141 |     test_idx = np.loadtxt(test_fpath, dtype='bool')
142 |     y_true = np.loadtxt(assign_fpath)
143 |     
144 |     num_series = X.shape[0]
145 |     max_pts = gamma_max
146 |     
147 |     peak_days = []
148 |     sum_views = []
149 |     with open(tseries_fpath) as tseries_file:
150 |         for i, line in enumerate(tseries_file):
151 |             if test_idx[i]:
152 |                 x = np.array([int(v) for v in line.split()[1:]])
153 |                 peak_days.append(x.argmax())
154 |                 sum_views.append(x.sum())
155 | 
156 |     peak_days = np.array(peak_days)
157 |     sum_views = np.array(sum_views)
158 |   
159 |     y_pred, best_by, confs, all_confs = \
160 |             pred(probs_folder, num_series, max_pts, min_pts, thetas)
161 |     save_results(X, peak_days, sum_views, min_pts, thetas, best_by, all_confs,
162 |                  y_true, y_pred, confs, out_folder)
163 | 
164 | 
165 | def get_params(folder, threshold, max_k):
166 |     
167 |     assign = np.loadtxt(os.path.join(folder, 'ksc', 'assign.dat'), dtype='i')
168 |     P = np.loadtxt(os.path.join(folder, 'probs', 'all-conf.dat'), dtype='f')
169 |     best_by = np.loadtxt(os.path.join(folder, 'probs', 'best-by.dat'), dtype='i')
170 |     
171 |     thetas = {}
172 |     min_pts = {}
173 |     for i in xrange(2, P.shape[1]):
174 |         fpath = os.path.join(folder, 'probs', 'probs-%d-pts.dat' % i)
175 |         Pi = np.loadtxt(fpath, dtype='f')
176 |         for k in set(assign):
177 |             y_true = assign == k
178 | 
179 |             maxcls = Pi.argmax(axis=1)
180 |             y_pred = maxcls == k
181 |             score = f1_score(y_true, y_pred)
182 |             if score >= threshold and k not in thetas:
183 |                 thetas[k] = P[assign == k][:,i].mean()
184 |                 min_pts[k] = i
185 |     
186 |     for k in xrange(max_k):
187 |         if k not in thetas:
188 |             thetas[k] = 1.0 / len(set(assign))
189 |             min_pts[k] = 0
190 | 
191 |     return thetas, min_pts
192 | 
193 | def multi_proc_run(args):
194 | 
195 |     folder, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k = args
196 |     fitted_thetas, fitted_min_pts = get_params(folder, float(f1_target), max_k)
197 | 
198 |     out_folder = os.path.join(folder, results_sub_folder)
199 |     run_fold(folder, tseries_fpath, fitted_min_pts, fitted_thetas, 
200 |                 out_folder, gamma_max)
201 | 
202 | def main(tseries_fpath, base_folder, f1_target, results_sub_folder, gamma_max, max_k):
203 |     gamma_max = int(gamma_max)
204 |     max_k = int(max_k)
205 | 
206 |     folders = glob.glob(os.path.join(base_folder, 'fold-*/'))
207 |     pool = multiprocessing.Pool()
208 |     pool.map(multi_proc_run, \
209 |             [(fold, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k) for fold in folders])
210 |     pool.terminate()
211 |     pool.join()
212 | 
213 | if __name__ == '__main__':
214 |     sys.exit(plac.call(main))
215 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/classify_theta_train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from classify_theta import get_params, pred, save_results
 6 | 
 7 | from pyksc import dist
 8 | 
 9 | import argparse
10 | import glob
11 | import ioutil
12 | import multiprocessing
13 | import numpy as np
14 | import os
15 | import plac
16 | import sys
17 | 
18 | FNAME = 'probs-%d-pts.dat'
19 | 
20 | def run_fold(folder, tseries_fpath, min_pts, thetas, gamma_max, out_folder):
21 | 
22 |     try:
23 |         os.makedirs(out_folder)
24 |     except:
25 |         pass
26 | 
27 |     train_fpath = os.path.join(folder, 'train.dat')
28 |     cents_fpath = os.path.join(folder, 'ksc', 'cents.dat')
29 |     assign_fpath = os.path.join(folder, 'ksc', 'assign.dat')
30 |     probs_folder = os.path.join(folder, 'probs')
31 | 
32 |     X = ioutil.load_series(tseries_fpath, train_fpath)
33 |     train_idx = np.loadtxt(train_fpath, dtype='bool')
34 |     y_true = np.loadtxt(assign_fpath)
35 |     
36 |     num_series = X.shape[0]
37 |     max_pts = gamma_max
38 |     
39 |     #Since we prune the first 100 lines of X we need to read other info
40 |     peak_days = []
41 |     sum_views = []
42 |     with open(tseries_fpath) as tseries_file:
43 |         for i, line in enumerate(tseries_file):
44 |             if train_idx[i]:
45 |                 x = np.array([int(v) for v in line.split()[1:]])
46 |                 peak_days.append(x.argmax())
47 |                 sum_views.append(x.sum())
48 | 
49 |     peak_days = np.array(peak_days)
50 |     sum_views = np.array(sum_views)
51 |   
52 |     y_pred, best_by, confs, all_confs = \
53 |             pred(probs_folder, num_series, max_pts, min_pts, thetas)
54 |     save_results(X, peak_days, sum_views, min_pts, thetas, best_by, all_confs,
55 |                  y_true, y_pred, confs, out_folder)
56 | 
57 | def multi_proc_run(args):
58 | 
59 |     folder, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k = args
60 |     fitted_thetas, fitted_min_pts = get_params(folder, f1_target, max_k)
61 | 
62 |     out_folder = os.path.join(folder, results_sub_folder)
63 |     run_fold(folder, tseries_fpath, fitted_min_pts, fitted_thetas, 
64 |                 gamma_max, out_folder)
65 | 
66 | def main(tseries_fpath, base_folder, f1_target, results_sub_folder, gamma_max, max_k):
67 |     gamma_max = int(gamma_max)
68 |     max_k = int(max_k)
69 |     
70 |     f1_target = float(f1_target)
71 |     folders = glob.glob(os.path.join(base_folder, 'fold-*/'))
72 |     pool = multiprocessing.Pool()
73 |     pool.map(multi_proc_run, [(fold, tseries_fpath, f1_target, gamma_max, results_sub_folder, max_k) for fold in folders])
74 |     pool.terminate()
75 |     pool.join()
76 | 
77 | if __name__ == '__main__':
78 |     sys.exit(plac.call(main))
79 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/cluster.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | from pyksc import dist
 6 | from pyksc import ksc
 7 | 
 8 | import ioutil
 9 | import numpy as np
10 | import os
11 | import plac
12 | 
13 | def main(tseries_fpath, base_folder, k):
14 |     k = int(k)
15 |     
16 |     idx_fpath = os.path.join(os.path.join(base_folder, '..'), 'train.dat')
17 |     X = ioutil.load_series(tseries_fpath, idx_fpath)
18 | 
19 |     cent, assign, shift, dists_cent = ksc.inc_ksc(X, k)
20 |     np.savetxt(os.path.join(base_folder, 'cents.dat'), cent, fmt='%.5f')
21 |     np.savetxt(os.path.join(base_folder, 'assign.dat'), assign, fmt='%d')
22 |     np.savetxt(os.path.join(base_folder, 'shift.dat'), shift, fmt='%d')
23 |     np.savetxt(os.path.join(base_folder, 'dists_cent.dat'), dists_cent, 
24 |                fmt='%.5f')
25 | 
26 | if __name__ == '__main__':
27 |     plac.call(main)
28 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/cotrain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import division, print_function
 3 | 
 4 | from scipy import stats
 5 | 
 6 | from sklearn.base import clone
 7 | 
 8 | import numpy as np
 9 | 
10 | class CoTrain(object):
11 | 
12 |     def __init__(self, classifier, label_fract = .25):
13 |         self.classifier = clone(classifier)
14 |         self.label_fract = label_fract
15 | 
16 |     def fit(self, X, y, P):
17 |         X = np.asanyarray(X)
18 |         y = np.asanyarray(y)
19 |         P = np.asanyarray(P)
20 | 
21 |         assert X.shape[0] == y.shape[0] == P.shape[0]
22 | 
23 |         n = X.shape[0]
24 |         idx = np.arange(n)
25 |         np.random.shuffle(idx)
26 | 
27 |         n_classes = len(set(y))
28 |         n_labelled = int(n * self.label_fract)
29 | 
30 |         init_labelled = idx[:n_labelled]
31 |         w_label = np.zeros(n, dtype='bool')
32 |         w_label[init_labelled] = True
33 | 
34 |         classes = np.arange(n_classes)
35 |         y_new = np.zeros(n) - 1
36 |         y_new[init_labelled] = y[init_labelled]
37 |         while not w_label.all():
38 |             self.classifier.fit(X[w_label], y_new[w_label])
39 |             P_cls = self.classifier.predict_proba(X[~w_label])
40 | 
41 |             best_c = P_cls.argmax(axis = 0)
42 |             best_p = P[~w_label].argmax(axis = 0)
43 |             
44 |             idx_c = np.where(~w_label)[0][best_c]
45 |             idx_p = np.where(~w_label)[0][best_p]
46 | 
47 |             w_label[idx_c] = True
48 |             w_label[idx_p] = True
49 | 
50 |             y_new[idx_c] = classes
51 |             y_new[idx_p] = classes
52 | 
53 |     def predict(self, X, P):
54 |         P_class = self.classifier.predict_proba(X)
55 |         return (P * P_class).argmax(axis = 1)
56 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/create_test_assign.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pyksc import dist
 4 | 
 5 | import ioutil
 6 | import numpy as np
 7 | import plac
 8 | import sys
 9 | 
10 | def main(tseries_fpath, test_fpath, cents_fpath):
11 | 
12 |     X = ioutil.load_series(tseries_fpath, test_fpath)
13 | 
14 |     C = np.loadtxt(cents_fpath)
15 |     dist_cents = dist.dist_all(C, X, rolling=True)[0]
16 |     y_true = dist_cents.argmin(axis=0)
17 | 
18 |     for t in y_true:
19 |         print t
20 | 
21 | if __name__ == '__main__':
22 |     sys.exit(plac.call(main))
23 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/generate_cross_vals.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf8
 3 | from __future__ import print_function, division
 4 | 
 5 | from sklearn import model_selection
 6 | 
 7 | import numpy as np
 8 | import os
 9 | import plac
10 | import sys
11 | 
12 | def main(tseries_fpath, out_folder):
13 |     X = np.genfromtxt(tseries_fpath)[:,1:]
14 |     num_series = X.shape[0]
15 |     
16 |     curr_fold = 1
17 |     cv = model_selection.KFold(5, shuffle=True)
18 |     to_save_train = np.zeros(len(X), dtype='b')
19 |     to_save_test = np.zeros(len(X), dtype='b')
20 | 
21 |     for train, test in cv.split(X):
22 |         curr_out_folder = os.path.join(out_folder, 'fold-%d' % curr_fold)
23 |         
24 |         try:
25 |             os.makedirs(curr_out_folder)
26 |         except:
27 |             pass
28 |         
29 |         to_save_train[:] = False
30 |         to_save_test[:] = False
31 |         to_save_train[train] = True
32 |         to_save_test[test] = True
33 | 
34 |         np.savetxt(os.path.join(curr_out_folder, 'train.dat'), to_save_train, fmt='%i')
35 |         np.savetxt(os.path.join(curr_out_folder, 'test.dat'), to_save_test, fmt='%i')
36 |         curr_fold += 1
37 | 
38 | if __name__ == '__main__':
39 |     sys.exit(plac.call(main))
40 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/ioutil.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | 
 3 | import numpy as np
 4 | 
 5 | EPS = 1e-6
 6 | 
 7 | def load_series(tseries_fpath, idx_fpath):
 8 |     X = np.genfromtxt(tseries_fpath)[:, 1:] + EPS
 9 |     train_idx = np.loadtxt(idx_fpath, dtype='bool')
10 |     return np.asanyarray(X[train_idx], order='C')
11 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/learn_base.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf8
  2 | '''
  3 | Common functions for creating classifiers and regressors for machine learning
  4 | tasks
  5 | '''
  6 | from __future__ import division, print_function
  7 | 
  8 | from scipy import sparse
  9 | 
 10 | from sklearn import neighbors
 11 | from sklearn import ensemble
 12 | from sklearn import model_selection
 13 | from sklearn import linear_model
 14 | from sklearn import svm
 15 | 
 16 | import cStringIO
 17 | import numpy as np
 18 | 
 19 | #Params
 20 | TREE_SPLIT_RANGE = [1, 2, 4, 8, 16, 32, 64, 128]
 21 | KNN_K_RANGE = [5, 10, 15]
 22 | 
 23 | PARAMS = {'lr':{'C':[1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4]},
 24 |           'knn':{'n_neighbors':KNN_K_RANGE},
 25 |           'extra_trees':{'min_samples_split':TREE_SPLIT_RANGE}}
 26 | 
 27 | #Classifiers
 28 | CLFS = {'lr':linear_model.LogisticRegression(),
 29 |         'knn':neighbors.KNeighborsClassifier(),
 30 |         'extra_trees':ensemble.ExtraTreesClassifier(n_estimators=100,
 31 |                                                     criterion='entropy',
 32 |                                                     n_jobs=1)}
 33 | 
 34 | #Category Parsing Utilities
 35 | CATEG_ABBRV = {
 36 |     'Autos&amp;Vehicles':'Vehi.',
 37 |     'Autos':'Vehi.',
 38 |     'Comedy':'Com.',
 39 |     'Education':'Edu.',
 40 |     'Entertainment':'Ent.',
 41 |     'Film':'Film',
 42 |     'Film&amp;Animation':'Film',
 43 |     'Games':'Game',
 44 |     'Gaming':'Game',
 45 |     'Howto':'Howto',
 46 |     'Howto&amp;Style':'Howto',
 47 |     'Movies':'Film',
 48 |     'Music':'Music',
 49 |     'NULL':'-',
 50 |     'News':'News',
 51 |     'News&amp;Politics':'News',
 52 |     'Nonprofit':'Nonprof.',
 53 |     'Nonprofits&amp;Activism':'Nonprof.',
 54 |     'People&amp;Blogs':'People',
 55 |     'People':'People',
 56 |     'Pets&amp;Animals':'Pets',
 57 |     'Pets':'Pets',
 58 |     'Animals':'Pets',
 59 |     'Science&amp;Technology':'Sci.',
 60 |     'Science':'Sci.',
 61 |     'Tech':'Sci.',
 62 |     'Shows':'Show',
 63 |     'Sports':'Sport',
 64 |     'Trailers':'Film',
 65 |     'Travel&amp;Events':'Travel',
 66 |     'Travel':'Travel'}
 67 | 
 68 | CAT_COL = 2
 69 | CAT_IDS = dict((abbrv, i) \
 70 |                for i, abbrv in enumerate(sorted(set(CATEG_ABBRV.values()))))
 71 | INV_CAT_IDS = dict((v, k) for k, v in CAT_IDS.items())
 72 | 
 73 | def _get_classifier_and_params(name):
 74 |     return CLFS[name], PARAMS[name]
 75 | 
 76 | def create_grid_search(name, n_jobs=-1):
 77 |     learner, params = _get_classifier_and_params(name)    
 78 |     return model_selection.GridSearchCV(learner, params, cv=3, refit=True, 
 79 |                                     n_jobs=n_jobs)
 80 | 
 81 | def hstack_if_possible(X, Y):
 82 |     if X is not None:
 83 |         return np.hstack((X, Y))
 84 |     else:
 85 |         return Y
 86 |     
 87 | def load_categories(tags_cat_fpath):
 88 |     with open(tags_cat_fpath) as tags_cat_file:
 89 | 
 90 |         data = []
 91 |         for i, line in enumerate(tags_cat_file):
 92 |             spl = line.split()
 93 |             category = 'NULL'
 94 |             if len(spl) > CAT_COL:
 95 |                 category = line.split()[CAT_COL]
 96 |                 
 97 |             abbrv = CATEG_ABBRV[category]
 98 |             categ_id = CAT_IDS[abbrv]
 99 |             
100 |             n_rows = len(CAT_IDS)
101 |             row = np.zeros(n_rows)
102 |             row[categ_id] = 1
103 | 
104 |             data.append(row)
105 | 
106 |         X_categ = np.asarray(data)
107 |         return X_categ
108 |             
109 | def clf_summary(mean_scores, ci_scores):
110 |     
111 |     buff = cStringIO.StringIO()
112 |     try:
113 |         print('class \tprecision \trecall \tf1 score \tsupport', file=buff)
114 |         for j in xrange(mean_scores.shape[1]):
115 |             print(j, end="\t", file=buff)
116 |             for i in xrange(mean_scores.shape[0]):
117 |                 print('%.3f +- %.3f' % (mean_scores[i, j], ci_scores[i, j]), 
118 |                       end="\t", file=buff)
119 |             print(file=buff)
120 |         print(file=buff)
121 |     
122 |         return buff.getvalue()
123 |     finally:
124 |         buff.close()
125 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/multimodel_class.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | from sklearn.base import clone
  6 | from sklearn.metrics import classification_report
  7 | 
  8 | from learn_base import create_grid_search
  9 | from learn_base import load_categories
 10 | 
 11 | import boosting
 12 | import cotrain
 13 | import stacking
 14 | import numpy as np
 15 | import os
 16 | import plac
 17 | import sys
 18 | 
 19 | def load_features(features_folder, best_by, gamma_max):
 20 |     
 21 |     F = []
 22 |     matrices = {}
 23 |     feats_fname = 'year#%d.txt'
 24 | 
 25 |     for i  in xrange(best_by.shape[0]):
 26 |         bby = best_by[i]
 27 | 
 28 |         if bby == np.inf:
 29 |             feats_file = os.path.join(features_folder, feats_fname % gamma_max)
 30 |         else:
 31 |             bby = int(bby)
 32 |             feats_file = os.path.join(features_folder, feats_fname % bby)
 33 | 
 34 |         if bby in matrices:
 35 |             Fi = matrices[bby]
 36 |         else:
 37 |             Fi = np.genfromtxt(feats_file)[:,1:]
 38 |             matrices[bby] = Fi
 39 | 
 40 |         feats = Fi[i]
 41 | 
 42 |         F.append(feats)
 43 | 
 44 |     return np.asanyarray(F)
 45 | 
 46 | def save_results(out_folder, base_name, y_pred, y_true):
 47 |     folder = os.path.join(out_folder, base_name)
 48 |     
 49 |     try:
 50 |         os.mkdir(folder)
 51 |     except:
 52 |         pass
 53 |     
 54 |     out_file = os.path.join(folder, 'pred.dat')
 55 |     np.savetxt(out_file, y_pred)
 56 | 
 57 |     with open(os.path.join(folder, 'summ.dat'), 'w') as summ_file:
 58 |         print(classification_report(y_true, y_pred), file=summ_file)
 59 | 
 60 | def run_classifier(out_folder, trend_probs, referrers, y, train, test):
 61 | 
 62 |     F = referrers #static features
 63 |     etree = create_grid_search('lr', n_jobs = 1)
 64 |     
 65 |     y_pred = trend_probs[test].argmax(axis=1)
 66 |     save_results(out_folder, 'tl-base-lr', y_pred, y[test])
 67 | 
 68 |     aux = clone(etree)
 69 |     aux.fit(F[train], y[train])
 70 |     y_pred = aux.predict(F[test])
 71 |     save_results(out_folder, 'tree-feats', y_pred, y[test])
 72 |     
 73 |     aux = clone(etree)
 74 |     aux.fit(trend_probs[train], y[train])
 75 |     y_pred = aux.predict(trend_probs[test])
 76 |     save_results(out_folder, 'tree-probs', y_pred, y[test])
 77 |     
 78 |     C = np.hstack((F, trend_probs))
 79 |     aux = clone(etree)
 80 |     aux.fit(C[train], y[train])
 81 |     y_pred = aux.predict(C[test])
 82 |     save_results(out_folder, 'meta-combine', y_pred, y[test])
 83 | 
 84 |     #stack_clf = stacking.Stacking(3, [etree], 'tree')
 85 |     #stack_clf.fit(F[train], y[train], trend_probs[train])
 86 |     #y_pred = stack_clf.predict(F[test], trend_probs[test])
 87 |     #save_results(out_folder, 'meta-stack-tree', y_pred)
 88 |     
 89 |     stack_clf = stacking.Stacking(3, [etree], 'linear')
 90 |     stack_clf.fit(F[train], y[train], trend_probs[train])
 91 |     y_pred = stack_clf.predict(F[test], trend_probs[test])
 92 |     save_results(out_folder, 'meta-stack-linear', y_pred, y[test])
 93 |     
 94 |     #stack_clf = stacking.Stacking(3, [etree], 'deco')
 95 |     #stack_clf.fit(F[train], y[train], trend_probs[train])
 96 |     #y_pred = stack_clf.predict(F[test], trend_probs[test])
 97 |     #save_results(out_folder, 'meta-stack-svm', y_pred)
 98 | 
 99 | def run_one_folder(features_folder, fold_folder, results_name, gamma_max):
100 | 
101 |     #File paths
102 |     best_by_test_fpath = os.path.join(fold_folder, results_name,
103 |             'best-by.dat')
104 |     best_by_train_fpath = os.path.join(fold_folder, results_name + '-train',
105 |             'best-by.dat')
106 |     
107 |     all_conf_test_fpath = os.path.join(fold_folder, results_name, 
108 |             'all-conf.dat')
109 |     all_conf_train_fpath = os.path.join(fold_folder, results_name + '-train',
110 |             'all-conf.dat')
111 |     
112 |     ytest_fpath = os.path.join(fold_folder, 'ksc', 'test_assign.dat')
113 |     ytrain_fpath = os.path.join(fold_folder, 'ksc', 'assign.dat')
114 |     
115 |     test_fpath = os.path.join(fold_folder, 'test.dat')
116 |     train_fpath = os.path.join(fold_folder, 'train.dat')
117 |     tags_fpath = os.path.join(features_folder, 'tags.dat')
118 |     
119 |     #Loading Matrices
120 |     best_by_test = np.genfromtxt(best_by_test_fpath)
121 |     best_by_train = np.genfromtxt(best_by_train_fpath)
122 |     
123 |     test = np.loadtxt(test_fpath, dtype='bool')
124 |     train = np.loadtxt(train_fpath, dtype='bool')
125 | 
126 |     assert np.logical_xor(train, test).all()
127 |     assert best_by_train.shape == train.sum()
128 |     assert best_by_test.shape == test.sum()
129 | 
130 |     best_by = np.zeros(best_by_test.shape[0] + best_by_train.shape[0])
131 |     best_by[test] = best_by_test
132 |     best_by[train] = best_by_train
133 | 
134 |     trend_probs_test = np.genfromtxt(all_conf_test_fpath)
135 |     trend_probs_train = np.genfromtxt(all_conf_train_fpath)
136 |     
137 |     assert trend_probs_train.shape[0] == train.sum()
138 |     assert trend_probs_test.shape[0] == test.sum()
139 |     
140 |     shape = (trend_probs_test.shape[0] + trend_probs_train.shape[0], 
141 |             trend_probs_test.shape[1])
142 |     trend_probs = np.zeros(shape)
143 |     trend_probs[test] = trend_probs_test
144 |     trend_probs[train] = trend_probs_train
145 | 
146 |     y_true_test = np.loadtxt(ytest_fpath, dtype='i')
147 |     y_true_train = np.loadtxt(ytrain_fpath, dtype='i')
148 | 
149 |     assert y_true_train.shape[0] == train.sum()
150 |     assert y_true_test.shape[0] == test.sum()
151 | 
152 |     y_true = np.zeros(y_true_train.shape[0] + y_true_test.shape[0])
153 |     y_true[test] = y_true_test
154 |     y_true[train] = y_true_train
155 |     
156 |     referrers = load_features(features_folder, best_by, gamma_max)
157 | 
158 |     #Actual test, ufa
159 |     run_classifier(os.path.join(fold_folder, results_name), 
160 |             trend_probs, referrers, y_true, train, test)
161 | 
162 | @plac.annotations(
163 |         features_folder=plac.Annotation('Folder with features', type=str),
164 |         fold_folder=plac.Annotation('Folder with the train and test data', type=str),
165 |         results_name=plac.Annotation('Base name of the results folder', type=str),
166 |         gamma_max=plac.Annotation('Gamma Max', type=int))
167 | def main(features_folder, fold_folder, results_name, gamma_max):
168 |     run_one_folder(features_folder, fold_folder, results_name, gamma_max)
169 | 
170 | if __name__ == '__main__':
171 |     sys.exit(plac.call(main))
172 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 3 ]; then
 4 |     echo "Please provide me with a time series file, an output folder and a features folder"
 5 |     exit 1
 6 | fi
 7 | 
 8 | IN=$1
 9 | BASE_FOLD=$2
10 | FEATURES_FOLDER=$3
11 | 
12 | K=2
13 | F1=0.5
14 | GAMMA_MAX=20
15 | 
16 | #Creates output folder
17 | mkdir -p $BASE_FOLD 2> /dev/null
18 | 
19 | #Generate cross-val
20 | python generate_cross_vals.py $IN $BASE_FOLD
21 | 
22 | #Cluster dataset
23 | for fold in $BASE_FOLD/*/; do
24 |     mkdir -p $fold/ksc 2> /dev/null
25 |     python cluster.py $IN $fold/ksc $K
26 | done
27 | 
28 | #Compute agreement between folds
29 | python sim_folds.py $IN $BASE_FOLD
30 | 
31 | #Precompute probabilities train
32 | for fold in $BASE_FOLD/*/; do
33 |     mkdir -p $fold/probs/ 2> /dev/null
34 |     python classify_pts.py $IN $fold/train.dat $fold/ksc/cents.dat \
35 |         $fold/ksc/assign.dat $fold/probs/ $GAMMA_MAX
36 | done
37 | 
38 | #Precompute probabilities test
39 | for fold in $BASE_FOLD/*/; do
40 |     mkdir -p $fold/probs-test/ 2> /dev/null
41 |     python classify_pts_test.py $IN $fold/ksc/cents.dat $fold/test.dat \
42 |         $fold/ksc/assign.dat $fold/probs-test/ $GAMMA_MAX
43 | done
44 | 
45 | #Create the assign for the test
46 | for fold in $BASE_FOLD/*/; do
47 |     python create_test_assign.py $IN $fold/test.dat \
48 |         $fold/ksc/cents.dat > $fold/ksc/test_assign.dat
49 | done
50 | 
51 | #Learn parameters train
52 | for fold in $BASE_FOLD/*/; do
53 |     mkdir -p $fold/cls-res-fitted-$F1-$GAMMA_MAX-train 2> /dev/null
54 | done
55 | python classify_theta_train.py $IN $BASE_FOLD $F1 cls-res-fitted-$F1-$GAMMA_MAX-train $GAMMA_MAX $K
56 | 
57 | #Learn parameters test
58 | for fold in $BASE_FOLD/*/; do
59 |     mkdir -p $fold/cls-res-fitted-$F1-$GAMMA_MAX 2> /dev/null
60 | done
61 | python classify_theta.py $IN $BASE_FOLD $F1 cls-res-fitted-$F1-$GAMMA_MAX $GAMMA_MAX $K
62 | 
63 | #Adding static features
64 | for fold in $BASE_FOLD/*/; do
65 |     python multimodel_class.py $FEATURES_FOLDER $fold cls-res-fitted-$F1-$GAMMA_MAX $GAMMA_MAX
66 | done
67 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/regression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import division, print_function
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | from vod.stats.ci import half_confidence_interval_size as hci
  7 | 
  8 | import numpy as np
  9 | import plac
 10 | import sys
 11 | 
 12 | class OLS(object):
 13 | 
 14 |     def __init__(self):
 15 |         
 16 |         self.coeffs = None
 17 |         self.residuals = None
 18 |         self.gcv_sqerrors = None
 19 | 
 20 |     def fit(self, X, y):
 21 | 
 22 |         assert X.shape[0] == y.shape[0]
 23 | 
 24 |         X = np.asanyarray(X, dtype='f', order='C')
 25 |         y = np.asanyarray(y, dtype='f', order='C')
 26 | 
 27 |         n = y.shape[0]
 28 | 
 29 |         PI = np.linalg.pinv(X)
 30 |         H = np.dot(X, PI)
 31 |         self.coeffs = np.dot(PI, y)
 32 |         
 33 |         y_hat = np.dot(X, self.coeffs)
 34 |         
 35 |         self.residuals = y_hat - y
 36 | 
 37 |         aux = self.residuals / (1 - np.diag(H))
 38 |         self.gcv_sqerrors = np.power(aux, 2)
 39 | 
 40 |     def predict(X):
 41 |         return np.dot(X, self.coeffs)
 42 | 
 43 | def fit(X, tr, tt):
 44 |     ols = OLS()
 45 |     
 46 |     y = X[:, :tt].sum(axis=1)
 47 |     XR = (X[:, :tr].T / y).T
 48 |     ols.fit(XR, np.ones(XR.shape[0]))
 49 | 
 50 |     return ols
 51 | 
 52 | def main(tseries_fpath, predict_fpath, bestby_fpath):
 53 | 
 54 |     X = np.genfromtxt(tseries_fpath)[:,1:] + 0.0001
 55 |     cls_pred = np.loadtxt(predict_fpath, dtype='i')
 56 |     rgr_true = X.sum(axis=1)
 57 |     bestby = np.genfromtxt(bestby_fpath)
 58 | 
 59 |     cls_labels = set(cls_pred[cls_pred != -1])
 60 | 
 61 |     tt = X.shape[1]
 62 |     models = {}
 63 |     models_per_clust = {}
 64 |     ref_time = np.arange(1, tt + 1)
 65 | 
 66 |     #tr = 7
 67 |     #ref_time = np.array([tr])
 68 |     #bestby = np.zeros(bestby.shape[0]) + tr
 69 | 
 70 |     for tr in ref_time:
 71 |         models[tr] = fit(X, tr, tt)
 72 |         
 73 |         for k in sorted(cls_labels):
 74 |             Xk = X[cls_pred == k]
 75 |             models_per_clust[tr, k] = fit(Xk, tr, tt)
 76 |     
 77 |     errors_all = []
 78 |     errors_cls = []
 79 |     errors_per_cls = defaultdict(list)
 80 |     for tr in ref_time:
 81 |         idx = bestby == tr
 82 |         ols = models[tr]
 83 | 
 84 |         errors_all.extend(ols.gcv_sqerrors[idx])
 85 |         classes = cls_pred[idx]
 86 | 
 87 |         for cls in set(classes):
 88 |             bestby_for_cls = bestby[cls_pred == cls]
 89 |             idx_cls = bestby_for_cls == tr
 90 |             
 91 |             ols = models_per_clust[tr, cls]
 92 |             errors_cls.extend(ols.gcv_sqerrors[idx_cls])
 93 |             errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls])
 94 | 
 95 |     print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95))
 96 |     print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95))
 97 |     print()
 98 |     print('Per class')
 99 |     for cls in cls_labels:
100 |         err = errors_per_cls[cls]
101 |         print('Cls = ', cls, np.mean(err), '+-', hci(err, .95))
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     sys.exit(plac.call(main))
106 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/sim_folds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf8
 3 | from __future__ import print_function, division
 4 | 
 5 | from pyksc import dist
 6 | 
 7 | import glob
 8 | import numpy as np
 9 | import os
10 | import plac
11 | import sys
12 | 
13 | def main(tseries_fpath, in_folder):
14 | 
15 |     ids = []
16 |     with open(tseries_fpath) as tseries_file:
17 |         for l in tseries_file:
18 |             ids.append(l.split()[0])
19 | 
20 |     ids = np.array(ids)
21 |     folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc'))
22 |     num_folders = len(folders)
23 | 
24 |     agree = 0
25 |     diff = 0
26 |     
27 |     for i in xrange(num_folders):
28 | 
29 |         base_i = os.path.dirname(folders[i])
30 |         Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat'))
31 | 
32 |         train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool')
33 |         assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat'))
34 | 
35 |         for j in xrange(i, num_folders):
36 | 
37 |             base_j = os.path.dirname(folders[j])    
38 |             Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat'))
39 |             
40 |             dists = dist.dist_all(Ci, Cj, rolling=True)[0]
41 |             argsrt = dists.argsort(axis=1)
42 |             
43 |             train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool')    
44 |             assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat'))
45 |             
46 |             for k in xrange(argsrt.shape[0]):
47 |                 first = True
48 |                 for o in argsrt[k]:
49 |                     ids_k = set(ids[train_i][assign_i == k])
50 |                     ids_o = set(ids[train_j][assign_j == o])
51 |                     n_inter = len(ids_k.intersection(ids_o))
52 | 
53 |                     if first:
54 |                         first = False
55 |                         agree += n_inter
56 |                     else:
57 |                         diff += n_inter
58 |     
59 |     print('AgreedProb = ', agree / (agree + diff))
60 |     print('DisagreeProb = ', diff / (agree + diff))
61 | 
62 | if __name__ == '__main__':
63 |     sys.exit(plac.call(main))
64 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/stacking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf8
  3 | 
  4 | from __future__ import division, print_function
  5 | 
  6 | from learn_base import create_grid_search
  7 | 
  8 | from sklearn import base
  9 | from sklearn import model_selection
 10 | from sklearn import linear_model
 11 | from sklearn import tree
 12 | 
 13 | import numpy as np
 14 | 
 15 | class StackingException(Exception): pass
 16 | 
 17 | class Stacking(object):
 18 |     '''Implements a stacking classifier'''
 19 | 
 20 |     def __init__(self, num_splits, base_models, stacker_name='linear'):
 21 | 
 22 |         STACKERS = {'tree':_TreeStacking, 
 23 |                     'linear':_MLRStacking,
 24 |                     'deco':_DecoStacking}
 25 | 
 26 |         self.num_splits = num_splits
 27 |         self.base_classifiers = []
 28 | 
 29 |         for base_model in base_models:
 30 |             clone = base.clone(base_model)
 31 |             self.base_classifiers.append(clone)
 32 |         
 33 |         if stacker_name not in STACKERS:
 34 |             names = STACKERS.keys()
 35 |             raise StackingException('Unknown combiner, choose from: %s' % names)
 36 | 
 37 |         self.stacker = STACKERS[stacker_name]()
 38 |         self.P_fit = None
 39 |         self.y_fit = None
 40 |         self.model = None
 41 |         self.num_classes = 0
 42 | 
 43 |     def fit(self, X, y, B):
 44 |         X = np.asanyarray(X)
 45 |         y = np.asanyarray(y)
 46 | 
 47 |         assert X.shape[0] == y.shape[0]
 48 |         assert y.ndim == 1
 49 | 
 50 |         self.num_classes = len(set(y))
 51 |         num_base_models = len(self.base_classifiers)
 52 | 
 53 |         P = np.zeros((X.shape[0], self.num_classes * num_base_models))
 54 | 
 55 |         kfold = model_selection.StratifiedKFold(self.num_splits)
 56 |         for train, test in kfold.split(X, y):
 57 |             for i, base_model in enumerate(self.base_classifiers):
 58 |                 base_model.fit(X[train], y[train])
 59 | 
 60 |                 base_probs = base_model.predict_proba(X[test])
 61 |                 cols_min = i * self.num_classes
 62 |                 cols_max = self.num_classes * (i + 1)
 63 |                 P[test, cols_min:cols_max] = base_probs
 64 | 
 65 |         #a = P.max(axis=1)
 66 |         #b = P.argmax(axis=1)
 67 |         #c = B.max(axis=1)
 68 |         #d = B.argmax(axis=1)
 69 |         #self.stacker.fit(np.vstack((a, b, c, d)).T, y)
 70 |         self.stacker.fit(np.hstack((P, B)), y)
 71 | 
 72 |     def predict(self, X, B):
 73 |         X = np.asanyarray(X)
 74 | 
 75 |         num_features = len(self.base_classifiers) * self.num_classes
 76 |         P = np.zeros((X.shape[0], num_features))
 77 |         for i, base_model in enumerate(self.base_classifiers):
 78 |             base_probs = base_model.predict_proba(X)
 79 |             cols_min = i * self.num_classes
 80 |             cols_max = self.num_classes * (i + 1)
 81 |             P[:, cols_min:cols_max] = base_probs
 82 | 
 83 |         #a = P.max(axis=1)
 84 |         #b = P.argmax(axis=1)
 85 |         #c = B.max(axis=1)
 86 |         #d = B.argmax(axis=1)
 87 |         #P = np.vstack((a, b, c, d)).T
 88 |         P = np.hstack((P, B))
 89 |         return self.stacker.predict(P)
 90 | 
 91 | class _MLRStacking(base.BaseEstimator, base.ClassifierMixin):
 92 |     """Implements a multi-response linear regression classifier"""
 93 | 
 94 |     def __init__(self):
 95 |         self.regressors = dict()
 96 | 
 97 |     def fit(self, X, y):
 98 |         X = np.asanyarray(X)
 99 |         y = np.asanyarray(y)
100 | 
101 |         for yi in set(y):
102 |             self.regressors[yi] = linear_model.LinearRegression()
103 |             specific_y = np.asanyarray(y == yi, dtype='i')
104 |             self.regressors[yi].fit(X, specific_y)
105 | 
106 |     def predict(self, X):
107 |         X = np.asanyarray(X)
108 |         
109 |         prediction = np.zeros(X.shape[0])
110 |         best_value = np.zeros_like(prediction)
111 |         for yi, regressor in self.regressors.items():
112 |             value = regressor.predict(X)
113 |             for index, vindex in enumerate(value):
114 |                 if vindex > best_value[index]:
115 |                     best_value[index] = vindex
116 |                     prediction[index] = yi
117 |         return prediction
118 | 
119 | class _TreeStacking(base.BaseEstimator, base.ClassifierMixin):
120 |     '''Implements stacking with a multiresponse regression tree'''
121 | 
122 |     def __init__(self):
123 |         self.model = None
124 | 
125 |     def _y_to_one_zero_mat(self, y):
126 |         y = np.asanyarray(y)
127 | 
128 |         #Guarantees that y is 0 to n - 1
129 |         unique_y, labels_flat = np.unique(y, return_inverse=True)
130 |         y = labels_flat.reshape(y.shape)
131 | 
132 |         Y = np.zeros(shape=(len(y), len(unique_y)), dtype='f', order='C')
133 |         for yi in unique_y:
134 |             Y[:, yi] = (y == yi)
135 | 
136 |         return Y
137 | 
138 |     def fit(self, X, y):
139 |         X = np.asanyarray(X, dtype='f', order='C')
140 |         Y = self._y_to_one_zero_mat(y)
141 |         
142 |         self.model = tree.DecisionTreeRegressor()
143 |         self.model.fit(X, Y)
144 | 
145 |     def predict(self, X):
146 |         X = np.asanyarray(X, dtype='f', order='C')
147 |         P = self.model.predict(X)
148 |         return P.argmax(axis=1)
149 | 
150 | class _DecoStacking(base.BaseEstimator, base.ClassifierMixin):
151 | 
152 |     def __init__(self):
153 |         self.model = etree = create_grid_search('extra_trees', n_jobs = 1)
154 | 
155 |     def fit(self, X, y):
156 |         X = np.asanyarray(X, dtype='f', order='C')
157 |         y = np.asanyarray(y, dtype='f', order='C')
158 | 
159 |         self.model.fit(X, y)
160 | 
161 |     def predict(self, X):
162 |         return self.model.predict(X)
163 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/summarize_results.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf8
 2 | from __future__ import division, print_function
 3 | 
 4 | from pyksc import dist
 5 | 
 6 | from sklearn.metrics import f1_score
 7 | from sklearn.metrics import classification_report
 8 | 
 9 | import glob
10 | import numpy as np
11 | import os
12 | import plac 
13 | 
14 | def main(tseries_fpath, base_folder):
15 | 
16 |     folders = glob.glob(os.path.join(base_folder, 'fold-*'))
17 |     num_folders = len(folders)
18 |     
19 |     cluster_mapping = []
20 |     C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat'))
21 |     
22 |     for i in xrange(num_folders):
23 |         Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat'))
24 | 
25 |         dists = dist.dist_all(Ci, C_base, rolling=True)[0]
26 |         closest = dists.argmin(axis=1)
27 |         
28 |         cluster_mapping.append({})
29 |         for k in xrange(Ci.shape[0]):
30 |             cluster_mapping[i][k] = closest[k]
31 |     
32 |     y_true_all = []
33 |     y_pred_all = []
34 |     for i in xrange(num_folders):
35 |         y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat'))
36 |         y_pred = np.loadtxt(os.path.join(folders[i], \
37 |                 'cls-res-fitted-50/pred.dat'))
38 |         
39 |         for j in xrange(y_true.shape[0]):
40 |             y_true[j] = cluster_mapping[i][y_true[j]]
41 |             if y_pred[j] != -1:
42 |                 y_pred[j] = cluster_mapping[i][y_pred[j]]
43 |         
44 |         y_true_all.extend(y_true)
45 |         y_pred_all.extend(y_pred)
46 |     
47 |     y_pred_all = np.asarray(y_pred_all)
48 |     y_true_all = np.asarray(y_true_all)
49 | 
50 |     report = classification_report(y_true_all, y_pred_all)
51 |     valid = y_pred_all != -1
52 |     print()
53 |     print('Using the centroids from folder: ', folders[0])
54 |     print('Micro Aggregation of Folds:')
55 |     print('%.3f fract of videos were not classified' % (sum(~valid) / y_pred_all.shape[0]))
56 |     print()
57 |     print(classification_report(y_true_all[valid], y_pred_all[valid]))
58 | 
59 | if __name__ == '__main__':
60 |     plac.call(main)
61 | 


--------------------------------------------------------------------------------
/src/trend-learner-scripts/translation-final-results-to-paper-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flaviovdf/pyksc/6ba8988c7fad63366dc2b8d005d0779971e129c5/src/trend-learner-scripts/translation-final-results-to-paper-new.png


--------------------------------------------------------------------------------