├── .gitignore
├── .travis.yml
├── COPYING
├── README.md
├── requirements.txt
├── spylearn
    ├── __init__.py
    ├── block_rdd.py
    ├── blocked_math.py
    ├── histogram.py
    ├── k_means.py
    ├── linear_model.py
    └── random_permutation.py
└── test
    ├── common.py
    ├── run_tests.sh
    ├── test_block_rdd.py
    ├── test_blocked_math.py
    ├── test_histogram.py
    ├── test_k_means.py
    ├── test_linear_model.py
    └── test_random_permutation.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ## generic files to ignore
 2 | *~
 3 | *.lock
 4 | *.DS_Store
 5 | *.swp
 6 | *.out
 7 | 
 8 | awsdeploy/
 9 | 
10 | # rails specific
11 | *.sqlite3
12 | config/database.yml
13 | log/*
14 | tmp/*
15 | 
16 | # java specific
17 | *.class
18 | 
19 | # python specific
20 | *.pyc
21 | 
22 | # xcode/iphone specific
23 | build/*
24 | *.pbxuser
25 | *.mode2v3
26 | *.mode1v3
27 | *.perspective
28 | *.perspectivev3
29 | *~.nib
30 | 
31 | # akka specific
32 | logs/*
33 | 
34 | # sbt specific
35 | target/
36 | project/boot
37 | lib_managed/*
38 | project/build/target
39 | project/build/lib_managed
40 | project/build/src_managed
41 | project/plugins/lib_managed
42 | project/plugins/target
43 | project/plugins/src_managed
44 | project/plugins/project
45 | 
46 | core/lib_managed
47 | core/target
48 | pubsub/lib_managed
49 | pubsub/target
50 | 
51 | # eclipse specific
52 | .metadata
53 | jrebel.lic
54 | .settings
55 | .classpath
56 | .project
57 | 
58 | .ensime*
59 | *.sublime-*
60 | .cache
61 | 
62 | # intellij
63 | *.eml
64 | *.iml
65 | *.ipr
66 | *.iws
67 | .*.sw?
68 | .idea
69 | 
70 | # paulp script
71 | /.lib/
72 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | jdk:
 5 |   - openjdk7
 6 | virtualenv:
 7 |   system_site_packages: true
 8 | install:
 9 |   # Install Spark
10 |   - wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-hadoop1.tgz
11 |   - tar -xzf spark-0.9.0-incubating-bin-hadoop1.tgz
12 |   - sudo apt-get install -qq python-numpy python-scipy python-pandas python-sklearn
13 |   - pip install -r requirements.txt
14 |   # Workaround for Travis issue with POSIX semaphores; see
15 |   # https://github.com/travis-ci/travis-cookbooks/issues/155
16 |   - "sudo rm -rf /dev/shm && sudo ln -s /run/shm /dev/shm"
17 | script:
18 |     - export SPARK_HOME=`pwd`/spark-0.9.0-incubating-bin-hadoop1
19 |     - cd test
20 |     - ./run_tests.sh


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2007–2014 The pyspark-sklearn developers.
 4 | All rights reserved.
 5 | 
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |   a. Redistributions of source code must retain the above copyright notice,
11 |      this list of conditions and the following disclaimer.
12 |   b. Redistributions in binary form must reproduce the above copyright
13 |      notice, this list of conditions and the following disclaimer in the
14 |      documentation and/or other materials provided with the distribution.
15 |   c. Neither the name of the Scikit-learn Developers  nor the names of
16 |      its contributors may be used to endorse or promote products
17 |      derived from this software without specific prior written
18 |      permission. 
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 | DAMAGE.
32 | 
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PySpark / scikit-learn glue code
2 | 
3 | [![Build Status](https://travis-ci.org/ogrisel/spylearn.png?branch=master)](https://travis-ci.org/ogrisel/spylearn)
4 | 
5 | Repo with experimental tools to show how to combine scikit-learn and PySpark.
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse
2 | numpy
3 | scipy
4 | scikit-learn
5 | pandas


--------------------------------------------------------------------------------
/spylearn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ogrisel/spylearn/8af56f44d52cd46c72877a1df80a2406bbc61d34/spylearn/__init__.py


--------------------------------------------------------------------------------
/spylearn/block_rdd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | 
 4 | 
 5 | def block_rdd(data, block_size=None):
 6 |     """Block an RDD
 7 | 
 8 |     Parameters
 9 |     ----------
10 | 
11 |     data : RDD
12 |         RDD of data points to block into either numpy arrays,
13 |         scipy sparse matrices, or pandas data frames.
14 |         Type of data point will be automatically inferred
15 |         and blocked accordingly.
16 | 
17 |     block_size : int, optional, default None
18 |         Size of each block (number of elements), if None all data points
19 |         from each partition will be combined in a block.
20 | 
21 |     """
22 | 
23 |     import pandas as pd
24 |     try:
25 |         entry = data.first()
26 |     except IndexError:
27 |         # empty RDD: do not block
28 |         return data
29 | 
30 |     # do different kinds of block depending on the type
31 |     if isinstance(entry, tuple):
32 |         return data.mapPartitions(_block_tuple, block_size)
33 | 
34 |     elif isinstance(entry, dict):
35 |         return data.mapPartitions(
36 |             lambda x: _block_collection(x, pd.DataFrame, block_size))
37 | 
38 |     elif sp.issparse(entry):
39 |         return data.mapPartitions(
40 |             lambda x: _block_collection(x, sp.vstack, block_size))
41 | 
42 |     else:
43 |         # Fallback to array packing
44 |         return data.mapPartitions(
45 |             lambda x: _block_collection(x, np.array, block_size))
46 | 
47 | 
48 | def _pack_accumulated(accumulated):
49 |     if len(accumulated) > 0 and sp.issparse(accumulated[0]):
50 |         return sp.vstack(accumulated)
51 |     else:
52 |         return np.array(accumulated)
53 | 
54 | 
55 | def _block_tuple(iterator, block_size=None):
56 |     """Pack rdd of tuples as tuples of arrays or scipy.sparse matrices."""
57 |     i = 0
58 |     blocked_tuple = None
59 |     for tuple_i in iterator:
60 |         if blocked_tuple is None:
61 |             blocked_tuple = tuple([] for _ in range(len(tuple_i)))
62 | 
63 |         if block_size is not None and i >= block_size:
64 |             yield tuple(_pack_accumulated(x) for x in blocked_tuple)
65 |             blocked_tuple = tuple([] for _ in range(len(tuple_i)))
66 |             i = 0
67 |         for x_j, x in zip(tuple_i, blocked_tuple):
68 |             x.append(x_j)
69 |         i += 1
70 |     yield tuple(_pack_accumulated(x) for x in blocked_tuple)
71 | 
72 | 
73 | def _block_collection(iterator, collection_type, block_size=None):
74 |     """Pack rdd with a specific collection constructor."""
75 |     i = 0
76 |     accumulated = []
77 |     for a in iterator:
78 |         if block_size is not None and i >= block_size:
79 |             yield collection_type(accumulated)
80 |             accumulated = []
81 |             i = 0
82 |         accumulated.append(a)
83 |         i += 1
84 |     yield collection_type(accumulated)
85 | 


--------------------------------------------------------------------------------
/spylearn/blocked_math.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.linalg as ln
  3 | from operator import add
  4 | 
  5 | 
  6 | def count(blocked_rdd):
  7 |     return blocked_rdd.map(lambda x: x.shape[0]).reduce(add)
  8 | 
  9 | 
 10 | def sum(blocked_rdd, axis=None):
 11 |     """
 12 |     Compute the sum of a blcoked RDD, either the sum of all values,
 13 |     or the sum along the specified dimension (must be 0)
 14 |     """
 15 |     if axis is None:
 16 |         return blocked_rdd.map(np.sum).reduce(add)
 17 |     elif axis == 0:
 18 |         return blocked_rdd.map(lambda x: np.sum(x, axis)).reduce(add)
 19 |     else:
 20 |         raise ValueError("axis must be 0 or None")
 21 | 
 22 | 
 23 | def mean(blocked_rdd):
 24 |     """
 25 |     Done this way to avoid overflow from summing everything before dividing. Though
 26 |     not sure if that's an issue?
 27 |     """
 28 |     pavgs = blocked_rdd.map(lambda b: (np.average(b, axis=0), b.shape[0]))
 29 |     avgs, weights = zip(*pavgs.collect())
 30 |     return np.average(np.array(avgs), axis=0, weights=weights)
 31 | 
 32 | 
 33 | def cov(blocked_rdd):
 34 |     """
 35 |     Calculated the covariance matrix for the given blocked RDD.
 36 |     Unlike numpy.cov, expects each row to represent an observation.
 37 |     """
 38 |     avg = mean(blocked_rdd)
 39 |     covs = blocked_rdd.map(lambda x: x - avg).map(lambda x: (x.T.dot(x), x.shape[0]))
 40 |     prod, count = covs.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
 41 |     return prod / count
 42 | 
 43 | 
 44 | def svd(blocked_rdd, k):
 45 |     """
 46 |     Calculate the SVD of a blocked RDD directly, returning only the leading k
 47 |     singular vectors. Assumes n rows and d columns, efficient when n >> d
 48 |     Must be able to fit d^2 within the memory of a single machine.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 | 
 53 |     blocked_rdd : RDD
 54 |         RDD with data points in numpy array blocks
 55 | 
 56 |     k : Int
 57 |         Number of singular vectors to return
 58 | 
 59 |     Returns
 60 |     ----------
 61 | 
 62 |     u : RDD of blocks
 63 |         Left eigenvectors
 64 |     s : numpy array
 65 |         Singular values
 66 |     v : numpy array
 67 |         Right eigenvectors
 68 |     """
 69 | 
 70 |     # compute the covariance matrix (without mean subtraction)
 71 |     # TODO use one func for this (with mean subtraction as an option?)
 72 |     c = blocked_rdd.map(lambda x: (x.T.dot(x), x.shape[0]))
 73 |     prod, n = c.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
 74 | 
 75 |     # do local eigendecomposition
 76 |     w, v = ln.eig(prod / n)
 77 |     w = np.real(w)
 78 |     v = np.real(v)
 79 |     inds = np.argsort(w)[::-1]
 80 |     s = np.sqrt(w[inds[0:k]]) * np.sqrt(n)
 81 |     v = v[:, inds[0:k]].T
 82 | 
 83 |     # project back into data, normalize by singular values
 84 |     u = blocked_rdd.map(lambda x: np.inner(x, v) / s)
 85 | 
 86 |     return u, s, v
 87 | 
 88 | 
 89 | def svd_em(blocked_rdd, k, maxiter=20, tol=1e-5, seed=None):
 90 |     """
 91 |     Calculate the SVD of a blocked RDD using an expectation maximization
 92 |     algorithm (from Roweis, NIPS, 1997) that avoids explicitly
 93 |     computing the covariance matrix, returning only the leading k
 94 |     singular vectors. Assumes n rows and d columns, does not require
 95 |     d^2 to fit into memory on a single machine.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 | 
100 |     blocked_rdd : RDD
101 |         RDD with data points in numpy array blocks
102 | 
103 |     k : Int
104 |         Number of singular vectors to return
105 | 
106 |     maxiter : Int, optional, default = 20
107 |         Number of iterations to perform
108 | 
109 |     tol : Double, optional, default = 1e-5
110 |         Tolerance for stopping iterative updates
111 | 
112 |     seed : Int, optional, default = None
113 |         Seed for random number generator for initializing subspace
114 | 
115 |     Returns
116 |     ----------
117 | 
118 |     u : RDD of blocks
119 |         Left eigenvectors
120 |     s : numpy array
121 |         Singular values
122 |     v : numpy array
123 |         Right eigenvectors
124 |     """
125 | 
126 |     n = count(blocked_rdd)
127 |     m = len(blocked_rdd.first()[0])
128 | 
129 |     def outerprod(x):
130 |         return x.T.dot(x)
131 | 
132 |     if seed is not None:
133 |         rng = np.random.RandomState(seed)
134 |         c = rng.randn(k, m)
135 |     else:
136 |         c = np.random.randn(k, m)
137 |     iter = 0
138 |     error = 100
139 | 
140 |     # iteratively update subspace using expectation maximization
141 |     # e-step: x = (cc')^-1 c y
142 |     # m-step: c = y x' (xx')^-1
143 |     while (iter < maxiter) & (error > tol):
144 |         c_old = c
145 |         # pre compute (cc')^-1 c
146 |         c_inv = np.dot(c.T, ln.inv(np.dot(c, c.T)))
147 |         premult1 = blocked_rdd.context.broadcast(c_inv)
148 |         # compute (xx')^-1 through a map reduce
149 |         xx = blocked_rdd.map(lambda x: outerprod(np.dot(x, premult1.value))).reduce(add)
150 |         xx_inv = ln.inv(xx)
151 |         # pre compute (cc')^-1 c (xx')^-1
152 |         premult2 = blocked_rdd.context.broadcast(np.dot(c_inv, xx_inv))
153 |         # compute the new c through a map reduce
154 |         c = blocked_rdd.map(lambda x: np.dot(x.T, np.dot(x, premult2.value))).reduce(add)
155 |         c = c.T
156 | 
157 |         error = np.sum((c - c_old) ** 2)
158 |         iter += 1
159 | 
160 |     # project data into subspace spanned by columns of c
161 |     # use standard eigendecomposition to recover an orthonormal basis
162 |     c = ln.orth(c.T).T
163 |     cov = blocked_rdd.map(lambda x: np.dot(x, c.T)).map(lambda x: outerprod(x)).reduce(add)
164 |     w, v = ln.eig(cov / n)
165 |     w = np.real(w)
166 |     v = np.real(v)
167 |     inds = np.argsort(w)[::-1]
168 |     s = np.sqrt(w[inds[0:k]]) * np.sqrt(n)
169 |     v = np.dot(v[:, inds[0:k]].T, c)
170 |     u = blocked_rdd.map(lambda x: np.inner(x, v) / s)
171 | 
172 |     return u, s, v
173 | 


--------------------------------------------------------------------------------
/spylearn/histogram.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import collections
 3 | import types
 4 | 
 5 | def histogram(rdd, range=None, bins=10):
 6 |   """
 7 |   Compute the histogram of an RDD.
 8 |   """
 9 |   def _bin(num, bin_edges):
10 |     """
11 |     Given a number and set of bins defined by edges, computes which bin the number
12 |     lies in.  Lower edges are inclusive, higher are exclusive.
13 |     """
14 |     if num < bin_edges[0]:
15 |       return []
16 | 
17 |     for i, edge in enumerate(bin_edges[1:]):
18 |       if num < edge:
19 |         return [i]
20 | 
21 |     return []
22 | 
23 |   if isinstance(bins, collections.Iterable):
24 |     bin_edges = bins
25 |   elif type(bins) is types.IntType:
26 |     if range is None:
27 |       raise TypeError("range argument required when bins is an int")
28 |     bin_edges = np.linspace(range[0], range[1], bins+1)
29 |   else:
30 |     raise TypeError("bins required to be an int or iterable")
31 | 
32 |   return (rdd.flatMap(lambda x: _bin(x, bin_edges)).countByValue(), bin_edges)
33 | 


--------------------------------------------------------------------------------
/spylearn/k_means.py:
--------------------------------------------------------------------------------
 1 | from pyspark.mllib.clustering import KMeans
 2 | from pyspark.rdd import RDD
 3 | from operator import add
 4 | from numpy.linalg import norm
 5 | import itertools
 6 | 
 7 | class ParallelKMeans:
 8 |     """K-Means clustering
 9 | 
10 |     Parameters
11 |     ----------
12 | 
13 |     n_clusters : int, optional, default: 8
14 |         The number of clusters to form as well as the number of
15 |         centroids to generate.
16 | 
17 |     max_iter : int, optional, default: 10
18 |         Maximum number of iterations of the k-means algorithm for a
19 |         single run.
20 |     
21 |     init : string, optional, default: k-means||
22 |         Method for coming up with the initial clusters centers.  Either 'k-means||'
23 |         for the algorithm described by Bahmani et al. (Bahmani et al.,
24 |         Scalable K-Means++, VLDB 2012) or 'random' for initial centers chosen from
25 |         random input points.
26 | 
27 |     """
28 |     def __init__(self, n_clusters=8, max_iter=10, init='k-means||'):
29 |         self.n_clusters = n_clusters
30 |         self.max_iter = max_iter
31 |         self.init = init
32 | 
33 |     def fit(self, rdd):
34 |         rdd.cache()
35 |         self.model = KMeans.train(rdd, self.n_clusters, self.max_iter,
36 |             runs=1, initializationMode=self.init)
37 |         self.cluster_centers_ = self.model.centers
38 |         self.inertia_ = self.score_rdd(rdd)
39 | 
40 |     def error(self, point):
41 |         center = self.cluster_centers_[self.model.predict(point)]
42 |         return norm(point - center)
43 |     
44 |     def predict(self, data):
45 |         if isinstance(data, RDD):
46 |             return self.predict_rdd(data)
47 |         else:
48 |             return self.predict_array(data)
49 | 
50 |     def predict_rdd(self, rdd):
51 |         return rdd.map(lambda x: self.model.predict(x))
52 | 
53 |     def predict_array(self, arr):
54 |         return [self.model.predict(x) for x in arr]
55 | 
56 |     def score(self, data):
57 |         if isinstance(data, RDD):
58 |             return self.score_rdd(data)
59 |         else:
60 |             return self.score_array(data)
61 | 
62 |     def score_rdd(self, rdd):
63 |         return -rdd.map(self.error).sum()
64 | 
65 |     def score_array(self, arr):
66 |         return -sum(itertools.imap(self.error, arr))
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/spylearn/linear_model.py:
--------------------------------------------------------------------------------
 1 | """Parallel Linear Model training with partial_fit and averaging"""
 2 | 
 3 | 
 4 | def _train(iterator, model, classes):
 5 |     for X, y in iterator:
 6 |         model.partial_fit(X, y, classes=classes)
 7 |     yield model, 1
 8 | 
 9 | 
10 | def _model_sum(m_1, m_2):
11 |     model_1, count_1 = m_1
12 |     model_2, count_2 = m_2
13 |     model_1.coef_ += model_2.coef_
14 |     model_1.intercept_ += model_2.intercept_
15 |     return model_1, count_1 + count_2
16 | 
17 | 
18 | def parallel_train(model, data, classes=None, n_iter=10):
19 |     for i in range(n_iter):
20 |         models = data.mapPartitions(lambda x: _train(x, model, classes))
21 |         model, count = models.reduce(_model_sum)
22 |         model.coef_ /= count
23 |         model.intercept_ /= count
24 |     return model
25 | 


--------------------------------------------------------------------------------
/spylearn/random_permutation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def random_permutation(rdd, seed=None):
 4 |     """
 5 |     Shuffles, i.e., randomly reorders, the elements of the given rdd. This is
 6 |     more efficient than sorting by a random key.  It functions by assigning
 7 |     each element to a random partition and then ordering within each partition
 8 |     using the Fisher-Yates algorithm.
 9 |     """
10 |     num_partitions = rdd._jrdd.splits().size()
11 |     def partition_partition(split_index, iterable):
12 |         if seed != None:
13 |             np.random.seed(seed + split_index)
14 |         for el in iterable:
15 |             yield (np.random.randint(num_partitions), el)
16 |     
17 |     rdd = rdd.mapPartitionsWithIndex(partition_partition)
18 |     repartitioned = rdd.partitionBy(num_partitions, partitionFunc=lambda x: x)
19 |     
20 |     def fisher_yates(split_index, iterable):
21 |         """
22 |         Order randomly within a partition and strip off keys
23 |         """
24 |         if seed != None:
25 |             np.random.seed(seed + num_partitions + split_index) 
26 | 
27 |         out = []
28 |         for el in iterable:
29 |             j = np.random.randint(len(out)+1)
30 |             if j == len(out):
31 |                 out.append(el[1])
32 |             else:
33 |                 out.append(out[j])
34 |                 out[j] = el[1]
35 |         return out
36 |     
37 |     return repartitioned.mapPartitionsWithIndex(fisher_yates)
38 | 
39 | 


--------------------------------------------------------------------------------
/test/common.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pyspark import SparkContext
 3 | 
 4 | 
 5 | class SpylearnTestCase(unittest.TestCase):
 6 |     def setUp(self):
 7 |         class_name = self.__class__.__name__
 8 |         self.sc = SparkContext('local', class_name)
 9 | 
10 |     def tearDown(self):
11 |         self.sc.stop()
12 |         # To avoid Akka rebinding to the same port, since it doesn't unbind
13 |         # immediately on shutdown
14 |         self.sc._jvm.System.clearProperty("spark.driver.port")


--------------------------------------------------------------------------------
/test/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ -z "$SPARK_HOME" ]; then
 4 |     echo 'You need to set $SPARK_HOME to run these tests.' >&2
 5 |     exit 1
 6 | fi
 7 | 
 8 | export PYTHONPATH=$PYTHONPATH:$SPARK_HOME/python:../
 9 | nosetests $@
10 | 


--------------------------------------------------------------------------------
/test/test_block_rdd.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import tempfile
  3 | import numpy as np
  4 | import scipy.sparse as sp
  5 | 
  6 | from common import SpylearnTestCase
  7 | 
  8 | from spylearn.block_rdd import block_rdd
  9 | 
 10 | from nose.tools import assert_equal
 11 | from nose.tools import assert_true
 12 | from numpy.testing import assert_array_almost_equal
 13 | 
 14 | 
 15 | class TestUtils(SpylearnTestCase):
 16 |     def setUp(self):
 17 |         super(TestUtils, self).setUp()
 18 |         self.outputdir = tempfile.mkdtemp()
 19 | 
 20 |     def tearDown(self):
 21 |         super(TestUtils, self).tearDown()
 22 |         shutil.rmtree(self.outputdir)
 23 | 
 24 | 
 25 | class TestBlockRDD(TestUtils):
 26 | 
 27 |     def test_block_rdd_tuple(self):
 28 |         n_partitions = 10
 29 |         n_samples = 100
 30 |         sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
 31 |         data = self.sc.parallelize(
 32 |             [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)],
 33 |             n_partitions)
 34 |         blocked_data = block_rdd(data)
 35 | 
 36 |         expected_first_block = np.array([[1., 2.]] * 10)
 37 |         expected_second_block = np.zeros(10, dtype=np.int)
 38 |         expected_third_block = sp.vstack([sparse_row] * 10)
 39 | 
 40 |         first_block_tuple = blocked_data.first()
 41 |         assert_array_almost_equal(expected_first_block, first_block_tuple[0])
 42 |         assert_array_almost_equal(expected_second_block, first_block_tuple[1])
 43 |         assert_array_almost_equal(expected_third_block.toarray(),
 44 |                                   first_block_tuple[2].toarray())
 45 | 
 46 |         tuple_blocks = blocked_data.collect()
 47 |         assert_equal(len(tuple_blocks), n_partitions)
 48 |         assert_equal(sum(len(b[0]) for b in tuple_blocks),  n_samples)
 49 |         assert_equal(sum(len(b[1]) for b in tuple_blocks),  n_samples)
 50 | 
 51 |     def test_block_rdd_sp_matrix(self):
 52 |         n_partitions = 10
 53 |         n_samples = 100
 54 |         sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
 55 |         data = self.sc.parallelize([sparse_row for i in range(n_samples)],
 56 |             n_partitions)
 57 |         blocked_data = block_rdd(data)
 58 |         assert_true(sp.issparse(blocked_data.first()))
 59 | 
 60 |         expected_block = sp.vstack([sparse_row] * 10)
 61 |         assert_array_almost_equal(expected_block.toarray(),
 62 |                                   blocked_data.first().toarray())
 63 | 
 64 |     def test_block_rdd_array(self):
 65 |         n_partitions = 10
 66 |         n_samples = 100
 67 |         data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
 68 |             n_partitions)
 69 |         blocked_data = block_rdd(data)
 70 |         assert_array_almost_equal(np.ones((10, 1)), blocked_data.first())
 71 |         blocks = blocked_data.collect()
 72 |         assert_equal(len(blocks), n_partitions)
 73 |         assert_array_almost_equal(np.ones((10, 1)), blocks[-1])
 74 |         assert_equal(sum(len(b) for b in blocks),  n_samples)
 75 | 
 76 |         n_partitions = 17
 77 |         data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
 78 |             n_partitions)
 79 |         blocked_data = block_rdd(data)
 80 |         assert_array_almost_equal(np.ones((n_samples / n_partitions, 1)),
 81 |                                   blocked_data.first())
 82 |         blocks = blocked_data.collect()
 83 |         assert_equal(len(blocks), n_partitions)
 84 |         assert_equal(sum(len(b) for b in blocks),  n_samples)
 85 | 
 86 |     def test_block_rdd_array_block_size(self):
 87 |         n_partitions = 10
 88 |         n_samples = 107
 89 |         data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
 90 |             n_partitions)
 91 | 
 92 |         block_data_5 = block_rdd(data, block_size=5)
 93 |         blocks = block_data_5.collect()
 94 |         assert_true(all(len(b) <= 5 for b in blocks))
 95 | 
 96 |         block_data_10 = block_rdd(data, block_size=10)
 97 |         blocks = block_data_10.collect()
 98 |         assert_true(all(len(b) <= 10 for b in blocks))
 99 | 
100 |     def test_block_empty_rdd(self):
101 |         n_partitions = 3
102 |         empty_data = self.sc.parallelize([], n_partitions)
103 |         blocks = block_rdd(empty_data).collect()
104 |         assert_equal(len(blocks), 0)
105 | 
106 |     def test_block_rdd_dict(self):
107 |         n_partitions = 3
108 |         n_samples = 57
109 |         dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)]
110 |         data = self.sc.parallelize(dicts, n_partitions)
111 | 
112 |         block_data_5 = block_rdd(data, block_size=5)
113 |         blocks = block_data_5.collect()
114 |         assert_true(all(len(b) <= 5 for b in blocks))
115 |         assert_array_almost_equal(blocks[0].a, np.arange(5))
116 |         assert_array_almost_equal(blocks[0].b,
117 |                                   np.arange(5, dtype=np.float) ** 2)
118 | 


--------------------------------------------------------------------------------
/test/test_blocked_math.py:
--------------------------------------------------------------------------------
 1 | from common import SpylearnTestCase
 2 | 
 3 | from spylearn.blocked_math import count, cov, svd, svd_em
 4 | from spylearn.block_rdd import block_rdd
 5 | 
 6 | import numpy as np
 7 | import scipy.linalg as ln
 8 | from numpy.testing import assert_array_almost_equal
 9 | 
10 | 
11 | def match_sign(a, b):
12 |     a_sign = np.sign(a)
13 |     b_sign = np.sign(b)
14 |     if np.array_equal(a_sign, -b_sign):
15 |         return -b
16 |     elif np.array_equal(a_sign, b_sign):
17 |         return b
18 |     else:
19 |         raise AssertionError("inconsistent matching of sign")
20 | 
21 | 
22 | class TestUtils(SpylearnTestCase):
23 |     def setUp(self):
24 |         super(TestUtils, self).setUp()
25 | 
26 |     def tearDown(self):
27 |         super(TestUtils, self).tearDown()
28 | 
29 | 
30 | class TestBlockedMath(TestUtils):
31 | 
32 |     def test_count(self):
33 |         n_samples = 100
34 |         n_partitions = 10
35 |         mat = [np.array([1]) for i in range(n_samples)]
36 |         data = block_rdd(self.sc.parallelize(mat, n_partitions))
37 |         assert_array_almost_equal(n_samples, count(data))
38 | 
39 |     def test_cov(self):
40 |         rng = np.random.RandomState(42)
41 |         true_cov = np.array([[3., 2., 4.], [2., 2., 5.], [4., 5., 6.]])
42 |         mat = rng.multivariate_normal(np.array([1., 2., 3.]), size=int(1e3),
43 |             cov=true_cov)
44 |         data = block_rdd(self.sc.parallelize(mat, 4))
45 |         rdd_cov = cov(data)
46 |         assert_array_almost_equal(np.cov(mat.T), rdd_cov, decimal=1)
47 | 
48 |     def test_svd(self):
49 |         rng = np.random.RandomState(42)
50 |         mat = rng.randn(1e3, 10)
51 |         data = block_rdd(self.sc.parallelize(list(mat), 10))
52 |         u, s, v = svd(data, 1)
53 |         u = np.squeeze(np.concatenate(np.array(u.collect()))).T
54 |         u_true, s_true, v_true = ln.svd(mat)
55 |         assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]))
56 |         assert_array_almost_equal(s[0], s_true[0])
57 |         assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
58 | 
59 |     def test_svd_em(self):
60 |         rng = np.random.RandomState(42)
61 |         mat = rng.randn(10, 3)
62 |         data = block_rdd(self.sc.parallelize(list(mat), 2)).cache()
63 |         u, s, v = svd_em(data, 1, seed=42)
64 |         u = np.squeeze(np.concatenate(np.array(u.collect()))).T
65 |         u_true, s_true, v_true = ln.svd(mat)
66 |         tol = 1
67 |         assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]), tol)
68 |         assert_array_almost_equal(s[0], s_true[0], tol)
69 |         assert_array_almost_equal(u, match_sign(u, u_true[:, 0]), tol)
70 | 
71 | 


--------------------------------------------------------------------------------
/test/test_histogram.py:
--------------------------------------------------------------------------------
 1 | from common import SpylearnTestCase
 2 | import shutil
 3 | import tempfile
 4 | 
 5 | from spylearn.histogram import histogram
 6 | import numpy as np
 7 | 
 8 | from nose.tools import assert_equals
 9 | from numpy.testing import assert_array_almost_equal
10 | 
11 | 
12 | class TestUtils(SpylearnTestCase):
13 |     def setUp(self):
14 |         super(TestUtils, self).setUp()
15 |         self.outputdir = tempfile.mkdtemp()
16 | 
17 |     def tearDown(self):
18 |         super(TestUtils, self).tearDown()
19 |         shutil.rmtree(self.outputdir)
20 | 
21 | 
22 | class TestHistogram(TestUtils):
23 | 
24 |     def test_bins_as_number(self):
25 |         data = self.sc.parallelize([1, 2, 3, 4, 5])
26 |         hist, bin_edges = histogram(data, range=(0, 6), bins=2)
27 |         assert_equals(2, hist[0])
28 |         assert_equals(3, hist[1])
29 |         assert_array_almost_equal(np.array([0, 3, 6]), bin_edges)
30 | 
31 |     def test_bins_as_array(self):
32 |         data = self.sc.parallelize([1, 2, 3, 4, 5])
33 |         hist, bin_edges = histogram(data, bins=[0, 3, 6])
34 |         assert_equals(2, hist[0])
35 |         assert_equals(3, hist[1])
36 |         assert_array_almost_equal(np.array([0, 3, 6]), bin_edges)
37 | 
38 |     def test_ignore_out_of_range(self):
39 |         data = self.sc.parallelize([1, 2, 3, 4, 5])
40 |         hist, bin_edges = histogram(data, range=(2, 5), bins=2)
41 |         assert_equals(2, hist[0])
42 |         assert_equals(1, hist[1])
43 |         assert_array_almost_equal(np.array([2, 3.5, 5]), bin_edges)
44 | 
45 | 


--------------------------------------------------------------------------------
/test/test_k_means.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from spylearn.k_means import ParallelKMeans
 4 | 
 5 | from common import SpylearnTestCase
 6 | 
 7 | from nose.tools import assert_greater, assert_less
 8 | from numpy.testing import assert_array_equal
 9 | 
10 | class KMeansTestCase(SpylearnTestCase):
11 | 
12 |     data = None
13 |     n_partitions = 2
14 | 
15 |     def setUp(self):
16 |         super(KMeansTestCase, self).setUp()
17 |         if self.data is None:
18 |             rng = np.random.RandomState(42)
19 |             self.center1 = rng.randint(10, size=50)
20 |             self.center2 = rng.randint(10, size=50)
21 |             self.cluster1 = rng.normal(size=(int(1e3), 50)) + self.center1
22 |             self.cluster2 = rng.normal(size=(int(1e3), 50)) + self.center2
23 |             X = np.concatenate([self.cluster1, self.cluster2])
24 |             rng.shuffle(X)
25 |             self.data = self.sc.parallelize(X, numSlices=self.n_partitions)
26 |             self.expected_error = sum([np.linalg.norm(rng.randn(50) -
27 |                 rng.randn(50)) for _ in range(int(1e3))])
28 | 
29 |     def test_clustering(self):
30 |         model = ParallelKMeans(2, 7)
31 |         model.fit(self.data)
32 |         cluster1_predictions = model.predict(self.cluster1)
33 |         cluster2_predictions = model.predict(self.cluster2)
34 |         assert_array_equal(np.repeat(cluster1_predictions[0], len(self.cluster1)), 
35 |             cluster1_predictions)
36 |         assert_array_equal(np.repeat(cluster2_predictions[0], len(self.cluster2)), 
37 |             cluster2_predictions)
38 | 
39 |         score1 = model.score(self.cluster1)
40 |         assert_less(score1, 0)
41 |         assert_greater(score1, -self.expected_error * 1.5)
42 |         score2 = model.score(self.cluster2)
43 |         assert_less(score2, 0)
44 |         assert_greater(score2, -self.expected_error * 1.5)
45 | 
46 | 


--------------------------------------------------------------------------------
/test/test_linear_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sklearn.linear_model import SGDClassifier
 4 | 
 5 | from spylearn.linear_model import parallel_train
 6 | from spylearn.block_rdd import block_rdd
 7 | 
 8 | from common import SpylearnTestCase
 9 | 
10 | from nose.tools import assert_greater
11 | from nose import SkipTest
12 | from numpy.testing import assert_array_almost_equal
13 | 
14 | 
15 | class SumModel(object):
16 | 
17 |     def __init__(self):
18 |         self.coef_ = 0
19 |         self.intercept_ = 0
20 | 
21 |     def partial_fit(self, X, y, **kwargs):
22 |         if X.ndim == 1:
23 |             X = X.reshape(1, -1)
24 |         self.coef_ += X.sum(axis=0)
25 |         return self
26 | 
27 | 
28 | class LinearModelTestCase(SpylearnTestCase):
29 | 
30 |     data = None
31 |     n_partitions = 2
32 | 
33 |     def setUp(self):
34 |         super(LinearModelTestCase, self).setUp()
35 |         if self.data is None:
36 |             rng = np.random.RandomState(42)
37 |             X = rng.normal(size=(int(1e3), 50))
38 |             coef = rng.normal(size=50)
39 |             y = (np.dot(X, coef) > 0.01).astype(np.int)
40 |             self.X = X
41 |             self.y = y
42 |             self.classes = np.unique(y)
43 |             self.data = self.sc.parallelize(list(zip(X, y)),
44 |                 numSlices=self.n_partitions).cache()
45 |             self.blocked_data = block_rdd(self.data, block_size=171)
46 | 
47 |     def test_parallel_train_sum_model_non_blocked(self):
48 |         n_iter = 2
49 |         model = parallel_train(SumModel(), self.data, self.classes, n_iter)
50 |         expected_coef = self.X.sum(axis=0) * n_iter / self.n_partitions
51 |         assert_array_almost_equal(model.coef_, expected_coef , 5)
52 | 
53 |     def test_parallel_train(self):
54 |         if not hasattr(SGDClassifier, 'partial_fit'):
55 |             raise SkipTest('sklearn >= 0.13 is required to run this test')
56 |         model = SGDClassifier(loss='log', alpha=1e-5, random_state=2)
57 |         model = parallel_train(model, self.blocked_data, self.classes)
58 |         assert_greater(model.score(self.X, self.y), 0.90)
59 | 


--------------------------------------------------------------------------------
/test/test_random_permutation.py:
--------------------------------------------------------------------------------
 1 | from common import SpylearnTestCase
 2 | import shutil
 3 | import tempfile
 4 | 
 5 | from spylearn.random_permutation import random_permutation
 6 | import numpy as np
 7 | 
 8 | from nose.tools import assert_equals, assert_not_equals
 9 | from numpy.testing import assert_array_almost_equal
10 | 
11 | 
12 | class TestUtils(SpylearnTestCase):
13 |     def setUp(self):
14 |         super(TestUtils, self).setUp()
15 |         self.outputdir = tempfile.mkdtemp()
16 | 
17 |     def tearDown(self):
18 |         super(TestUtils, self).tearDown()
19 |         shutil.rmtree(self.outputdir)
20 | 
21 | 
22 | class TestRandomPermutation(TestUtils):
23 | 
24 |     def test_random_permutation(self):
25 |         data = self.sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)
26 |         shuffled = random_permutation(data, seed=5).collect()
27 |         assert_equals([10, 7, 6, 8, 2, 5, 9, 4, 3, 1], shuffled)
28 |         assert_not_equals(shuffled, random_permutation(data, seed=6).collect())
29 | 
30 | 


--------------------------------------------------------------------------------