├── .gitignore ├── .travis.yml ├── COPYING ├── README.md ├── requirements.txt ├── spylearn ├── __init__.py ├── block_rdd.py ├── blocked_math.py ├── histogram.py ├── k_means.py ├── linear_model.py └── random_permutation.py └── test ├── common.py ├── run_tests.sh ├── test_block_rdd.py ├── test_blocked_math.py ├── test_histogram.py ├── test_k_means.py ├── test_linear_model.py └── test_random_permutation.py /.gitignore: -------------------------------------------------------------------------------- 1 | ## generic files to ignore 2 | *~ 3 | *.lock 4 | *.DS_Store 5 | *.swp 6 | *.out 7 | 8 | awsdeploy/ 9 | 10 | # rails specific 11 | *.sqlite3 12 | config/database.yml 13 | log/* 14 | tmp/* 15 | 16 | # java specific 17 | *.class 18 | 19 | # python specific 20 | *.pyc 21 | 22 | # xcode/iphone specific 23 | build/* 24 | *.pbxuser 25 | *.mode2v3 26 | *.mode1v3 27 | *.perspective 28 | *.perspectivev3 29 | *~.nib 30 | 31 | # akka specific 32 | logs/* 33 | 34 | # sbt specific 35 | target/ 36 | project/boot 37 | lib_managed/* 38 | project/build/target 39 | project/build/lib_managed 40 | project/build/src_managed 41 | project/plugins/lib_managed 42 | project/plugins/target 43 | project/plugins/src_managed 44 | project/plugins/project 45 | 46 | core/lib_managed 47 | core/target 48 | pubsub/lib_managed 49 | pubsub/target 50 | 51 | # eclipse specific 52 | .metadata 53 | jrebel.lic 54 | .settings 55 | .classpath 56 | .project 57 | 58 | .ensime* 59 | *.sublime-* 60 | .cache 61 | 62 | # intellij 63 | *.eml 64 | *.iml 65 | *.ipr 66 | *.iws 67 | .*.sw? 68 | .idea 69 | 70 | # paulp script 71 | /.lib/ 72 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | jdk: 5 | - openjdk7 6 | virtualenv: 7 | system_site_packages: true 8 | install: 9 | # Install Spark 10 | - wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-hadoop1.tgz 11 | - tar -xzf spark-0.9.0-incubating-bin-hadoop1.tgz 12 | - sudo apt-get install -qq python-numpy python-scipy python-pandas python-sklearn 13 | - pip install -r requirements.txt 14 | # Workaround for Travis issue with POSIX semaphores; see 15 | # https://github.com/travis-ci/travis-cookbooks/issues/155 16 | - "sudo rm -rf /dev/shm && sudo ln -s /run/shm /dev/shm" 17 | script: 18 | - export SPARK_HOME=`pwd`/spark-0.9.0-incubating-bin-hadoop1 19 | - cd test 20 | - ./run_tests.sh -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2007–2014 The pyspark-sklearn developers. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the Scikit-learn Developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySpark / scikit-learn glue code 2 | 3 | [![Build Status](https://travis-ci.org/ogrisel/spylearn.png?branch=master)](https://travis-ci.org/ogrisel/spylearn) 4 | 5 | Repo with experimental tools to show how to combine scikit-learn and PySpark. 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse 2 | numpy 3 | scipy 4 | scikit-learn 5 | pandas -------------------------------------------------------------------------------- /spylearn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/spylearn/8af56f44d52cd46c72877a1df80a2406bbc61d34/spylearn/__init__.py -------------------------------------------------------------------------------- /spylearn/block_rdd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | 5 | def block_rdd(data, block_size=None): 6 | """Block an RDD 7 | 8 | Parameters 9 | ---------- 10 | 11 | data : RDD 12 | RDD of data points to block into either numpy arrays, 13 | scipy sparse matrices, or pandas data frames. 14 | Type of data point will be automatically inferred 15 | and blocked accordingly. 16 | 17 | block_size : int, optional, default None 18 | Size of each block (number of elements), if None all data points 19 | from each partition will be combined in a block. 20 | 21 | """ 22 | 23 | import pandas as pd 24 | try: 25 | entry = data.first() 26 | except IndexError: 27 | # empty RDD: do not block 28 | return data 29 | 30 | # do different kinds of block depending on the type 31 | if isinstance(entry, tuple): 32 | return data.mapPartitions(_block_tuple, block_size) 33 | 34 | elif isinstance(entry, dict): 35 | return data.mapPartitions( 36 | lambda x: _block_collection(x, pd.DataFrame, block_size)) 37 | 38 | elif sp.issparse(entry): 39 | return data.mapPartitions( 40 | lambda x: _block_collection(x, sp.vstack, block_size)) 41 | 42 | else: 43 | # Fallback to array packing 44 | return data.mapPartitions( 45 | lambda x: _block_collection(x, np.array, block_size)) 46 | 47 | 48 | def _pack_accumulated(accumulated): 49 | if len(accumulated) > 0 and sp.issparse(accumulated[0]): 50 | return sp.vstack(accumulated) 51 | else: 52 | return np.array(accumulated) 53 | 54 | 55 | def _block_tuple(iterator, block_size=None): 56 | """Pack rdd of tuples as tuples of arrays or scipy.sparse matrices.""" 57 | i = 0 58 | blocked_tuple = None 59 | for tuple_i in iterator: 60 | if blocked_tuple is None: 61 | blocked_tuple = tuple([] for _ in range(len(tuple_i))) 62 | 63 | if block_size is not None and i >= block_size: 64 | yield tuple(_pack_accumulated(x) for x in blocked_tuple) 65 | blocked_tuple = tuple([] for _ in range(len(tuple_i))) 66 | i = 0 67 | for x_j, x in zip(tuple_i, blocked_tuple): 68 | x.append(x_j) 69 | i += 1 70 | yield tuple(_pack_accumulated(x) for x in blocked_tuple) 71 | 72 | 73 | def _block_collection(iterator, collection_type, block_size=None): 74 | """Pack rdd with a specific collection constructor.""" 75 | i = 0 76 | accumulated = [] 77 | for a in iterator: 78 | if block_size is not None and i >= block_size: 79 | yield collection_type(accumulated) 80 | accumulated = [] 81 | i = 0 82 | accumulated.append(a) 83 | i += 1 84 | yield collection_type(accumulated) 85 | -------------------------------------------------------------------------------- /spylearn/blocked_math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.linalg as ln 3 | from operator import add 4 | 5 | 6 | def count(blocked_rdd): 7 | return blocked_rdd.map(lambda x: x.shape[0]).reduce(add) 8 | 9 | 10 | def sum(blocked_rdd, axis=None): 11 | """ 12 | Compute the sum of a blcoked RDD, either the sum of all values, 13 | or the sum along the specified dimension (must be 0) 14 | """ 15 | if axis is None: 16 | return blocked_rdd.map(np.sum).reduce(add) 17 | elif axis == 0: 18 | return blocked_rdd.map(lambda x: np.sum(x, axis)).reduce(add) 19 | else: 20 | raise ValueError("axis must be 0 or None") 21 | 22 | 23 | def mean(blocked_rdd): 24 | """ 25 | Done this way to avoid overflow from summing everything before dividing. Though 26 | not sure if that's an issue? 27 | """ 28 | pavgs = blocked_rdd.map(lambda b: (np.average(b, axis=0), b.shape[0])) 29 | avgs, weights = zip(*pavgs.collect()) 30 | return np.average(np.array(avgs), axis=0, weights=weights) 31 | 32 | 33 | def cov(blocked_rdd): 34 | """ 35 | Calculated the covariance matrix for the given blocked RDD. 36 | Unlike numpy.cov, expects each row to represent an observation. 37 | """ 38 | avg = mean(blocked_rdd) 39 | covs = blocked_rdd.map(lambda x: x - avg).map(lambda x: (x.T.dot(x), x.shape[0])) 40 | prod, count = covs.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1])) 41 | return prod / count 42 | 43 | 44 | def svd(blocked_rdd, k): 45 | """ 46 | Calculate the SVD of a blocked RDD directly, returning only the leading k 47 | singular vectors. Assumes n rows and d columns, efficient when n >> d 48 | Must be able to fit d^2 within the memory of a single machine. 49 | 50 | Parameters 51 | ---------- 52 | 53 | blocked_rdd : RDD 54 | RDD with data points in numpy array blocks 55 | 56 | k : Int 57 | Number of singular vectors to return 58 | 59 | Returns 60 | ---------- 61 | 62 | u : RDD of blocks 63 | Left eigenvectors 64 | s : numpy array 65 | Singular values 66 | v : numpy array 67 | Right eigenvectors 68 | """ 69 | 70 | # compute the covariance matrix (without mean subtraction) 71 | # TODO use one func for this (with mean subtraction as an option?) 72 | c = blocked_rdd.map(lambda x: (x.T.dot(x), x.shape[0])) 73 | prod, n = c.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1])) 74 | 75 | # do local eigendecomposition 76 | w, v = ln.eig(prod / n) 77 | w = np.real(w) 78 | v = np.real(v) 79 | inds = np.argsort(w)[::-1] 80 | s = np.sqrt(w[inds[0:k]]) * np.sqrt(n) 81 | v = v[:, inds[0:k]].T 82 | 83 | # project back into data, normalize by singular values 84 | u = blocked_rdd.map(lambda x: np.inner(x, v) / s) 85 | 86 | return u, s, v 87 | 88 | 89 | def svd_em(blocked_rdd, k, maxiter=20, tol=1e-5, seed=None): 90 | """ 91 | Calculate the SVD of a blocked RDD using an expectation maximization 92 | algorithm (from Roweis, NIPS, 1997) that avoids explicitly 93 | computing the covariance matrix, returning only the leading k 94 | singular vectors. Assumes n rows and d columns, does not require 95 | d^2 to fit into memory on a single machine. 96 | 97 | Parameters 98 | ---------- 99 | 100 | blocked_rdd : RDD 101 | RDD with data points in numpy array blocks 102 | 103 | k : Int 104 | Number of singular vectors to return 105 | 106 | maxiter : Int, optional, default = 20 107 | Number of iterations to perform 108 | 109 | tol : Double, optional, default = 1e-5 110 | Tolerance for stopping iterative updates 111 | 112 | seed : Int, optional, default = None 113 | Seed for random number generator for initializing subspace 114 | 115 | Returns 116 | ---------- 117 | 118 | u : RDD of blocks 119 | Left eigenvectors 120 | s : numpy array 121 | Singular values 122 | v : numpy array 123 | Right eigenvectors 124 | """ 125 | 126 | n = count(blocked_rdd) 127 | m = len(blocked_rdd.first()[0]) 128 | 129 | def outerprod(x): 130 | return x.T.dot(x) 131 | 132 | if seed is not None: 133 | rng = np.random.RandomState(seed) 134 | c = rng.randn(k, m) 135 | else: 136 | c = np.random.randn(k, m) 137 | iter = 0 138 | error = 100 139 | 140 | # iteratively update subspace using expectation maximization 141 | # e-step: x = (cc')^-1 c y 142 | # m-step: c = y x' (xx')^-1 143 | while (iter < maxiter) & (error > tol): 144 | c_old = c 145 | # pre compute (cc')^-1 c 146 | c_inv = np.dot(c.T, ln.inv(np.dot(c, c.T))) 147 | premult1 = blocked_rdd.context.broadcast(c_inv) 148 | # compute (xx')^-1 through a map reduce 149 | xx = blocked_rdd.map(lambda x: outerprod(np.dot(x, premult1.value))).reduce(add) 150 | xx_inv = ln.inv(xx) 151 | # pre compute (cc')^-1 c (xx')^-1 152 | premult2 = blocked_rdd.context.broadcast(np.dot(c_inv, xx_inv)) 153 | # compute the new c through a map reduce 154 | c = blocked_rdd.map(lambda x: np.dot(x.T, np.dot(x, premult2.value))).reduce(add) 155 | c = c.T 156 | 157 | error = np.sum((c - c_old) ** 2) 158 | iter += 1 159 | 160 | # project data into subspace spanned by columns of c 161 | # use standard eigendecomposition to recover an orthonormal basis 162 | c = ln.orth(c.T).T 163 | cov = blocked_rdd.map(lambda x: np.dot(x, c.T)).map(lambda x: outerprod(x)).reduce(add) 164 | w, v = ln.eig(cov / n) 165 | w = np.real(w) 166 | v = np.real(v) 167 | inds = np.argsort(w)[::-1] 168 | s = np.sqrt(w[inds[0:k]]) * np.sqrt(n) 169 | v = np.dot(v[:, inds[0:k]].T, c) 170 | u = blocked_rdd.map(lambda x: np.inner(x, v) / s) 171 | 172 | return u, s, v 173 | -------------------------------------------------------------------------------- /spylearn/histogram.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | import types 4 | 5 | def histogram(rdd, range=None, bins=10): 6 | """ 7 | Compute the histogram of an RDD. 8 | """ 9 | def _bin(num, bin_edges): 10 | """ 11 | Given a number and set of bins defined by edges, computes which bin the number 12 | lies in. Lower edges are inclusive, higher are exclusive. 13 | """ 14 | if num < bin_edges[0]: 15 | return [] 16 | 17 | for i, edge in enumerate(bin_edges[1:]): 18 | if num < edge: 19 | return [i] 20 | 21 | return [] 22 | 23 | if isinstance(bins, collections.Iterable): 24 | bin_edges = bins 25 | elif type(bins) is types.IntType: 26 | if range is None: 27 | raise TypeError("range argument required when bins is an int") 28 | bin_edges = np.linspace(range[0], range[1], bins+1) 29 | else: 30 | raise TypeError("bins required to be an int or iterable") 31 | 32 | return (rdd.flatMap(lambda x: _bin(x, bin_edges)).countByValue(), bin_edges) 33 | -------------------------------------------------------------------------------- /spylearn/k_means.py: -------------------------------------------------------------------------------- 1 | from pyspark.mllib.clustering import KMeans 2 | from pyspark.rdd import RDD 3 | from operator import add 4 | from numpy.linalg import norm 5 | import itertools 6 | 7 | class ParallelKMeans: 8 | """K-Means clustering 9 | 10 | Parameters 11 | ---------- 12 | 13 | n_clusters : int, optional, default: 8 14 | The number of clusters to form as well as the number of 15 | centroids to generate. 16 | 17 | max_iter : int, optional, default: 10 18 | Maximum number of iterations of the k-means algorithm for a 19 | single run. 20 | 21 | init : string, optional, default: k-means|| 22 | Method for coming up with the initial clusters centers. Either 'k-means||' 23 | for the algorithm described by Bahmani et al. (Bahmani et al., 24 | Scalable K-Means++, VLDB 2012) or 'random' for initial centers chosen from 25 | random input points. 26 | 27 | """ 28 | def __init__(self, n_clusters=8, max_iter=10, init='k-means||'): 29 | self.n_clusters = n_clusters 30 | self.max_iter = max_iter 31 | self.init = init 32 | 33 | def fit(self, rdd): 34 | rdd.cache() 35 | self.model = KMeans.train(rdd, self.n_clusters, self.max_iter, 36 | runs=1, initializationMode=self.init) 37 | self.cluster_centers_ = self.model.centers 38 | self.inertia_ = self.score_rdd(rdd) 39 | 40 | def error(self, point): 41 | center = self.cluster_centers_[self.model.predict(point)] 42 | return norm(point - center) 43 | 44 | def predict(self, data): 45 | if isinstance(data, RDD): 46 | return self.predict_rdd(data) 47 | else: 48 | return self.predict_array(data) 49 | 50 | def predict_rdd(self, rdd): 51 | return rdd.map(lambda x: self.model.predict(x)) 52 | 53 | def predict_array(self, arr): 54 | return [self.model.predict(x) for x in arr] 55 | 56 | def score(self, data): 57 | if isinstance(data, RDD): 58 | return self.score_rdd(data) 59 | else: 60 | return self.score_array(data) 61 | 62 | def score_rdd(self, rdd): 63 | return -rdd.map(self.error).sum() 64 | 65 | def score_array(self, arr): 66 | return -sum(itertools.imap(self.error, arr)) 67 | 68 | 69 | -------------------------------------------------------------------------------- /spylearn/linear_model.py: -------------------------------------------------------------------------------- 1 | """Parallel Linear Model training with partial_fit and averaging""" 2 | 3 | 4 | def _train(iterator, model, classes): 5 | for X, y in iterator: 6 | model.partial_fit(X, y, classes=classes) 7 | yield model, 1 8 | 9 | 10 | def _model_sum(m_1, m_2): 11 | model_1, count_1 = m_1 12 | model_2, count_2 = m_2 13 | model_1.coef_ += model_2.coef_ 14 | model_1.intercept_ += model_2.intercept_ 15 | return model_1, count_1 + count_2 16 | 17 | 18 | def parallel_train(model, data, classes=None, n_iter=10): 19 | for i in range(n_iter): 20 | models = data.mapPartitions(lambda x: _train(x, model, classes)) 21 | model, count = models.reduce(_model_sum) 22 | model.coef_ /= count 23 | model.intercept_ /= count 24 | return model 25 | -------------------------------------------------------------------------------- /spylearn/random_permutation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def random_permutation(rdd, seed=None): 4 | """ 5 | Shuffles, i.e., randomly reorders, the elements of the given rdd. This is 6 | more efficient than sorting by a random key. It functions by assigning 7 | each element to a random partition and then ordering within each partition 8 | using the Fisher-Yates algorithm. 9 | """ 10 | num_partitions = rdd._jrdd.splits().size() 11 | def partition_partition(split_index, iterable): 12 | if seed != None: 13 | np.random.seed(seed + split_index) 14 | for el in iterable: 15 | yield (np.random.randint(num_partitions), el) 16 | 17 | rdd = rdd.mapPartitionsWithIndex(partition_partition) 18 | repartitioned = rdd.partitionBy(num_partitions, partitionFunc=lambda x: x) 19 | 20 | def fisher_yates(split_index, iterable): 21 | """ 22 | Order randomly within a partition and strip off keys 23 | """ 24 | if seed != None: 25 | np.random.seed(seed + num_partitions + split_index) 26 | 27 | out = [] 28 | for el in iterable: 29 | j = np.random.randint(len(out)+1) 30 | if j == len(out): 31 | out.append(el[1]) 32 | else: 33 | out.append(out[j]) 34 | out[j] = el[1] 35 | return out 36 | 37 | return repartitioned.mapPartitionsWithIndex(fisher_yates) 38 | 39 | -------------------------------------------------------------------------------- /test/common.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pyspark import SparkContext 3 | 4 | 5 | class SpylearnTestCase(unittest.TestCase): 6 | def setUp(self): 7 | class_name = self.__class__.__name__ 8 | self.sc = SparkContext('local', class_name) 9 | 10 | def tearDown(self): 11 | self.sc.stop() 12 | # To avoid Akka rebinding to the same port, since it doesn't unbind 13 | # immediately on shutdown 14 | self.sc._jvm.System.clearProperty("spark.driver.port") -------------------------------------------------------------------------------- /test/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -z "$SPARK_HOME" ]; then 4 | echo 'You need to set $SPARK_HOME to run these tests.' >&2 5 | exit 1 6 | fi 7 | 8 | export PYTHONPATH=$PYTHONPATH:$SPARK_HOME/python:../ 9 | nosetests $@ 10 | -------------------------------------------------------------------------------- /test/test_block_rdd.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import numpy as np 4 | import scipy.sparse as sp 5 | 6 | from common import SpylearnTestCase 7 | 8 | from spylearn.block_rdd import block_rdd 9 | 10 | from nose.tools import assert_equal 11 | from nose.tools import assert_true 12 | from numpy.testing import assert_array_almost_equal 13 | 14 | 15 | class TestUtils(SpylearnTestCase): 16 | def setUp(self): 17 | super(TestUtils, self).setUp() 18 | self.outputdir = tempfile.mkdtemp() 19 | 20 | def tearDown(self): 21 | super(TestUtils, self).tearDown() 22 | shutil.rmtree(self.outputdir) 23 | 24 | 25 | class TestBlockRDD(TestUtils): 26 | 27 | def test_block_rdd_tuple(self): 28 | n_partitions = 10 29 | n_samples = 100 30 | sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) 31 | data = self.sc.parallelize( 32 | [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], 33 | n_partitions) 34 | blocked_data = block_rdd(data) 35 | 36 | expected_first_block = np.array([[1., 2.]] * 10) 37 | expected_second_block = np.zeros(10, dtype=np.int) 38 | expected_third_block = sp.vstack([sparse_row] * 10) 39 | 40 | first_block_tuple = blocked_data.first() 41 | assert_array_almost_equal(expected_first_block, first_block_tuple[0]) 42 | assert_array_almost_equal(expected_second_block, first_block_tuple[1]) 43 | assert_array_almost_equal(expected_third_block.toarray(), 44 | first_block_tuple[2].toarray()) 45 | 46 | tuple_blocks = blocked_data.collect() 47 | assert_equal(len(tuple_blocks), n_partitions) 48 | assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) 49 | assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples) 50 | 51 | def test_block_rdd_sp_matrix(self): 52 | n_partitions = 10 53 | n_samples = 100 54 | sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) 55 | data = self.sc.parallelize([sparse_row for i in range(n_samples)], 56 | n_partitions) 57 | blocked_data = block_rdd(data) 58 | assert_true(sp.issparse(blocked_data.first())) 59 | 60 | expected_block = sp.vstack([sparse_row] * 10) 61 | assert_array_almost_equal(expected_block.toarray(), 62 | blocked_data.first().toarray()) 63 | 64 | def test_block_rdd_array(self): 65 | n_partitions = 10 66 | n_samples = 100 67 | data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], 68 | n_partitions) 69 | blocked_data = block_rdd(data) 70 | assert_array_almost_equal(np.ones((10, 1)), blocked_data.first()) 71 | blocks = blocked_data.collect() 72 | assert_equal(len(blocks), n_partitions) 73 | assert_array_almost_equal(np.ones((10, 1)), blocks[-1]) 74 | assert_equal(sum(len(b) for b in blocks), n_samples) 75 | 76 | n_partitions = 17 77 | data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], 78 | n_partitions) 79 | blocked_data = block_rdd(data) 80 | assert_array_almost_equal(np.ones((n_samples / n_partitions, 1)), 81 | blocked_data.first()) 82 | blocks = blocked_data.collect() 83 | assert_equal(len(blocks), n_partitions) 84 | assert_equal(sum(len(b) for b in blocks), n_samples) 85 | 86 | def test_block_rdd_array_block_size(self): 87 | n_partitions = 10 88 | n_samples = 107 89 | data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], 90 | n_partitions) 91 | 92 | block_data_5 = block_rdd(data, block_size=5) 93 | blocks = block_data_5.collect() 94 | assert_true(all(len(b) <= 5 for b in blocks)) 95 | 96 | block_data_10 = block_rdd(data, block_size=10) 97 | blocks = block_data_10.collect() 98 | assert_true(all(len(b) <= 10 for b in blocks)) 99 | 100 | def test_block_empty_rdd(self): 101 | n_partitions = 3 102 | empty_data = self.sc.parallelize([], n_partitions) 103 | blocks = block_rdd(empty_data).collect() 104 | assert_equal(len(blocks), 0) 105 | 106 | def test_block_rdd_dict(self): 107 | n_partitions = 3 108 | n_samples = 57 109 | dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)] 110 | data = self.sc.parallelize(dicts, n_partitions) 111 | 112 | block_data_5 = block_rdd(data, block_size=5) 113 | blocks = block_data_5.collect() 114 | assert_true(all(len(b) <= 5 for b in blocks)) 115 | assert_array_almost_equal(blocks[0].a, np.arange(5)) 116 | assert_array_almost_equal(blocks[0].b, 117 | np.arange(5, dtype=np.float) ** 2) 118 | -------------------------------------------------------------------------------- /test/test_blocked_math.py: -------------------------------------------------------------------------------- 1 | from common import SpylearnTestCase 2 | 3 | from spylearn.blocked_math import count, cov, svd, svd_em 4 | from spylearn.block_rdd import block_rdd 5 | 6 | import numpy as np 7 | import scipy.linalg as ln 8 | from numpy.testing import assert_array_almost_equal 9 | 10 | 11 | def match_sign(a, b): 12 | a_sign = np.sign(a) 13 | b_sign = np.sign(b) 14 | if np.array_equal(a_sign, -b_sign): 15 | return -b 16 | elif np.array_equal(a_sign, b_sign): 17 | return b 18 | else: 19 | raise AssertionError("inconsistent matching of sign") 20 | 21 | 22 | class TestUtils(SpylearnTestCase): 23 | def setUp(self): 24 | super(TestUtils, self).setUp() 25 | 26 | def tearDown(self): 27 | super(TestUtils, self).tearDown() 28 | 29 | 30 | class TestBlockedMath(TestUtils): 31 | 32 | def test_count(self): 33 | n_samples = 100 34 | n_partitions = 10 35 | mat = [np.array([1]) for i in range(n_samples)] 36 | data = block_rdd(self.sc.parallelize(mat, n_partitions)) 37 | assert_array_almost_equal(n_samples, count(data)) 38 | 39 | def test_cov(self): 40 | rng = np.random.RandomState(42) 41 | true_cov = np.array([[3., 2., 4.], [2., 2., 5.], [4., 5., 6.]]) 42 | mat = rng.multivariate_normal(np.array([1., 2., 3.]), size=int(1e3), 43 | cov=true_cov) 44 | data = block_rdd(self.sc.parallelize(mat, 4)) 45 | rdd_cov = cov(data) 46 | assert_array_almost_equal(np.cov(mat.T), rdd_cov, decimal=1) 47 | 48 | def test_svd(self): 49 | rng = np.random.RandomState(42) 50 | mat = rng.randn(1e3, 10) 51 | data = block_rdd(self.sc.parallelize(list(mat), 10)) 52 | u, s, v = svd(data, 1) 53 | u = np.squeeze(np.concatenate(np.array(u.collect()))).T 54 | u_true, s_true, v_true = ln.svd(mat) 55 | assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :])) 56 | assert_array_almost_equal(s[0], s_true[0]) 57 | assert_array_almost_equal(u, match_sign(u, u_true[:, 0])) 58 | 59 | def test_svd_em(self): 60 | rng = np.random.RandomState(42) 61 | mat = rng.randn(10, 3) 62 | data = block_rdd(self.sc.parallelize(list(mat), 2)).cache() 63 | u, s, v = svd_em(data, 1, seed=42) 64 | u = np.squeeze(np.concatenate(np.array(u.collect()))).T 65 | u_true, s_true, v_true = ln.svd(mat) 66 | tol = 1 67 | assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]), tol) 68 | assert_array_almost_equal(s[0], s_true[0], tol) 69 | assert_array_almost_equal(u, match_sign(u, u_true[:, 0]), tol) 70 | 71 | -------------------------------------------------------------------------------- /test/test_histogram.py: -------------------------------------------------------------------------------- 1 | from common import SpylearnTestCase 2 | import shutil 3 | import tempfile 4 | 5 | from spylearn.histogram import histogram 6 | import numpy as np 7 | 8 | from nose.tools import assert_equals 9 | from numpy.testing import assert_array_almost_equal 10 | 11 | 12 | class TestUtils(SpylearnTestCase): 13 | def setUp(self): 14 | super(TestUtils, self).setUp() 15 | self.outputdir = tempfile.mkdtemp() 16 | 17 | def tearDown(self): 18 | super(TestUtils, self).tearDown() 19 | shutil.rmtree(self.outputdir) 20 | 21 | 22 | class TestHistogram(TestUtils): 23 | 24 | def test_bins_as_number(self): 25 | data = self.sc.parallelize([1, 2, 3, 4, 5]) 26 | hist, bin_edges = histogram(data, range=(0, 6), bins=2) 27 | assert_equals(2, hist[0]) 28 | assert_equals(3, hist[1]) 29 | assert_array_almost_equal(np.array([0, 3, 6]), bin_edges) 30 | 31 | def test_bins_as_array(self): 32 | data = self.sc.parallelize([1, 2, 3, 4, 5]) 33 | hist, bin_edges = histogram(data, bins=[0, 3, 6]) 34 | assert_equals(2, hist[0]) 35 | assert_equals(3, hist[1]) 36 | assert_array_almost_equal(np.array([0, 3, 6]), bin_edges) 37 | 38 | def test_ignore_out_of_range(self): 39 | data = self.sc.parallelize([1, 2, 3, 4, 5]) 40 | hist, bin_edges = histogram(data, range=(2, 5), bins=2) 41 | assert_equals(2, hist[0]) 42 | assert_equals(1, hist[1]) 43 | assert_array_almost_equal(np.array([2, 3.5, 5]), bin_edges) 44 | 45 | -------------------------------------------------------------------------------- /test/test_k_means.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from spylearn.k_means import ParallelKMeans 4 | 5 | from common import SpylearnTestCase 6 | 7 | from nose.tools import assert_greater, assert_less 8 | from numpy.testing import assert_array_equal 9 | 10 | class KMeansTestCase(SpylearnTestCase): 11 | 12 | data = None 13 | n_partitions = 2 14 | 15 | def setUp(self): 16 | super(KMeansTestCase, self).setUp() 17 | if self.data is None: 18 | rng = np.random.RandomState(42) 19 | self.center1 = rng.randint(10, size=50) 20 | self.center2 = rng.randint(10, size=50) 21 | self.cluster1 = rng.normal(size=(int(1e3), 50)) + self.center1 22 | self.cluster2 = rng.normal(size=(int(1e3), 50)) + self.center2 23 | X = np.concatenate([self.cluster1, self.cluster2]) 24 | rng.shuffle(X) 25 | self.data = self.sc.parallelize(X, numSlices=self.n_partitions) 26 | self.expected_error = sum([np.linalg.norm(rng.randn(50) - 27 | rng.randn(50)) for _ in range(int(1e3))]) 28 | 29 | def test_clustering(self): 30 | model = ParallelKMeans(2, 7) 31 | model.fit(self.data) 32 | cluster1_predictions = model.predict(self.cluster1) 33 | cluster2_predictions = model.predict(self.cluster2) 34 | assert_array_equal(np.repeat(cluster1_predictions[0], len(self.cluster1)), 35 | cluster1_predictions) 36 | assert_array_equal(np.repeat(cluster2_predictions[0], len(self.cluster2)), 37 | cluster2_predictions) 38 | 39 | score1 = model.score(self.cluster1) 40 | assert_less(score1, 0) 41 | assert_greater(score1, -self.expected_error * 1.5) 42 | score2 = model.score(self.cluster2) 43 | assert_less(score2, 0) 44 | assert_greater(score2, -self.expected_error * 1.5) 45 | 46 | -------------------------------------------------------------------------------- /test/test_linear_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.linear_model import SGDClassifier 4 | 5 | from spylearn.linear_model import parallel_train 6 | from spylearn.block_rdd import block_rdd 7 | 8 | from common import SpylearnTestCase 9 | 10 | from nose.tools import assert_greater 11 | from nose import SkipTest 12 | from numpy.testing import assert_array_almost_equal 13 | 14 | 15 | class SumModel(object): 16 | 17 | def __init__(self): 18 | self.coef_ = 0 19 | self.intercept_ = 0 20 | 21 | def partial_fit(self, X, y, **kwargs): 22 | if X.ndim == 1: 23 | X = X.reshape(1, -1) 24 | self.coef_ += X.sum(axis=0) 25 | return self 26 | 27 | 28 | class LinearModelTestCase(SpylearnTestCase): 29 | 30 | data = None 31 | n_partitions = 2 32 | 33 | def setUp(self): 34 | super(LinearModelTestCase, self).setUp() 35 | if self.data is None: 36 | rng = np.random.RandomState(42) 37 | X = rng.normal(size=(int(1e3), 50)) 38 | coef = rng.normal(size=50) 39 | y = (np.dot(X, coef) > 0.01).astype(np.int) 40 | self.X = X 41 | self.y = y 42 | self.classes = np.unique(y) 43 | self.data = self.sc.parallelize(list(zip(X, y)), 44 | numSlices=self.n_partitions).cache() 45 | self.blocked_data = block_rdd(self.data, block_size=171) 46 | 47 | def test_parallel_train_sum_model_non_blocked(self): 48 | n_iter = 2 49 | model = parallel_train(SumModel(), self.data, self.classes, n_iter) 50 | expected_coef = self.X.sum(axis=0) * n_iter / self.n_partitions 51 | assert_array_almost_equal(model.coef_, expected_coef , 5) 52 | 53 | def test_parallel_train(self): 54 | if not hasattr(SGDClassifier, 'partial_fit'): 55 | raise SkipTest('sklearn >= 0.13 is required to run this test') 56 | model = SGDClassifier(loss='log', alpha=1e-5, random_state=2) 57 | model = parallel_train(model, self.blocked_data, self.classes) 58 | assert_greater(model.score(self.X, self.y), 0.90) 59 | -------------------------------------------------------------------------------- /test/test_random_permutation.py: -------------------------------------------------------------------------------- 1 | from common import SpylearnTestCase 2 | import shutil 3 | import tempfile 4 | 5 | from spylearn.random_permutation import random_permutation 6 | import numpy as np 7 | 8 | from nose.tools import assert_equals, assert_not_equals 9 | from numpy.testing import assert_array_almost_equal 10 | 11 | 12 | class TestUtils(SpylearnTestCase): 13 | def setUp(self): 14 | super(TestUtils, self).setUp() 15 | self.outputdir = tempfile.mkdtemp() 16 | 17 | def tearDown(self): 18 | super(TestUtils, self).tearDown() 19 | shutil.rmtree(self.outputdir) 20 | 21 | 22 | class TestRandomPermutation(TestUtils): 23 | 24 | def test_random_permutation(self): 25 | data = self.sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) 26 | shuffled = random_permutation(data, seed=5).collect() 27 | assert_equals([10, 7, 6, 8, 2, 5, 9, 4, 3, 1], shuffled) 28 | assert_not_equals(shuffled, random_permutation(data, seed=6).collect()) 29 | 30 | --------------------------------------------------------------------------------