├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE.md ├── Makefile ├── README.md ├── benchmarks ├── benchmarks_mnist.py └── benchmarks_mnist_script.py ├── cmake ├── Modules │ ├── FindDistributions.cmake │ └── FindMicroscopesCommon.cmake ├── find_in_venv_like.sh └── print_cmake_command.py ├── conda └── microscopes-mixturemodel │ ├── build.sh │ └── meta.yaml ├── include └── microscopes │ └── mixture │ └── model.hpp ├── microscopes ├── __init__.py └── mixture │ ├── __init__.py │ ├── _model.pxd │ ├── _model.pyx │ ├── _model_h.pxd │ ├── _state_h.pxd │ ├── definition.pxd │ ├── definition.pyx │ ├── model.pyx │ ├── query.py │ ├── runner.py │ └── testutil.py ├── setup.py ├── src └── mixture │ └── model.cpp └── test ├── cxx └── test_state.cpp ├── test_crp.py ├── test_cxx_imports.py ├── test_definition.py ├── test_dm.py ├── test_hp_inference.py ├── test_mixturemodel_gibbs_assign.py ├── test_models_get_set_params.py ├── test_models_mixture_dp.py ├── test_query.py ├── test_runner.py ├── test_sample.py ├── test_slice_theta.py ├── test_state.py └── test_state_stress.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | .DS_Store 4 | /config.mk 5 | /build 6 | /debug 7 | /release 8 | /relwithdebinfo 9 | *.d 10 | *.prog 11 | *.dSYM 12 | /microscopes/**/*.cpp 13 | /microscopes/**/*.so 14 | /microscopes_mixturemodel.egg-info 15 | /microscopes/mixture/githash.txt 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | notifications: 5 | email: false 6 | slack: 7 | secure: I5/FD8tMEMQCeAAtYiUrPVg0U5j6AyoIyEpWtDbS7kHDqu7eG9qNZhFjcy/qxOHmv6xKeJPyNR8iuoQm5y0OVxkvwC7iI02EHCgcH7BSvwMvXcXiYTH0JM0SwySnfW6pJkbwfs9AT06UlNrYqXDatqNUTIhLajsDQWAeSmd2wwM= 8 | before_install: 9 | - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 10 | - sudo apt-get update -qq 11 | - sudo apt-get install -qq g++-4.8 12 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 13 | - chmod +x miniconda.sh 14 | - ./miniconda.sh -b 15 | - export PATH=/home/travis/miniconda/bin:$PATH 16 | - conda update --yes conda 17 | - sudo rm -rf /dev/shm 18 | - sudo ln -s /run/shm /dev/shm 19 | - conda create -n build --yes python=$TRAVIS_PYTHON_VERSION numpy=1.8 scipy nose cython pip scikit-learn 20 | - source activate build 21 | - conda install --yes -c distributions distributions 22 | - conda install --yes -c datamicroscopes eigen3 23 | - export CC=gcc-4.8 24 | - export CXX=g++-4.8 25 | - printenv 26 | - conda list --export 27 | - mkdir .travis 28 | - (cd .travis && git clone https://github.com/datamicroscopes/common.git && cd common && make travis_install) 29 | - (cd .travis && git clone https://github.com/datamicroscopes/kernels.git && cd kernels && make travis_install) 30 | install: make travis_install 31 | script: make travis_script 32 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/") 3 | 4 | project(microscopes_mixturemodel) 5 | 6 | # default mode is Release 7 | if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 9 | endif() 10 | 11 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -g -MD -std=c++0x") 12 | 13 | # since we use distributions headers, we need to inherit their 14 | # no-strict-aliasing warnings rule otherwise deal with compiler noise 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing") 16 | 17 | # this warning is annoying; who cares about unused parameters 18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") 19 | 20 | if(APPLE) 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 22 | # for anaconda builds 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmacosx-version-min=10.7") 24 | # clang complains about register 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-register") 26 | endif() 27 | 28 | # taken from distributions 29 | #set(CMAKE_CXX_FLAGS_MATHOPT "-mfpmath=sse -msse4.1 -ffast-math -funsafe-math-optimizations") 30 | set(CMAKE_CXX_FLAGS_MATHOPT "-mfpmath=sse -msse4.1") 31 | 32 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG ${CMAKE_CXX_FLAGS_MATHOPT}") 33 | set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -fno-omit-frame-pointer") 34 | set(CMAKE_CXX_FLAGS_DEBUG "-DDEBUG_MODE -fno-omit-frame-pointer") 35 | 36 | # give our include dirs the most precedent 37 | include_directories(include) 38 | 39 | # followed by the EXTRA_* ones 40 | if(DEFINED EXTRA_INCLUDE_PATH) 41 | include_directories(${EXTRA_INCLUDE_PATH}) 42 | endif() 43 | if(DEFINED EXTRA_LIBRARY_PATH) 44 | link_directories(${EXTRA_LIBRARY_PATH}) 45 | endif() 46 | 47 | find_package(Protobuf REQUIRED) 48 | message(STATUS "found protobuf INC=${PROTOBUF_INCLUDE_DIRS}, LIB=${PROTOBUF_LIBRARIES}") 49 | include_directories(${PROTOBUF_INCLUDE_DIRS}) 50 | 51 | find_package(Distributions) 52 | if(DISTRIBUTIONS_FOUND) 53 | message(STATUS "found distributions INC=${DISTRIBUTIONS_INCLUDE_DIRS}, LIB=${DISTRIBUTIONS_LIBRARY_DIRS}") 54 | include_directories(${DISTRIBUTIONS_INCLUDE_DIRS}) 55 | link_directories(${DISTRIBUTIONS_LIBRARY_DIRS}) 56 | else() 57 | message(FATAL_ERROR "Could not find distributions") 58 | endif() 59 | 60 | find_package(MicroscopesCommon) 61 | if(MICROSCOPES_COMMON_FOUND) 62 | message(STATUS "found microscopes_common INC=${MICROSCOPES_COMMON_INCLUDE_DIRS}, LIB=${MICROSCOPES_COMMON_LIBRARY_DIRS}") 63 | include_directories(${MICROSCOPES_COMMON_INCLUDE_DIRS}) 64 | link_directories(${MICROSCOPES_COMMON_LIBRARY_DIRS}) 65 | else() 66 | message(FATAL_ERROR "Could not find microscopes_common") 67 | endif() 68 | 69 | install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN "*.h*") 70 | install(DIRECTORY microscopes DESTINATION cython FILES_MATCHING PATTERN "*.pxd" PATTERN "__init__.py") 71 | 72 | set(MICROSCOPES_MIXTUREMODEL_SOURCE_FILES src/mixture/model.cpp) 73 | add_library(microscopes_mixturemodel SHARED ${MICROSCOPES_MIXTUREMODEL_SOURCE_FILES}) 74 | target_link_libraries(microscopes_mixturemodel ${PROTOBUF_LIBRARIES} distributions_shared microscopes_common) 75 | install(TARGETS microscopes_mixturemodel LIBRARY DESTINATION lib) 76 | 77 | # test executables 78 | enable_testing() 79 | add_executable(test_state test/cxx/test_state.cpp) 80 | add_test(test_state test_state) 81 | target_link_libraries(test_state ${PROTOBUF_LIBRARIES} distributions_shared microscopes_common microscopes_mixturemodel) 82 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Qadium 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of Qadium nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | @echo "choose a valid target" 3 | 4 | .PHONY: release 5 | release: 6 | @echo "Setting up cmake (release)" 7 | @python ./cmake/print_cmake_command.py Release 8 | [ -d release ] || (mkdir release && cd release && eval `python ../cmake/print_cmake_command.py Release`) 9 | 10 | .PHONY: relwithdebinfo 11 | relwithdebinfo: 12 | @echo "Setting up cmake (relwithdebinfo)" 13 | @python ./cmake/print_cmake_command.py RelWithDebInfo 14 | [ -d relwithdebinfo ] || (mkdir relwithdebinfo && cd relwithdebinfo && eval `python ../cmake/print_cmake_command.py RelWithDebInfo`) 15 | 16 | .PHONY: debug 17 | debug: 18 | @echo "Setting up cmake (debug)" 19 | @python ./cmake/print_cmake_command.py Debug 20 | [ -d debug ] || (mkdir debug && cd debug && eval `python ../cmake/print_cmake_command.py Debug`) 21 | 22 | CPU_COUNT=$(shell python -c 'import multiprocessing as m; print m.cpu_count()') 23 | 24 | .PHONY: test 25 | test: 26 | (cd test && NOSE_PROCESSES=$(CPU_COUNT) NOSE_PROCESS_TIMEOUT=240 nosetests -a '!uses_mp,!slow' --verbose) 27 | (cd test && nosetests -a 'uses_mp,!slow' --verbose) 28 | 29 | .PHONY: travis_install 30 | travis_install: 31 | make relwithdebinfo 32 | (cd relwithdebinfo && make && make install) 33 | pip install . 34 | 35 | .PHONY: travis_script 36 | travis_script: 37 | (cd relwithdebinfo && CTEST_OUTPUT_ON_FAILURE=true make test) 38 | (cd test && NOSE_PROCESSES=$(CPU_COUNT) NOSE_PROCESS_TIMEOUT=240 nosetests --verbose -a '!uses_mp,!slow') 39 | (cd test && nosetests --verbose -a 'uses_mp,!slow') 40 | 41 | .PHONY: lint 42 | lint: 43 | pyflakes microscopes test setup.py 44 | pep8 --filename=*.py --ignore=E265 microscopes test setup.py 45 | pep8 --filename=*.pyx --ignore=E265,E211,E225 microscopes 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf release relwithdebinfo debug microscopes_mixturemodel.egg-info 50 | find microscopes/ -name '*.cpp' -type f -print0 | xargs -0 rm -- 51 | find microscopes/ -name '*.so' -type f -print0 | xargs -0 rm -- 52 | find microscopes/ -name '*.pyc' -type f -print0 | xargs -0 rm -- 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # microscopes-mixtures [![Build Status](https://travis-ci.org/datamicroscopes/mixturemodel.svg?branch=master)](https://travis-ci.org/datamicroscopes/mixturemodel) 2 | 3 | Contains the implementation of a Dirichlet process mixture model (DPMM). Routines for doing inference on this model are found in the [kernels](https://github.com/datamicroscopes/kernels) project. 4 | 5 | ### Installation 6 | 7 | OS X and Linux builds of `microscopes-mixturemodel` are released to [Anaconda.org](https://conda.anaconda.org). Installing them requires [Conda](https://store.continuum.io/cshop/anaconda/). To install the current release version run: 8 | 9 | ``` 10 | $ conda install -c datamicroscopes -c distributions microscopes-mixturemodel 11 | ``` 12 | -------------------------------------------------------------------------------- /benchmarks/benchmarks_mnist.py: -------------------------------------------------------------------------------- 1 | from microscopes.common.recarray.dataview import numpy_dataview 2 | from microscopes.common.rng import rng 3 | from microscopes.common.scalar_functions import log_exponential 4 | from microscopes.mixture.model import initialize, bind 5 | from microscopes.kernels.gibbs import assign 6 | from microscopes.kernels.slice import hp 7 | from microscopes.common.util import mkdirp 8 | from microscopes.models import bb, dd 9 | from microscopes.mixture.definition import model_definition 10 | 11 | from sklearn.datasets import fetch_mldata 12 | from sklearn.cross_validation import train_test_split 13 | from sklearn.metrics import ( 14 | accuracy_score, 15 | roc_auc_score, 16 | confusion_matrix, 17 | #roc_curve, 18 | ) 19 | 20 | import numpy as np 21 | import numpy.ma as ma 22 | import math 23 | import time 24 | import os 25 | 26 | from nose.plugins.attrib import attr 27 | 28 | 29 | def _get_mnist_dataset(): 30 | return fetch_mldata('MNIST original') 31 | 32 | 33 | def groupcounts(s): 34 | counts = np.zeros(s.ngroups(), dtype=np.int) 35 | for i, gid in enumerate(s.groups()): 36 | counts[i] = s.groupsize(gid) 37 | return np.sort(counts)[::-1] 38 | 39 | 40 | def groupsbysize(s): 41 | """groupids by decreasing size""" 42 | counts = [(gid, s.groupsize(gid)) for gid in s.groups()] 43 | counts = sorted(counts, key=lambda x: x[1], reverse=True) 44 | return counts 45 | 46 | 47 | @attr('slow') 48 | def test_mnist_supervised(): 49 | mnist_dataset = _get_mnist_dataset() 50 | classes = range(10) 51 | classmap = {c: i for i, c in enumerate(classes)} 52 | train_data, test_data = [], [] 53 | for c in classes: 54 | Y = mnist_dataset['data'][ 55 | np.where(mnist_dataset['target'] == float(c))[0]] 56 | Y_train, Y_test = train_test_split(Y, test_size=0.01) 57 | train_data.append(Y_train) 58 | test_data.append(Y_test) 59 | 60 | sample_size_max = 10000 61 | 62 | def mk_class_data(c, Y): 63 | n, D = Y.shape 64 | print 'number of digit', c, 'in training is', n 65 | dtype = [('', bool)] * D + [('', int)] 66 | inds = np.random.permutation(Y.shape[0])[:sample_size_max] 67 | Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], 68 | dtype=dtype) 69 | return Y 70 | Y_train = np.hstack([mk_class_data(c, y) 71 | for c, y in zip(classes, train_data)]) 72 | Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] 73 | 74 | n, = Y_train.shape 75 | D = len(Y_train.dtype) 76 | print 'training data is', n, 'examples' 77 | print 'image dimension is', (D - 1), 'pixels' 78 | 79 | view = numpy_dataview(Y_train) 80 | defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) 81 | r = rng() 82 | s = initialize(defn, 83 | view, 84 | cluster_hp={'alpha': 0.2}, 85 | feature_hps=[{'alpha': 1., 'beta': 1.}] * 86 | (D - 1) + [{'alphas': [1. for _ in classes]}], 87 | r=r) 88 | 89 | bound_s = bind(s, view) 90 | 91 | indiv_prior_fn = log_exponential(1.2) 92 | hparams = { 93 | i: { 94 | 'alpha': (indiv_prior_fn, 1.5), 95 | 'beta': (indiv_prior_fn, 1.5), 96 | } for i in xrange(D - 1)} 97 | hparams[D - 1] = { 98 | 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) 99 | for idx in xrange(len(classes)) 100 | } 101 | 102 | def print_prediction_results(): 103 | results = [] 104 | for c, Y_test in zip(classes, test_data): 105 | for y in Y_test: 106 | query = ma.masked_array( 107 | np.array([tuple(y) + (0,)], 108 | dtype=[('', bool)] * (D - 1) + [('', int)]), 109 | mask=[(False,) * (D - 1) + (True,)])[0] 110 | samples = [ 111 | s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)] 112 | samples = np.bincount(samples, minlength=len(classes)) 113 | prediction = np.argmax(samples) 114 | results.append((classmap[c], prediction, samples)) 115 | print 'finished predictions for class', c 116 | 117 | Y_actual = np.array([a for a, _, _ in results], dtype=np.int) 118 | Y_pred = np.array([b for _, b, _ in results], dtype=np.int) 119 | print 'accuracy:', accuracy_score(Y_actual, Y_pred) 120 | print 'confusion matrix:' 121 | print confusion_matrix(Y_actual, Y_pred) 122 | 123 | # AUROC for one vs all (each class) 124 | for i, clabel in enumerate(classes): 125 | Y_true = np.copy(Y_actual) 126 | 127 | # treat class c as the "positive" example 128 | positive_examples = Y_actual == i 129 | negative_examples = Y_actual != i 130 | Y_true[positive_examples] = 1 131 | Y_true[negative_examples] = 0 132 | Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) 133 | cls_auc = roc_auc_score(Y_true, Y_prob) 134 | print 'class', clabel, 'auc=', cls_auc 135 | 136 | #import matplotlib.pylab as plt 137 | #Y_prob = np.array([c for _, _, c in results]) 138 | #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) 139 | #plt.plot(fpr, tpr) 140 | #plt.show() 141 | 142 | def kernel(rid): 143 | start0 = time.time() 144 | assign(bound_s, r) 145 | sec0 = time.time() - start0 146 | 147 | start1 = time.time() 148 | hp(bound_s, r, hparams=hparams) 149 | sec1 = time.time() - start1 150 | 151 | print 'rid=', rid, 'nclusters=', s.ngroups(), \ 152 | 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' 153 | 154 | sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) 155 | print ' time_per_post_pred=', sec_per_post_pred, 'sec' 156 | 157 | # print group size breakdown 158 | sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] 159 | sizes = sorted(sizes, key=lambda x: x[1], reverse=True) 160 | print ' group_sizes=', sizes 161 | 162 | print_prediction_results() 163 | 164 | # save state 165 | mkdirp("mnist-states") 166 | fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) 167 | with open(fname, "w") as fp: 168 | fp.write(s.serialize()) 169 | 170 | # training 171 | iters = 30 172 | for rid in xrange(iters): 173 | kernel(rid) 174 | 175 | 176 | @attr('slow') 177 | def test_mnist(): 178 | import matplotlib.pylab as plt 179 | from PIL import Image, ImageOps 180 | mnist_dataset = _get_mnist_dataset() 181 | Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]] 182 | Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]] 183 | print 'number of twos:', Y_2.shape[0] 184 | print 'number of threes:', Y_3.shape[0] 185 | _, D = Y_2.shape 186 | W = int(math.sqrt(D)) 187 | assert W * W == D 188 | dtype = [('', bool)] * D 189 | Y = np.vstack([Y_2, Y_3]) 190 | Y = np.array( 191 | [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]], 192 | dtype=dtype) 193 | 194 | view = numpy_dataview(Y) 195 | defn = model_definition(Y.shape[0], [bb] * D) 196 | r = rng() 197 | s = initialize( 198 | defn, 199 | view, 200 | cluster_hp={'alpha': 0.2}, 201 | feature_hps=[{'alpha': 1., 'beta': 1.}] * D, 202 | r=r) 203 | bound_s = bind(s, view) 204 | 205 | indiv_prior_fn = log_exponential(1.2) 206 | hparams = { 207 | i: { 208 | 'alpha': (indiv_prior_fn, 1.5), 209 | 'beta': (indiv_prior_fn, 1.5), 210 | } for i in xrange(D)} 211 | 212 | def plot_clusters(s, fname, scalebysize=False): 213 | hps = [s.get_feature_hp(i) for i in xrange(D)] 214 | 215 | def prior_prob(hp): 216 | return hp['alpha'] / (hp['alpha'] + hp['beta']) 217 | 218 | def data_for_group(gid): 219 | suffstats = [s.get_suffstats(gid, i) for i in xrange(D)] 220 | 221 | def prob(hp, ss): 222 | top = hp['alpha'] + ss['heads'] 223 | bot = top + hp['beta'] + ss['tails'] 224 | return top / bot 225 | probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)] 226 | return np.array(probs) 227 | 228 | def scale(d, weight): 229 | im = d.reshape((W, W)) 230 | newW = max(int(weight * W), 1) 231 | im = Image.fromarray(im) 232 | im = im.resize((newW, newW)) 233 | im = ImageOps.expand(im, border=(W - newW) / 2) 234 | im = np.array(im) 235 | a, b = im.shape 236 | #print 'a,b:', a, b 237 | if a < W: 238 | im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0) 239 | elif a > W: 240 | im = im[:W, :] 241 | assert im.shape[0] == W 242 | if b < W: 243 | #print 'current:', im.shape 244 | im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1) 245 | elif b > W: 246 | im = im[:, :W] 247 | assert im.shape[1] == W 248 | return im.flatten() 249 | 250 | data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)] 251 | largest = max(cnt for _, cnt in data) 252 | data = [scale(d, cnt / float(largest)) if scalebysize else d 253 | for d, cnt in data] 254 | digits_per_row = 12 255 | rem = len(data) % digits_per_row 256 | if rem: 257 | fill = digits_per_row - rem 258 | for _ in xrange(fill): 259 | data.append(np.zeros(D)) 260 | assert not (len(data) % digits_per_row) 261 | #rows = len(data) / digits_per_row 262 | data = np.vstack([np.hstack([d.reshape((W, W)) 263 | for d in data[i:i + digits_per_row]]) 264 | for i in xrange(0, len(data), digits_per_row)]) 265 | #print 'saving figure', fname 266 | plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') 267 | plt.savefig(fname) 268 | plt.close() 269 | 270 | def plot_hyperparams(s, fname): 271 | hps = [s.get_feature_hp(i) for i in xrange(D)] 272 | alphas = np.array([hp['alpha'] for hp in hps]) 273 | betas = np.array([hp['beta'] for hp in hps]) 274 | data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))]) 275 | plt.imshow(data, interpolation='nearest') 276 | plt.colorbar() 277 | plt.savefig(fname) 278 | plt.close() 279 | 280 | def kernel(rid): 281 | start0 = time.time() 282 | assign(bound_s, r) 283 | sec0 = time.time() - start0 284 | 285 | start1 = time.time() 286 | hp(bound_s, r, hparams=hparams) 287 | sec1 = time.time() - start1 288 | 289 | print 'rid=', rid, 'nclusters=', s.ngroups(), \ 290 | 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' 291 | 292 | sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) 293 | print ' time_per_post_pred=', sec_per_post_pred, 'sec' 294 | 295 | return s.score_joint(r) 296 | 297 | # burnin 298 | burnin = 20 299 | for rid in xrange(burnin): 300 | print 'score:', kernel(rid) 301 | print 'finished burnin' 302 | plot_clusters(s, 'mnist_clusters.pdf') 303 | plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True) 304 | plot_hyperparams(s, 'mnist_hyperparams.pdf') 305 | print 'groupcounts:', groupcounts(s) 306 | 307 | # posterior predictions 308 | present = D / 2 309 | absent = D - present 310 | queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \ 311 | [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]] 312 | 313 | queries_masked = ma.masked_array( 314 | np.array(queries, dtype=[('', bool)] * D), 315 | mask=[(False,) * present + (True,) * absent]) 316 | 317 | def postpred_sample(y_new): 318 | Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)] 319 | Y_samples = np.array([list(y) for y in np.hstack(Y_samples)]) 320 | Y_avg = Y_samples.mean(axis=0) 321 | return Y_avg 322 | 323 | queries_masked = [postpred_sample(y) for y in queries_masked] 324 | data0 = np.hstack([q.reshape((W, W)) for q in queries_masked]) 325 | data1 = np.hstack( 326 | [np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W)) 327 | for q in queries]) 328 | data = np.vstack([data0, data1]) 329 | plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') 330 | plt.savefig('mnist_predict.pdf') 331 | plt.close() 332 | -------------------------------------------------------------------------------- /benchmarks/benchmarks_mnist_script.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from microscopes.common.recarray.dataview import numpy_dataview 3 | from microscopes.common.rng import rng 4 | from microscopes.common.scalar_functions import log_exponential 5 | from microscopes.mixture.model import initialize, bind 6 | from microscopes.kernels.gibbs import assign 7 | from microscopes.kernels.slice import hp 8 | from microscopes.common.util import mkdirp 9 | from microscopes.models import bb, dd 10 | from microscopes.mixture.definition import model_definition 11 | 12 | from sklearn.datasets import fetch_mldata 13 | from sklearn.cross_validation import train_test_split 14 | from sklearn.metrics import ( 15 | accuracy_score, 16 | roc_auc_score, 17 | confusion_matrix, 18 | #roc_curve, 19 | ) 20 | 21 | import numpy as np 22 | import numpy.ma as ma 23 | import math 24 | import time 25 | import os 26 | 27 | from nose.plugins.attrib import attr 28 | 29 | 30 | def _get_mnist_dataset(): 31 | return fetch_mldata('MNIST original') 32 | 33 | 34 | def groupcounts(s): 35 | counts = np.zeros(s.ngroups(), dtype=np.int) 36 | for i, gid in enumerate(s.groups()): 37 | counts[i] = s.groupsize(gid) 38 | return np.sort(counts)[::-1] 39 | 40 | 41 | def groupsbysize(s): 42 | """groupids by decreasing size""" 43 | counts = [(gid, s.groupsize(gid)) for gid in s.groups()] 44 | counts = sorted(counts, key=lambda x: x[1], reverse=True) 45 | return counts 46 | 47 | 48 | @attr('slow') 49 | #@profile 50 | def test_mnist_supervised(n): 51 | mnist_dataset = _get_mnist_dataset() 52 | classes = range(10) 53 | classmap = {c: i for i, c in enumerate(classes)} 54 | train_data, test_data = [], [] 55 | for c in classes: 56 | Y = mnist_dataset['data'][ 57 | np.where(mnist_dataset['target'] == float(c))[0]] 58 | Y_train, Y_test = train_test_split(Y, test_size=0.01) 59 | train_data.append(Y_train) 60 | test_data.append(Y_test) 61 | 62 | sample_size_max = n 63 | 64 | def mk_class_data(c, Y): 65 | n, D = Y.shape 66 | print 'number of digit', c, 'in training is', n 67 | dtype = [('', bool)] * D + [('', int)] 68 | inds = np.random.permutation(Y.shape[0])[:sample_size_max] 69 | Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], 70 | dtype=dtype) 71 | return Y 72 | Y_train = np.hstack([mk_class_data(c, y) 73 | for c, y in zip(classes, train_data)]) 74 | Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] 75 | 76 | n, = Y_train.shape 77 | D = len(Y_train.dtype) 78 | print 'training data is', n, 'examples' 79 | print 'image dimension is', (D - 1), 'pixels' 80 | 81 | view = numpy_dataview(Y_train) 82 | defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) 83 | r = rng() 84 | s = initialize(defn, 85 | view, 86 | cluster_hp={'alpha': 0.2}, 87 | feature_hps=[{'alpha': 1., 'beta': 1.}] * 88 | (D - 1) + [{'alphas': [1. for _ in classes]}], 89 | r=r) 90 | 91 | bound_s = bind(s, view) 92 | 93 | indiv_prior_fn = log_exponential(1.2) 94 | hparams = { 95 | i: { 96 | 'alpha': (indiv_prior_fn, 1.5), 97 | 'beta': (indiv_prior_fn, 1.5), 98 | } for i in xrange(D - 1)} 99 | hparams[D - 1] = { 100 | 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) 101 | for idx in xrange(len(classes)) 102 | } 103 | 104 | def print_prediction_results(): 105 | results = [] 106 | for c, Y_test in zip(classes, test_data): 107 | for y in Y_test: 108 | query = ma.masked_array( 109 | np.array([tuple(y) + (0,)], 110 | dtype=[('', bool)] * (D - 1) + [('', int)]), 111 | mask=[(False,) * (D - 1) + (True,)])[0] 112 | samples = [ 113 | s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)] 114 | samples = np.bincount(samples, minlength=len(classes)) 115 | prediction = np.argmax(samples) 116 | results.append((classmap[c], prediction, samples)) 117 | print 'finished predictions for class', c 118 | 119 | Y_actual = np.array([a for a, _, _ in results], dtype=np.int) 120 | Y_pred = np.array([b for _, b, _ in results], dtype=np.int) 121 | print 'accuracy:', accuracy_score(Y_actual, Y_pred) 122 | print 'confusion matrix:' 123 | print confusion_matrix(Y_actual, Y_pred) 124 | 125 | # AUROC for one vs all (each class) 126 | for i, clabel in enumerate(classes): 127 | Y_true = np.copy(Y_actual) 128 | 129 | # treat class c as the "positive" example 130 | positive_examples = Y_actual == i 131 | negative_examples = Y_actual != i 132 | Y_true[positive_examples] = 1 133 | Y_true[negative_examples] = 0 134 | Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) 135 | cls_auc = roc_auc_score(Y_true, Y_prob) 136 | print 'class', clabel, 'auc=', cls_auc 137 | 138 | #import matplotlib.pylab as plt 139 | #Y_prob = np.array([c for _, _, c in results]) 140 | #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) 141 | #plt.plot(fpr, tpr) 142 | #plt.show() 143 | 144 | def kernel(rid): 145 | start0 = time.time() 146 | assign(bound_s, r) 147 | sec0 = time.time() - start0 148 | 149 | start1 = time.time() 150 | hp(bound_s, r, hparams=hparams) 151 | sec1 = time.time() - start1 152 | 153 | print 'rid=', rid, 'nclusters=', s.ngroups(), \ 154 | 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' 155 | 156 | sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) 157 | print ' time_per_post_pred=', sec_per_post_pred, 'sec' 158 | 159 | # training 160 | iters = 30 161 | for rid in xrange(iters): 162 | kernel(rid) 163 | 164 | # print group size breakdown 165 | sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] 166 | sizes = sorted(sizes, key=lambda x: x[1], reverse=True) 167 | print ' group_sizes=', sizes 168 | 169 | #print_prediction_results() 170 | 171 | # save state 172 | mkdirp("mnist-states") 173 | fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) 174 | with open(fname, "w") as fp: 175 | fp.write(s.serialize()) 176 | 177 | if __name__ == "__main__": 178 | start = time.time() 179 | parser = argparse.ArgumentParser( 180 | description=globals()['__doc__'], 181 | formatter_class=argparse.RawDescriptionHelpFormatter) 182 | 183 | parser.add_argument( 184 | '-n', '--nsamples', required=True, type=int, 185 | help='Number of samples from each digit') 186 | 187 | args = parser.parse_args() 188 | test_mnist_supervised(args.nsamples) 189 | end = time.time() 190 | print 'sampler with %d samples took %.2f seconds' % \ 191 | (args.nsamples*10, end-start) 192 | -------------------------------------------------------------------------------- /cmake/Modules/FindDistributions.cmake: -------------------------------------------------------------------------------- 1 | message(STATUS "Finding distributions") 2 | 3 | execute_process( 4 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_in_venv_like.sh distributions_shared distributions 5 | OUTPUT_VARIABLE DISTRIBUTIONS_ROOT 6 | OUTPUT_STRIP_TRAILING_WHITESPACE) 7 | 8 | if(DISTRIBUTIONS_ROOT) 9 | set(DISTRIBUTIONS_INCLUDE_DIRS ${DISTRIBUTIONS_ROOT}/include) 10 | set(DISTRIBUTIONS_LIBRARY_DIRS ${DISTRIBUTIONS_ROOT}/lib) 11 | set(DISTRIBUTIONS_LIBRARIES distributions_shared) 12 | set(DISTRIBUTIONS_FOUND true) 13 | else() 14 | message(STATUS "could not locate distributions") 15 | set(DISTRIBUTIONS_FOUND false) 16 | endif() 17 | -------------------------------------------------------------------------------- /cmake/Modules/FindMicroscopesCommon.cmake: -------------------------------------------------------------------------------- 1 | message(STATUS "Finding microscopes-common") 2 | 3 | execute_process( 4 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_in_venv_like.sh microscopes_common microscopes 5 | OUTPUT_VARIABLE MICROSCOPES_COMMON_ROOT 6 | OUTPUT_STRIP_TRAILING_WHITESPACE) 7 | 8 | if(MICROSCOPES_COMMON_ROOT) 9 | set(MICROSCOPES_COMMON_INCLUDE_DIRS ${MICROSCOPES_COMMON_ROOT}/include) 10 | set(MICROSCOPES_COMMON_LIBRARY_DIRS ${MICROSCOPES_COMMON_ROOT}/lib) 11 | set(MICROSCOPES_COMMON_LIBRARIES microscopes_common) 12 | set(MICROSCOPES_COMMON_FOUND true) 13 | else() 14 | message(STATUS "could not locate microscopes_common") 15 | set(MICROSCOPES_COMMON_FOUND false) 16 | endif() 17 | -------------------------------------------------------------------------------- /cmake/find_in_venv_like.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ### currently, looks for either a virtualenv or anaconda install 4 | 5 | LIBNAME=$1 6 | INCNAME=$2 7 | 8 | UNAME=`uname` 9 | if [ "${UNAME}" = "Darwin" ]; then 10 | SOEXT=dylib 11 | else 12 | SOEXT=so 13 | fi 14 | 15 | if [ -n "${VIRTUAL_ENV}" ]; then 16 | if [ -f "${VIRTUAL_ENV}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${VIRTUAL_ENV}/include/${INCNAME}" ]; then 17 | echo "${VIRTUAL_ENV}" 18 | exit 0 19 | fi 20 | fi 21 | 22 | if [ "${CONDA_BUILD}" = "1" ]; then 23 | if [ -f "${PREFIX}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${PREFIX}/include/${INCNAME}" ]; then 24 | echo "${PREFIX}" 25 | exit 0 26 | fi 27 | fi 28 | 29 | if [ -n "${CONDA_DEFAULT_ENV}" ]; then 30 | DIR=`conda info | grep 'default environment' | awk '{print $4}'` 31 | if [ -f "${DIR}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${DIR}/include/${INCNAME}" ]; then 32 | echo "${DIR}" 33 | exit 0 34 | fi 35 | fi 36 | 37 | exit 1 38 | -------------------------------------------------------------------------------- /cmake/print_cmake_command.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from subprocess import check_output 4 | 5 | if __name__ == '__main__': 6 | build_type = "RelWithDebInfo" 7 | if len(sys.argv) > 1: 8 | build_type = sys.argv[1] 9 | ValidBuildTypes = ('Debug', 'Release', 'RelWithDebInfo') 10 | if not build_type in ValidBuildTypes: 11 | raise ValueError("invalid build type: {}".format(build_type)) 12 | ## XXX: handle virtualenv 13 | conda_full_path = check_output("which conda", shell=True).strip() 14 | if 'CONDA_DEFAULT_ENV' in os.environ: 15 | a, b = os.path.split(conda_full_path) 16 | assert b == 'conda' 17 | a, b = os.path.split(a) 18 | assert b == 'bin' 19 | conda_env_path = a 20 | a, b = os.path.split(a) 21 | assert b == os.environ['CONDA_DEFAULT_ENV'] 22 | else: 23 | a, b = os.path.split(conda_full_path) 24 | assert b == 'conda' 25 | a, b = os.path.split(a) 26 | assert b == 'bin' 27 | conda_env_path = a 28 | print 'cmake -DCMAKE_BUILD_TYPE={} -DCMAKE_INSTALL_PREFIX={} -DCMAKE_PREFIX_PATH={} ..'.format( 29 | build_type, 30 | conda_env_path, 31 | conda_env_path) 32 | -------------------------------------------------------------------------------- /conda/microscopes-mixturemodel/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | UNAME=`uname` 5 | if [ "${UNAME}" = "Darwin" ]; then 6 | export EXTRA_LINK_ARGS=-headerpad_max_install_names 7 | export MACOSX_DEPLOYMENT_TARGET=10.7 8 | elif [ "${UNAME}" = "Linux" ]; then 9 | if (which g++-4.8 >/dev/null 2>&1); then 10 | export CXX=g++-4.8 11 | fi 12 | if (which gcc-4.8 >/dev/null 2>&1); then 13 | export CC=gcc-4.8 14 | fi 15 | else 16 | echo "unsupported os: ${UNAME}" 17 | exit 1 18 | fi 19 | 20 | mkdir build && cd build 21 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_PREFIX_PATH=${PREFIX} .. 22 | make VERBOSE=1 && make install 23 | cd .. 24 | OFFICIAL_BUILD=1 LIBRARY_PATH=${PREFIX}/lib EXTRA_INCLUDE_PATH=${PREFIX}/include $PYTHON setup.py install 25 | -------------------------------------------------------------------------------- /conda/microscopes-mixturemodel/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: microscopes-mixturemodel 3 | version: "0.1.0" 4 | 5 | source: 6 | git_url: https://github.com/datamicroscopes/mixturemodel.git 7 | 8 | requirements: 9 | build: 10 | - cmake 11 | - python 12 | - numpy ==1.8.2 13 | - cython >=0.20.2 14 | - distributions >=2.0.23 15 | - libprotobuf 16 | - eigen3 17 | - microscopes-common 18 | - microscopes-kernels 19 | run: 20 | - python 21 | - numpy ==1.8.2 22 | - scipy 23 | - distributions >=2.0.23 24 | - libprotobuf 25 | - protobuf 26 | - microscopes-common 27 | - microscopes-kernels 28 | 29 | test: 30 | imports: 31 | - microscopes.mixture 32 | - microscopes.mixture.definition 33 | - microscopes.mixture.model 34 | 35 | about: 36 | home: https://github.com/datamicroscopes/mixturemodel 37 | -------------------------------------------------------------------------------- /include/microscopes/mixture/model.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace microscopes { 24 | namespace mixture { 25 | 26 | namespace detail { 27 | 28 | typedef std::vector> group_type; 29 | 30 | static inline common::serialized_t 31 | group_type_to_string(const group_type &groups) 32 | { 33 | io::MixtureModelGroup m; 34 | for (auto &px : groups) 35 | m.add_suffstats(px->get_ss()); 36 | return common::util::protobuf_to_string(m); 37 | } 38 | 39 | static inline group_type 40 | group_type_from_string( 41 | const common::serialized_t &s, 42 | const std::vector> &models) 43 | { 44 | common::rng_t rng; // XXX: hack 45 | io::MixtureModelGroup m; 46 | common::util::protobuf_from_string(m, s); 47 | MICROSCOPES_DCHECK((size_t)m.suffstats_size() == models.size(), "sizes do not match"); 48 | group_type g; 49 | g.reserve(models.size()); 50 | for (size_t i = 0; i < models.size(); i++) { 51 | g.emplace_back(models[i]->create_group(rng)); 52 | g.back()->set_ss(m.suffstats(i)); 53 | } 54 | return g; 55 | } 56 | 57 | template