├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── LICENSE.md
├── Makefile
├── README.md
├── benchmarks
    ├── benchmarks_mnist.py
    └── benchmarks_mnist_script.py
├── cmake
    ├── Modules
    │   ├── FindDistributions.cmake
    │   └── FindMicroscopesCommon.cmake
    ├── find_in_venv_like.sh
    └── print_cmake_command.py
├── conda
    └── microscopes-mixturemodel
    │   ├── build.sh
    │   └── meta.yaml
├── include
    └── microscopes
    │   └── mixture
    │       └── model.hpp
├── microscopes
    ├── __init__.py
    └── mixture
    │   ├── __init__.py
    │   ├── _model.pxd
    │   ├── _model.pyx
    │   ├── _model_h.pxd
    │   ├── _state_h.pxd
    │   ├── definition.pxd
    │   ├── definition.pyx
    │   ├── model.pyx
    │   ├── query.py
    │   ├── runner.py
    │   └── testutil.py
├── setup.py
├── src
    └── mixture
    │   └── model.cpp
└── test
    ├── cxx
        └── test_state.cpp
    ├── test_crp.py
    ├── test_cxx_imports.py
    ├── test_definition.py
    ├── test_dm.py
    ├── test_hp_inference.py
    ├── test_mixturemodel_gibbs_assign.py
    ├── test_models_get_set_params.py
    ├── test_models_mixture_dp.py
    ├── test_query.py
    ├── test_runner.py
    ├── test_sample.py
    ├── test_slice_theta.py
    ├── test_state.py
    └── test_state_stress.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | .DS_Store
 4 | /config.mk
 5 | /build
 6 | /debug
 7 | /release
 8 | /relwithdebinfo
 9 | *.d
10 | *.prog
11 | *.dSYM
12 | /microscopes/**/*.cpp
13 | /microscopes/**/*.so
14 | /microscopes_mixturemodel.egg-info
15 | /microscopes/mixture/githash.txt
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - '2.7'
 4 | notifications:
 5 |   email: false
 6 |   slack:
 7 |     secure: I5/FD8tMEMQCeAAtYiUrPVg0U5j6AyoIyEpWtDbS7kHDqu7eG9qNZhFjcy/qxOHmv6xKeJPyNR8iuoQm5y0OVxkvwC7iI02EHCgcH7BSvwMvXcXiYTH0JM0SwySnfW6pJkbwfs9AT06UlNrYqXDatqNUTIhLajsDQWAeSmd2wwM=
 8 | before_install:
 9 |   - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
10 |   - sudo apt-get update -qq
11 |   - sudo apt-get install -qq g++-4.8
12 |   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
13 |   - chmod +x miniconda.sh
14 |   - ./miniconda.sh -b
15 |   - export PATH=/home/travis/miniconda/bin:$PATH
16 |   - conda update --yes conda
17 |   - sudo rm -rf /dev/shm
18 |   - sudo ln -s /run/shm /dev/shm
19 |   - conda create -n build --yes python=$TRAVIS_PYTHON_VERSION numpy=1.8 scipy nose cython pip scikit-learn
20 |   - source activate build
21 |   - conda install --yes -c distributions distributions
22 |   - conda install --yes -c datamicroscopes eigen3
23 |   - export CC=gcc-4.8
24 |   - export CXX=g++-4.8
25 |   - printenv
26 |   - conda list --export
27 |   - mkdir .travis
28 |   - (cd .travis && git clone https://github.com/datamicroscopes/common.git && cd common && make travis_install)
29 |   - (cd .travis && git clone https://github.com/datamicroscopes/kernels.git && cd kernels && make travis_install)
30 | install: make travis_install
31 | script: make travis_script
32 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 3 | 
 4 | project(microscopes_mixturemodel)
 5 | 
 6 | # default mode is Release
 7 | if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
 8 |   set(CMAKE_BUILD_TYPE RelWithDebInfo)
 9 | endif()
10 | 
11 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -g -MD -std=c++0x")
12 | 
13 | # since we use distributions headers, we need to inherit their
14 | # no-strict-aliasing warnings rule otherwise deal with compiler noise
15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
16 | 
17 | # this warning is annoying; who cares about unused parameters
18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
19 | 
20 | if(APPLE)
21 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
22 |   # for anaconda builds
23 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmacosx-version-min=10.7")
24 |   # clang complains about register
25 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-register")
26 | endif()
27 | 
28 | # taken from distributions
29 | #set(CMAKE_CXX_FLAGS_MATHOPT "-mfpmath=sse -msse4.1 -ffast-math -funsafe-math-optimizations")
30 | set(CMAKE_CXX_FLAGS_MATHOPT "-mfpmath=sse -msse4.1")
31 | 
32 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG ${CMAKE_CXX_FLAGS_MATHOPT}")
33 | set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -fno-omit-frame-pointer")
34 | set(CMAKE_CXX_FLAGS_DEBUG "-DDEBUG_MODE -fno-omit-frame-pointer")
35 | 
36 | # give our include dirs the most precedent
37 | include_directories(include)
38 | 
39 | # followed by the EXTRA_* ones
40 | if(DEFINED EXTRA_INCLUDE_PATH)
41 |   include_directories(${EXTRA_INCLUDE_PATH})
42 | endif()
43 | if(DEFINED EXTRA_LIBRARY_PATH)
44 |   link_directories(${EXTRA_LIBRARY_PATH})
45 | endif()
46 | 
47 | find_package(Protobuf REQUIRED)
48 | message(STATUS "found protobuf INC=${PROTOBUF_INCLUDE_DIRS}, LIB=${PROTOBUF_LIBRARIES}")
49 | include_directories(${PROTOBUF_INCLUDE_DIRS})
50 | 
51 | find_package(Distributions)
52 | if(DISTRIBUTIONS_FOUND)
53 |   message(STATUS "found distributions INC=${DISTRIBUTIONS_INCLUDE_DIRS}, LIB=${DISTRIBUTIONS_LIBRARY_DIRS}")
54 |   include_directories(${DISTRIBUTIONS_INCLUDE_DIRS})
55 |   link_directories(${DISTRIBUTIONS_LIBRARY_DIRS})
56 | else()
57 |   message(FATAL_ERROR "Could not find distributions")
58 | endif()
59 | 
60 | find_package(MicroscopesCommon)
61 | if(MICROSCOPES_COMMON_FOUND)
62 |   message(STATUS "found microscopes_common INC=${MICROSCOPES_COMMON_INCLUDE_DIRS}, LIB=${MICROSCOPES_COMMON_LIBRARY_DIRS}")
63 |   include_directories(${MICROSCOPES_COMMON_INCLUDE_DIRS})
64 |   link_directories(${MICROSCOPES_COMMON_LIBRARY_DIRS})
65 | else()
66 |   message(FATAL_ERROR "Could not find microscopes_common")
67 | endif()
68 | 
69 | install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN "*.h*")
70 | install(DIRECTORY microscopes DESTINATION cython FILES_MATCHING PATTERN "*.pxd" PATTERN "__init__.py")
71 | 
72 | set(MICROSCOPES_MIXTUREMODEL_SOURCE_FILES src/mixture/model.cpp)
73 | add_library(microscopes_mixturemodel SHARED ${MICROSCOPES_MIXTUREMODEL_SOURCE_FILES})
74 | target_link_libraries(microscopes_mixturemodel ${PROTOBUF_LIBRARIES} distributions_shared microscopes_common)
75 | install(TARGETS microscopes_mixturemodel LIBRARY DESTINATION lib)
76 | 
77 | # test executables
78 | enable_testing()
79 | add_executable(test_state test/cxx/test_state.cpp)
80 | add_test(test_state test_state)
81 | target_link_libraries(test_state ${PROTOBUF_LIBRARIES} distributions_shared microscopes_common microscopes_mixturemodel)
82 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Qadium
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of Qadium nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	@echo "choose a valid target"
 3 | 
 4 | .PHONY: release
 5 | release:
 6 | 	@echo "Setting up cmake (release)"
 7 | 	@python ./cmake/print_cmake_command.py Release
 8 | 	[ -d release ] || (mkdir release && cd release && eval `python ../cmake/print_cmake_command.py Release`)
 9 | 
10 | .PHONY: relwithdebinfo
11 | relwithdebinfo:
12 | 	@echo "Setting up cmake (relwithdebinfo)"
13 | 	@python ./cmake/print_cmake_command.py RelWithDebInfo
14 | 	[ -d relwithdebinfo ] || (mkdir relwithdebinfo && cd relwithdebinfo && eval `python ../cmake/print_cmake_command.py RelWithDebInfo`)
15 | 
16 | .PHONY: debug
17 | debug:
18 | 	@echo "Setting up cmake (debug)"
19 | 	@python ./cmake/print_cmake_command.py Debug
20 | 	[ -d debug ] || (mkdir debug && cd debug && eval `python ../cmake/print_cmake_command.py Debug`)
21 | 
22 | CPU_COUNT=$(shell python -c 'import multiprocessing as m; print m.cpu_count()')
23 | 
24 | .PHONY: test
25 | test:
26 | 	(cd test && NOSE_PROCESSES=$(CPU_COUNT) NOSE_PROCESS_TIMEOUT=240 nosetests -a '!uses_mp,!slow' --verbose)
27 | 	(cd test && nosetests -a 'uses_mp,!slow' --verbose)
28 | 
29 | .PHONY: travis_install
30 | travis_install:
31 | 	make relwithdebinfo
32 | 	(cd relwithdebinfo && make && make install)
33 | 	pip install .
34 | 
35 | .PHONY: travis_script
36 | travis_script:
37 | 	(cd relwithdebinfo && CTEST_OUTPUT_ON_FAILURE=true make test)
38 | 	(cd test && NOSE_PROCESSES=$(CPU_COUNT) NOSE_PROCESS_TIMEOUT=240 nosetests --verbose -a '!uses_mp,!slow')
39 | 	(cd test && nosetests --verbose -a 'uses_mp,!slow')
40 | 
41 | .PHONY: lint
42 | lint:
43 | 	pyflakes microscopes test setup.py
44 | 	pep8 --filename=*.py --ignore=E265 microscopes test setup.py
45 | 	pep8 --filename=*.pyx --ignore=E265,E211,E225 microscopes
46 | 
47 | .PHONY: clean
48 | clean:
49 | 	rm -rf release relwithdebinfo debug microscopes_mixturemodel.egg-info
50 | 	find microscopes/ -name '*.cpp' -type f -print0 | xargs -0 rm --
51 | 	find microscopes/ -name '*.so' -type f -print0 | xargs -0 rm --
52 | 	find microscopes/ -name '*.pyc' -type f -print0 | xargs -0 rm --
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # microscopes-mixtures [![Build Status](https://travis-ci.org/datamicroscopes/mixturemodel.svg?branch=master)](https://travis-ci.org/datamicroscopes/mixturemodel)
 2 | 
 3 | Contains the implementation of a Dirichlet process mixture model (DPMM). Routines for doing inference on this model are found in the [kernels](https://github.com/datamicroscopes/kernels) project.
 4 | 
 5 | ### Installation
 6 | 
 7 | OS X and Linux builds of `microscopes-mixturemodel` are released to [Anaconda.org](https://conda.anaconda.org). Installing them requires [Conda](https://store.continuum.io/cshop/anaconda/).  To install the current release version run:
 8 | 
 9 | ```
10 | $ conda install -c datamicroscopes -c distributions microscopes-mixturemodel
11 | ```
12 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks_mnist.py:
--------------------------------------------------------------------------------
  1 | from microscopes.common.recarray.dataview import numpy_dataview
  2 | from microscopes.common.rng import rng
  3 | from microscopes.common.scalar_functions import log_exponential
  4 | from microscopes.mixture.model import initialize, bind
  5 | from microscopes.kernels.gibbs import assign
  6 | from microscopes.kernels.slice import hp
  7 | from microscopes.common.util import mkdirp
  8 | from microscopes.models import bb, dd
  9 | from microscopes.mixture.definition import model_definition
 10 | 
 11 | from sklearn.datasets import fetch_mldata
 12 | from sklearn.cross_validation import train_test_split
 13 | from sklearn.metrics import (
 14 |     accuracy_score,
 15 |     roc_auc_score,
 16 |     confusion_matrix,
 17 |     #roc_curve,
 18 | )
 19 | 
 20 | import numpy as np
 21 | import numpy.ma as ma
 22 | import math
 23 | import time
 24 | import os
 25 | 
 26 | from nose.plugins.attrib import attr
 27 | 
 28 | 
 29 | def _get_mnist_dataset():
 30 |     return fetch_mldata('MNIST original')
 31 | 
 32 | 
 33 | def groupcounts(s):
 34 |     counts = np.zeros(s.ngroups(), dtype=np.int)
 35 |     for i, gid in enumerate(s.groups()):
 36 |         counts[i] = s.groupsize(gid)
 37 |     return np.sort(counts)[::-1]
 38 | 
 39 | 
 40 | def groupsbysize(s):
 41 |     """groupids by decreasing size"""
 42 |     counts = [(gid, s.groupsize(gid)) for gid in s.groups()]
 43 |     counts = sorted(counts, key=lambda x: x[1], reverse=True)
 44 |     return counts
 45 | 
 46 | 
 47 | @attr('slow')
 48 | def test_mnist_supervised():
 49 |     mnist_dataset = _get_mnist_dataset()
 50 |     classes = range(10)
 51 |     classmap = {c: i for i, c in enumerate(classes)}
 52 |     train_data, test_data = [], []
 53 |     for c in classes:
 54 |         Y = mnist_dataset['data'][
 55 |             np.where(mnist_dataset['target'] == float(c))[0]]
 56 |         Y_train, Y_test = train_test_split(Y, test_size=0.01)
 57 |         train_data.append(Y_train)
 58 |         test_data.append(Y_test)
 59 | 
 60 |     sample_size_max = 10000
 61 | 
 62 |     def mk_class_data(c, Y):
 63 |         n, D = Y.shape
 64 |         print 'number of digit', c, 'in training is', n
 65 |         dtype = [('', bool)] * D + [('', int)]
 66 |         inds = np.random.permutation(Y.shape[0])[:sample_size_max]
 67 |         Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]],
 68 |                      dtype=dtype)
 69 |         return Y
 70 |     Y_train = np.hstack([mk_class_data(c, y)
 71 |                          for c, y in zip(classes, train_data)])
 72 |     Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))]
 73 | 
 74 |     n, = Y_train.shape
 75 |     D = len(Y_train.dtype)
 76 |     print 'training data is', n, 'examples'
 77 |     print 'image dimension is', (D - 1), 'pixels'
 78 | 
 79 |     view = numpy_dataview(Y_train)
 80 |     defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))])
 81 |     r = rng()
 82 |     s = initialize(defn,
 83 |                    view,
 84 |                    cluster_hp={'alpha': 0.2},
 85 |                    feature_hps=[{'alpha': 1., 'beta': 1.}] *
 86 |                    (D - 1) + [{'alphas': [1. for _ in classes]}],
 87 |                    r=r)
 88 | 
 89 |     bound_s = bind(s, view)
 90 | 
 91 |     indiv_prior_fn = log_exponential(1.2)
 92 |     hparams = {
 93 |         i: {
 94 |             'alpha': (indiv_prior_fn, 1.5),
 95 |             'beta': (indiv_prior_fn, 1.5),
 96 |         } for i in xrange(D - 1)}
 97 |     hparams[D - 1] = {
 98 |         'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5)
 99 |         for idx in xrange(len(classes))
100 |     }
101 | 
102 |     def print_prediction_results():
103 |         results = []
104 |         for c, Y_test in zip(classes, test_data):
105 |             for y in Y_test:
106 |                 query = ma.masked_array(
107 |                     np.array([tuple(y) + (0,)],
108 |                              dtype=[('', bool)] * (D - 1) + [('', int)]),
109 |                     mask=[(False,) * (D - 1) + (True,)])[0]
110 |                 samples = [
111 |                     s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)]
112 |                 samples = np.bincount(samples, minlength=len(classes))
113 |                 prediction = np.argmax(samples)
114 |                 results.append((classmap[c], prediction, samples))
115 |             print 'finished predictions for class', c
116 | 
117 |         Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
118 |         Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
119 |         print 'accuracy:', accuracy_score(Y_actual, Y_pred)
120 |         print 'confusion matrix:'
121 |         print confusion_matrix(Y_actual, Y_pred)
122 | 
123 |         # AUROC for one vs all (each class)
124 |         for i, clabel in enumerate(classes):
125 |             Y_true = np.copy(Y_actual)
126 | 
127 |             # treat class c as the "positive" example
128 |             positive_examples = Y_actual == i
129 |             negative_examples = Y_actual != i
130 |             Y_true[positive_examples] = 1
131 |             Y_true[negative_examples] = 0
132 |             Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
133 |             cls_auc = roc_auc_score(Y_true, Y_prob)
134 |             print 'class', clabel, 'auc=', cls_auc
135 | 
136 |         #import matplotlib.pylab as plt
137 |         #Y_prob = np.array([c for _, _, c in results])
138 |         #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0)
139 |         #plt.plot(fpr, tpr)
140 |         #plt.show()
141 | 
142 |     def kernel(rid):
143 |         start0 = time.time()
144 |         assign(bound_s, r)
145 |         sec0 = time.time() - start0
146 | 
147 |         start1 = time.time()
148 |         hp(bound_s, r, hparams=hparams)
149 |         sec1 = time.time() - start1
150 | 
151 |         print 'rid=', rid, 'nclusters=', s.ngroups(), \
152 |             'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'
153 | 
154 |         sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
155 |         print '  time_per_post_pred=', sec_per_post_pred, 'sec'
156 | 
157 |         # print group size breakdown
158 |         sizes = [(gid, s.groupsize(gid)) for gid in s.groups()]
159 |         sizes = sorted(sizes, key=lambda x: x[1], reverse=True)
160 |         print '  group_sizes=', sizes
161 | 
162 |         print_prediction_results()
163 | 
164 |         # save state
165 |         mkdirp("mnist-states")
166 |         fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid))
167 |         with open(fname, "w") as fp:
168 |             fp.write(s.serialize())
169 | 
170 |     # training
171 |     iters = 30
172 |     for rid in xrange(iters):
173 |         kernel(rid)
174 | 
175 | 
176 | @attr('slow')
177 | def test_mnist():
178 |     import matplotlib.pylab as plt
179 |     from PIL import Image, ImageOps
180 |     mnist_dataset = _get_mnist_dataset()
181 |     Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]]
182 |     Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]]
183 |     print 'number of twos:', Y_2.shape[0]
184 |     print 'number of threes:', Y_3.shape[0]
185 |     _, D = Y_2.shape
186 |     W = int(math.sqrt(D))
187 |     assert W * W == D
188 |     dtype = [('', bool)] * D
189 |     Y = np.vstack([Y_2, Y_3])
190 |     Y = np.array(
191 |         [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]],
192 |         dtype=dtype)
193 | 
194 |     view = numpy_dataview(Y)
195 |     defn = model_definition(Y.shape[0], [bb] * D)
196 |     r = rng()
197 |     s = initialize(
198 |         defn,
199 |         view,
200 |         cluster_hp={'alpha': 0.2},
201 |         feature_hps=[{'alpha': 1., 'beta': 1.}] * D,
202 |         r=r)
203 |     bound_s = bind(s, view)
204 | 
205 |     indiv_prior_fn = log_exponential(1.2)
206 |     hparams = {
207 |         i: {
208 |             'alpha': (indiv_prior_fn, 1.5),
209 |             'beta': (indiv_prior_fn, 1.5),
210 |         } for i in xrange(D)}
211 | 
212 |     def plot_clusters(s, fname, scalebysize=False):
213 |         hps = [s.get_feature_hp(i) for i in xrange(D)]
214 | 
215 |         def prior_prob(hp):
216 |             return hp['alpha'] / (hp['alpha'] + hp['beta'])
217 | 
218 |         def data_for_group(gid):
219 |             suffstats = [s.get_suffstats(gid, i) for i in xrange(D)]
220 | 
221 |             def prob(hp, ss):
222 |                 top = hp['alpha'] + ss['heads']
223 |                 bot = top + hp['beta'] + ss['tails']
224 |                 return top / bot
225 |             probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)]
226 |             return np.array(probs)
227 | 
228 |         def scale(d, weight):
229 |             im = d.reshape((W, W))
230 |             newW = max(int(weight * W), 1)
231 |             im = Image.fromarray(im)
232 |             im = im.resize((newW, newW))
233 |             im = ImageOps.expand(im, border=(W - newW) / 2)
234 |             im = np.array(im)
235 |             a, b = im.shape
236 |             #print 'a,b:', a, b
237 |             if a < W:
238 |                 im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0)
239 |             elif a > W:
240 |                 im = im[:W, :]
241 |             assert im.shape[0] == W
242 |             if b < W:
243 |                 #print 'current:', im.shape
244 |                 im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1)
245 |             elif b > W:
246 |                 im = im[:, :W]
247 |             assert im.shape[1] == W
248 |             return im.flatten()
249 | 
250 |         data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)]
251 |         largest = max(cnt for _, cnt in data)
252 |         data = [scale(d, cnt / float(largest)) if scalebysize else d
253 |                 for d, cnt in data]
254 |         digits_per_row = 12
255 |         rem = len(data) % digits_per_row
256 |         if rem:
257 |             fill = digits_per_row - rem
258 |             for _ in xrange(fill):
259 |                 data.append(np.zeros(D))
260 |         assert not (len(data) % digits_per_row)
261 |         #rows = len(data) / digits_per_row
262 |         data = np.vstack([np.hstack([d.reshape((W, W))
263 |                          for d in data[i:i + digits_per_row]])
264 |                          for i in xrange(0, len(data), digits_per_row)])
265 |         #print 'saving figure', fname
266 |         plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest')
267 |         plt.savefig(fname)
268 |         plt.close()
269 | 
270 |     def plot_hyperparams(s, fname):
271 |         hps = [s.get_feature_hp(i) for i in xrange(D)]
272 |         alphas = np.array([hp['alpha'] for hp in hps])
273 |         betas = np.array([hp['beta'] for hp in hps])
274 |         data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))])
275 |         plt.imshow(data, interpolation='nearest')
276 |         plt.colorbar()
277 |         plt.savefig(fname)
278 |         plt.close()
279 | 
280 |     def kernel(rid):
281 |         start0 = time.time()
282 |         assign(bound_s, r)
283 |         sec0 = time.time() - start0
284 | 
285 |         start1 = time.time()
286 |         hp(bound_s, r, hparams=hparams)
287 |         sec1 = time.time() - start1
288 | 
289 |         print 'rid=', rid, 'nclusters=', s.ngroups(), \
290 |             'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'
291 | 
292 |         sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
293 |         print '  time_per_post_pred=', sec_per_post_pred, 'sec'
294 | 
295 |         return s.score_joint(r)
296 | 
297 |     # burnin
298 |     burnin = 20
299 |     for rid in xrange(burnin):
300 |         print 'score:', kernel(rid)
301 |     print 'finished burnin'
302 |     plot_clusters(s, 'mnist_clusters.pdf')
303 |     plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True)
304 |     plot_hyperparams(s, 'mnist_hyperparams.pdf')
305 |     print 'groupcounts:', groupcounts(s)
306 | 
307 |     # posterior predictions
308 |     present = D / 2
309 |     absent = D - present
310 |     queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \
311 |               [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]]
312 | 
313 |     queries_masked = ma.masked_array(
314 |         np.array(queries, dtype=[('', bool)] * D),
315 |         mask=[(False,) * present + (True,) * absent])
316 | 
317 |     def postpred_sample(y_new):
318 |         Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)]
319 |         Y_samples = np.array([list(y) for y in np.hstack(Y_samples)])
320 |         Y_avg = Y_samples.mean(axis=0)
321 |         return Y_avg
322 | 
323 |     queries_masked = [postpred_sample(y) for y in queries_masked]
324 |     data0 = np.hstack([q.reshape((W, W)) for q in queries_masked])
325 |     data1 = np.hstack(
326 |         [np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W))
327 |          for q in queries])
328 |     data = np.vstack([data0, data1])
329 |     plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest')
330 |     plt.savefig('mnist_predict.pdf')
331 |     plt.close()
332 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks_mnist_script.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from microscopes.common.recarray.dataview import numpy_dataview
  3 | from microscopes.common.rng import rng
  4 | from microscopes.common.scalar_functions import log_exponential
  5 | from microscopes.mixture.model import initialize, bind
  6 | from microscopes.kernels.gibbs import assign
  7 | from microscopes.kernels.slice import hp
  8 | from microscopes.common.util import mkdirp
  9 | from microscopes.models import bb, dd
 10 | from microscopes.mixture.definition import model_definition
 11 | 
 12 | from sklearn.datasets import fetch_mldata
 13 | from sklearn.cross_validation import train_test_split
 14 | from sklearn.metrics import (
 15 |     accuracy_score,
 16 |     roc_auc_score,
 17 |     confusion_matrix,
 18 |     #roc_curve,
 19 | )
 20 | 
 21 | import numpy as np
 22 | import numpy.ma as ma
 23 | import math
 24 | import time
 25 | import os
 26 | 
 27 | from nose.plugins.attrib import attr
 28 | 
 29 | 
 30 | def _get_mnist_dataset():
 31 |     return fetch_mldata('MNIST original')
 32 | 
 33 | 
 34 | def groupcounts(s):
 35 |     counts = np.zeros(s.ngroups(), dtype=np.int)
 36 |     for i, gid in enumerate(s.groups()):
 37 |         counts[i] = s.groupsize(gid)
 38 |     return np.sort(counts)[::-1]
 39 | 
 40 | 
 41 | def groupsbysize(s):
 42 |     """groupids by decreasing size"""
 43 |     counts = [(gid, s.groupsize(gid)) for gid in s.groups()]
 44 |     counts = sorted(counts, key=lambda x: x[1], reverse=True)
 45 |     return counts
 46 | 
 47 | 
 48 | @attr('slow')
 49 | #@profile
 50 | def test_mnist_supervised(n):
 51 |     mnist_dataset = _get_mnist_dataset()
 52 |     classes = range(10)
 53 |     classmap = {c: i for i, c in enumerate(classes)}
 54 |     train_data, test_data = [], []
 55 |     for c in classes:
 56 |         Y = mnist_dataset['data'][
 57 |             np.where(mnist_dataset['target'] == float(c))[0]]
 58 |         Y_train, Y_test = train_test_split(Y, test_size=0.01)
 59 |         train_data.append(Y_train)
 60 |         test_data.append(Y_test)
 61 | 
 62 |     sample_size_max = n
 63 | 
 64 |     def mk_class_data(c, Y):
 65 |         n, D = Y.shape
 66 |         print 'number of digit', c, 'in training is', n
 67 |         dtype = [('', bool)] * D + [('', int)]
 68 |         inds = np.random.permutation(Y.shape[0])[:sample_size_max]
 69 |         Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]],
 70 |                      dtype=dtype)
 71 |         return Y
 72 |     Y_train = np.hstack([mk_class_data(c, y)
 73 |                         for c, y in zip(classes, train_data)])
 74 |     Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))]
 75 | 
 76 |     n, = Y_train.shape
 77 |     D = len(Y_train.dtype)
 78 |     print 'training data is', n, 'examples'
 79 |     print 'image dimension is', (D - 1), 'pixels'
 80 | 
 81 |     view = numpy_dataview(Y_train)
 82 |     defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))])
 83 |     r = rng()
 84 |     s = initialize(defn,
 85 |                    view,
 86 |                    cluster_hp={'alpha': 0.2},
 87 |                    feature_hps=[{'alpha': 1., 'beta': 1.}] *
 88 |                    (D - 1) + [{'alphas': [1. for _ in classes]}],
 89 |                    r=r)
 90 | 
 91 |     bound_s = bind(s, view)
 92 | 
 93 |     indiv_prior_fn = log_exponential(1.2)
 94 |     hparams = {
 95 |         i: {
 96 |             'alpha': (indiv_prior_fn, 1.5),
 97 |             'beta': (indiv_prior_fn, 1.5),
 98 |         } for i in xrange(D - 1)}
 99 |     hparams[D - 1] = {
100 |         'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5)
101 |         for idx in xrange(len(classes))
102 |     }
103 | 
104 |     def print_prediction_results():
105 |         results = []
106 |         for c, Y_test in zip(classes, test_data):
107 |             for y in Y_test:
108 |                 query = ma.masked_array(
109 |                     np.array([tuple(y) + (0,)],
110 |                              dtype=[('', bool)] * (D - 1) + [('', int)]),
111 |                     mask=[(False,) * (D - 1) + (True,)])[0]
112 |                 samples = [
113 |                     s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)]
114 |                 samples = np.bincount(samples, minlength=len(classes))
115 |                 prediction = np.argmax(samples)
116 |                 results.append((classmap[c], prediction, samples))
117 |             print 'finished predictions for class', c
118 | 
119 |         Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
120 |         Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
121 |         print 'accuracy:', accuracy_score(Y_actual, Y_pred)
122 |         print 'confusion matrix:'
123 |         print confusion_matrix(Y_actual, Y_pred)
124 | 
125 |         # AUROC for one vs all (each class)
126 |         for i, clabel in enumerate(classes):
127 |             Y_true = np.copy(Y_actual)
128 | 
129 |             # treat class c as the "positive" example
130 |             positive_examples = Y_actual == i
131 |             negative_examples = Y_actual != i
132 |             Y_true[positive_examples] = 1
133 |             Y_true[negative_examples] = 0
134 |             Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
135 |             cls_auc = roc_auc_score(Y_true, Y_prob)
136 |             print 'class', clabel, 'auc=', cls_auc
137 | 
138 |         #import matplotlib.pylab as plt
139 |         #Y_prob = np.array([c for _, _, c in results])
140 |         #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0)
141 |         #plt.plot(fpr, tpr)
142 |         #plt.show()
143 | 
144 |     def kernel(rid):
145 |         start0 = time.time()
146 |         assign(bound_s, r)
147 |         sec0 = time.time() - start0
148 | 
149 |         start1 = time.time()
150 |         hp(bound_s, r, hparams=hparams)
151 |         sec1 = time.time() - start1
152 | 
153 |         print 'rid=', rid, 'nclusters=', s.ngroups(), \
154 |             'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'
155 | 
156 |         sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
157 |         print '  time_per_post_pred=', sec_per_post_pred, 'sec'
158 | 
159 |     # training
160 |     iters = 30
161 |     for rid in xrange(iters):
162 |         kernel(rid)
163 | 
164 |     # print group size breakdown
165 |     sizes = [(gid, s.groupsize(gid)) for gid in s.groups()]
166 |     sizes = sorted(sizes, key=lambda x: x[1], reverse=True)
167 |     print '  group_sizes=', sizes
168 | 
169 |     #print_prediction_results()
170 | 
171 |     # save state
172 |     mkdirp("mnist-states")
173 |     fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid))
174 |     with open(fname, "w") as fp:
175 |         fp.write(s.serialize())
176 | 
177 | if __name__ == "__main__":
178 |     start = time.time()
179 |     parser = argparse.ArgumentParser(
180 |         description=globals()['__doc__'],
181 |         formatter_class=argparse.RawDescriptionHelpFormatter)
182 | 
183 |     parser.add_argument(
184 |         '-n', '--nsamples', required=True, type=int,
185 |         help='Number of samples from each digit')
186 | 
187 |     args = parser.parse_args()
188 |     test_mnist_supervised(args.nsamples)
189 |     end = time.time()
190 |     print 'sampler with %d samples took %.2f seconds' % \
191 |         (args.nsamples*10, end-start)
192 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindDistributions.cmake:
--------------------------------------------------------------------------------
 1 | message(STATUS "Finding distributions")
 2 | 
 3 | execute_process(
 4 |     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_in_venv_like.sh distributions_shared distributions
 5 |     OUTPUT_VARIABLE DISTRIBUTIONS_ROOT
 6 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
 7 | 
 8 | if(DISTRIBUTIONS_ROOT)
 9 |     set(DISTRIBUTIONS_INCLUDE_DIRS ${DISTRIBUTIONS_ROOT}/include)
10 |     set(DISTRIBUTIONS_LIBRARY_DIRS ${DISTRIBUTIONS_ROOT}/lib)
11 |     set(DISTRIBUTIONS_LIBRARIES distributions_shared)
12 |     set(DISTRIBUTIONS_FOUND true)
13 | else()
14 |     message(STATUS "could not locate distributions") 
15 |     set(DISTRIBUTIONS_FOUND false)
16 | endif()
17 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindMicroscopesCommon.cmake:
--------------------------------------------------------------------------------
 1 | message(STATUS "Finding microscopes-common")
 2 | 
 3 | execute_process(
 4 |     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/find_in_venv_like.sh microscopes_common microscopes
 5 |     OUTPUT_VARIABLE MICROSCOPES_COMMON_ROOT
 6 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
 7 | 
 8 | if(MICROSCOPES_COMMON_ROOT)
 9 |     set(MICROSCOPES_COMMON_INCLUDE_DIRS ${MICROSCOPES_COMMON_ROOT}/include)
10 |     set(MICROSCOPES_COMMON_LIBRARY_DIRS ${MICROSCOPES_COMMON_ROOT}/lib)
11 |     set(MICROSCOPES_COMMON_LIBRARIES microscopes_common)
12 |     set(MICROSCOPES_COMMON_FOUND true)
13 | else()
14 |     message(STATUS "could not locate microscopes_common") 
15 |     set(MICROSCOPES_COMMON_FOUND false)
16 | endif()
17 | 


--------------------------------------------------------------------------------
/cmake/find_in_venv_like.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ### currently, looks for either a virtualenv or anaconda install
 4 | 
 5 | LIBNAME=$1
 6 | INCNAME=$2
 7 | 
 8 | UNAME=`uname`
 9 | if [ "${UNAME}" = "Darwin" ]; then
10 |     SOEXT=dylib
11 | else
12 |     SOEXT=so
13 | fi
14 | 
15 | if [ -n "${VIRTUAL_ENV}" ]; then
16 |     if [ -f "${VIRTUAL_ENV}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${VIRTUAL_ENV}/include/${INCNAME}" ]; then
17 |         echo "${VIRTUAL_ENV}"
18 |         exit 0
19 |     fi
20 | fi
21 | 
22 | if [ "${CONDA_BUILD}" = "1" ]; then
23 |     if [ -f "${PREFIX}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${PREFIX}/include/${INCNAME}" ]; then
24 |         echo "${PREFIX}"
25 |         exit 0
26 |     fi
27 | fi
28 | 
29 | if [ -n "${CONDA_DEFAULT_ENV}" ]; then
30 |     DIR=`conda info | grep 'default environment' | awk '{print $4}'`
31 |     if [ -f "${DIR}/lib/lib${LIBNAME}.${SOEXT}" ] && [ -d "${DIR}/include/${INCNAME}" ]; then
32 |         echo "${DIR}"
33 |         exit 0
34 |     fi
35 | fi
36 | 
37 | exit 1
38 | 


--------------------------------------------------------------------------------
/cmake/print_cmake_command.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from subprocess import check_output
 4 | 
 5 | if __name__ == '__main__':
 6 |     build_type = "RelWithDebInfo"
 7 |     if len(sys.argv) > 1:
 8 |         build_type = sys.argv[1]
 9 |     ValidBuildTypes = ('Debug', 'Release', 'RelWithDebInfo')
10 |     if not build_type in ValidBuildTypes:
11 |         raise ValueError("invalid build type: {}".format(build_type))
12 |     ## XXX: handle virtualenv
13 |     conda_full_path = check_output("which conda", shell=True).strip()
14 |     if 'CONDA_DEFAULT_ENV' in os.environ:
15 |         a, b = os.path.split(conda_full_path)
16 |         assert b == 'conda'
17 |         a, b = os.path.split(a)
18 |         assert b == 'bin'
19 |         conda_env_path = a
20 |         a, b = os.path.split(a)
21 |         assert b == os.environ['CONDA_DEFAULT_ENV']
22 |     else:
23 |         a, b = os.path.split(conda_full_path)
24 |         assert b == 'conda'
25 |         a, b = os.path.split(a)
26 |         assert b == 'bin'
27 |         conda_env_path = a
28 |     print 'cmake -DCMAKE_BUILD_TYPE={} -DCMAKE_INSTALL_PREFIX={} -DCMAKE_PREFIX_PATH={} ..'.format(
29 |             build_type,
30 |             conda_env_path,
31 |             conda_env_path)
32 | 


--------------------------------------------------------------------------------
/conda/microscopes-mixturemodel/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | UNAME=`uname`
 5 | if [ "${UNAME}" = "Darwin" ]; then
 6 |     export EXTRA_LINK_ARGS=-headerpad_max_install_names
 7 |     export MACOSX_DEPLOYMENT_TARGET=10.7
 8 | elif [ "${UNAME}" = "Linux" ]; then
 9 |     if (which g++-4.8 >/dev/null 2>&1); then
10 |       export CXX=g++-4.8
11 |     fi
12 |     if (which gcc-4.8 >/dev/null 2>&1); then
13 |       export CC=gcc-4.8
14 |     fi
15 | else
16 |     echo "unsupported os: ${UNAME}"
17 |     exit 1
18 | fi
19 | 
20 | mkdir build && cd build
21 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_PREFIX_PATH=${PREFIX} ..
22 | make VERBOSE=1 && make install
23 | cd ..
24 | OFFICIAL_BUILD=1 LIBRARY_PATH=${PREFIX}/lib EXTRA_INCLUDE_PATH=${PREFIX}/include $PYTHON setup.py install
25 | 


--------------------------------------------------------------------------------
/conda/microscopes-mixturemodel/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: microscopes-mixturemodel
 3 |   version: "0.1.0"
 4 | 
 5 | source:
 6 |   git_url: https://github.com/datamicroscopes/mixturemodel.git
 7 | 
 8 | requirements:
 9 |   build:
10 |     - cmake
11 |     - python
12 |     - numpy ==1.8.2
13 |     - cython >=0.20.2
14 |     - distributions >=2.0.23
15 |     - libprotobuf
16 |     - eigen3
17 |     - microscopes-common
18 |     - microscopes-kernels
19 |   run:
20 |     - python
21 |     - numpy ==1.8.2
22 |     - scipy
23 |     - distributions >=2.0.23
24 |     - libprotobuf
25 |     - protobuf
26 |     - microscopes-common
27 |     - microscopes-kernels
28 | 
29 | test:
30 |   imports:
31 |     - microscopes.mixture
32 |     - microscopes.mixture.definition
33 |     - microscopes.mixture.model
34 | 
35 | about:
36 |   home: https://github.com/datamicroscopes/mixturemodel
37 | 


--------------------------------------------------------------------------------
/include/microscopes/mixture/model.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <microscopes/models/base.hpp>
  4 | #include <microscopes/common/entity_state.hpp>
  5 | #include <microscopes/common/group_manager.hpp>
  6 | #include <microscopes/common/recarray/dataview.hpp>
  7 | #include <microscopes/common/util.hpp>
  8 | #include <microscopes/common/typedefs.hpp>
  9 | #include <microscopes/common/assert.hpp>
 10 | #include <microscopes/io/schema.pb.h>
 11 | #include <distributions/special.hpp>
 12 | 
 13 | #include <cmath>
 14 | #include <vector>
 15 | #include <set>
 16 | #include <functional>
 17 | #include <map>
 18 | #include <memory>
 19 | #include <sstream>
 20 | #include <utility>
 21 | #include <stdexcept>
 22 | 
 23 | namespace microscopes {
 24 | namespace mixture {
 25 | 
 26 | namespace detail {
 27 | 
 28 | typedef std::vector<std::shared_ptr<models::group>> group_type;
 29 | 
 30 | static inline common::serialized_t
 31 | group_type_to_string(const group_type &groups)
 32 | {
 33 |   io::MixtureModelGroup m;
 34 |   for (auto &px : groups)
 35 |     m.add_suffstats(px->get_ss());
 36 |   return common::util::protobuf_to_string(m);
 37 | }
 38 | 
 39 | static inline group_type
 40 | group_type_from_string(
 41 |     const common::serialized_t &s,
 42 |     const std::vector<std::shared_ptr<models::hypers>> &models)
 43 | {
 44 |   common::rng_t rng; // XXX: hack
 45 |   io::MixtureModelGroup m;
 46 |   common::util::protobuf_from_string(m, s);
 47 |   MICROSCOPES_DCHECK((size_t)m.suffstats_size() == models.size(), "sizes do not match");
 48 |   group_type g;
 49 |   g.reserve(models.size());
 50 |   for (size_t i = 0; i < models.size(); i++) {
 51 |     g.emplace_back(models[i]->create_group(rng));
 52 |     g.back()->set_ss(m.suffstats(i));
 53 |   }
 54 |   return g;
 55 | }
 56 | 
 57 | template <template <typename> class GroupManager>
 58 | class state {
 59 | public:
 60 |   typedef typename GroupManager<group_type>::message_type message_type;
 61 | 
 62 |   state(const std::vector<std::shared_ptr<models::hypers>> &hypers,
 63 |         const GroupManager<group_type> &groups)
 64 |     : hypers_(hypers), groups_(groups)
 65 |   {
 66 |     for (const auto &h : hypers_)
 67 |       MICROSCOPES_DCHECK(h, "hyper is null");
 68 |   }
 69 | 
 70 |   inline common::hyperparam_bag_t
 71 |   get_cluster_hp() const
 72 |   {
 73 |     return groups_.get_hp();
 74 |   }
 75 | 
 76 |   inline void
 77 |   set_cluster_hp(const common::hyperparam_bag_t &hp)
 78 |   {
 79 |     groups_.set_hp(hp);
 80 |   }
 81 | 
 82 |   inline common::value_mutator
 83 |   get_cluster_hp_mutator(const std::string &key)
 84 |   {
 85 |     return groups_.get_hp_mutator(key);
 86 |   }
 87 | 
 88 |   inline common::hyperparam_bag_t
 89 |   get_feature_hp(size_t i) const
 90 |   {
 91 |     MICROSCOPES_DCHECK(i < hypers_.size(), "invalid feature");
 92 |     return hypers_[i]->get_hp();
 93 |   }
 94 | 
 95 |   inline void
 96 |   set_feature_hp(size_t i, const common::hyperparam_bag_t &hp)
 97 |   {
 98 |     MICROSCOPES_DCHECK(i < hypers_.size(), "invalid feature");
 99 |     hypers_[i]->set_hp(hp);
100 |   }
101 | 
102 |   inline void
103 |   set_feature_hp(size_t i, const models::hypers &m)
104 |   {
105 |     MICROSCOPES_DCHECK(i < hypers_.size(), "invalid feature");
106 |     hypers_[i]->set_hp(m);
107 |   }
108 | 
109 |   inline common::value_mutator
110 |   get_feature_hp_mutator(size_t i, const std::string &key)
111 |   {
112 |     MICROSCOPES_DCHECK(i < hypers_.size(), "invalid feature");
113 |     return hypers_[i]->get_hp_mutator(key);
114 |   }
115 | 
116 |   inline common::suffstats_bag_t
117 |   get_suffstats(size_t gid, size_t fid) const
118 |   {
119 |     const auto &g = groups_.group(gid).data_;
120 |     MICROSCOPES_DCHECK(fid < g.size(), "invalid feature");
121 |     return g[fid]->get_ss();
122 |   }
123 | 
124 |   inline void
125 |   set_suffstats(size_t gid, size_t fid, const common::suffstats_bag_t &ss)
126 |   {
127 |     const auto &g = groups_.group(gid).data_;
128 |     MICROSCOPES_DCHECK(fid < g.size(), "invalid feature");
129 |     g[fid]->set_ss(ss);
130 |   }
131 | 
132 |   inline common::value_mutator
133 |   get_suffstats_mutator(size_t gid, size_t fid, const std::string &key)
134 |   {
135 |     const auto &g = groups_.group(gid).data_;
136 |     MICROSCOPES_DCHECK(fid < g.size(), "invalid feature");
137 |     return g[fid]->get_ss_mutator(key);
138 |   }
139 | 
140 |   inline const std::vector<ssize_t> &
141 |   assignments() const
142 |   {
143 |     return groups_.assignments();
144 |   }
145 | 
146 |   inline size_t nentities() const { return groups_.nentities(); }
147 |   inline size_t ngroups() const { return groups_.ngroups(); }
148 |   inline size_t nfeatures() const { return hypers_.size(); }
149 | 
150 |   inline size_t groupsize(size_t gid) const { return groups_.groupsize(gid); }
151 |   inline std::vector<size_t> groups() const { return groups_.groups(); }
152 |   inline bool isactivegroup(size_t gid) const { return groups_.isactivegroup(gid); }
153 | 
154 |   inline void
155 |   add_value(size_t gid,
156 |             size_t eid,
157 |             common::recarray::row_accessor &acc,
158 |             common::rng_t &rng)
159 |   {
160 |     auto &g = groups_.add_value(gid, eid);
161 |     acc.reset();
162 |     MICROSCOPES_DCHECK(acc.nfeatures() == g.size(),
163 |         "nfeatures mismatch");
164 |     const size_t nfeatures = acc.nfeatures();
165 |     for (size_t i = 0; i < nfeatures; i++, acc.bump()) {
166 |       // XXX: currently, multi-dimensional features are all or nothing; if any
167 |       // of the individual values are masked, we treat the whole feature value
168 |       // as masked
169 |       auto value = acc.get();
170 |       if (unlikely(value.anymasked()))
171 |         continue;
172 |       MICROSCOPES_ASSERT(hypers_[i]);
173 |       g[i]->add_value(*hypers_[i], value, rng);
174 |     }
175 |   }
176 | 
177 |   inline size_t
178 |   remove_value(size_t eid,
179 |                common::recarray::row_accessor &acc,
180 |                common::rng_t &rng)
181 |   {
182 |     auto ret = groups_.remove_value(eid);
183 |     auto &g = ret.second;
184 |     acc.reset();
185 |     MICROSCOPES_DCHECK(acc.nfeatures() == g.size(),
186 |         "nfeatures mismatch");
187 |     const size_t nfeatures = acc.nfeatures();
188 |     for (size_t i = 0; i < nfeatures; i++, acc.bump()) {
189 |       // XXX: see note in state::add_value()
190 |       auto value = acc.get();
191 |       if (unlikely(value.anymasked()))
192 |         continue;
193 |       g[i]->remove_value(*hypers_[i], value, rng);
194 |     }
195 |     return ret.first;
196 |   }
197 | 
198 |   inline std::pair<std::vector<size_t>, std::vector<float>>
199 |   score_value(common::recarray::row_accessor &acc, common::rng_t &rng) const
200 |   {
201 |     std::pair<std::vector<size_t>, std::vector<float>> ret;
202 |     ret.first.reserve(ngroups());
203 |     ret.second.reserve(ngroups());
204 |     inplace_score_value(ret, acc, rng);
205 |     return ret;
206 |   }
207 | 
208 |   inline void
209 |   inplace_score_value(
210 |     std::pair<std::vector<size_t>, std::vector<float>> &scores,
211 |     common::recarray::row_accessor &acc,
212 |     common::rng_t &rng) const
213 |   {
214 |     MICROSCOPES_DCHECK(acc.nfeatures() == nfeatures(),
215 |         "nfeatures mismatch");
216 | 
217 |     scores.first.clear();
218 |     scores.second.clear();
219 | 
220 |     using distributions::fast_log;
221 | 
222 |     // stash the value_accessors so we don't have to keep
223 |     // reconstructing them in the inner loop below
224 |     std::vector<std::pair<bool, common::value_accessor>> accessors;
225 |     accessors.reserve(nfeatures());
226 |     acc.reset();
227 |     for (size_t i = 0; i < acc.nfeatures(); i++, acc.bump()) {
228 |       auto value = acc.get();
229 |       if (unlikely(value.anymasked()))
230 |         accessors.emplace_back(true, common::value_accessor());
231 |       else
232 |         accessors.emplace_back(false, value);
233 |     }
234 | 
235 |     float pseudocounts = 0.;
236 |     for (const auto &group : groups_) {
237 |       const float pseudocount =
238 |         groups_.pseudocount(group.first, group.second);
239 |       float sum = fast_log(pseudocount);
240 |       MICROSCOPES_ASSERT(accessors.size() == group.second.data_.size());
241 |       for (size_t i = 0; i < accessors.size(); i++) {
242 |         if (unlikely(accessors[i].first))
243 |           continue;
244 |         sum += group.second.data_[i]->score_value(
245 |             *hypers_[i], accessors[i].second, rng);
246 |       }
247 |       scores.first.push_back(group.first);
248 |       scores.second.push_back(sum);
249 |       pseudocounts += pseudocount;
250 |     }
251 | 
252 |     const float lgnorm = fast_log(pseudocounts);
253 |     for (auto &s : scores.second)
254 |       s -= lgnorm;
255 |   }
256 | 
257 |   // accumulate (sum) score_data over the suff-stats of the cartesian-product
258 |   // of [features] x [groups]
259 |   float
260 |   score_data(const std::vector<size_t> &fs,
261 |              const std::vector<size_t> &gs,
262 |              common::rng_t &rng) const
263 |   {
264 |     // XXX: out of laziness, we copy
265 |     std::vector<size_t> fids(fs);
266 |     if (fids.empty())
267 |       common::util::inplace_range(fids, hypers_.size());
268 |     std::vector<size_t> gids(gs);
269 |     if (gids.empty())
270 |       gids = groups();
271 |     float sum = 0.;
272 |     for (auto gid : gids) {
273 |       const auto &gdata = groups_.group(gid).data_;
274 |       for (auto f : fids)
275 |         sum += gdata[f]->score_data(*hypers_[f], rng);
276 |     }
277 |     return sum;
278 |   }
279 | 
280 |   // XXX: we assume the caller has taken care to set the groups correctly!
281 |   size_t
282 |   sample_post_pred(common::recarray::row_accessor &acc,
283 |                    common::recarray::row_mutator &mut,
284 |                    common::rng_t &rng) const
285 |   {
286 |     MICROSCOPES_DCHECK(acc.nfeatures() == mut.nfeatures(),
287 |         "nfeatures not the same");
288 |     auto scores = score_value(acc, rng);
289 |     const auto choice =
290 |       scores.first[common::util::sample_discrete_log(scores.second, rng)];
291 |     const auto &gdata = groups_.group(choice).data_;
292 | 
293 |     acc.reset();
294 |     mut.reset();
295 |     for (size_t i = 0; !acc.end(); acc.bump(), mut.bump(), i++) {
296 |       if (!acc.anymasked()) {
297 |         mut.set(acc);
298 |         continue;
299 |       }
300 |       auto value_mut = mut.set();
301 |       gdata[i]->sample_value(*hypers_[i], value_mut, rng);
302 |     }
303 | 
304 |     return choice;
305 |   }
306 | 
307 |   inline float score_assignment() const { return groups_.score_assignment(); }
308 | 
309 |   inline float
310 |   score_joint(common::rng_t &rng) const
311 |   {
312 |     const std::vector<size_t> empty;
313 |     return score_assignment() + score_data(empty, empty, rng);
314 |   }
315 | 
316 |   // for debugging purposes
317 |   void
318 |   dcheck_consistency() const
319 |   {
320 |     // XXX: implement me
321 |   }
322 | 
323 |   common::serialized_t
324 |   serialize() const
325 |   {
326 |     io::MixtureModelState m;
327 |     for (auto &p : hypers_)
328 |       m.add_hypers(p->get_hp());
329 |     m.set_groups(groups_.serialize(group_type_to_string));
330 |     return common::util::protobuf_to_string(m);
331 |   }
332 | 
333 | protected:
334 |   std::vector<std::shared_ptr<models::hypers>> hypers_;
335 |   GroupManager<group_type> groups_;
336 | };
337 | 
338 | // template instantiation
339 | extern template class state<common::group_manager>;
340 | 
341 | class model_definition {
342 | public:
343 |   model_definition(
344 |       size_t n,
345 |       const std::vector<std::shared_ptr<models::model>> &models)
346 |     : n_(n), models_(models)
347 |   {
348 |     MICROSCOPES_DCHECK(n > 0, "no entities given");
349 |     MICROSCOPES_DCHECK(models.size() > 0, "no features given");
350 |   }
351 | 
352 |   std::vector<common::runtime_type>
353 |   get_runtime_types() const
354 |   {
355 |     std::vector<common::runtime_type> ret;
356 |     ret.reserve(models_.size());
357 |     for (const auto &m : models_)
358 |       ret.push_back(m->get_runtime_type());
359 |     return ret;
360 |   }
361 | 
362 |   std::vector<std::shared_ptr<models::hypers>>
363 |   create_hypers() const
364 |   {
365 |     std::vector<std::shared_ptr<models::hypers>> ret;
366 |     ret.reserve(models_.size());
367 |     for (auto &m : models_)
368 |       ret.emplace_back(m->create_hypers());
369 |     return ret;
370 |   }
371 | 
372 |   inline size_t n() const { return n_; }
373 | 
374 |   inline const std::vector<std::shared_ptr<models::model>> &
375 |   models() const
376 |   {
377 |     return models_;
378 |   }
379 | 
380 |   inline size_t
381 |   nmodels() const
382 |   {
383 |     return models().size();
384 |   }
385 | 
386 | private:
387 |   size_t n_;
388 |   std::vector<std::shared_ptr<models::model>> models_;
389 | };
390 | 
391 | } // namespace detail
392 | 
393 | class model_definition : public detail::model_definition {
394 | public:
395 |   model_definition(
396 |       size_t n,
397 |       const std::vector<std::shared_ptr<models::model>> &models)
398 |     : detail::model_definition(n, models) {}
399 | };
400 | 
401 |   /**
402 |    * randomly initializes to a valid point in the state space
403 |    *
404 |    * if the assignment vector passed in is empty, generates a random one;
405 |    * otherwise, uses the assignment vector
406 |    */
407 | 
408 | class state : public detail::state<common::group_manager> {
409 | public:
410 |   state(const std::vector<std::shared_ptr<models::hypers>> &hypers,
411 |         const common::group_manager<detail::group_type> &groups)
412 |     : detail::state<common::group_manager>(hypers, groups)
413 |   {}
414 | 
415 |   inline size_t
416 |   create_group(common::rng_t &rng)
417 |   {
418 |     auto ret = groups_.create_group();
419 |     auto &gdata = ret.second;
420 |     gdata.reserve(hypers_.size());
421 |     for (auto &m : hypers_)
422 |       gdata.emplace_back(m->create_group(rng));
423 |     return ret.first;
424 |   }
425 | 
426 |   inline void
427 |   delete_group(size_t gid)
428 |   {
429 |     groups_.delete_group(gid);
430 |   }
431 | 
432 |   inline const std::set<size_t> &
433 |   empty_groups() const
434 |   {
435 |     return groups_.empty_groups();
436 |   }
437 | 
438 |   // XXX: helper function, move to outer mixturemodel once we
439 |   // abstract better
440 |   void
441 |   ensure_k_empty_groups(size_t k, bool resample, common::rng_t &rng)
442 |   {
443 |     if (resample) {
444 |       // delete all empty groups
445 |       const std::vector<size_t> egids(
446 |           empty_groups().begin(),
447 |           empty_groups().end());
448 |       for (auto egid : egids)
449 |         groups_.delete_group(egid);
450 |     }
451 |     const size_t esize = empty_groups().size();
452 |     if (esize == k)
453 |       return;
454 |     else if (esize > k) {
455 |       // set iterators do not support iter + size_type
456 |       auto it = empty_groups().cbegin();
457 |       for (size_t i = 0; i < (esize-k); ++i, ++it)
458 |         ;
459 |       const std::vector<size_t> egids(it, empty_groups().cend());
460 |       for (auto egid : egids)
461 |         delete_group(egid);
462 |     } else {
463 |       for (size_t i = 0; i < (k-esize); i++)
464 |         create_group(rng);
465 |     }
466 |     MICROSCOPES_ASSERT( empty_groups().size() == k );
467 |   }
468 | 
469 |   /**
470 |    * initialized to an **invalid** point in the state space!
471 |    *
472 |    *   (A) no entities assigned
473 |    *   (B) no groups
474 |    *   (C) no hypers initialized
475 |    *
476 |    * useful primarily for testing purposes
477 |    */
478 |   static std::shared_ptr<state>
479 |   unsafe_initialize(const model_definition &def)
480 |   {
481 |     return std::make_shared<state>(
482 |         def.create_hypers(),
483 |         common::group_manager<detail::group_type>(def.n()));
484 |   }
485 | 
486 |   /**
487 |    * randomly initializes to a valid point in the state space
488 |    */
489 |   static std::shared_ptr<state>
490 |   initialize(const model_definition &def,
491 |              const common::hyperparam_bag_t &cluster_init,
492 |              const std::vector<common::hyperparam_bag_t> &feature_inits,
493 |              const std::vector<size_t> &assignments,
494 |              common::recarray::dataview &data,
495 |              common::rng_t &rng)
496 |   {
497 |     MICROSCOPES_DCHECK(def.models().size() == feature_inits.size(),
498 |         "init size mismatch");
499 |     auto p = unsafe_initialize(def);
500 |     p->set_cluster_hp(cluster_init);
501 |     for (size_t i = 0; i < feature_inits.size(); i++)
502 |       p->set_feature_hp(i, feature_inits[i]);
503 |     std::vector<size_t> assign;
504 |     if (assignments.empty())
505 |       assign = common::util::random_assignment_vector(data.size(), rng);
506 |     else
507 |       assign = assignments;
508 |     MICROSCOPES_DCHECK(assign.size() == data.size(),
509 |       "invalid length assignment vector");
510 |     const size_t ngroups = *std::max_element(assign.begin(), assign.end()) + 1;
511 |     for (size_t i = 0; i < ngroups; i++)
512 |       p->create_group(rng);
513 |     data.reset();
514 |     for (size_t i = 0; i < assign.size(); i++, data.next()) {
515 |       auto acc = data.get();
516 |       p->add_value(assign[i], i, acc, rng);
517 |     }
518 |     return p;
519 |   }
520 | 
521 |   static std::shared_ptr<state>
522 |   deserialize(const model_definition &def,
523 |               const common::serialized_t &s)
524 |   {
525 |     std::vector<std::shared_ptr<models::hypers>> hypers;
526 |     hypers.reserve(def.models().size());
527 |     io::MixtureModelState m;
528 |     common::util::protobuf_from_string(m, s);
529 |     MICROSCOPES_DCHECK((size_t)m.hypers_size() == def.models().size(), "inconsistent");
530 |     for (size_t i = 0; i < def.models().size(); i++) {
531 |       auto &p = def.models()[i];
532 |       hypers.emplace_back(p->create_hypers());
533 |       hypers.back()->set_hp(m.hypers(i));
534 |     }
535 |     common::group_manager<detail::group_type> g(
536 |         m.groups(),
537 |         [&hypers](const std::string &s) {
538 |           return detail::group_type_from_string(s, hypers);
539 |         });
540 |     return std::make_shared<state>(hypers, g);
541 |   }
542 | 
543 | };
544 | 
545 | namespace detail {
546 | 
547 | template <typename T, typename Base>
548 | class model : public Base {
549 | public:
550 |   model(const std::shared_ptr<T> &impl,
551 |         const std::shared_ptr<common::recarray::dataview> &data)
552 |     : impl_(impl), data_(data) {}
553 | 
554 |   size_t nentities() const override { return impl_->nentities(); }
555 |   size_t ngroups() const override { return impl_->ngroups(); }
556 |   size_t ncomponents() const override { return impl_->nfeatures(); }
557 | 
558 |   std::vector<ssize_t> assignments() const override { return impl_->assignments(); }
559 |   std::vector<size_t> groups() const override { return impl_->groups(); }
560 | 
561 |   size_t groupsize(size_t gid) const override { return impl_->groupsize(gid); }
562 | 
563 |   common::hyperparam_bag_t
564 |   get_cluster_hp() const override
565 |   {
566 |     return impl_->get_cluster_hp();
567 |   }
568 | 
569 |   void
570 |   set_cluster_hp(const common::hyperparam_bag_t &hp) override
571 |   {
572 |     impl_->set_cluster_hp(hp);
573 |   }
574 | 
575 |   common::value_mutator
576 |   get_cluster_hp_mutator(const std::string &key) override
577 |   {
578 |     return impl_->get_cluster_hp_mutator(key);
579 |   }
580 | 
581 |   common::hyperparam_bag_t
582 |   get_component_hp(size_t component) const override
583 |   {
584 |     return impl_->get_feature_hp(component);
585 |   }
586 | 
587 |   void
588 |   set_component_hp(size_t component,
589 |                    const common::hyperparam_bag_t &hp) override
590 |   {
591 |     impl_->set_feature_hp(component, hp);
592 |   }
593 | 
594 |   void
595 |   set_component_hp(size_t component,
596 |                    const models::hypers &proto) override
597 |   {
598 |     impl_->set_feature_hp(component, proto);
599 |   }
600 | 
601 |   common::value_mutator
602 |   get_component_hp_mutator(
603 |       size_t component, const std::string &key) override
604 |   {
605 |     return impl_->get_feature_hp_mutator(component, key);
606 |   }
607 | 
608 |   std::vector<common::ident_t>
609 |   suffstats_identifiers(size_t component) const override
610 |   {
611 |     return impl_->groups();
612 |   }
613 | 
614 |   common::suffstats_bag_t
615 |   get_suffstats(size_t component, common::ident_t id) const override
616 |   {
617 |     return impl_->get_suffstats(id, component);
618 |   }
619 | 
620 |   void
621 |   set_suffstats(
622 |       size_t component, common::ident_t id,
623 |       const common::suffstats_bag_t &ss) override
624 |   {
625 |     impl_->set_suffstats(id, component, ss);
626 |   }
627 | 
628 |   common::value_mutator
629 |   get_suffstats_mutator(
630 |       size_t component,
631 |       common::ident_t id, const std::string &key) override
632 |   {
633 |     return impl_->get_suffstats_mutator(id, component, key);
634 |   }
635 | 
636 |   void
637 |   add_value(
638 |       size_t gid, size_t eid, common::rng_t &rng) override
639 |   {
640 |     common::recarray::row_accessor acc = data_->get(eid);
641 |     impl_->add_value(gid, eid, acc, rng);
642 |   }
643 | 
644 |   size_t
645 |   remove_value(size_t eid, common::rng_t &rng) override
646 |   {
647 |     common::recarray::row_accessor acc = data_->get(eid);
648 |     return impl_->remove_value(eid, acc, rng);
649 |   }
650 | 
651 |   std::pair<std::vector<size_t>, std::vector<float>>
652 |   score_value(size_t eid, common::rng_t &rng) const override
653 |   {
654 |     common::recarray::row_accessor acc = data_->get(eid);
655 |     return impl_->score_value(acc, rng);
656 |   }
657 | 
658 |   void
659 |   inplace_score_value(
660 |       std::pair<std::vector<size_t>, std::vector<float>> &scores,
661 |       size_t eid,
662 |       common::rng_t &rng) const override
663 |   {
664 |     common::recarray::row_accessor acc = data_->get(eid);
665 |     impl_->inplace_score_value(scores, acc, rng);
666 |   }
667 | 
668 |   float score_assignment()
669 |   const override
670 |   {
671 |     return impl_->score_assignment();
672 |   }
673 | 
674 |   float
675 |   score_likelihood(
676 |       size_t component,
677 |       common::ident_t id,
678 |       common::rng_t &rng) const override
679 |   {
680 |     return impl_->score_data({component}, {id}, rng);
681 |   }
682 | 
683 |   float
684 |   score_likelihood(
685 |       size_t component, common::rng_t &rng) const override
686 |   {
687 |     return impl_->score_data({component}, {}, rng);
688 |   }
689 | 
690 |   inline const std::shared_ptr<T> & state() const { return impl_; }
691 | 
692 | protected:
693 |   std::shared_ptr<T> impl_;
694 |   std::shared_ptr<common::recarray::dataview> data_;
695 | };
696 | 
697 | extern template class model<
698 |   mixture::state,
699 |   common::entity_based_state_object
700 | >;
701 | 
702 | } // namespace detail
703 | 
704 | class model :
705 |   public detail::model<
706 |       mixture::state,
707 |       common::entity_based_state_object>
708 | {
709 | public:
710 |   model(
711 |       const std::shared_ptr<mixture::state> &impl,
712 |       const std::shared_ptr<common::recarray::dataview> &data)
713 |     : detail::model<
714 |         mixture::state,
715 |         common::entity_based_state_object>(impl, data)
716 |   {}
717 | 
718 |   std::vector<size_t>
719 |   empty_groups() const override
720 |   {
721 |     const auto &egs = impl_->empty_groups();
722 |     return std::vector<size_t>(egs.begin(), egs.end());
723 |   }
724 | 
725 |   size_t create_group(common::rng_t &rng) override { return impl_->create_group(rng); }
726 |   void delete_group(size_t gid) override { impl_->delete_group(gid); }
727 | };
728 | 
729 | } // namespace mixture
730 | } // namespace microscopes
731 | 


--------------------------------------------------------------------------------
/microscopes/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 | __path__ = extend_path(__path__, __name__)
3 | 


--------------------------------------------------------------------------------
/microscopes/mixture/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # NOTE: _version_base must be of the form
 4 | # _version_base = '...', since setup.py depends on it
 5 | _version_base = '0.1.0'
 6 | 
 7 | try:
 8 |     # read git hash from file
 9 |     with open(os.path.join(os.path.dirname(__file__), 'githash.txt')) as fp:
10 |         _githash = fp.read().strip()
11 |     __version__ = '{base}-{githash}'.format(
12 |         base=_version_base, githash=_githash)
13 | except IOError:
14 |     __version__ = _version_base
15 | 


--------------------------------------------------------------------------------
/microscopes/mixture/_model.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp cimport bool as cbool
 2 | from libcpp.vector cimport vector
 3 | from libcpp.utility cimport pair
 4 | from libcpp.string cimport string
 5 | from libc.stdint cimport uint8_t
 6 | from libc.stddef cimport size_t
 7 | 
 8 | from microscopes._shared_ptr_h cimport shared_ptr
 9 | from microscopes._models_h cimport model as c_component_model
10 | from microscopes.common._typedefs_h cimport (
11 |     hyperparam_bag_t,
12 |     suffstats_bag_t,
13 | )
14 | from microscopes.common._dataview cimport get_c_types, get_np_type
15 | from microscopes.common.recarray._dataview cimport (
16 |     numpy_dataview,
17 |     abstract_dataview,
18 | )
19 | from microscopes.common.recarray._dataview_h cimport (
20 |     row_accessor,
21 |     row_mutator,
22 |     row_major_dataview,
23 | )
24 | from microscopes.common._runtime_type_h cimport runtime_type
25 | from microscopes.common._rng cimport rng
26 | from microscopes.common._entity_state_h cimport (
27 |     entity_based_state_object as c_entity_based_state_object,
28 | )
29 | from microscopes.common._entity_state cimport (
30 |     entity_based_state_object,
31 | )
32 | from microscopes.mixture.definition cimport model_definition
33 | 
34 | from microscopes.mixture._model_h cimport (
35 |     state as c_state,
36 |     model as c_model,
37 | )
38 | from microscopes.mixture._state_h cimport (
39 |     initialize as c_initialize,
40 |     deserialize as c_deserialize,
41 | )
42 | cimport numpy as np
43 | 
44 | 
45 | cdef class state:
46 |     cdef shared_ptr[c_state] _thisptr
47 | 
48 |     # XXX: the type/structure information below is not technically
49 |     # part of the model, and we should find a way to remove this
50 |     # in the future
51 |     cdef model_definition _defn
52 | 


--------------------------------------------------------------------------------
/microscopes/mixture/_model.pyx:
--------------------------------------------------------------------------------
  1 | # cython: embedsignature=True
  2 | 
  3 | 
  4 | # python imports
  5 | import numpy as np
  6 | import numpy.ma as ma
  7 | import copy
  8 | 
  9 | from microscopes.common._rng import rng
 10 | from microscopes.common._entity_state import entity_based_state_object
 11 | from microscopes.common.recarray._dataview cimport abstract_dataview
 12 | from microscopes.io.schema_pb2 import CRP
 13 | from distributions.io.schema_pb2 import DirichletDiscrete
 14 | from microscopes.common import validator
 15 | 
 16 | 
 17 | cdef numpy_dataview get_dataview_for(y):
 18 |     """
 19 |     creates a dataview for a single recarray
 20 | 
 21 |     not very efficient
 22 |     """
 23 | 
 24 |     cdef np.ndarray inp_data
 25 |     cdef np.ndarray inp_mask
 26 | 
 27 |     if hasattr(y, 'mask'):
 28 |         # deal with the mask
 29 |         inp_mask = np.ascontiguousarray(y.mask)
 30 |     else:
 31 |         inp_mask = None
 32 | 
 33 |     # this allows us to unify the two possible representations here
 34 |     # notice:
 35 |     # In [53]: y
 36 |     # Out[53]:
 37 |     # masked_array(data = [(--, 10.0)],
 38 |     #              mask = [(True, False)],
 39 |     #        fill_value = (True, 1e+20),
 40 |     #             dtype = [('f0', '?'), ('f1', '<f8')])
 41 |     #
 42 |     # In [54]: np.ascontiguousarray(y)
 43 |     # Out[54]:
 44 |     # array([(True, 10.0)],
 45 |     #       dtype=[('f0', '?'), ('f1', '<f8')])
 46 |     #
 47 |     # In [57]: np.ascontiguousarray(y[0])
 48 |     # Out[57]:
 49 |     # array([(True, 10.0)],
 50 |     #       dtype=[('f0', '?'), ('f1', '<f8')])
 51 | 
 52 |     inp_data = np.ascontiguousarray(y)
 53 |     if inp_mask is not None:
 54 |         inp_data = ma.array(inp_data, mask=inp_mask)
 55 | 
 56 |     return numpy_dataview(inp_data)
 57 | 
 58 | 
 59 | cdef class state:
 60 |     """The underlying state of a Dirichlet Process mixture model.
 61 | 
 62 |     You should not explicitly construct a state object.
 63 |     Instead, use `initialize`.
 64 |     """
 65 |     def __cinit__(self, model_definition defn, **kwargs):
 66 |         self._defn = defn
 67 |         cdef vector[hyperparam_bag_t] c_feature_hps_bytes
 68 |         cdef vector[size_t] c_assignment
 69 | 
 70 |         # note: python cannot overload __cinit__(), so we
 71 |         # use kwargs to handle both the random initialization case and
 72 |         # the deserialize from string case
 73 |         if not (('data' in kwargs) ^ ('bytes' in kwargs)):
 74 |             raise ValueError("need exaclty one of `data' or `bytes'")
 75 | 
 76 |         valid_kwargs = ('data', 'bytes', 'r',
 77 |                         'cluster_hp', 'feature_hps', 'assignment',)
 78 |         validator.validate_kwargs(kwargs, valid_kwargs)
 79 | 
 80 |         if 'data' in kwargs:
 81 |             # handle the random initialization case
 82 |             self._validate_kwargs(kwargs)
 83 |             cluster_hp_bytes = self._get_cluster_hp_bytes(kwargs)
 84 |             c_feature_hps_bytes = self._get_feature_hp_bytes(kwargs)
 85 |             c_assignment = self._get_assignment(kwargs)
 86 | 
 87 |             self._thisptr = c_initialize(
 88 |                 defn._thisptr.get()[0],
 89 |                 cluster_hp_bytes,
 90 |                 c_feature_hps_bytes,
 91 |                 c_assignment,
 92 |                 (<abstract_dataview> kwargs['data'])._thisptr.get()[0],
 93 |                 (<rng> kwargs['r']  )._thisptr[0])
 94 |         else:
 95 |             # handle the deserialize case
 96 |             self._thisptr = c_deserialize(
 97 |                 defn._thisptr.get()[0],
 98 |                 kwargs['bytes'])
 99 | 
100 |         if self._thisptr.get() == NULL:
101 |             raise RuntimeError("could not properly construct state")
102 | 
103 |     def _validate_kwargs(self, kwargs):
104 |         validator.validate_type(kwargs['data'], abstract_dataview, "data")
105 |         validator.validate_len(kwargs['data'], self._defn.n(), "data")
106 | 
107 |         if 'r' not in kwargs:
108 |             raise ValueError("need parameter `r'")
109 |         validator.validate_type(kwargs['r'], rng, "r")
110 | 
111 |     def _get_cluster_hp_bytes(self, kwargs):
112 |         cluster_hp = kwargs.get('cluster_hp', None)
113 |         if cluster_hp is None:
114 |             cluster_hp = {'alpha': 1.}
115 |         validator.validate_type(cluster_hp, dict, "cluster_hp")
116 | 
117 |         m = CRP()
118 |         m.alpha = cluster_hp['alpha']
119 |         return m.SerializeToString()
120 | 
121 |     def _get_feature_hp_bytes(self, kwargs):
122 |         cdef vector[hyperparam_bag_t] c_feature_hps_bytes
123 |         feature_hps = kwargs.get('feature_hps', None)
124 |         if feature_hps is None:
125 |             feature_hps = [m.default_hyperparams() for m in self._defn.models()]
126 |         validator.validate_len(
127 |             feature_hps, len(self._defn.models()), "feature_hps")
128 |         feature_hps_bytes = [
129 |             m.py_desc().shared_dict_to_bytes(hp)
130 |             for hp, m in zip(feature_hps, self._defn.models())]
131 |         for s in feature_hps_bytes:
132 |             c_feature_hps_bytes.push_back(s)
133 |         return c_feature_hps_bytes
134 | 
135 |     def _get_assignment(self, kwargs):
136 |         cdef vector[size_t] c_assignment
137 |         assignment = kwargs.get('assignment', None)
138 |         if assignment is not None:
139 |             validator.validate_len(assignment, kwargs['data'].size(), "assignment")
140 |             for s in assignment:
141 |                 validator.validate_nonnegative(s)
142 |                 c_assignment.push_back(s)
143 |         return c_assignment
144 | 
145 |     # XXX: get rid of these introspection methods in the future
146 |     def get_feature_types(self):
147 |         models = self._defn.models()
148 |         types = [m.py_desc()._model_module for m in models]
149 |         return types
150 | 
151 |     def get_feature_dtypes(self):
152 |         models = self._defn.models()
153 |         dtypes = [('', m.py_desc().get_np_dtype()) for m in models]
154 |         return np.dtype(dtypes)
155 | 
156 |     def get_cluster_hp(self):
157 |         m = CRP()
158 |         raw = str(self._thisptr.get().get_cluster_hp())
159 |         m.ParseFromString(raw)
160 |         return {'alpha': m.alpha}
161 | 
162 |     def set_cluster_hp(self, dict raw):
163 |         m = CRP()
164 |         m.alpha = float(raw['alpha'])
165 |         self._thisptr.get().set_cluster_hp(m.SerializeToString())
166 | 
167 |     def _validate_eid(self, eid):
168 |         validator.validate_in_range(eid, self.nentities())
169 | 
170 |     def _validate_fid(self, fid):
171 |         validator.validate_in_range(fid, self.nfeatures())
172 | 
173 |     def _validate_gid(self, gid):
174 |         if not self._thisptr.get().isactivegroup(gid):
175 |             raise ValueError("invalid gid: {}".format(gid))
176 | 
177 |     def get_feature_hp(self, int i):
178 |         self._validate_fid(i)
179 |         raw = str(self._thisptr.get().get_feature_hp(i))
180 |         models = self._defn.models()
181 |         return models[i].py_desc().shared_bytes_to_dict(raw)
182 | 
183 |     def set_feature_hp(self, int i, dict d):
184 |         self._validate_fid(i)
185 |         models = self._defn.models()
186 |         cdef hyperparam_bag_t raw = models[i].py_desc().shared_dict_to_bytes(d)
187 |         self._thisptr.get().set_feature_hp(i, raw)
188 | 
189 |     def get_suffstats(self, int gid, int fid):
190 |         self._validate_fid(fid)
191 |         self._validate_gid(gid)
192 |         models = self._defn.models()
193 |         raw = str(self._thisptr.get().get_suffstats(gid, fid))
194 |         return models[fid].py_desc().group_bytes_to_dict(raw)
195 | 
196 |     def set_suffstats(self, int gid, int fid, dict d):
197 |         self._validate_fid(fid)
198 |         self._validate_gid(gid)
199 |         models = self._defn.models()
200 |         cdef suffstats_bag_t raw = (
201 |             models[fid].py_desc().shared_dict_to_bytes(d)
202 |         )
203 |         self._thisptr.get().set_suffstats(gid, fid, raw)
204 | 
205 |     def assignments(self):
206 |         return list(self._thisptr.get().assignments())
207 | 
208 |     def empty_groups(self):
209 |         return list(self._thisptr.get().empty_groups())
210 | 
211 |     def ngroups(self):
212 |         return self._thisptr.get().ngroups()
213 | 
214 |     def nentities(self):
215 |         return self._thisptr.get().nentities()
216 | 
217 |     def nfeatures(self):
218 |         return len(self._defn.models())
219 | 
220 |     def groupsize(self, int gid):
221 |         self._validate_gid(gid)
222 |         return self._thisptr.get().groupsize(gid)
223 | 
224 |     def is_group_empty(self, int gid):
225 |         self._validate_gid(gid)
226 |         return not self._groups.nentities_in_group(gid)
227 | 
228 |     def groups(self):
229 |         cdef list g = self._thisptr.get().groups()
230 |         return g
231 | 
232 |     def create_group(self, rng r):
233 |         assert r
234 |         return self._thisptr.get().create_group(r._thisptr[0])
235 | 
236 |     def delete_group(self, int gid):
237 |         self._validate_gid(gid)
238 |         self._thisptr.get().delete_group(gid)
239 | 
240 |     def add_value(self, int gid, int eid, y, rng r):
241 |         self._validate_gid(gid)
242 |         self._validate_eid(eid)
243 |         # XXX: need to validate y
244 |         validator.validate_not_none(r)
245 | 
246 |         cdef numpy_dataview view = get_dataview_for(y)
247 |         cdef row_accessor acc = view._thisptr.get().get()
248 |         self._thisptr.get().add_value(gid, eid, acc, r._thisptr[0])
249 | 
250 |     def remove_value(self, int eid, y, rng r):
251 |         self._validate_eid(eid)
252 |         # XXX: need to validate y
253 |         validator.validate_not_none(r)
254 | 
255 |         cdef numpy_dataview view = get_dataview_for(y)
256 |         cdef row_accessor acc = view._thisptr.get().get()
257 |         return self._thisptr.get().remove_value(eid, acc, r._thisptr[0])
258 | 
259 |     def score_value(self, y, rng r):
260 |         # XXX: need to validate y
261 |         validator.validate_not_none(r)
262 | 
263 |         cdef numpy_dataview view = get_dataview_for(y)
264 |         cdef row_accessor acc = view._thisptr.get().get()
265 |         cdef pair[vector[size_t], vector[float]] ret = (
266 |             self._thisptr.get().score_value(acc, r._thisptr[0])
267 |         )
268 |         ret0 = list(ret.first)
269 |         ret1 = np.array(list(ret.second))
270 |         return ret0, ret1
271 | 
272 |     def score_data(self, features, groups, rng r):
273 |         validator.validate_not_none(r)
274 |         if features is None:
275 |             features = range(len(self._defn.models()))
276 |         elif not hasattr(features, '__iter__'):
277 |             features = [features]
278 | 
279 |         if groups is None:
280 |             groups = self.groups()
281 |         elif not hasattr(groups, '__iter__'):
282 |             groups = [groups]
283 | 
284 |         cdef vector[size_t] f
285 |         for i in features:
286 |             self._validate_fid(i)
287 |             f.push_back(i)
288 | 
289 |         cdef vector[size_t] g
290 |         for i in groups:
291 |             self._validate_gid(i)
292 |             g.push_back(i)
293 | 
294 |         return self._thisptr.get().score_data(f, g, r._thisptr[0])
295 | 
296 |     def sample_post_pred(self, y_new, rng r):
297 |         # XXX: need to validate y
298 |         validator.validate_not_none(r)
299 |         if y_new is None:
300 |             D = self.nfeatures()
301 |             y_new = ma.masked_array(
302 |                 np.array([tuple(0 for _ in xrange(D))], dtype=[('', int)] * D),
303 |                 mask=[tuple(True for _ in xrange(D))])
304 | 
305 |         cdef numpy_dataview view = get_dataview_for(y_new)
306 |         cdef row_accessor acc = view._thisptr.get().get()
307 | 
308 |         # ensure the state has 1 empty group
309 |         self._thisptr.get().ensure_k_empty_groups(1, False, r._thisptr[0])
310 | 
311 |         cdef vector[runtime_type] out_ctypes = \
312 |             self._defn._thisptr.get().get_runtime_types()
313 |         out_dtype = [('', get_np_type(t)) for t in out_ctypes]
314 | 
315 |         # build an appropriate numpy array to store the output
316 |         cdef np.ndarray out_npd = np.zeros(1, dtype=out_dtype)
317 | 
318 |         cdef row_mutator mut = (
319 |             row_mutator(<uint8_t *> out_npd.data, &out_ctypes)
320 |         )
321 |         gid = self._thisptr.get().sample_post_pred(acc, mut, r._thisptr[0])
322 | 
323 |         return gid, out_npd
324 | 
325 |     def score_assignment(self):
326 |         return self._thisptr.get().score_assignment()
327 | 
328 |     def score_joint(self, rng r):
329 |         validator.validate_not_none(r)
330 |         return self._thisptr.get().score_joint(r._thisptr[0])
331 | 
332 |     def dcheck_consistency(self):
333 |         self._thisptr.get().dcheck_consistency()
334 | 
335 |     def serialize(self):
336 |         return self._thisptr.get().serialize()
337 | 
338 |     def __reduce__(self):
339 |         return (_reconstruct_state, (self._defn, self.serialize()))
340 | 
341 |     def __copy__(self):
342 |         """Returns a shallow copy of this object
343 | 
344 |         Shallow copy current means the model object is shared,
345 |         but the underlying state representation is not
346 |         """
347 |         return state(self._defn, bytes=self.serialize())
348 | 
349 |     def __deepcopy__(self, memo):
350 |         defn = copy.deepcopy(self._defn, memo)
351 |         return state(defn, bytes=self.serialize())
352 | 
353 | 
354 | def bind(state s, abstract_dataview data):
355 |     cdef shared_ptr[c_entity_based_state_object] px
356 |     px.reset(new c_model(s._thisptr, data._thisptr))
357 |     cdef entity_based_state_object ret = (
358 |         entity_based_state_object(s._defn.models())
359 |     )
360 |     ret._thisptr = px
361 |     ret._refs = data
362 |     return ret
363 | 
364 | 
365 | def initialize(model_definition defn,
366 |                abstract_dataview data,
367 |                rng r,
368 |                **kwargs):
369 |     """Initialize state to a random, valid point in the state space
370 | 
371 |     Parameters
372 |     ----------
373 |     defn : model definition
374 |     data : recarray dataview
375 |     rng : random state
376 | 
377 |     """
378 |     return state(defn=defn, data=data, r=r, **kwargs)
379 | 
380 | 
381 | def deserialize(model_definition defn, bytes):
382 |     """Restore a state object from a bytestring representation.
383 | 
384 |     Note that a serialized representation of a state object does
385 |     not contain its own structural definition.
386 | 
387 |     Parameters
388 |     ----------
389 |     defn : model definition
390 |     bytes : bytestring representation
391 | 
392 |     """
393 |     return state(defn=defn, bytes=bytes)
394 | 
395 | 
396 | def _reconstruct_state(defn, bytes):
397 |     return deserialize(defn, bytes)
398 | 


--------------------------------------------------------------------------------
/microscopes/mixture/_model_h.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | from libcpp.string cimport string
 3 | from libcpp.utility cimport pair
 4 | from libcpp.set cimport set
 5 | from libcpp cimport bool as cbool
 6 | from libc.stddef cimport size_t
 7 | 
 8 | from microscopes._shared_ptr_h cimport shared_ptr
 9 | from microscopes.common.recarray._dataview_h cimport row_accessor, row_mutator, dataview
10 | from microscopes.common._random_fwd_h cimport rng_t
11 | from microscopes.common._typedefs_h cimport hyperparam_bag_t, suffstats_bag_t
12 | from microscopes.common._runtime_type_h cimport runtime_type
13 | from microscopes.common._entity_state_h cimport entity_based_state_object
14 | from microscopes._models_h cimport model as c_model
15 | 
16 | cdef extern from "microscopes/mixture/model.hpp" namespace "microscopes::mixture":
17 | 
18 |     cdef cppclass model_definition:
19 |         model_definition(size_t, const vector[shared_ptr[c_model]] &) except +
20 |         vector[runtime_type] get_runtime_types() except +
21 |         size_t nmodels()
22 | 
23 | 
24 |     cdef cppclass state:
25 |         hyperparam_bag_t get_cluster_hp() except +
26 |         void set_cluster_hp(const hyperparam_bag_t &) except +
27 |         hyperparam_bag_t get_feature_hp(size_t) except +
28 |         void set_feature_hp(size_t, const hyperparam_bag_t &) except +
29 |         suffstats_bag_t get_suffstats(size_t, size_t) except +
30 |         void set_suffstats(size_t, size_t, const suffstats_bag_t &) except +
31 | 
32 |         const vector[ssize_t] & assignments()
33 |         const set[size_t] & empty_groups()
34 | 
35 |         size_t nentities()
36 |         size_t ngroups()
37 |         size_t groupsize(size_t) except +
38 |         vector[size_t] groups() except +
39 |         cbool isactivegroup(size_t)
40 | 
41 |         size_t create_group(rng_t &) except +
42 |         void delete_group(size_t) except +
43 | 
44 |         void add_value(size_t, size_t, const row_accessor &, rng_t &) except +
45 |         size_t remove_value(size_t, const row_accessor &, rng_t &) except +
46 |         pair[vector[size_t], vector[float]] score_value(const row_accessor &, rng_t &) except +
47 |         float score_data(const vector[size_t] &, const vector[size_t] &, rng_t &) except +
48 | 
49 |         void ensure_k_empty_groups(size_t, cbool, rng_t &) except +
50 | 
51 |         size_t sample_post_pred(const row_accessor &, row_mutator &, rng_t &) except +
52 |         float score_assignment() except +
53 |         float score_joint(rng_t &) except +
54 | 
55 |         # for debugging purposes
56 |         void dcheck_consistency() except +
57 | 
58 |         string serialize() except +
59 | 
60 | 
61 |     cdef cppclass model:
62 |         model(const shared_ptr[state] &,
63 |               const shared_ptr[dataview] &) except +
64 | 


--------------------------------------------------------------------------------
/microscopes/mixture/_state_h.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | from libcpp.string cimport string
 3 | from libc.stddef cimport size_t
 4 | 
 5 | from microscopes._shared_ptr_h cimport shared_ptr
 6 | from microscopes.common._typedefs_h cimport hyperparam_bag_t, suffstats_bag_t
 7 | from microscopes.common._random_fwd_h cimport rng_t
 8 | from microscopes.common.recarray._dataview_h cimport dataview
 9 | from microscopes.mixture._model_h cimport model_definition, state
10 | 
11 | # this is annoying, we need to create a separate namespace to avoid name
12 | # collisions
13 | 
14 | cdef extern from "microscopes/mixture/model.hpp" namespace "microscopes::mixture::state":
15 |     shared_ptr[state] initialize(
16 |             const model_definition &,
17 |             const hyperparam_bag_t &,
18 |             const vector[hyperparam_bag_t] &,
19 |             const vector[size_t] &,
20 |             dataview &,
21 |             rng_t &) except +
22 | 
23 |     shared_ptr[state] deserialize(
24 |             const model_definition &,
25 |             const string &) except +
26 | 


--------------------------------------------------------------------------------
/microscopes/mixture/definition.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | 
 3 | from microscopes._shared_ptr_h cimport shared_ptr
 4 | from microscopes._models cimport _base
 5 | from microscopes._models_h cimport model as c_component_model
 6 | from microscopes.mixture._model_h cimport model_definition as c_model_definition
 7 | 
 8 | 
 9 | cdef class model_definition:
10 |     # ideally would not be shared pointer, but
11 |     # doesn't have no-arg ctor
12 |     cdef shared_ptr[c_model_definition] _thisptr
13 |     cdef readonly int _n
14 |     cdef readonly list _models
15 |     cdef readonly dict _cluster_hyperprior
16 | 


--------------------------------------------------------------------------------
/microscopes/mixture/definition.pyx:
--------------------------------------------------------------------------------
  1 | # cython: embedsignature=True
  2 | 
  3 | 
  4 | from microscopes.models import model_descriptor
  5 | from microscopes.common import validator
  6 | from microscopes.common.scalar_functions import log_exponential
  7 | import operator as op
  8 | import copy
  9 | 
 10 | 
 11 | cdef vector[shared_ptr[c_component_model]] get_cmodels(models):
 12 |     cdef vector[shared_ptr[c_component_model]] c_models
 13 |     for m in models:
 14 |         c_models.push_back((<_base> m._c_descriptor).get())
 15 |     return c_models
 16 | 
 17 | 
 18 | def _validate(models):
 19 |     validator.validate_nonempty(models)
 20 |     for m in models:
 21 |         if hasattr(m, '__len__'):
 22 |             validator.validate_len(m, 2)
 23 |             validator.validate_type(m[0], model_descriptor)
 24 |             validator.validate_type(m[1], dict)
 25 |         else:
 26 |             validator.validate_type(m, model_descriptor)
 27 | 
 28 | 
 29 | cdef class model_definition:
 30 |     """Structural definition for a dirichlet process mixture model
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     n : int
 35 |         Number of observations
 36 |     models : iterable of model descriptors
 37 |         The component likelihood models. Each element is either `x` or
 38 |         `(x, y)`, where `x` is a ``model_descriptor`` and `y` is a dict
 39 |         containing the hyperpriors. If `y` is not given, then the default
 40 |         hyperpriors are used per model.
 41 |     cluster_hyperprior : dict, optional
 42 |         Describes the hyperior for the CRP
 43 | 
 44 |     Notes
 45 |     -----
 46 |     This class is not meant to be sub-classable.
 47 | 
 48 |     """
 49 | 
 50 |     def __cinit__(self,
 51 |                   int n,
 52 |                   models,
 53 |                   cluster_hyperprior={'alpha': log_exponential(1.)}):
 54 |         validator.validate_positive(n)
 55 |         _validate(models)
 56 | 
 57 |         self._n = n
 58 |         self._models = []
 59 |         for model in models:
 60 |             if hasattr(model, '__len__'):
 61 |                 m, hp = model
 62 |             else:
 63 |                 m, hp = model, model.default_hyperpriors()
 64 |             self._models.append((m, hp))
 65 | 
 66 |         validator.validate_type(cluster_hyperprior, dict)
 67 |         if cluster_hyperprior.keys() != ['alpha']:
 68 |             msg = "invalid cluster hp: {}".format(cluster_hyperprior)
 69 |             raise ValueError(msg)
 70 |         self._cluster_hyperprior = cluster_hyperprior
 71 | 
 72 |         self._thisptr.reset(
 73 |             new c_model_definition(
 74 |                 n,
 75 |                 get_cmodels(map(op.itemgetter(0), self._models))))
 76 | 
 77 |     def n(self):
 78 |         return self._n
 79 | 
 80 |     def models(self):
 81 |         return map(op.itemgetter(0), self._models)
 82 | 
 83 |     def hyperpriors(self):
 84 |         return map(op.itemgetter(1), self._models)
 85 | 
 86 |     def cluster_hyperprior(self):
 87 |         return self._cluster_hyperprior
 88 | 
 89 |     def __reduce__(self):
 90 |         args = (self._n, self._models, self._cluster_hyperprior)
 91 |         return (_reconstruct_model_definition, args)
 92 | 
 93 |     def __copy__(self):
 94 |         args = self._n, self._models, self._cluster_hyperprior
 95 |         res = model_definition(*args)
 96 |         return res
 97 | 
 98 |     def __deepcopy__(self, memo):
 99 |         models = copy.deepcopy(self._models, memo)
100 |         cluster_hyperprior = copy.deepcopy(self._cluster_hyperprior, memo)
101 |         args = self._n, models, cluster_hyperprior
102 |         res = model_definition(*args)
103 |         return res
104 | 
105 | 
106 | def _reconstruct_model_definition(n, models, cluster_hyperprior):
107 |     return model_definition(n, models, cluster_hyperprior)
108 | 


--------------------------------------------------------------------------------
/microscopes/mixture/model.pyx:
--------------------------------------------------------------------------------
 1 | # cython: embedsignature=True
 2 | 
 3 | 
 4 | from microscopes.mixture._model import (
 5 |     state,
 6 |     bind,
 7 |     initialize,
 8 |     deserialize,
 9 | )
10 | 
11 | from microscopes.common import validator
12 | from microscopes.common.random import sample_discrete
13 | import numpy as np
14 | 
15 | 
16 | def sample(defn, cluster_hp=None, feature_hps=None, r=None):
17 |     """Sample i.i.d. values from the generative process described by `defn`.
18 | 
19 |     Parameters
20 |     ----------
21 |     defn : ``model_definition``
22 |         The generative process
23 |     cluster_hp : dict, optional
24 |     feature_hps : iterable of dicts, optional
25 |     r : ``rng``, optional
26 | 
27 |     Returns
28 |     -------
29 |     samples : tuple of samples in clusters
30 |     params : tuple of Sampler objects, used to sample the clusters
31 | 
32 |     Notes
33 |     -----
34 |     Currently, the `r` parameter is ignored
35 | 
36 |     """
37 |     dtypes = [m.py_desc().get_np_dtype() for m in defn.models()]
38 |     dtypes = np.dtype([('', dtype) for dtype in dtypes])
39 |     cluster_counts = np.array([1], dtype=np.int)
40 |     featuretypes = tuple(m.py_desc()._model_module for m in defn.models())
41 |     featureshares = [t.Shared() for t in featuretypes]
42 |     # init with defaults
43 |     for share, m in zip(featureshares, defn.models()):
44 |         share.load(m.default_hyperparams())
45 |     alpha = 1.0
46 |     if cluster_hp is not None:
47 |         alpha = float(cluster_hp['alpha'])
48 |     validator.validate_positive(alpha, "alpha")
49 |     if feature_hps is not None:
50 |         validator.validate_len(feature_hps, len(defn.models()), "feature_hps")
51 |         for share, hp in zip(featureshares, feature_hps):
52 |             share.load(hp)
53 | 
54 |     def init_sampler(arg):
55 |         typ, s = arg
56 |         samp = typ.Sampler()
57 |         samp.init(s)
58 |         return samp
59 | 
60 |     def new_cluster_params():
61 |         return tuple(map(init_sampler, zip(featuretypes, featureshares)))
62 | 
63 |     def new_sample(params):
64 |         data = tuple(samp.eval(s) for samp, s in zip(params, featureshares))
65 |         return data
66 | 
67 |     cluster_params = [new_cluster_params()]
68 |     samples = [[new_sample(cluster_params[-1])]]
69 |     for _ in xrange(1, defn._n):
70 |         dist = np.append(cluster_counts, alpha).astype(np.float, copy=False)
71 |         choice = sample_discrete(dist)
72 |         if choice == len(cluster_counts):
73 |             cluster_counts = np.append(cluster_counts, 1)
74 |             cluster_params.append(new_cluster_params())
75 |             samples.append([new_sample(cluster_params[-1])])
76 |         else:
77 |             cluster_counts[choice] += 1
78 |             params = cluster_params[choice]
79 |             samples[choice].append(new_sample(params))
80 |     return (
81 |         tuple(np.array(ys, dtype=dtypes) for ys in samples),
82 |         tuple(cluster_params)
83 |     )
84 | 


--------------------------------------------------------------------------------
/microscopes/mixture/query.py:
--------------------------------------------------------------------------------
  1 | """The query interface for mixturemodels.
  2 | 
  3 | Note that the methods of this interface all take a list of latent state objects
  4 | (as opposed to a single latent).
  5 | 
  6 | """
  7 | 
  8 | import numpy as np
  9 | import operator as op
 10 | from scipy.stats import mode
 11 | from microscopes.common import query, validator
 12 | 
 13 | 
 14 | def zmatrix(latents):
 15 |     """Compute a z-matrix (cluster co-assignment matrix). The ij-th entry of a
 16 |     z-matrix is a real value scalar between [0, 1] indicating the frequency of
 17 |     how often entities i and j appear in the same cluster.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     latents : list of mixturemodel latent objects
 22 |         The latents should all be points in the state space of the same
 23 |         structural model. The implementation currently does not check for this.
 24 | 
 25 |     Returns
 26 |     -------
 27 |     zmat : (N, N) ndarray
 28 | 
 29 |     Notes
 30 |     -----
 31 |     Currently does not support a sparse zmatrix representation, so only use
 32 |     this for small N.
 33 | 
 34 |     """
 35 |     return query.zmatrix([latent.assignments() for latent in latents])
 36 | 
 37 | 
 38 | def posterior_predictive(q, latents, r, samples_per_chain=1):
 39 |     """Generate a bag of samples from the posterior distribution of each
 40 |     mixturemodel state object.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     q : (N,) masked recarray
 45 |         The query object
 46 |     latents : list of mixturemodel latent objects
 47 |     r : random state
 48 |     samples_per_chain : int, optional
 49 |         Default is 1.
 50 | 
 51 |     Returns
 52 |     -------
 53 |     samples : (N, M) recarray
 54 |         where ``M = len(latents) * samples_per_chain``
 55 | 
 56 |     Notes
 57 |     -----
 58 |     If ``N=1``, the resultng `samples` will *not* be collasped into a (M,)
 59 |     shape recarray for consistency purposes.
 60 | 
 61 |     """
 62 | 
 63 |     if len(q.shape) != 1:
 64 |         raise ValueError("1d masked recarrays only")
 65 |     if not len(latents):
 66 |         raise ValueError("no latents given")
 67 |     validator.validate_positive(
 68 |         samples_per_chain, param_name='samples_per_chain')
 69 | 
 70 |     def f(q):
 71 |         samples = []
 72 |         for latent in latents:
 73 |             for _ in xrange(samples_per_chain):
 74 |                 samples.append(latent.sample_post_pred(q, r)[1])
 75 |         return np.hstack(samples)
 76 | 
 77 |     return np.array(map(f, q))
 78 | 
 79 | 
 80 | def _is_discrete_dtype(dtype):
 81 |     # XXX(stephentu): is there a better way?
 82 |     return (np.issubdtype(dtype, np.integer) or
 83 |             np.issubdtype(dtype, np.bool))
 84 | 
 85 | 
 86 | def posterior_predictive_statistic(q,
 87 |                                    latents,
 88 |                                    r,
 89 |                                    samples_per_chain=1,
 90 |                                    merge='avg'):
 91 |     """Sample many values and combine each feature independently using the
 92 |     given `merge` strategy.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     q : (N,) masked recarray
 97 |         The query object
 98 |     latents : list of mixturemodel latent objects
 99 |     r : random state
100 |     samples_per_chain : int, optional
101 |         Default is 1.
102 |     merge : str or list of strs, each str is one of {'avg', 'mode'}
103 |         Note that 'mode' only works for discrete data types.
104 | 
105 |     Returns
106 |     -------
107 |     statistic : (N,) recarray
108 | 
109 |     Notes
110 |     -----
111 |     This method exists as a convenience, primarily because ndarray methods such
112 |     as `mean()` do not work with recarrays.
113 | 
114 |     """
115 | 
116 |     samples = posterior_predictive(
117 |         q, latents, r, samples_per_chain=samples_per_chain)
118 | 
119 |     nfeatures = len(samples.dtype)
120 | 
121 |     # NOTE: samples.dtype is not iterable
122 |     dtypes = [samples.dtype[i] for i in xrange(nfeatures)]
123 | 
124 |     if not hasattr(merge, '__iter__'):
125 |         merge = [merge] * nfeatures
126 | 
127 |     for strat, dtype in zip(merge, dtypes):
128 |         if strat not in ('avg', 'mode'):
129 |             raise ValueError("bad merge strategy: {}".format(strat))
130 |         if strat == 'mode' and not _is_discrete_dtype(dtype):
131 |             msg = ("`mode' merge strategy cannot work "
132 |                    "with non-integral types: {}").format(dtype)
133 |             raise ValueError(msg)
134 | 
135 |     def f(samples):
136 |         values = [[] for _ in xrange(nfeatures)]
137 |         for sample in samples:
138 |             for lst, v in zip(values, sample):
139 |                 lst.append(v)
140 | 
141 |         values = [np.array(v, dtype=dtype) for v, dtype in zip(values, dtypes)]
142 | 
143 |         def statistic(value, strat):
144 |             if strat == 'avg':
145 |                 mean = value.mean()
146 |                 return mean, mean.dtype
147 |             elif strat == 'mode':
148 |                 # scipy.stats.mode() is weird
149 |                 arr = mode(value, axis=None)[0]
150 |                 assert arr.shape == (1,)
151 |                 return arr[0], value.dtype
152 |             else:
153 |                 assert False, 'should not be reached'
154 | 
155 |         stat_with_dtypes = (
156 |             [statistic(value, strat) for value, strat in zip(values, merge)]
157 |         )
158 | 
159 |         return np.array(
160 |             [tuple(map(op.itemgetter(0), stat_with_dtypes))],
161 |             dtype=[('', dt) for _, dt in stat_with_dtypes])
162 | 
163 |     return np.hstack(map(f, samples))
164 | 


--------------------------------------------------------------------------------
/microscopes/mixture/runner.py:
--------------------------------------------------------------------------------
  1 | """Implements the Runner interface for mixture models
  2 | """
  3 | 
  4 | from microscopes.common import validator
  5 | from microscopes.common.rng import rng
  6 | from microscopes.common.recarray._dataview import abstract_dataview
  7 | from microscopes.mixture.definition import model_definition
  8 | from microscopes.mixture.model import state, bind
  9 | from microscopes.kernels import gibbs, slice
 10 | from microscopes.kernels.slice import _parse_descriptor
 11 | 
 12 | import itertools as it
 13 | import copy
 14 | import numpy as np
 15 | 
 16 | 
 17 | def _validate_definition(defn):
 18 |     if not isinstance(defn, model_definition):
 19 |         raise ValueError("bad defn given")
 20 |     return defn
 21 | 
 22 | 
 23 | def default_assign_kernel_config(defn):
 24 |     """Creates a default kernel configuration for sampling the assignment
 25 |     (clustering) vector. The default kernel is currently a gibbs sampler.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     defn : mixturemodel definition
 30 | 
 31 |     """
 32 |     # XXX(stephentu): model_descriptors should implement
 33 |     # is_conjugate()
 34 | 
 35 |     def is_nonconj(x):
 36 |         return x.name() == 'bbnc'
 37 | 
 38 |     nonconj_indices = [
 39 |         idx for idx, x in enumerate(defn.models()) if is_nonconj(x)
 40 |     ]
 41 | 
 42 |     defn = _validate_definition(defn)
 43 | 
 44 |     # assignment
 45 |     if nonconj_indices:
 46 |         # XXX(stephentu): 0.1 is arbitrary
 47 |         # XXX(stephentu): don't assume bbnc
 48 |         theta_config = {
 49 |             'tparams': {i: {'p': 0.1} for i in nonconj_indices}
 50 |         }
 51 |         kernels = [
 52 |             ('assign_resample', {'m': 10}),
 53 |             ('theta', theta_config),
 54 |         ]
 55 |     else:
 56 |         kernels = ['assign']
 57 | 
 58 |     return kernels
 59 | 
 60 | 
 61 | def default_feature_hp_kernel_config(defn):
 62 |     """Creates a default kernel configuration for sampling the component
 63 |     (feature) model hyper-parameters. The default kernel is currently
 64 |     a one-dimensional slice sampler.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     defn : mixturemodel definition
 69 |         The hyper-priors set in the definition are used to configure the
 70 |         hyper-parameter sampling kernels.
 71 | 
 72 |     """
 73 |     defn = _validate_definition(defn)
 74 | 
 75 |     # hyperparams
 76 |     hparams = {}
 77 |     for i, hp in enumerate(defn.hyperpriors()):
 78 |         if not hp:
 79 |             continue
 80 |         # XXX(stephentu): we are arbitrarily picking w=0.1
 81 |         hparams[i] = {k: (fn, 0.1) for k, fn in hp.iteritems()}
 82 | 
 83 |     if not hparams:
 84 |         return []
 85 |     else:
 86 |         return [('slice_feature_hp', {'hparams': hparams})]
 87 | 
 88 | 
 89 | def default_grid_feature_hp_kernel_config(defn):
 90 |     """Creates a default kernel configuration for sampling the component
 91 |     (feature) model hyper-parameters via gridded gibbs.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     defn : mixturemodel definition
 96 |         The hyper-priors set in the definition are used to configure the
 97 |         hyper-parameter sampling kernels.
 98 | 
 99 |     """
100 |     defn = _validate_definition(defn)
101 |     config = {}
102 | 
103 |     grid = enumerate(zip(defn.models(), defn.hyperpriors()))
104 | 
105 |     for fi, (model, priors) in grid:
106 |         partials = copy.deepcopy(model.default_partial_hypergrid())
107 |         if not partials:
108 |             continue
109 | 
110 |         evals = []
111 |         for update_descs, fn in priors.iteritems():
112 |             if not hasattr(update_descs, '__iter__'):
113 |                 update_descs = [update_descs]
114 |             keyidxs = []
115 |             for update_desc in update_descs:
116 |                 key, idx = _parse_descriptor(update_desc, default=None)
117 |                 keyidxs.append((key, idx))
118 | 
119 |             def func(raw, keyidxs):
120 |                 s = 0.
121 |                 for key, idx in keyidxs:
122 |                     if idx is None:
123 |                         s += raw[key]
124 |                     else:
125 |                         s += raw[key][idx]
126 |                 return s
127 |             evals.append(lambda raw, keyidxs=keyidxs: func(raw, keyidxs))
128 | 
129 |         def jointprior(raw, evals):
130 |             return np.array([f(raw) for f in evals]).sum()
131 | 
132 |         config[fi] = {
133 |             'hpdf': lambda raw, evals=evals: jointprior(raw, evals),
134 |             'hgrid': partials,
135 |         }
136 | 
137 |     if not config:
138 |         return []
139 |     else:
140 |         return [('grid_feature_hp', config)]
141 | 
142 | 
143 | def default_cluster_hp_kernel_config(defn):
144 |     """Creates a default kernel configuration for sampling the clustering
145 |     (Chinese Restaurant Process) model hyper-parameter. The default kernel is
146 |     currently a one-dimensional slice sampler.
147 | 
148 |     Parameters
149 |     ----------
150 |     defn : mixturemodel definition
151 |         The hyper-priors set in the definition are used to configure the
152 |         hyper-parameter sampling kernels.
153 |     """
154 |     defn = _validate_definition(defn)
155 |     hp = defn.cluster_hyperprior()
156 |     cparam = {k: (fn, 0.1) for k, fn in hp.iteritems()}
157 |     if not cparam:
158 |         return None
159 |     else:
160 |         return [('slice_cluster_hp', {'cparam': cparam})]
161 | 
162 | 
163 | def default_kernel_config(defn):
164 |     """Creates a default kernel configuration suitable for general purpose
165 |     inference. Currently configures an assignment sampler followed by a
166 |     component hyper-parameter sampler.
167 | 
168 |     Parameters
169 |     ----------
170 |     defn : mixturemodel definition
171 | 
172 |     """
173 |     # XXX(stephentu): should the default config also include cluster_hp?
174 |     return list(it.chain(
175 |         default_assign_kernel_config(defn),
176 |         default_feature_hp_kernel_config(defn)))
177 | 
178 | 
179 | class runner(object):
180 |     """The dirichlet process mixture model runner
181 | 
182 |     Parameters
183 |     ----------
184 | 
185 |     defn : ``model_definition``
186 |         The structural definition.
187 | 
188 |     view : a recarray dataview
189 |         The observations.
190 | 
191 |     latent : ``state``
192 |         The initialization state. Note that a *copy* of `latent` is
193 |         made. Use :meth:`get_latent` to access the modified state.
194 | 
195 |     kernel_config : list
196 |         A list of either `x` strings or `(x, y)` tuples, where `x` is a string
197 |         containing the name of the kernel and `y` is a dict which configures
198 |         the particular kernel. In the former case where `y` is omitted, then
199 |         the defaults parameters for each kernel are used.
200 | 
201 |         Possible values of `x` are:
202 |         {assign', 'assign_resample', 'grid_feature_hp', 'slice_feature_hp',
203 |          'slice_cluster_hp'}
204 | 
205 |     """
206 | 
207 |     def __init__(self, defn, view, latent, kernel_config):
208 |         defn = _validate_definition(defn)
209 |         validator.validate_type(view, abstract_dataview, param_name='view')
210 |         if not isinstance(latent, state):
211 |             raise ValueError("bad latent given")
212 |         validator.validate_len(view, defn.n())
213 | 
214 |         def require_feature_indices(v):
215 |             nfeatures = len(defn.models())
216 |             valid_keys = set(xrange(nfeatures))
217 |             if not set(v.keys()).issubset(valid_keys):
218 |                 msg = "bad config found: {}".format(v)
219 |                 raise ValueError(msg)
220 | 
221 |         self._defn = defn
222 |         self._view = view
223 |         self._latent = copy.deepcopy(latent)
224 | 
225 |         self._kernel_config = []
226 |         for kernel in kernel_config:
227 | 
228 |             if hasattr(kernel, '__iter__'):
229 |                 name, config = kernel
230 |             else:
231 |                 name, config = kernel, {}
232 |             validator.validate_dict_like(config)
233 | 
234 |             if name == 'assign':
235 |                 if config:
236 |                     raise ValueError("assign has no parameters")
237 | 
238 |             elif name == 'assign_resample':
239 |                 if config.keys() != ['m']:
240 |                     raise ValueError("bad config found: {}".format(config))
241 |                 validator.validate_positive(config['m'])
242 | 
243 |             elif name == 'grid_feature_hp':
244 |                 require_feature_indices(config)
245 |                 for fi, ps in config.iteritems():
246 |                     if set(ps.keys()) != set(('hpdf', 'hgrid',)):
247 |                         raise ValueError("bad config found: {}".format(ps))
248 |                     full = []
249 |                     for partial in ps['hgrid']:
250 |                         hp = latent.get_feature_hp(fi)
251 |                         hp.update(partial)
252 |                         full.append(hp)
253 |                     ps['hgrid'] = full
254 | 
255 |             elif name == 'slice_feature_hp':
256 |                 if config.keys() != ['hparams']:
257 |                     raise ValueError("bad config found: {}".format(config))
258 |                 require_feature_indices(config['hparams'])
259 | 
260 |             elif name == 'slice_cluster_hp':
261 |                 if config.keys() != ['cparam']:
262 |                     raise ValueError("bad config found: {}".format(config))
263 |                 if config['cparam'].keys() != ['alpha']:
264 |                     msg = "bad config found: {}".format(config['cparam'])
265 |                     raise ValueError(msg)
266 | 
267 |             elif name == 'theta':
268 |                 if config.keys() != ['tparams']:
269 |                     raise ValueError("bad config found: {}".format(config))
270 |                 require_feature_indices(config['tparams'])
271 | 
272 |             else:
273 |                 raise ValueError("bad kernel found: {}".format(name))
274 | 
275 |             self._kernel_config.append((name, config))
276 | 
277 |     def run(self, r, niters=10000):
278 |         """Run the specified mixturemodel kernel for `niters`, in a single
279 |         thread.
280 | 
281 |         Parameters
282 |         ----------
283 |         r : random state
284 |         niters : int
285 | 
286 |         """
287 |         validator.validate_type(r, rng, param_name='r')
288 |         validator.validate_positive(niters, param_name='niters')
289 |         model = bind(self._latent, self._view)
290 |         for _ in xrange(niters):
291 |             for name, config in self._kernel_config:
292 |                 if name == 'assign':
293 |                     gibbs.assign(model, r)
294 |                 elif name == 'assign_resample':
295 |                     gibbs.assign_resample(model, config['m'], r)
296 |                 elif name == 'grid_feature_hp':
297 |                     gibbs.hp(model, config, r)
298 |                 elif name == 'slice_feature_hp':
299 |                     slice.hp(model, r, hparams=config['hparams'])
300 |                 elif name == 'slice_cluster_hp':
301 |                     slice.hp(model, r, cparam=config['cparam'])
302 |                 elif name == 'theta':
303 |                     slice.theta(model, r, tparams=config['tparams'])
304 |                 else:
305 |                     assert False, "should not be reach"
306 | 
307 |     def get_latent(self):
308 |         """Returns the current value of the underlying state object.
309 | 
310 |         Note that the returned value is a *copy*, so modifications to it will
311 |         not be seen by the runner.
312 |         """
313 |         return copy.deepcopy(self._latent)
314 | 
315 |     @property
316 |     def expensive_state(self):
317 |         return self._view
318 | 
319 |     @expensive_state.setter
320 |     def expensive_state(self, view):
321 |         self._view = view
322 | 
323 |     def expensive_state_digest(self, h):
324 |         return self._view.digest(h)
325 | 


--------------------------------------------------------------------------------
/microscopes/mixture/testutil.py:
--------------------------------------------------------------------------------
 1 | """Test helpers specific to mixture models
 2 | 
 3 | """
 4 | 
 5 | from microscopes.common.rng import rng
 6 | from microscopes.common.recarray.dataview import numpy_dataview
 7 | from microscopes.mixture.model import sample, initialize
 8 | from microscopes.common.testutil import dist_on_all_clusterings
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def toy_dataset(defn):
14 |     samples, _ = sample(defn)
15 |     return np.hstack(samples)
16 | 
17 | 
18 | def data_with_posterior(defn,
19 |                         cluster_hp=None,
20 |                         feature_hps=None,
21 |                         preprocess_data_fn=None,
22 |                         r=None):
23 |     # XXX(stephentu): should only accept conjugate models
24 |     if r is None:
25 |         r = rng()
26 |     Y_clusters, _ = sample(defn, cluster_hp, feature_hps, r)
27 |     Y = np.hstack(Y_clusters)
28 |     if preprocess_data_fn:
29 |         Y = preprocess_data_fn(Y)
30 |     data = numpy_dataview(Y)
31 | 
32 |     def score_fn(assignment):
33 |         s = initialize(defn,
34 |                        data,
35 |                        r,
36 |                        cluster_hp=cluster_hp,
37 |                        feature_hps=feature_hps,
38 |                        assignment=assignment)
39 |         return s.score_joint(r)
40 | 
41 |     posterior = dist_on_all_clusterings(score_fn, defn.n())
42 |     return Y, posterior
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from distutils.core import setup
  4 | from distutils.extension import Extension
  5 | from distutils.version import LooseVersion
  6 | from Cython.Build import cythonize
  7 | import Cython.Compiler.Options
  8 | Cython.Compiler.Options.fail_fast = True
  9 | from cython import __version__ as cython_version
 10 | from pkg_resources import parse_version
 11 | from os.path import join as join_path
 12 | 
 13 | import numpy
 14 | import sys
 15 | import os
 16 | import json
 17 | import re
 18 | 
 19 | from subprocess import Popen, PIPE
 20 | 
 21 | 
 22 | CYTHON_MODULES = ['microscopes.mixture.definition',
 23 |                   'microscopes.mixture.model',
 24 |                   'microscopes.mixture._model',
 25 |                   ]
 26 | 
 27 | LIBRARY_DEPENDENCIES = ["microscopes_common", "microscopes_mixturemodel",
 28 |                         "protobuf", "distributions_shared"]
 29 | 
 30 | 
 31 | def get_git_sha1():
 32 |     try:
 33 |         import git
 34 |         required_version = '0.3.7'
 35 |         if parse_version(git.__version__) < parse_version(required_version):
 36 |             raise ImportError('could not import gitpython>=%s' % required_version)
 37 |     except ImportError as e:
 38 |         print >>sys.stderr, e
 39 |         return None
 40 |     repo = git.Repo(os.path.dirname(__file__))
 41 |     sha1 = repo.iter_commits().next().hexsha
 42 |     return sha1
 43 | 
 44 | 
 45 | def find_dependency(soname, incname):
 46 |     def test(prefix):
 47 |         sofile = join_path(prefix, 'lib/{}'.format(soname))
 48 |         incdir = join_path(prefix, 'include/{}'.format(incname))
 49 |         if os.path.isfile(sofile) and os.path.isdir(incdir):
 50 |             return (join_path(prefix, 'lib'),
 51 |                     join_path(prefix, 'include'))
 52 |         return None
 53 |     if 'VIRTUAL_ENV' in os.environ:
 54 |         ret = test(os.environ['VIRTUAL_ENV'])
 55 |         if ret is not None:
 56 |             return ret[0], ret[1]
 57 |     if 'CONDA_BUILD' in os.environ:
 58 |         d = os.environ.get('PREFIX', None)
 59 |         if d:
 60 |             ret = test(d)
 61 |             if ret is not None:
 62 |                 return ret[0], ret[1]
 63 |     if 'CONDA_DEFAULT_ENV' in os.environ:
 64 |         # shell out to conda to get info
 65 |         cmd = ['conda', 'info', '--json']
 66 |         s = Popen(cmd, shell=False, stdout=PIPE).stdout.read()
 67 |         s = json.loads(s)
 68 |         if 'default_prefix' in s:
 69 |             ret = test(str(s['default_prefix']))
 70 |             if ret is not None:
 71 |                 return ret[0], ret[1]
 72 |     return None, None
 73 | 
 74 | 
 75 | def find_cython_dependency(dirname):
 76 |     def test(prefix):
 77 |         incdir = join_path(prefix, 'cython/{}'.format(dirname))
 78 |         if os.path.isdir(incdir):
 79 |             return join_path(prefix, 'cython')
 80 |         return None
 81 |     if 'VIRTUAL_ENV' in os.environ:
 82 |         ret = test(os.environ['VIRTUAL_ENV'])
 83 |         if ret is not None:
 84 |             return ret
 85 |     if 'CONDA_BUILD' in os.environ:
 86 |         d = os.environ.get('PREFIX', None)
 87 |         if d:
 88 |             ret = test(d)
 89 |             if ret is not None:
 90 |                 return ret
 91 |     if 'CONDA_DEFAULT_ENV' in os.environ:
 92 |         # shell out to conda to get info
 93 |         cmd = ['conda', 'info', '--json']
 94 |         s = Popen(cmd, shell=False, stdout=PIPE).stdout.read()
 95 |         s = json.loads(s)
 96 |         if 'default_prefix' in s:
 97 |             ret = test(str(s['default_prefix']))
 98 |             if ret is not None:
 99 |                 return ret
100 |     return None
101 | 
102 | 
103 | def is_debug_build():
104 |     return 'DEBUG' in os.environ
105 | 
106 | 
107 | def is_clang():
108 |     return sys.platform.lower().startswith('darwin')
109 | 
110 | 
111 | def load_dependencies(basedir):
112 |     so_ext = 'dylib' if is_clang() else 'so'
113 | 
114 |     min_cython_version = '0.20.2' if is_clang() else '0.20.1'
115 |     if LooseVersion(cython_version) < LooseVersion(min_cython_version):
116 |         raise ValueError(
117 |             'cython support requires cython>={}'.format(min_cython_version))
118 | 
119 |     cc = os.environ.get('CC', None)
120 |     cxx = os.environ.get('CXX', None)
121 |     distributions_lib, distributions_inc = find_dependency(
122 |         'libdistributions_shared.{}'.format(so_ext), 'distributions')
123 |     microscopes_common_lib, microscopes_common_inc = find_dependency(
124 |         'libmicroscopes_common.{}'.format(so_ext), 'microscopes')
125 |     microscopes_common_cython_inc = find_cython_dependency('microscopes')
126 |     microscopes_mixturemodel_lib, microscopes_mixturemodel_inc = find_dependency(
127 |         'libmicroscopes_mixturemodel.{}'.format(so_ext), 'microscopes')
128 | 
129 | 
130 |     if 'OFFICIAL_BUILD' not in os.environ:
131 |         sha1 = get_git_sha1()
132 |         if sha1 is None:
133 |             sha1 = 'unknown'
134 |         print 'writing git hash:', sha1
135 |         githashfile = join_path(basedir, 'githash.txt')
136 |         with open(githashfile, 'w') as fp:
137 |             print >>fp, sha1
138 |     elif is_debug_build():
139 |         raise RuntimeError("OFFICIAL_BUILD and DEBUG both set")
140 | 
141 |     if distributions_inc is not None:
142 |         print 'Using distributions_inc:', distributions_inc
143 |     if distributions_lib is not None:
144 |         print 'Using distributions_lib:', distributions_lib
145 |     if microscopes_common_inc is not None:
146 |         print 'Using microscopes_common_inc:', microscopes_common_inc
147 |     if microscopes_common_cython_inc is not None:
148 |         print 'Using microscopes_common_cython_inc:', microscopes_common_cython_inc
149 |     if microscopes_common_lib is not None:
150 |         print 'Using microscopes_common_lib:', microscopes_common_lib
151 |     if microscopes_mixturemodel_inc is not None:
152 |         print 'Using microscopes_mixturemodel_inc:', microscopes_mixturemodel_inc
153 |     if microscopes_mixturemodel_lib is not None:
154 |         print 'Using microscopes_mixturemodel_lib:', microscopes_mixturemodel_lib
155 |     if cc is not None:
156 |         print 'Using CC={}'.format(cc)
157 |     if cxx is not None:
158 |         print 'Using CXX={}'.format(cxx)
159 |     if is_debug_build():
160 |         print 'Debug build'
161 | 
162 |     include_dirs = [numpy.get_include()]
163 |     if 'EXTRA_INCLUDE_PATH' in os.environ:
164 |         include_dirs.append(os.environ['EXTRA_INCLUDE_PATH'])
165 |     if distributions_inc is not None:
166 |         include_dirs.append(distributions_inc)
167 |     if microscopes_common_inc is not None:
168 |         include_dirs.append(microscopes_common_inc)
169 |     if microscopes_mixturemodel_inc is not None:
170 |         include_dirs.append(microscopes_mixturemodel_inc)
171 | 
172 |     library_dirs = []
173 |     if distributions_lib is not None:
174 |         library_dirs.append(distributions_lib)
175 |     if microscopes_common_lib is not None:
176 |         library_dirs.append(microscopes_common_lib)
177 |     if microscopes_mixturemodel_lib is not None:
178 |         library_dirs.append(microscopes_mixturemodel_lib)
179 | 
180 |     include_paths = {"microscopes_common_cython_inc": microscopes_common_cython_inc,
181 |                      }
182 |     return include_dirs, library_dirs, include_paths
183 | 
184 | 
185 | def build_extra_compile_args():
186 |     extra_compile_args = [
187 |         '-std=c++0x',
188 |         '-Wno-unused-function',
189 |     ]
190 |     # taken from distributions
191 |     math_opt_flags = [
192 |         '-mfpmath=sse',
193 |         '-msse4.1',
194 |     ]
195 |     if not is_debug_build():
196 |         extra_compile_args.extend(math_opt_flags)
197 |     if is_clang():
198 |         extra_compile_args.extend([
199 |             '-mmacosx-version-min=10.7',  # for anaconda
200 |             '-stdlib=libc++',
201 |             '-Wno-deprecated-register',
202 |         ])
203 |     if is_debug_build():
204 |         extra_compile_args.append('-DDEBUG_MODE')
205 | 
206 |     return extra_compile_args
207 | 
208 | 
209 | def build_extra_link_args():
210 |     extra_link_args = []
211 |     if 'EXTRA_LINK_ARGS' in os.environ:
212 |         extra_link_args.append(os.environ['EXTRA_LINK_ARGS'])
213 |     return extra_link_args
214 | 
215 | 
216 | def make_extension(module_name):
217 |     sources = [module_name.replace('.', '/') + '.pyx']
218 |     return Extension(
219 |         module_name,
220 |         sources=sources,
221 |         language="c++",
222 |         include_dirs=include_dirs,
223 |         libraries=LIBRARY_DEPENDENCIES,
224 |         library_dirs=library_dirs,
225 |         extra_compile_args=extra_compile_args,
226 |         extra_link_args=extra_link_args)
227 | 
228 | 
229 | def read_readme():
230 |     with open('README.md') as f:
231 |         return f.read()
232 | 
233 | 
234 | def get_version():
235 |     version = None
236 |     with open(join_path(basedir, '__init__.py')) as fp:
237 |         for line in fp:
238 |             if re.match("_version_base\s+=\s+'\S+'$", line):
239 |                 version = line.split()[-1].strip("'")
240 |     if not version:
241 |         raise RuntimeError("could not determine version")
242 |     return version
243 | 
244 | basedir = join_path(os.path.dirname(__file__), 'microscopes', 'mixture')
245 | include_dirs, library_dirs, include_paths = load_dependencies(basedir)
246 | extra_compile_args = build_extra_compile_args()
247 | extra_link_args = build_extra_link_args()
248 | extensions = cythonize([make_extension(module) for module in CYTHON_MODULES],
249 |                        include_path=[include_paths['microscopes_common_cython_inc']])
250 | long_description = read_readme()
251 | version = get_version()
252 | 
253 | setup(version=version,
254 |       name='microscopes-mixturemodel',
255 |       description='Non-parametric bayesian inference',
256 |       long_description=long_description,
257 |       url='https://github.com/datamicroscopes/mixturemodel',
258 |       author='Stephen Tu, Eric Jonas',
259 |       maintainer='Stephen Tu',
260 |       maintainer_email='tu.stephenl@gmail.com',
261 |       packages=(
262 |           'microscopes.mixture',
263 |       ),
264 |       ext_modules=extensions)
265 | 


--------------------------------------------------------------------------------
/src/mixture/model.cpp:
--------------------------------------------------------------------------------
 1 | #include <microscopes/mixture/model.hpp>
 2 | 
 3 | namespace microscopes {
 4 | namespace mixture {
 5 | namespace detail {
 6 | 
 7 | template class state<common::group_manager>;
 8 | template class model<
 9 |   mixture::state,
10 |   common::entity_based_state_object
11 | >;
12 | 
13 | } // namespace detail
14 | } // namespace mixture
15 | } // namespace microscopes
16 | 
17 | //void
18 | //state::dcheck_consistency() const
19 | //{
20 | //  MICROSCOPES_DCHECK(gcount_ >= gremoved_, "created is not >= removed");
21 | //  MICROSCOPES_DCHECK(alpha_ > 0.0, "cluster HP <= 0.0");
22 | //
23 | //  // check the assignments are all valid
24 | //  map<size_t, size_t> counts;
25 | //  for (size_t i = 0; i < assignments_.size(); i++) {
26 | //    if (assignments_[i] == -1)
27 | //      continue;
28 | //    MICROSCOPES_DCHECK(assignments_[i] >= 0, "invalid negative assignment found");
29 | //    MICROSCOPES_DCHECK(!gempty_.count(assignments_[i]), "assigned element in empty group");
30 | //    MICROSCOPES_DCHECK(groups_.find(assignments_[i]) != groups_.end(), "assigned to non-existent group");
31 | //    counts[assignments_[i]]++;
32 | //  }
33 | //
34 | //  // every group in gempty_ should appear in groups_, but empty
35 | //  for (auto g : gempty_) {
36 | //    const auto it = groups_.find(g);
37 | //    MICROSCOPES_DCHECK(it != groups_.end(), "non-existent group in empty groups list");
38 | //    MICROSCOPES_DCHECK(it->second.first == 0, "empty group is not empty");
39 | //    // XXX: need a way to tell suff stats are actually empty!
40 | //  }
41 | //
42 | //  for (auto g : groups_) {
43 | //    if (!g.second.first) {
44 | //      MICROSCOPES_DCHECK(gempty_.count(g.first), "empty group not accounted for in gempty_");
45 | //      MICROSCOPES_DCHECK(counts.find(g.first) == counts.end(), "empty group not empty");
46 | //    } else {
47 | //      MICROSCOPES_DCHECK(!gempty_.count(g.first), "non-empty group found in gempty_");
48 | //      MICROSCOPES_DCHECK(counts.at(g.first) == g.second.first, "assignments disagree");
49 | //    }
50 | //  }
51 | //}
52 | 


--------------------------------------------------------------------------------
/test/cxx/test_state.cpp:
--------------------------------------------------------------------------------
 1 | #include <microscopes/mixture/model.hpp>
 2 | #include <microscopes/common/recarray/dataview.hpp>
 3 | #include <microscopes/models/distributions.hpp>
 4 | #include <microscopes/common/random_fwd.hpp>
 5 | 
 6 | #include <random>
 7 | #include <iostream>
 8 | 
 9 | using namespace std;
10 | using namespace distributions;
11 | using namespace microscopes;
12 | using namespace microscopes::common;
13 | using namespace microscopes::common::recarray;
14 | 
15 | int
16 | main(void)
17 | {
18 |   const size_t D = 28*28;
19 |   rng_t r(5849343);
20 | 
21 |   vector<shared_ptr<models::model>> models;
22 |   for (size_t i = 0; i < D; i++)
23 |     models.emplace_back(make_shared<
24 |         models::distributions_model<BetaBernoulli>>());
25 | 
26 |   mixture::model_definition def(1000, models);
27 | 
28 |   shared_ptr<mixture::state> spx(
29 |       mixture::state::unsafe_initialize(def));
30 |   auto s = *spx;
31 |   s.get_cluster_hp_mutator("alpha").set<float>(2.0);
32 | 
33 |   for (size_t i = 0; i < D; i++) {
34 |     s.get_feature_hp_mutator(i, "alpha").set<float>(1.0);
35 |     s.get_feature_hp_mutator(i, "beta").set<float>(1.0);
36 |   }
37 | 
38 |   //const size_t G = strtoul(argv[1], nullptr, 10);
39 |   //cout << "groups: " << G << endl;
40 |   const size_t G = 5;
41 | 
42 |   for (size_t i = 0; i < G; i++)
43 |     s.create_group(r);
44 | 
45 |   // create fake data
46 |   bool data[D];
47 |   for (size_t i = 0; i < D; i++)
48 |     data[i] = bernoulli_distribution(0.5)(r);
49 | 
50 |   vector<runtime_type> types(D, runtime_type(TYPE_B));
51 | 
52 |   row_accessor acc( reinterpret_cast<const uint8_t *>(&data[0]), nullptr, &types);
53 | 
54 |   s.add_value(G/2, 10, acc, r);
55 | 
56 |   float sum = 0.0;
57 |   const size_t NTRIALS = 100;
58 |   for (size_t i = 0; i < NTRIALS; i++) {
59 |     const size_t gid = s.remove_value(10, acc, r);
60 |     s.delete_group(gid);
61 |     s.create_group(r);
62 |     const auto p = s.score_value(acc, r);
63 |     sum += p.first[1];
64 |     sum += p.second[0];
65 |     const auto groups = s.groups();
66 |     const vector<float> probs(groups.size(), 1./float(groups.size()));
67 |     const auto choice = util::sample_discrete(probs, r);
68 |     s.add_value(groups[choice], 10, acc, r);
69 |   }
70 |   s.dcheck_consistency();
71 | 
72 |   //cout << "meaningless: " << sum << endl;
73 |   return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/test/test_crp.py:
--------------------------------------------------------------------------------
 1 | from microscopes.mixture.model import initialize
 2 | from microscopes.common.recarray.dataview import numpy_dataview
 3 | 
 4 | from microscopes.common.rng import rng
 5 | from microscopes.models import bb
 6 | from microscopes.mixture.definition import model_definition
 7 | from distributions.dbg.random import sample_discrete
 8 | 
 9 | from microscopes.common.testutil import (
10 |     permutation_iter,
11 |     permutation_canonical,
12 |     assert_discrete_dist_approx,
13 |     scores_to_probs,
14 | )
15 | 
16 | from nose.tools import assert_almost_equals
17 | #from nose.plugins.attrib import attr
18 | 
19 | import numpy as np
20 | 
21 | 
22 | def _sample_crp(n, alpha):
23 |     """
24 |     generate an assignment vector of length n from a CRP with alpha
25 |     """
26 |     if n <= 0:
27 |         raise ValueError("need positive n")
28 |     if alpha <= 0.:
29 |         raise ValueError("need positive alpha")
30 |     counts = np.array([1])
31 |     assignments = np.zeros(n, dtype=np.int)
32 |     assignments[0] = 0
33 |     for i in xrange(1, n):
34 |         dist = np.append(counts, alpha).astype(np.float, copy=False)
35 |         dist /= dist.sum()
36 |         choice = sample_discrete(dist)
37 |         if choice == len(counts):
38 |             # new cluster
39 |             counts = np.append(counts, 1)
40 |         else:
41 |             # existing cluster
42 |             counts[choice] += 1
43 |         assignments[i] = choice
44 |     return assignments
45 | 
46 | 
47 | def _test_crp(initialize_fn, dataview, alpha, r):
48 |     N = 6
49 |     defn = model_definition(N, [bb])
50 |     Y = np.array([(True,)] * N, dtype=[('', bool)])
51 |     view = dataview(Y)
52 | 
53 |     def crp_score(assignment):
54 |         latent = initialize_fn(
55 |             defn, view, r=r,
56 |             cluster_hp={'alpha': alpha}, assignment=assignment)
57 |         return latent.score_assignment()
58 |     dist = np.array(list(map(crp_score, permutation_iter(N))))
59 |     dist = np.exp(dist)
60 |     assert_almost_equals(dist.sum(), 1.0, places=3)
61 | 
62 | 
63 | def test_crp():
64 |     for alpha in (0.1, 1.0, 10.0):
65 |         _test_crp(initialize, numpy_dataview, alpha=alpha, r=rng())
66 | 
67 | 
68 | def test_crp_empirical():
69 |     N = 4
70 |     alpha = 2.5
71 |     defn = model_definition(N, [bb])
72 |     Y = np.array([(True,)] * N, dtype=[('', bool)])
73 |     view = numpy_dataview(Y)
74 |     r = rng()
75 | 
76 |     def crp_score(assignment):
77 |         latent = initialize(
78 |             defn, view, r=r,
79 |             cluster_hp={'alpha': alpha}, assignment=assignment)
80 |         return latent.score_assignment()
81 |     scores = np.array(list(map(crp_score, permutation_iter(N))))
82 |     dist = scores_to_probs(scores)
83 |     idmap = {C: i for i, C in enumerate(permutation_iter(N))}
84 | 
85 |     def sample_fn():
86 |         sample = permutation_canonical(_sample_crp(N, alpha))
87 |         return idmap[tuple(sample)]
88 |     assert_discrete_dist_approx(sample_fn, dist, ntries=100)
89 | 


--------------------------------------------------------------------------------
/test/test_cxx_imports.py:
--------------------------------------------------------------------------------
 1 | def test_cxx_import():
 2 |     from microscopes.mixture.model import \
 3 |         state, \
 4 |         bind, \
 5 |         initialize, \
 6 |         deserialize
 7 |     assert state
 8 |     assert bind
 9 |     assert initialize
10 |     assert deserialize
11 | 


--------------------------------------------------------------------------------
/test/test_definition.py:
--------------------------------------------------------------------------------
 1 | from microscopes.mixture.definition import model_definition
 2 | from microscopes.models import bb, niw
 3 | 
 4 | from nose.tools import (
 5 |     assert_equals,
 6 |     assert_is_not,
 7 | )
 8 | import pickle
 9 | import copy
10 | 
11 | 
12 | def test_model_definition_pickle():
13 |     defn = model_definition(10, [bb, niw(3)])
14 |     bstr = pickle.dumps(defn)
15 |     defn1 = pickle.loads(bstr)
16 |     assert_equals(defn.n(), defn1.n())
17 |     assert_equals(len(defn.models()), len(defn1.models()))
18 |     for a, b in zip(defn.models(), defn1.models()):
19 |         assert_equals(a.name(), b.name())
20 | 
21 | def test_model_definition_copy():
22 |     defn = model_definition(10, [bb, niw(3)])
23 |     defn_shallow = copy.copy(defn)
24 |     defn_deep = copy.deepcopy(defn)
25 |     assert_is_not(defn, defn_shallow)
26 |     assert_is_not(defn, defn_deep)
27 |     assert_is_not(defn._models, defn_deep._models)
28 |     assert_equals(defn.n(), defn_shallow.n())
29 |     assert_equals(defn.n(), defn_deep.n())
30 | 


--------------------------------------------------------------------------------
/test/test_dm.py:
--------------------------------------------------------------------------------
  1 | # XXX: this is here, and not in common since we currently don't have a python
  2 | # API to interface with our likelihood models (mixturemodel acts as that API)
  3 | #
  4 | # XXX: fix this!
  5 | 
  6 | from microscopes.models import dm
  7 | from microscopes.mixture.definition import model_definition
  8 | 
  9 | from microscopes.mixture.model import initialize as cxx_initialize
 10 | from microscopes.common.recarray.dataview import (
 11 |     numpy_dataview as cxx_numpy_dataview,
 12 | )
 13 | from microscopes.common.rng import rng
 14 | 
 15 | import numpy as np
 16 | import itertools as it
 17 | 
 18 | from nose.tools import (
 19 |     assert_almost_equals,
 20 |     assert_sequence_equal,
 21 |     assert_equals,
 22 | )
 23 | from scipy.special import gammaln
 24 | 
 25 | 
 26 | def test_dm_cxx():
 27 |     K = 4
 28 |     Y = np.array([
 29 |         ([0, 1, 2, 5],),
 30 |         ([1, 0, 1, 2],),
 31 |         ([0, 2, 9, 9],),
 32 |     ], dtype=[('', np.int, (K,))])
 33 |     Y_np = np.vstack(y[0] for y in Y)
 34 | 
 35 |     cxx_view = cxx_numpy_dataview(Y)
 36 |     r = rng()
 37 |     defn = model_definition(Y.shape[0], [dm(K)])
 38 |     prior = {'alphas': [1.] * K}
 39 |     cxx_s = cxx_initialize(
 40 |         defn,
 41 |         cxx_view,
 42 |         r,
 43 |         feature_hps=[prior],
 44 |         assignment=[0] * Y.shape[0])
 45 | 
 46 |     counts = cxx_s.get_suffstats(0, 0)['counts']
 47 |     assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
 48 | 
 49 | 
 50 | def test_betabin_equiv():
 51 | 
 52 |     # https://github.com/pymc-devs/pymc/blob/
 53 |     # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84
 54 |     def betabin_like(value, alpha, beta, n):
 55 |         return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) +
 56 |                 gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) +
 57 |                 gammaln(alpha + value) + gammaln(n + beta - value) -
 58 |                 gammaln(beta + alpha + n))
 59 | 
 60 |     # this N refers to the number of trials in the binomial distribution
 61 |     N = 10
 62 | 
 63 |     # this refers to the dataset size
 64 |     M = 100
 65 | 
 66 |     # hyperparams of the beta dist
 67 |     alpha, beta = 1., 2.
 68 | 
 69 |     heads = np.random.randint(low=0, high=N + 1, size=M)
 70 |     tails = N - heads
 71 | 
 72 |     data = np.vstack((heads, tails)).T
 73 | 
 74 |     Y = np.array([(y,) for y in data], dtype=[('', np.int, (2,))])
 75 |     view = cxx_numpy_dataview(Y)
 76 |     r = rng()
 77 |     defn = model_definition(Y.shape[0], [dm(2)])
 78 |     prior = {'alphas': [alpha, beta]}
 79 |     s = cxx_initialize(
 80 |         defn,
 81 |         view,
 82 |         r,
 83 |         feature_hps=[prior],
 84 |         assignment=[0] * Y.shape[0])
 85 | 
 86 |     assert_equals(s.groups(), [0])
 87 | 
 88 |     def all_indices(N):
 89 |         for i, j in it.product(range(0, N + 1), repeat=2):
 90 |             if (i + j) == N:
 91 |                 yield i, j
 92 | 
 93 |     all_data = [(list(ij),) for ij in all_indices(N)]
 94 | 
 95 |     Y_test = np.array(all_data, dtype=[('', np.int, (2,))])
 96 | 
 97 |     # the actual score is simply a betabin using the updated alpha, beta
 98 |     alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0)
 99 | 
100 |     def model_score(Y_value):
101 |         _, (score,) = s.score_value(Y_value, r)
102 |         return score
103 | 
104 |     def test_score(Y_value):
105 |         score = betabin_like(Y_value[0][0], alpha1, beta1, N)
106 |         return score
107 | 
108 |     model_scores = np.array(map(model_score, Y_test))
109 |     test_scores = np.array(map(test_score, Y_test))
110 | 
111 |     assert_almost_equals(np.exp(model_scores).sum(), 1., places=2)
112 |     assert_almost_equals(np.exp(test_scores).sum(), 1., places=2)
113 |     assert_almost_equals(
114 |         np.abs(model_scores - test_scores).max(), 0., places=1)
115 | 
116 | 
117 | def test_marginal():
118 | 
119 |     def score_dataset(counts):
120 |         M, K = counts.shape
121 |         Y = np.array([(y,) for y in counts], dtype=[('', np.int, (K,))])
122 |         view = cxx_numpy_dataview(Y)
123 |         r = rng()
124 |         defn = model_definition(M, [dm(K)])
125 |         prior = {'alphas': [1.] * K}
126 |         s = cxx_initialize(
127 |             defn,
128 |             view,
129 |             r,
130 |             feature_hps=[prior],
131 |             assignment=[0] * M)
132 |         assert_equals(s.groups(), [0])
133 |         return s.score_data(None, None, r)
134 | 
135 |     M = 5
136 |     N = 4
137 |     K = 2
138 | 
139 |     def all_indices(N, K):
140 |         for inds in it.product(range(0, N + 1), repeat=K):
141 |             if sum(inds) == N:
142 |                 yield list(inds)
143 | 
144 |     def dataset_iter(inds, M):
145 |         for ptrs in it.product(range(len(inds)), repeat=M):
146 |             dataset = np.array([inds[p] for p in ptrs])
147 |             assert dataset.shape == (M, K)
148 |             score = score_dataset(dataset)
149 |             yield score
150 | 
151 |     scores = np.array(list(dataset_iter(list(all_indices(N, K)), M)))
152 |     assert_almost_equals(np.exp(scores).sum(), 1., places=2)
153 | 


--------------------------------------------------------------------------------
/test/test_hp_inference.py:
--------------------------------------------------------------------------------
  1 | from nose.plugins.attrib import attr
  2 | 
  3 | from microscopes.mixture.model import initialize, bind
  4 | from microscopes.kernels.gibbs import hp as gibbs_hp
  5 | from microscopes.kernels.slice import hp as slice_hp
  6 | 
  7 | from microscopes.common.rng import rng
  8 | from microscopes.common.scalar_functions import (
  9 |     log_exponential,
 10 |     log_noninformative_beta_prior,
 11 |     log_normal,
 12 | )
 13 | 
 14 | from microscopes.common.recarray.dataview import numpy_dataview
 15 | from microscopes.models import bb, bnb, gp, nich
 16 | from microscopes.mixture.definition import model_definition
 17 | 
 18 | from microscopes.common.util import almost_eq
 19 | 
 20 | import numpy as np
 21 | 
 22 | try:
 23 |     import matplotlib.pylab as plt
 24 |     has_plt = True
 25 | except ImportError:
 26 |     has_plt = False
 27 | 
 28 | import itertools as it
 29 | 
 30 | from microscopes.common.testutil import (
 31 |     assert_1d_cont_dist_approx_emp,
 32 |     #OurAssertionError,
 33 |     #our_assert_almost_equals,
 34 | )
 35 | #from nose.plugins.attrib import attr
 36 | 
 37 | 
 38 | def _bb_hyperprior_pdf(hp):
 39 |     alpha, beta = hp['alpha'], hp['beta']
 40 |     if alpha > 0.0 and beta > 0.0:
 41 |         # http://iacs-courses.seas.harvard.edu/courses/am207/blog/lecture-9.html
 42 |         return -2.5 * np.log(alpha + beta)
 43 |     return -np.inf
 44 | 
 45 | 
 46 | def data_with_assignment(Y_clusters):
 47 |     assignments = it.chain.from_iterable(
 48 |         [i] * len(cluster) for i, cluster in enumerate(Y_clusters))
 49 |     return np.hstack(Y_clusters), list(assignments)
 50 | 
 51 | 
 52 | def _make_one_feature_bb_mm(initialize_fn, dataview, Nk, K, alpha, beta, r):
 53 |     # XXX: the rng parameter passed does not get threaded through the
 54 |     # random *data* generation
 55 |     # use the py_bb for sampling
 56 |     py_bb = bb.py_desc()._model_module
 57 |     shared = py_bb.Shared()
 58 |     shared.load({'alpha': alpha, 'beta': beta})
 59 | 
 60 |     def init_sampler():
 61 |         samp = py_bb.Sampler()
 62 |         samp.init(shared)
 63 |         return samp
 64 |     samplers = [init_sampler() for _ in xrange(K)]
 65 | 
 66 |     def gen_cluster(samp):
 67 |         data = [(samp.eval(shared),) for _ in xrange(Nk)]
 68 |         return np.array(data, dtype=[('', bool)])
 69 |     Y_clustered = tuple(map(gen_cluster, samplers))
 70 |     Y, assignment = data_with_assignment(Y_clustered)
 71 |     view = dataview(Y)
 72 |     s = initialize_fn(model_definition(Y.shape[0], [bb]),
 73 |                       view,
 74 |                       cluster_hp={'alpha': 2.},
 75 |                       feature_hps=[{'alpha': alpha, 'beta': beta}],
 76 |                       r=r,
 77 |                       assignment=assignment)
 78 |     return s, view
 79 | 
 80 | 
 81 | def _grid_actual(s, prior_fn, lo, hi, nelems, r):
 82 |     x = np.linspace(lo, hi, nelems)
 83 |     y = x.copy()
 84 |     xv, yv = np.meshgrid(x, y)
 85 |     z = np.zeros(xv.shape)
 86 |     for i in xrange(nelems):
 87 |         for j in xrange(nelems):
 88 |             alpha = xv[i, j]
 89 |             beta = yv[i, j]
 90 |             raw = {'alpha': alpha, 'beta': beta}
 91 |             s.set_feature_hp(0, raw)
 92 |             z[i, j] = prior_fn(raw) + s.score_data(0, None, r)
 93 |     return xv, yv, z
 94 | 
 95 | 
 96 | def _add_to_grid(xv, yv, z, value):
 97 |     xmin, xmax = xv.min(axis=1).min(), xv.max(axis=1).max()
 98 |     ymin, ymax = yv.min(axis=1).min(), yv.max(axis=1).max()
 99 |     if (value[0] < xmin or
100 |             value[0] > xmax or
101 |             value[1] < ymin or
102 |             value[1] > ymax):
103 |         # do not add
104 |         return False
105 |     xrep = xv[0, :]
106 |     yrep = yv[:, 0]
107 |     xidx = min(np.searchsorted(xrep, value[0]), len(xrep) - 1)
108 |     yidx = min(np.searchsorted(yrep, value[1]), len(yrep) - 1)
109 |     z[yidx, xidx] += 1
110 |     return True
111 | 
112 | 
113 | def _test_hp_inference(initialize_fn,
114 |                        prior_fn,
115 |                        grid_min,
116 |                        grid_max,
117 |                        grid_n,
118 |                        dataview,
119 |                        bind_fn,
120 |                        init_inf_kernel_state_fn,
121 |                        inf_kernel_fn,
122 |                        map_actual_postprocess_fn,
123 |                        grid_filename,
124 |                        prng,
125 |                        burnin=1000,
126 |                        nsamples=1000,
127 |                        skip=10,
128 |                        trials=5,
129 |                        tol=0.1):
130 | 
131 |     print '_test_hp_inference: burnin', burnin, 'nsamples', nsamples, \
132 |         'skip', skip, 'trials', trials, 'tol', tol
133 | 
134 |     Nk = 1000
135 |     K = 100
136 |     s, view = _make_one_feature_bb_mm(
137 |         initialize_fn, dataview, Nk, K, 0.8, 1.2, prng)
138 |     bound_s = bind_fn(s, view)
139 | 
140 |     xgrid, ygrid, z_actual = _grid_actual(
141 |         s, prior_fn, grid_min, grid_max, grid_n, prng)
142 | 
143 |     i_actual, j_actual = np.unravel_index(np.argmax(z_actual), z_actual.shape)
144 |     assert almost_eq(z_actual[i_actual, j_actual], z_actual.max())
145 |     alpha_map_actual, beta_map_actual = \
146 |         xgrid[i_actual, j_actual], ygrid[i_actual, j_actual]
147 |     map_actual = np.array([alpha_map_actual, beta_map_actual])
148 |     map_actual_postproc = map_actual_postprocess_fn(map_actual)
149 |     print 'MAP actual:', map_actual
150 |     print 'MAP actual postproc:', map_actual_postproc
151 | 
152 |     th_draw = lambda: np.random.uniform(grid_min, grid_max)
153 |     alpha0, beta0 = th_draw(), th_draw()
154 |     s.set_feature_hp(0, {'alpha': alpha0, 'beta': beta0})
155 |     print 'start values:', alpha0, beta0
156 | 
157 |     z_sample = np.zeros(xgrid.shape)
158 |     opaque = init_inf_kernel_state_fn(s)
159 |     for _ in xrange(burnin):
160 |         inf_kernel_fn(bound_s, opaque, prng)
161 |     print 'finished burnin of', burnin, 'iterations'
162 | 
163 |     def trial():
164 |         def posterior(k, skip):
165 |             for _ in xrange(k):
166 |                 for _ in xrange(skip - 1):
167 |                     inf_kernel_fn(bound_s, opaque, prng)
168 |                 inf_kernel_fn(bound_s, opaque, prng)
169 |                 hp = s.get_feature_hp(0)
170 |                 yield np.array([hp['alpha'], hp['beta']])
171 |         for samp in posterior(nsamples, skip):
172 |             #print 'gridding:', samp
173 |             _add_to_grid(xgrid, ygrid, z_sample, samp)
174 | 
175 |     def draw_grid_plot():
176 |         if not has_plt:
177 |             return
178 |         plt.imshow(z_sample, cmap=plt.cm.binary, origin='lower',
179 |                    interpolation='nearest',
180 |                    extent=(grid_min, grid_max, grid_min, grid_max))
181 |         plt.hold(True)  # XXX: restore plt state
182 |         plt.contour(np.linspace(grid_min, grid_max, grid_n),
183 |                     np.linspace(grid_min, grid_max, grid_n),
184 |                     z_actual)
185 |         plt.savefig(grid_filename)
186 |         plt.close()
187 | 
188 |     while trials:
189 |         trial()
190 |         i_sample, j_sample = np.unravel_index(
191 |             np.argmax(z_sample), z_sample.shape)
192 |         alpha_map_sample, beta_map_sample = \
193 |             xgrid[i_sample, j_sample], ygrid[i_sample, j_sample]
194 |         map_sample = np.array([alpha_map_sample, beta_map_sample])
195 |         diff = np.linalg.norm(map_actual_postproc - map_sample)
196 |         print 'map_sample:', map_sample, 'diff:', diff, \
197 |             'trials left:', (trials - 1)
198 |         if diff <= tol:
199 |             # draw plot and bail
200 |             draw_grid_plot()
201 |             return
202 |         trials -= 1
203 | 
204 |     draw_grid_plot()  # useful for debugging
205 |     assert False, 'MAP value did not converge to desired tolerance'
206 | 
207 | 
208 | def _test_kernel_gibbs_hp(initialize_fn,
209 |                           dataview,
210 |                           bind_fn,
211 |                           gibbs_hp_fn,
212 |                           fname,
213 |                           prng):
214 |     grid_min, grid_max, grid_n = 0.01, 5.0, 10
215 |     grid = it.product(np.linspace(grid_min, grid_max, grid_n), repeat=2)
216 |     grid = tuple({'alpha': alpha, 'beta': beta} for alpha, beta in grid)
217 | 
218 |     def init_inf_kernel_state_fn(dpmm):
219 |         hparams = {0: {'hpdf': _bb_hyperprior_pdf, 'hgrid': grid}}
220 |         return hparams
221 | 
222 |     def map_actual_postprocess_fn(map_actual):
223 |         # find closest grid point to actual point
224 |         dists = [np.linalg.norm(np.array([g['alpha'], g['beta']]) - map_actual)
225 |                  for g in grid]
226 |         dists = np.array(dists)
227 |         closest = grid[np.argmin(dists)]
228 |         closest = np.array([closest['alpha'], closest['beta']])
229 |         return closest
230 | 
231 |     _test_hp_inference(
232 |         initialize_fn,
233 |         _bb_hyperprior_pdf,
234 |         grid_min,
235 |         grid_max,
236 |         grid_n,
237 |         dataview,
238 |         bind_fn,
239 |         init_inf_kernel_state_fn,
240 |         gibbs_hp_fn,
241 |         map_actual_postprocess_fn,
242 |         grid_filename=fname,
243 |         prng=prng,
244 |         burnin=100,
245 |         trials=10,
246 |         nsamples=100)
247 | 
248 | 
249 | @attr('slow')
250 | def test_kernel_gibbs_hp():
251 |     _test_kernel_gibbs_hp(initialize,
252 |                           numpy_dataview,
253 |                           bind,
254 |                           gibbs_hp,
255 |                           'grid_gibbs_hp_samples_pdf',
256 |                           rng())
257 | 
258 | 
259 | def _test_kernel_slice_hp(initialize_fn,
260 |                           init_inf_kernel_state_fn,
261 |                           prior_fn,
262 |                           dataview,
263 |                           bind_fn,
264 |                           slice_hp_fn,
265 |                           fname,
266 |                           prng):
267 |     grid_min, grid_max, grid_n = 0.01, 5.0, 200
268 |     _test_hp_inference(
269 |         initialize_fn,
270 |         prior_fn,
271 |         grid_min,
272 |         grid_max,
273 |         grid_n,
274 |         dataview,
275 |         bind_fn,
276 |         init_inf_kernel_state_fn,
277 |         slice_hp_fn,
278 |         map_actual_postprocess_fn=lambda x: x,
279 |         grid_filename=fname,
280 |         prng=prng,
281 |         burnin=100,
282 |         trials=100,
283 |         nsamples=100)
284 | 
285 | 
286 | @attr('slow')
287 | def test_kernel_slice_hp():
288 |     indiv_prior_fn = log_exponential(1.2)
289 | 
290 |     def init_inf_kernel_state_fn(s):
291 |         hparams = {
292 |             0: {
293 |                 'alpha': (indiv_prior_fn, 1.5),
294 |                 'beta': (indiv_prior_fn, 1.5),
295 |             }
296 |         }
297 |         return hparams
298 | 
299 |     def prior_fn(raw):
300 |         return indiv_prior_fn(raw['alpha']) + indiv_prior_fn(raw['beta'])
301 |     kernel_fn = lambda s, arg, rng: slice_hp(s, rng, hparams=arg)
302 |     _test_kernel_slice_hp(initialize,
303 |                           init_inf_kernel_state_fn,
304 |                           prior_fn,
305 |                           numpy_dataview,
306 |                           bind,
307 |                           kernel_fn,
308 |                           'grid_slice_hp_samples.pdf',
309 |                           rng())
310 | 
311 | 
312 | @attr('slow')
313 | def test_kernel_slice_hp_noninform():
314 |     def init_inf_kernel_state_fn(s):
315 |         hparams = {
316 |             0: {
317 |                 ('alpha', 'beta'): (log_noninformative_beta_prior, 1.0),
318 |             }
319 |         }
320 |         return hparams
321 | 
322 |     def prior_fn(raw):
323 |         return log_noninformative_beta_prior(raw['alpha'], raw['beta'])
324 |     kernel_fn = lambda s, arg, rng: slice_hp(s, rng, hparams=arg)
325 |     _test_kernel_slice_hp(initialize,
326 |                           init_inf_kernel_state_fn,
327 |                           prior_fn,
328 |                           numpy_dataview,
329 |                           bind,
330 |                           kernel_fn,
331 |                           'grid_slice_hp_noninform_samples.pdf',
332 |                           rng())
333 | 
334 | 
335 | def _test_cluster_hp_inference(initialize_fn,
336 |                                prior_fn,
337 |                                grid_min,
338 |                                grid_max,
339 |                                grid_n,
340 |                                dataview,
341 |                                bind_fn,
342 |                                init_inf_kernel_state_fn,
343 |                                inf_kernel_fn,
344 |                                map_actual_postprocess_fn,
345 |                                prng,
346 |                                burnin=1000,
347 |                                nsamples=1000,
348 |                                skip=10,
349 |                                trials=100,
350 |                                places=2):
351 |     print '_test_cluster_hp_inference: burnin', burnin, 'nsamples', nsamples, \
352 |         'skip', skip, 'trials', trials, 'places', places
353 | 
354 |     N = 1000
355 |     D = 5
356 | 
357 |     # create random binary data, doesn't really matter what the values are
358 |     Y = np.random.random(size=(N, D)) < 0.5
359 |     Y = np.array([tuple(y) for y in Y], dtype=[('', np.bool)] * D)
360 |     view = dataview(Y)
361 | 
362 |     defn = model_definition(N, [bb] * D)
363 |     latent = initialize_fn(defn, view, r=prng)
364 |     model = bind_fn(latent, view)
365 | 
366 |     def score_alpha(alpha):
367 |         prev_alpha = latent.get_cluster_hp()['alpha']
368 |         latent.set_cluster_hp({'alpha': alpha})
369 |         score = prior_fn(alpha) + latent.score_assignment()
370 |         latent.set_cluster_hp({'alpha': prev_alpha})
371 |         return score
372 | 
373 |     def sample_fn():
374 |         for _ in xrange(skip - 1):
375 |             inf_kernel_fn(model, opaque, prng)
376 |         inf_kernel_fn(model, opaque, prng)
377 |         return latent.get_cluster_hp()['alpha']
378 | 
379 |     alpha0 = np.random.uniform(grid_min, grid_max)
380 |     print 'start alpha:', alpha0
381 |     latent.set_cluster_hp({'alpha': alpha0})
382 | 
383 |     opaque = init_inf_kernel_state_fn(latent)
384 |     for _ in xrange(burnin):
385 |         inf_kernel_fn(model, opaque, prng)
386 |     print 'finished burnin of', burnin, 'iterations'
387 | 
388 |     print 'grid_min', grid_min, 'grid_max', grid_max
389 |     assert_1d_cont_dist_approx_emp(sample_fn,
390 |                                    score_alpha,
391 |                                    grid_min,
392 |                                    grid_max,
393 |                                    grid_n,
394 |                                    trials,
395 |                                    nsamples,
396 |                                    places)
397 | 
398 |     # MAP estimation over a large range doesn't really work
399 |     #alpha_grid = np.linspace(grid_min, grid_max, grid_n)
400 |     #alpha_scores = np.array(map(score_alpha, alpha_grid))
401 |     #alpha_grid_map_idx = np.argmax(alpha_scores)
402 |     #alpha_grid_map = alpha_grid[alpha_grid_map_idx]
403 |     #alpha_grid_map_postproc = map_actual_postprocess_fn(alpha_grid_map)
404 |     #print 'alpha MAP:', alpha_grid_map, \
405 |     #      'alpha MAP postproc:', alpha_grid_map_postproc
406 | 
407 |     #alpha0 = np.random.uniform(grid_min, grid_max)
408 |     #print 'start alpha:', alpha0
409 |     #latent.set_cluster_hp({'alpha':alpha0})
410 | 
411 |     #opaque = init_inf_kernel_state_fn(latent)
412 |     #for _ in xrange(burnin):
413 |     #    inf_kernel_fn(model, opaque, prng)
414 |     #print 'finished burnin of', burnin, 'iterations'
415 | 
416 |     #def posterior(k, skip):
417 |     #    for _ in xrange(k):
418 |     #        for _ in xrange(skip-1):
419 |     #            inf_kernel_fn(model, opaque, prng)
420 |     #        inf_kernel_fn(model, opaque, prng)
421 |     #        yield latent.get_cluster_hp()['alpha']
422 | 
423 |     #bins = np.zeros(grid_n, dtype=np.int)
424 |     #while 1:
425 |     #    for sample in posterior(nsamples, skip):
426 |     #        idx = min(np.searchsorted(alpha_grid, sample), grid_n-1)
427 |     #        bins[idx] += 1
428 |     #    est_map = alpha_grid[np.argmax(bins)]
429 |     #    try:
430 |     #        our_assert_almost_equals(est_map, alpha_grid_map, places=places)
431 |     #        return # success
432 |     #    except OurAssertionError as ex:
433 |     #        print 'warning:', ex._ex.message
434 |     #        trials -= 1
435 |     #        if not trials:
436 |     #            raise ex._ex
437 | 
438 | 
439 | @attr('slow')
440 | def test_kernel_slice_cluster_hp():
441 |     prior_fn = log_exponential(1.5)
442 | 
443 |     def init_inf_kernel_state_fn(s):
444 |         cparam = {'alpha': (prior_fn, 1.)}
445 |         return cparam
446 |     kernel_fn = lambda s, arg, rng: slice_hp(s, rng, cparam=arg)
447 |     grid_min, grid_max, grid_n = 0.0, 50., 100
448 |     _test_cluster_hp_inference(initialize,
449 |                                prior_fn,
450 |                                grid_min,
451 |                                grid_max,
452 |                                grid_n,
453 |                                numpy_dataview,
454 |                                bind,
455 |                                init_inf_kernel_state_fn,
456 |                                kernel_fn,
457 |                                map_actual_postprocess_fn=lambda x: x,
458 |                                prng=rng())
459 | 
460 | 
461 | def _test_scalar_hp_inference(view,
462 |                               prior_fn,
463 |                               w,
464 |                               grid_min,
465 |                               grid_max,
466 |                               grid_n,
467 |                               likelihood_model,
468 |                               scalar_hp_key,
469 |                               burnin=1000,
470 |                               nsamples=1000,
471 |                               every=10,
472 |                               trials=100,
473 |                               places=2):
474 |     """
475 |     view must be 1D
476 |     """
477 |     r = rng()
478 | 
479 |     hparams = {0: {scalar_hp_key: (prior_fn, w)}}
480 | 
481 |     def score_fn(scalar):
482 |         d = latent.get_feature_hp(0)
483 |         prev_scalar = d[scalar_hp_key]
484 |         d[scalar_hp_key] = scalar
485 |         latent.set_feature_hp(0, d)
486 |         score = prior_fn(scalar) + latent.score_data(0, None, r)
487 |         d[scalar_hp_key] = prev_scalar
488 |         latent.set_feature_hp(0, d)
489 |         return score
490 | 
491 |     defn = model_definition(len(view), [likelihood_model])
492 |     latent = initialize(defn, view, r=r)
493 |     model = bind(latent, view)
494 | 
495 |     def sample_fn():
496 |         for _ in xrange(every):
497 |             slice_hp(model, r, hparams=hparams)
498 |         return latent.get_feature_hp(0)[scalar_hp_key]
499 | 
500 |     for _ in xrange(burnin):
501 |         slice_hp(model, r, hparams=hparams)
502 |     print 'finished burnin of', burnin, 'iterations'
503 | 
504 |     print 'grid_min', grid_min, 'grid_max', grid_max
505 |     assert_1d_cont_dist_approx_emp(sample_fn,
506 |                                    score_fn,
507 |                                    grid_min,
508 |                                    grid_max,
509 |                                    grid_n,
510 |                                    trials,
511 |                                    nsamples,
512 |                                    places)
513 | 
514 | 
515 | @attr('slow')
516 | def test_bnb_hp_alpha():
517 |     N = 1000
518 |     Y = np.array([(x,) for x in np.random.randint(low=0, high=10, size=N)],
519 |                  dtype=[('', np.bool)])
520 |     view = numpy_dataview(Y)
521 |     grid_min, grid_max, grid_n = 0.01, 5.0, 100
522 |     _test_scalar_hp_inference(view,
523 |                               log_exponential(1.),
524 |                               1.,
525 |                               grid_min,
526 |                               grid_max,
527 |                               grid_n,
528 |                               bnb,
529 |                               'alpha')
530 | 
531 | 
532 | @attr('slow')
533 | def test_bnb_hp_beta():
534 |     N = 1000
535 |     Y = np.array([(x,) for x in np.random.randint(low=0, high=10, size=N)],
536 |                  dtype=[('', np.bool)])
537 |     view = numpy_dataview(Y)
538 |     grid_min, grid_max, grid_n = 0.01, 5.0, 100
539 |     _test_scalar_hp_inference(view,
540 |                               log_exponential(1.),
541 |                               1.,
542 |                               grid_min,
543 |                               grid_max,
544 |                               grid_n,
545 |                               bnb,
546 |                               'beta')
547 | 
548 | 
549 | def test_gp_hp_alpha():
550 |     N = 1000
551 |     Y = np.array([(x,) for x in np.random.randint(low=0, high=10, size=N)],
552 |                  dtype=[('', np.bool)])
553 |     view = numpy_dataview(Y)
554 |     grid_min, grid_max, grid_n = 0.01, 5.0, 100
555 |     _test_scalar_hp_inference(view,
556 |                               log_exponential(1.),
557 |                               1.,
558 |                               grid_min,
559 |                               grid_max,
560 |                               grid_n,
561 |                               gp,
562 |                               'alpha')
563 | 
564 | 
565 | @attr('slow')
566 | def test_gp_hp_inv_beta():
567 |     N = 1000
568 |     Y = np.array([(x,) for x in np.random.randint(low=0, high=10, size=N)],
569 |                  dtype=[('', np.bool)])
570 |     view = numpy_dataview(Y)
571 |     grid_min, grid_max, grid_n = 0.001, 2.0, 100
572 |     _test_scalar_hp_inference(view,
573 |                               log_exponential(1.),
574 |                               0.1,
575 |                               grid_min,
576 |                               grid_max,
577 |                               grid_n,
578 |                               gp,
579 |                               'inv_beta')
580 | 
581 | 
582 | @attr('slow')
583 | def test_nich_hp_mu():
584 |     N = 1000
585 |     Y = np.array([(x,) for x in np.random.uniform(low=-10, high=10, size=N)],
586 |                  dtype=[('', np.float32)])
587 |     view = numpy_dataview(Y)
588 |     grid_min, grid_max, grid_n = -5., 5., 100
589 |     _test_scalar_hp_inference(view,
590 |                               log_normal(0., 1.),
591 |                               0.1,
592 |                               grid_min,
593 |                               grid_max,
594 |                               grid_n,
595 |                               nich,
596 |                               'mu')
597 | 
598 | 
599 | @attr('slow')
600 | def test_nich_hp_sigmasq():
601 |     N = 1000
602 |     Y = np.array([(x,) for x in np.random.uniform(low=-1, high=1, size=N)],
603 |                  dtype=[('', np.float32)])
604 |     view = numpy_dataview(Y)
605 |     grid_min, grid_max, grid_n = 0.0001, 1.0, 100
606 |     _test_scalar_hp_inference(view,
607 |                               log_exponential(1.),
608 |                               0.1,
609 |                               grid_min,
610 |                               grid_max,
611 |                               grid_n,
612 |                               nich,
613 |                               'sigmasq')
614 | 


--------------------------------------------------------------------------------
/test/test_mixturemodel_gibbs_assign.py:
--------------------------------------------------------------------------------
  1 | from microscopes.mixture.model import initialize, bind, sample
  2 | from microscopes.common.recarray.dataview import numpy_dataview
  3 | from microscopes.common.rng import rng
  4 | from microscopes.kernels.gibbs import (
  5 |     assign as gibbs_assign,
  6 |     assign_resample as gibbs_assign_nonconj,
  7 | )
  8 | from microscopes.kernels.slice import theta as slice_theta
  9 | 
 10 | from microscopes.models import bb, niw, bbnc
 11 | from microscopes.mixture.definition import model_definition
 12 | 
 13 | from microscopes.common.util import random_orthonormal_matrix
 14 | 
 15 | from microscopes.common.testutil import (
 16 |     assert_discrete_dist_approx,
 17 |     permutation_iter,
 18 |     permutation_canonical,
 19 | )
 20 | from microscopes.mixture.testutil import data_with_posterior
 21 | 
 22 | import itertools as it
 23 | import numpy as np
 24 | import numpy.ma as ma
 25 | import time
 26 | 
 27 | from nose.plugins.attrib import attr
 28 | 
 29 | 
 30 | def _test_convergence(bs,
 31 |                       posterior,
 32 |                       kernel,
 33 |                       burnin_niters,
 34 |                       skip,
 35 |                       ntries,
 36 |                       nsamples,
 37 |                       kl_places):
 38 |     N = bs.nentities()
 39 |     start = time.time()
 40 |     last = start
 41 |     for i in xrange(burnin_niters):
 42 |         kernel(bs)
 43 |         if not ((i + 1) % 1000):
 44 |             print 'burning finished iteration', (i + 1), \
 45 |                 'in', (time.time() - last), 'seconds'
 46 |             last = time.time()
 47 |     print 'finished burnin of', burnin_niters, \
 48 |         'iters in', (time.time() - start), 'seconds'
 49 |     idmap = {C: i for i, C in enumerate(permutation_iter(N))}
 50 | 
 51 |     def sample_fn():
 52 |         for _ in xrange(skip):
 53 |             kernel(bs)
 54 |         return idmap[tuple(permutation_canonical(bs.assignments()))]
 55 |     assert_discrete_dist_approx(
 56 |         sample_fn, posterior,
 57 |         ntries=ntries, nsamples=nsamples, kl_places=kl_places)
 58 | 
 59 | 
 60 | def data_with_assignment(Y_clusters):
 61 |     assignments = it.chain.from_iterable(
 62 |         [i] * len(cluster) for i, cluster in enumerate(Y_clusters))
 63 |     return np.hstack(Y_clusters), list(assignments)
 64 | 
 65 | 
 66 | def _test_convergence_bb_cxx(N,
 67 |                              D,
 68 |                              kernel,
 69 |                              preprocess_data_fn=None,
 70 |                              nonconj=False,
 71 |                              burnin_niters=10000,
 72 |                              skip=10,
 73 |                              ntries=50,
 74 |                              nsamples=1000,
 75 |                              kl_places=2):
 76 |     r = rng()
 77 |     cluster_hp = {'alpha': 2.0}
 78 |     feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D
 79 |     defn = model_definition(N, [bb] * D)
 80 |     nonconj_defn = model_definition(N, [bbnc] * D)
 81 |     Y, posterior = data_with_posterior(
 82 |         defn, cluster_hp, feature_hps, preprocess_data_fn)
 83 |     data = numpy_dataview(Y)
 84 |     s = initialize(nonconj_defn if nonconj else defn,
 85 |                    data,
 86 |                    cluster_hp=cluster_hp,
 87 |                    feature_hps=feature_hps,
 88 |                    r=r)
 89 |     bs = bind(s, data)
 90 |     wrapped_kernel = lambda s: kernel(s, r)
 91 |     _test_convergence(bs,
 92 |                       posterior,
 93 |                       wrapped_kernel,
 94 |                       burnin_niters,
 95 |                       skip,
 96 |                       ntries,
 97 |                       nsamples,
 98 |                       kl_places)
 99 | 
100 | 
101 | def test_convergence_bb_cxx():
102 |     N, D = 4, 5
103 |     _test_convergence_bb_cxx(N, D, gibbs_assign)
104 | 
105 | 
106 | def test_convergence_bb_missing():
107 |     N, D = 4, 5
108 | 
109 |     def preprocess_fn(Y):
110 |         masks = [tuple(j == (i % len(Y)) for j in xrange(D))
111 |                  for i in xrange(len(Y))]
112 |         return ma.array(Y, mask=masks)
113 |     _test_convergence_bb_cxx(N, D, gibbs_assign, preprocess_fn)
114 | 
115 | 
116 | @attr('slow')
117 | def test_convergence_bb_nonconj_cxx():
118 |     N, D = 3, 5
119 |     thetaparams = {fi: {'p': 0.1} for fi in xrange(D)}
120 | 
121 |     def kernel(s, r):
122 |         gibbs_assign_nonconj(s, 10, r)
123 |         slice_theta(s, r, tparams=thetaparams)
124 |     _test_convergence_bb_cxx(
125 |         N, D, kernel, preprocess_data_fn=None, nonconj=True)
126 | 
127 | 
128 | def _test_multivariate_models(initialize_fn,
129 |                               dataview,
130 |                               bind,
131 |                               gibbs_assign,
132 |                               R):
133 |     # XXX: this test only checks that the operations don't crash
134 |     mu = np.ones(3)
135 |     kappa = 0.3
136 |     Q = random_orthonormal_matrix(3)
137 |     psi = np.dot(Q, np.dot(np.diag([1.0, 0.5, 0.2]), Q.T))
138 |     nu = 6
139 | 
140 |     N = 10
141 | 
142 |     def genrow():
143 |         return tuple(
144 |             [np.random.choice([False, True]),
145 |              [np.random.uniform(-3.0, 3.0) for _ in xrange(3)]])
146 |     X = np.array([genrow()
147 |                   for _ in xrange(N)], dtype=[('', bool), ('', float, (3,))])
148 |     view = dataview(X)
149 | 
150 |     defn = model_definition(N, [bb, niw(3)])
151 |     s = initialize_fn(
152 |         defn,
153 |         view,
154 |         cluster_hp={'alpha': 2.},
155 |         feature_hps=[
156 |             {'alpha': 2., 'beta': 2.},
157 |             {'mu': mu, 'kappa': kappa, 'psi': psi, 'nu': nu}
158 |         ],
159 |         r=R)
160 | 
161 |     bound_s = bind(s, view)
162 |     for _ in xrange(10):
163 |         gibbs_assign(bound_s, R)
164 | 
165 | 
166 | def test_multivariate_models_cxx():
167 |     _test_multivariate_models(
168 |         initialize,
169 |         numpy_dataview,
170 |         bind,
171 |         gibbs_assign,
172 |         rng())
173 | 
174 | 
175 | def _test_nonconj_inference(initialize_fn,
176 |                             dataview,
177 |                             bind,
178 |                             assign_nonconj_fn,
179 |                             slice_theta_fn,
180 |                             R,
181 |                             ntries,
182 |                             nsamples,
183 |                             tol):
184 |     N, D = 1000, 5
185 |     defn = model_definition(N, [bbnc] * D)
186 |     cluster_hp = {'alpha': 0.2}
187 |     feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D
188 | 
189 |     while True:
190 |         Y_clustered, cluster_samplers = sample(
191 |             defn, cluster_hp, feature_hps, R)
192 |         if len(Y_clustered) == 2:
193 |             break
194 |     dominant = np.argmax(map(len, Y_clustered))
195 |     truth = np.array([s.p for s in cluster_samplers[dominant]])
196 |     print 'truth:', truth
197 | 
198 |     # see if we can learn the p-values for each of the two clusters. we proceed
199 |     # by running gibbs_assign_nonconj, followed by slice sampling on the
200 |     # posterior p(\theta | Y). we'll "cheat" a little by bootstrapping the
201 |     # DP with the correct assignment (but not with the correct p-values)
202 |     Y, assignment = data_with_assignment(Y_clustered)
203 |     view = dataview(Y)
204 |     s = initialize_fn(
205 |         defn, view, cluster_hp=cluster_hp,
206 |         feature_hps=feature_hps, assignment=assignment, r=R)
207 |     bs = bind(s, view)
208 | 
209 |     def mkparam():
210 |         return {'p': 0.1}
211 |     thetaparams = {fi: mkparam() for fi in xrange(D)}
212 | 
213 |     def kernel():
214 |         assign_nonconj_fn(bs, 10, R)
215 |         slice_theta_fn(bs, R, tparams=thetaparams)
216 | 
217 |     def inference(niters):
218 |         for _ in xrange(niters):
219 |             kernel()
220 |             groups = s.groups()
221 |             inferred_dominant = groups[
222 |                 np.argmax([s.groupsize(gid) for gid in groups])]
223 |             inferred = [s.get_suffstats(inferred_dominant, d)['p']
224 |                         for d in xrange(D)]
225 |             inferred = np.array(inferred)
226 |             yield inferred
227 | 
228 |     posterior = []
229 |     while ntries:
230 |         samples = list(inference(nsamples))
231 |         posterior.extend(samples)
232 |         inferred = sum(posterior) / len(posterior)
233 |         diff = np.linalg.norm(truth - inferred)
234 |         print 'inferred:', inferred
235 |         print 'diff:', diff
236 |         if diff <= tol:
237 |             return
238 |         ntries -= 1
239 |         print 'tries left:', ntries
240 | 
241 |     assert False, 'did not converge'
242 | 
243 | 
244 | @attr('slow')
245 | def test_nonconj_inference_cxx():
246 |     _test_nonconj_inference(
247 |         initialize, numpy_dataview, bind,
248 |         gibbs_assign_nonconj, slice_theta, R=rng(),
249 |         ntries=50, nsamples=100, tol=0.2)
250 | 


--------------------------------------------------------------------------------
/test/test_models_get_set_params.py:
--------------------------------------------------------------------------------
 1 | from microscopes.mixture.definition import model_definition
 2 | from microscopes.models import bb, bnb, gp, nich
 3 | from microscopes.mixture.model import initialize
 4 | from microscopes.common.rng import rng
 5 | from microscopes.common.recarray.dataview import numpy_dataview
 6 | 
 7 | import numpy as np
 8 | from nose.tools import assert_almost_equals
 9 | 
10 | 
11 | def assert_dict_almost_equals(a, b):
12 |     for k, v in a.iteritems():
13 |         assert k in b
14 |         # floats don't have much precision
15 |         assert_almost_equals(v, b[k], places=5)
16 | 
17 | 
18 | def assert_lists_almost_equals(a, b):
19 |     assert len(a) == len(b)
20 |     for x, y in zip(a, b):
21 |         assert_almost_equals(x, y, places=5)
22 | 
23 | 
24 | def test_get_set_params():
25 |     defn = model_definition(1, [bb, bnb, gp, nich])
26 |     data = np.array([(True, 3, 5, 10.), ],
27 |                     dtype=[('', bool), ('', int), ('', int), ('', float)])
28 |     s = initialize(defn=defn, data=numpy_dataview(data), r=rng())
29 |     s.set_cluster_hp({'alpha': 3.0})
30 |     assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0})
31 |     hyperparams = [
32 |         {'alpha': 1.2, 'beta': 4.3},
33 |         {'alpha': 1., 'beta': 1., 'r': 1},
34 |         {'alpha': 1., 'inv_beta': 1.},
35 |         {'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1.},
36 |     ]
37 |     for i, hp in enumerate(hyperparams):
38 |         s.set_feature_hp(i, hp)
39 |         assert_dict_almost_equals(s.get_feature_hp(i), hp)
40 | 


--------------------------------------------------------------------------------
/test/test_models_mixture_dp.py:
--------------------------------------------------------------------------------
 1 | from microscopes.models import bb
 2 | from microscopes.mixture.definition import model_definition
 3 | from microscopes.common.rng import rng
 4 | from microscopes.common.util import KL_discrete, logsumexp
 5 | 
 6 | from microscopes.common.recarray.dataview import \
 7 |     numpy_dataview as cxx_numpy_dataview
 8 | 
 9 | from microscopes.mixture.model import initialize as cxx_initialize
10 | 
11 | import numpy as np
12 | import numpy.ma as ma
13 | import itertools as it
14 | 
15 | #from nose.plugins.attrib import attr
16 | from nose.tools import assert_almost_equals
17 | 
18 | N, D = 1000, 5
19 | 
20 | 
21 | def _test_sample_post_pred(initialize_fn, dataview, y_new, r):
22 |     defn = model_definition(N, [bb] * D)
23 | 
24 |     data = [tuple(row) for row in (np.random.random(size=(N, D)) < 0.8)]
25 |     data = np.array(data, dtype=[('', bool)] * D)
26 | 
27 |     s = initialize_fn(
28 |         defn=defn,
29 |         data=dataview(data),
30 |         cluster_hp={'alpha': 2.},
31 |         feature_hps=[{'alpha': 1., 'beta': 1.}] * D,
32 |         r=r)
33 | 
34 |     n_samples = 10000
35 |     Y_samples = [s.sample_post_pred(None, r)[1] for _ in xrange(n_samples)]
36 |     Y_samples = np.hstack(Y_samples)
37 | 
38 |     empty_groups = list(s.empty_groups())
39 |     if len(empty_groups):
40 |         for egid in empty_groups[1:]:
41 |             s.delete_group(egid)
42 |     else:
43 |         s.create_group(r)
44 |     assert len(s.empty_groups()) == 1
45 | 
46 |     def score_post_pred(y):
47 |         # XXX: the C++ API can only handle structural arrays for now
48 |         y = np.array([y], dtype=[('', bool)] * D)[0]
49 |         _, scores = s.score_value(y, r)
50 |         return logsumexp(scores)
51 | 
52 |     scores = np.array(
53 |         list(map(score_post_pred, it.product([False, True], repeat=D))))
54 |     scores = np.exp(scores)
55 |     assert_almost_equals(scores.sum(), 1.0, places=3)
56 | 
57 |     # lazy man
58 |     idmap = {y: i for i, y in enumerate(it.product([False, True], repeat=D))}
59 | 
60 |     smoothing = 1e-5
61 |     sample_hist = np.zeros(len(idmap), dtype=np.int)
62 |     for y in Y_samples:
63 |         sample_hist[idmap[tuple(y)]] += 1.
64 | 
65 |     sample_hist = np.array(sample_hist, dtype=np.float) + smoothing
66 |     sample_hist /= sample_hist.sum()
67 | 
68 |     #print 'actual', scores
69 |     #print 'emp', sample_hist
70 |     kldiv = KL_discrete(scores, sample_hist)
71 |     print 'KL:', kldiv
72 | 
73 |     assert kldiv <= 0.005
74 | 
75 | 
76 | def test_cxx_sample_post_pred_no_given_data():
77 |     _test_sample_post_pred(cxx_initialize, cxx_numpy_dataview, None, rng(7589))
78 | 
79 | 
80 | def test_cxx_sample_post_pred_given_data():
81 |     assert D == 5
82 |     y_new = ma.masked_array(
83 |         np.array([(True, False, True, True, True)], dtype=[('', np.bool)] * 5),
84 |         mask=[(False, False, True, True, True)])[0]
85 |     _test_sample_post_pred(
86 |         cxx_initialize, cxx_numpy_dataview, y_new, rng(543234))
87 | 


--------------------------------------------------------------------------------
/test/test_query.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from microscopes.models import bb
 3 | from microscopes.mixture import query, model
 4 | from microscopes.mixture.definition import model_definition
 5 | from microscopes.mixture.testutil import toy_dataset
 6 | from microscopes.common.recarray.dataview import numpy_dataview
 7 | from microscopes.common.rng import rng
 8 | 
 9 | import numpy as np
10 | import numpy.ma as ma
11 | from nose.tools import assert_equals
12 | 
13 | 
14 | def test_zmatrix():
15 |     N, D = 10, 4
16 |     defn = model_definition(N, [bb] * D)
17 |     Y = toy_dataset(defn)
18 |     prng = rng()
19 |     view = numpy_dataview(Y)
20 |     latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
21 |     zmat = query.zmatrix(latents)
22 |     assert_equals(zmat.shape, (N, N))
23 | 
24 | 
25 | def test_posterior_predictive():
26 |     N, D = 10, 4  # D needs to be even
27 |     defn = model_definition(N, [bb] * D)
28 |     Y = toy_dataset(defn)
29 |     prng = rng()
30 |     view = numpy_dataview(Y)
31 |     latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
32 | 
33 |     q = ma.masked_array(
34 |         np.array([(False,) * D], dtype=[('', bool)] * D),
35 |         mask=[(False,) * (D / 2) + (True,) * (D / 2)])
36 |     samples = query.posterior_predictive(q, latents, prng)
37 |     assert_equals(samples.shape, (1, len(latents)))
38 | 
39 |     q = ma.masked_array(
40 |         np.array([(False,) * D] * 3, dtype=[('', bool)] * D),
41 |         mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3)
42 |     samples = query.posterior_predictive(q, latents, prng)
43 |     assert_equals(samples.shape, (3, len(latents)))
44 | 
45 | 
46 | def test_posterior_predictive_statistic():
47 |     N, D = 10, 4  # D needs to be even
48 |     defn = model_definition(N, [bb] * D)
49 |     Y = toy_dataset(defn)
50 |     prng = rng()
51 |     view = numpy_dataview(Y)
52 |     latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
53 |     q = ma.masked_array(
54 |         np.array([(False,) * D], dtype=[('', bool)] * D),
55 |         mask=[(False,) * (D / 2) + (True,) * (D / 2)])
56 | 
57 |     statistic = query.posterior_predictive_statistic(q, latents, prng)
58 |     assert_equals(statistic.shape, (1,))
59 |     assert_equals(len(statistic.dtype), D)
60 | 
61 |     statistic = query.posterior_predictive_statistic(
62 |         q, latents, prng, merge='mode')
63 |     assert_equals(statistic.shape, (1,))
64 |     assert_equals(len(statistic.dtype), D)
65 | 
66 |     statistic = query.posterior_predictive_statistic(
67 |         q, latents, prng, merge=['mode', 'mode', 'avg', 'avg'])
68 |     assert_equals(statistic.shape, (1,))
69 |     assert_equals(len(statistic.dtype), D)
70 | 
71 |     q = ma.masked_array(
72 |         np.array([(False,) * D] * 3, dtype=[('', bool)] * D),
73 |         mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3)
74 |     statistic = query.posterior_predictive_statistic(q, latents, prng)
75 |     assert_equals(statistic.shape, (3,))
76 |     assert_equals(len(statistic.dtype), D)
77 | 


--------------------------------------------------------------------------------
/test/test_runner.py:
--------------------------------------------------------------------------------
  1 | from microscopes.mixture import model, runner
  2 | from microscopes.mixture.definition import model_definition
  3 | from microscopes.models import (
  4 |     bb,
  5 |     bbnc,
  6 |     nich,
  7 |     niw,
  8 | )
  9 | from microscopes.common.rng import rng
 10 | from microscopes.common.recarray.dataview import numpy_dataview
 11 | from microscopes.kernels import parallel
 12 | from microscopes.common.testutil import (
 13 |     permutation_canonical,
 14 |     permutation_iter,
 15 |     assert_discrete_dist_approx,
 16 | )
 17 | from microscopes.mixture.testutil import (
 18 |     toy_dataset,
 19 |     data_with_posterior,
 20 | )
 21 | 
 22 | import itertools as it
 23 | import multiprocessing as mp
 24 | from nose.tools import assert_true
 25 | 
 26 | from nose.plugins.attrib import attr
 27 | 
 28 | 
 29 | def _test_runner_kernel_config(kc_fn, models):
 30 |     defn = model_definition(10, models)
 31 |     Y = toy_dataset(defn)
 32 |     view = numpy_dataview(Y)
 33 |     kc = kc_fn(defn)
 34 |     prng = rng()
 35 | 
 36 |     ntries = 5
 37 |     while ntries:
 38 |         latent = model.initialize(defn, view, prng)
 39 |         assignments = latent.assignments()
 40 |         r = runner.runner(defn, view, latent, kc)
 41 |         r.run(r=prng, niters=10)
 42 |         assignments1 = r.get_latent().assignments()
 43 | 
 44 |         # XXX: it should be very unlikely the assignments are all equal
 45 |         if assignments == assignments1:
 46 |             ntries -= 1
 47 |         else:
 48 |             return  # success
 49 | 
 50 |     assert_true(False)  # exceeded ntries
 51 | 
 52 | 
 53 | def test_runner_default_kernel_config():
 54 |     models = [bb, nich, niw(3)]
 55 |     _test_runner_kernel_config(runner.default_kernel_config, models)
 56 | 
 57 | 
 58 | def test_runner_default_kernel_config_nonconj():
 59 |     models = [bbnc, nich, niw(3)]
 60 |     _test_runner_kernel_config(runner.default_kernel_config, models)
 61 | 
 62 | 
 63 | @attr('slow')
 64 | def test_runner_default_kernel_grid():
 65 |     models = [bb, nich, niw(3)]
 66 | 
 67 |     def kc_fn(defn):
 68 |         return list(it.chain(
 69 |             runner.default_assign_kernel_config(defn),
 70 |             runner.default_grid_feature_hp_kernel_config(defn)))
 71 |     _test_runner_kernel_config(kc_fn, models)
 72 | 
 73 | 
 74 | def test_runner_default_kernel_config_with_cluster():
 75 |     models = [bb, nich, niw(3)]
 76 | 
 77 |     def kc_fn(defn):
 78 |         return list(it.chain(
 79 |             runner.default_assign_kernel_config(defn),
 80 |             runner.default_feature_hp_kernel_config(defn),
 81 |             runner.default_cluster_hp_kernel_config(defn)))
 82 |     _test_runner_kernel_config(kc_fn, models)
 83 | 
 84 | 
 85 | @attr('slow')
 86 | def test_runner_convergence():
 87 |     N, D = 4, 5
 88 |     defn = model_definition(N, [bb] * D)
 89 |     prng = rng()
 90 |     Y, posterior = data_with_posterior(defn, r=prng)
 91 |     view = numpy_dataview(Y)
 92 |     latent = model.initialize(defn, view, prng)
 93 |     r = runner.runner(defn, view, latent, ['assign'])
 94 |     r.run(r=prng, niters=1000)  # burnin
 95 |     idmap = {C: i for i, C in enumerate(permutation_iter(N))}
 96 | 
 97 |     def sample_fn():
 98 |         r.run(r=prng, niters=10)
 99 |         new_latent = r.get_latent()
100 |         return idmap[tuple(permutation_canonical(new_latent.assignments()))]
101 | 
102 |     assert_discrete_dist_approx(sample_fn, posterior, ntries=100)
103 | 
104 | 
105 | @attr('uses_mp')
106 | def test_runner_multiprocessing():
107 |     defn = model_definition(10, [bb, nich, niw(3)])
108 |     Y = toy_dataset(defn)
109 |     view = numpy_dataview(Y)
110 |     kc = runner.default_kernel_config(defn)
111 |     prng = rng()
112 |     latents = [model.initialize(defn, view, prng)
113 |                for _ in xrange(mp.cpu_count())]
114 |     runners = [runner.runner(defn, view, latent, kc) for latent in latents]
115 |     r = parallel.runner(runners)
116 |     # check it is restartable
117 |     r.run(r=prng, niters=10)
118 |     r.run(r=prng, niters=10)
119 | 
120 | 
121 | @attr('uses_mp')
122 | @attr('slow')
123 | def test_runner_multiprocessing_convergence():
124 |     N, D = 4, 5
125 |     defn = model_definition(N, [bb] * D)
126 |     prng = rng()
127 |     Y, posterior = data_with_posterior(defn, r=prng)
128 |     view = numpy_dataview(Y)
129 |     latents = [model.initialize(defn, view, prng)
130 |                for _ in xrange(mp.cpu_count())]
131 |     runners = [runner.runner(defn, view, latent, ['assign'])
132 |                for latent in latents]
133 |     r = parallel.runner(runners)
134 |     r.run(r=prng, niters=1000)  # burnin
135 |     idmap = {C: i for i, C in enumerate(permutation_iter(N))}
136 | 
137 |     def sample_iter():
138 |         r.run(r=prng, niters=10)
139 |         for latent in r.get_latents():
140 |             yield idmap[tuple(permutation_canonical(latent.assignments()))]
141 | 
142 |     ref = [None]
143 | 
144 |     def sample_fn():
145 |         if ref[0] is None:
146 |             ref[0] = sample_iter()
147 |         try:
148 |             return next(ref[0])
149 |         except StopIteration:
150 |             ref[0] = None
151 |         return sample_fn()
152 | 
153 |     assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
154 | 
155 | 
156 | @attr('slow')
157 | def test_runner_multyvac():
158 |     defn = model_definition(10, [bb, nich, niw(3)])
159 |     Y = toy_dataset(defn)
160 |     view = numpy_dataview(Y)
161 |     kc = runner.default_kernel_config(defn)
162 |     prng = rng()
163 |     latents = [model.initialize(defn, view, prng)
164 |                for _ in xrange(2)]
165 |     runners = [runner.runner(defn, view, latent, kc) for latent in latents]
166 |     r = parallel.runner(runners, backend='multyvac', layer='perf', core='f2')
167 |     r.run(r=prng, niters=1000)
168 |     r.run(r=prng, niters=1000)
169 | 
170 | 
171 | @attr('slow')
172 | def test_runner_multyvac_volume():
173 |     defn = model_definition(10, [bb, nich, niw(3)])
174 |     Y = toy_dataset(defn)
175 |     view = numpy_dataview(Y)
176 |     kc = runner.default_kernel_config(defn)
177 |     prng = rng()
178 |     latents = [model.initialize(defn, view, prng)
179 |                for _ in xrange(2)]
180 |     runners = [runner.runner(defn, view, latent, kc) for latent in latents]
181 |     r = parallel.runner(
182 |         runners, backend='multyvac', layer='perf', core='f2', volume='data')
183 |     r.run(r=prng, niters=1000)
184 |     r.run(r=prng, niters=1000)
185 | 


--------------------------------------------------------------------------------
/test/test_sample.py:
--------------------------------------------------------------------------------
 1 | from microscopes.mixture.model import sample
 2 | from microscopes.mixture.definition import model_definition
 3 | from microscopes.models import (
 4 |     bb,
 5 |     bnb,
 6 |     gp,
 7 |     nich,
 8 |     dd,
 9 |     niw,
10 | )
11 | 
12 | from nose.tools import assert_equals, assert_true
13 | 
14 | 
15 | def test_sample_sanity():
16 |     # just a sanity check
17 |     defn = model_definition(10, [bb, bnb, gp, nich, dd(5), niw(4)])
18 |     clusters, samplers = sample(defn)
19 |     assert_equals(len(clusters), len(samplers))
20 |     for cluster in clusters:
21 |         assert_true(len(cluster) > 0)
22 |         for v in cluster:
23 |             assert_equals(len(v), len(defn.models()))
24 | 


--------------------------------------------------------------------------------
/test/test_slice_theta.py:
--------------------------------------------------------------------------------
 1 | from microscopes.kernels.slice import theta
 2 | from microscopes.mixture.model import initialize, bind
 3 | from microscopes.common.recarray.dataview import numpy_dataview
 4 | from microscopes.common.rng import rng
 5 | from microscopes.models import bbnc
 6 | from microscopes.mixture.definition import model_definition
 7 | 
 8 | from microscopes.common.testutil import assert_1d_cont_dist_approx_sps
 9 | from scipy.stats import beta
10 | import numpy as np
11 | 
12 | #from nose.plugins.attrib import attr
13 | 
14 | 
15 | def test_slice_theta_mm():
16 |     N = 100
17 |     data = np.array(
18 |         [(np.random.random() < 0.8,) for _ in xrange(N)],
19 |         dtype=[('', bool)])
20 |     defn = model_definition(N, [bbnc])
21 |     r = rng()
22 |     prior = {'alpha': 1.0, 'beta': 9.0}
23 |     view = numpy_dataview(data)
24 |     s = initialize(
25 |         defn,
26 |         view,
27 |         cluster_hp={'alpha': 1., 'beta': 9.},
28 |         feature_hps=[prior],
29 |         r=r,
30 |         assignment=[0] * N)
31 | 
32 |     heads = len([1 for y in data if y[0]])
33 |     tails = N - heads
34 | 
35 |     alpha1 = prior['alpha'] + heads
36 |     beta1 = prior['beta'] + tails
37 | 
38 |     bs = bind(s, view)
39 |     params = {0: {'p': 0.05}}
40 | 
41 |     def sample_fn():
42 |         theta(bs, r, tparams=params)
43 |         return s.get_suffstats(0, 0)['p']
44 | 
45 |     rv = beta(alpha1, beta1)
46 |     assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
47 | 


--------------------------------------------------------------------------------
/test/test_state.py:
--------------------------------------------------------------------------------
  1 | # test the low level primitive operations
  2 | 
  3 | from distributions.dbg.models import (
  4 |     bb as dist_bb,
  5 |     bnb as dist_bnb,
  6 |     nich as dist_nich,
  7 | )
  8 | 
  9 | from microscopes.models import bb, bbnc, bnb, nich, niw
 10 | from microscopes.mixture.definition import model_definition
 11 | from microscopes.common.rng import rng
 12 | 
 13 | from microscopes.mixture.model import (
 14 |     initialize as cxx_initialize,
 15 |     deserialize as cxx_deserialize,
 16 |     bind,
 17 | )
 18 | 
 19 | from microscopes.common.recarray.dataview import (
 20 |     numpy_dataview as cxx_numpy_dataview,
 21 | )
 22 | 
 23 | from microscopes.mixture.testutil import toy_dataset
 24 | 
 25 | import itertools as it
 26 | import numpy as np
 27 | import numpy.ma as ma
 28 | import pickle
 29 | import copy
 30 | 
 31 | #from nose.plugins.attrib import attr
 32 | from nose.tools import (
 33 |     assert_almost_equals,
 34 |     assert_equals,
 35 |     assert_is_not,
 36 | )
 37 | from distributions.tests.util import assert_close
 38 | 
 39 | 
 40 | def unset(s, data, r):
 41 |     for i, yi in enumerate(data):
 42 |         s.remove_value(i, yi, r)
 43 | 
 44 | 
 45 | def ensure_k_groups(s, k, r):
 46 |     groups = sorted(list(s.groups()))
 47 |     if len(groups) < k:
 48 |         for _ in xrange(k - len(groups)):
 49 |             s.create_group(r)
 50 |     elif len(groups) > k:
 51 |         for gid in groups[k:]:
 52 |             s.delete_group(gid)
 53 | 
 54 | 
 55 | def test_operations():
 56 |     N = 10
 57 |     R = rng(12)
 58 | 
 59 |     def mkrow():
 60 |         return (np.random.choice([False, True]),
 61 |                 np.random.choice([False, True]),
 62 |                 np.random.random(),
 63 |                 np.random.choice([False, True]))
 64 |     dtype = [('', bool), ('', bool), ('', float), ('', bool)]
 65 |     # non-masked data
 66 |     data = [mkrow() for _ in xrange(N)]
 67 |     data = np.array(data, dtype=dtype)
 68 | 
 69 |     defn = model_definition(N, [bb, bb, nich, bb])
 70 |     init_args = {
 71 |         'defn': defn,
 72 |         'cluster_hp': {'alpha': 2.0},
 73 |         'feature_hps': [
 74 |             dist_bb.EXAMPLES[0]['shared'],
 75 |             dist_bb.EXAMPLES[0]['shared'],
 76 |             dist_nich.EXAMPLES[0]['shared'],
 77 |             dist_bb.EXAMPLES[0]['shared'],
 78 |         ],
 79 |         'r': R,
 80 |     }
 81 |     cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)
 82 | 
 83 |     # *_initialize() randomly assigns all entities to a group, so we'll have to
 84 |     # unset this assignment for this test
 85 |     unset(cxx_s, data, R)
 86 | 
 87 |     ensure_k_groups(cxx_s, 3, R)
 88 | 
 89 |     assert cxx_s.nentities() == N
 90 | 
 91 |     cxx_s.dcheck_consistency()
 92 | 
 93 |     assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])
 94 | 
 95 |     for i, yi in enumerate(data):
 96 |         egid = i % 2
 97 |         cxx_s.add_value(egid, i, yi, R)
 98 |         cxx_s.dcheck_consistency()
 99 | 
100 |     for i, yi in it.islice(enumerate(data), 2):
101 |         cxx_s.remove_value(i, yi, R)
102 |         cxx_s.dcheck_consistency()
103 | 
104 |     newrow = mkrow()
105 |     newdata = np.array([newrow], dtype=dtype)
106 | 
107 |     cxx_score = cxx_s.score_value(newdata[0], R)
108 |     assert cxx_score is not None
109 |     cxx_s.dcheck_consistency()
110 | 
111 | 
112 | def test_masked_operations():
113 |     N = 10
114 |     R = rng(2347785)
115 | 
116 |     dtype = [('', bool), ('', int), ('', float)]
117 | 
118 |     def randombool():
119 |         return np.random.choice([False, True])
120 | 
121 |     def mkrow():
122 |         return (randombool(), np.random.randint(1, 10), np.random.random())
123 | 
124 |     def mkmask():
125 |         return (randombool(), randombool(), randombool())
126 |     data = [mkrow() for _ in xrange(N)]
127 |     data = np.array(data, dtype=dtype)
128 |     mask = [mkmask() for _ in xrange(N)]
129 |     data = ma.masked_array(data, mask=mask)
130 | 
131 |     defn = model_definition(N, [bb, bnb, nich])
132 |     init_args = {
133 |         'defn': defn,
134 |         'cluster_hp': {'alpha': 10.0},
135 |         'feature_hps': [
136 |             dist_bb.EXAMPLES[0]['shared'],
137 |             dist_bnb.EXAMPLES[0]['shared'],
138 |             dist_nich.EXAMPLES[0]['shared'],
139 |         ],
140 |         'r': R,
141 |     }
142 |     cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)
143 | 
144 |     # see comment above
145 |     unset(cxx_s, data, R)
146 |     ensure_k_groups(cxx_s, 3, R)
147 | 
148 |     for i, yi in enumerate(data):
149 |         egid = i % 2
150 |         cxx_s.add_value(egid, i, yi, R)
151 |         cxx_s.dcheck_consistency()
152 | 
153 |     for i, yi in enumerate(data):
154 |         cxx_s.remove_value(i, yi, R)
155 |         cxx_s.dcheck_consistency()
156 | 
157 | 
158 | def _test_serializer(initialize_fn, deserialize_fn, dataview):
159 |     N = 10
160 |     R = rng()
161 | 
162 |     dtype = [('', bool), ('', int), ('', float)]
163 | 
164 |     def randombool():
165 |         return np.random.choice([False, True])
166 | 
167 |     def mkrow():
168 |         return (randombool(), np.random.randint(1, 10), np.random.random())
169 | 
170 |     def mkmask():
171 |         return (randombool(), randombool(), randombool())
172 |     data = [mkrow() for _ in xrange(N)]
173 |     data = np.array(data, dtype=dtype)
174 | 
175 |     defn = model_definition(N, [bb, bnb, nich])
176 |     init_args = {
177 |         'defn': defn,
178 |         'data': dataview(data),
179 |         'cluster_hp': {'alpha': 10.0},
180 |         'feature_hps': [
181 |             dist_bb.EXAMPLES[0]['shared'],
182 |             dist_bnb.EXAMPLES[0]['shared'],
183 |             dist_nich.EXAMPLES[0]['shared'],
184 |         ],
185 |         'r': R,
186 |     }
187 |     state = initialize_fn(**init_args)
188 | 
189 |     raw = state.serialize()
190 | 
191 |     state1 = deserialize_fn(defn, raw)
192 |     assert state1 is not None
193 | 
194 |     bstr = pickle.dumps(state)
195 |     state2 = pickle.loads(bstr)
196 |     assert state2 is not None
197 | 
198 | 
199 | def test_serializer_cxx():
200 |     _test_serializer(cxx_initialize, cxx_deserialize, cxx_numpy_dataview)
201 | 
202 | 
203 | def _assert_copy(s1, s2, bind_fn, view, r):
204 |     assert_equals(s1.nentities(), s2.nentities())
205 |     assert_equals(s1.nfeatures(), s2.nfeatures())
206 |     assert_equals(set(s1.groups()), set(s2.groups()))
207 |     assert_equals(s1.assignments(), s2.assignments())
208 |     for i in xrange(s1.nfeatures()):
209 |         hp1 = s1.get_feature_hp(i)
210 |         hp2 = s2.get_feature_hp(i)
211 |         assert_close(hp1, hp2)
212 |     for gid, fid in it.product(s1.groups(), range(s1.nfeatures())):
213 |         ss1 = s1.get_suffstats(gid, fid)
214 |         ss2 = s2.get_suffstats(gid, fid)
215 |         assert_close(ss1, ss2)
216 |     assert_almost_equals(s1.score_assignment(),
217 |                          s2.score_assignment())
218 |     assert_almost_equals(s1.score_data(None, None, r),
219 |                          s2.score_data(None, None, r))
220 |     before = list(s1.assignments())
221 |     gid = bind_fn(s1, view).remove_value(0, r)
222 |     assert_equals(s1.assignments()[0], -1)
223 |     assert_equals(before, s2.assignments())
224 |     bind_fn(s1, view).add_value(gid, 0, r)  # restore s1
225 | 
226 | 
227 | def _test_copy_state(defn, initialize_fn, bind_fn):
228 |     Y = toy_dataset(defn)
229 |     view = cxx_numpy_dataview(Y)
230 |     r = rng()
231 |     state = initialize_fn(defn, view, r)
232 |     state_shallow = copy.copy(state)
233 |     state_deep = copy.deepcopy(state)
234 |     assert_is_not(state, state_shallow)
235 |     assert_is_not(state, state_deep)
236 |     _assert_copy(state, state_shallow, bind_fn, view, r)
237 |     _assert_copy(state, state_deep, bind_fn, view, r)
238 | 
239 | 
240 | def test_copy_state():
241 |     defn = model_definition(10, [bb, niw(3)])
242 |     _test_copy_state(defn, cxx_initialize, bind)
243 | 
244 | 
245 | def test_copy_state_bbnc():
246 |     defn = model_definition(10, [bbnc])
247 |     _test_copy_state(defn, cxx_initialize, bind)
248 | 
249 | 
250 | def test_sample_post_pred():
251 |     N = 10
252 |     R = rng(5483932)
253 |     D = 4
254 | 
255 |     def randombool():
256 |         return np.random.choice([False, True])
257 | 
258 |     def mkrow():
259 |         return tuple(randombool() for _ in xrange(D))
260 |     dtype = [('', bool)] * D
261 |     data = [mkrow() for _ in xrange(N)]
262 |     data = np.array(data, dtype=dtype)
263 | 
264 |     defn = model_definition(N, [bb] * D)
265 |     init_args = {
266 |         'defn': defn,
267 |         'cluster_hp': {'alpha': 2.0},
268 |         'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
269 |         'r': R,
270 |     }
271 |     cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)
272 | 
273 |     G = 3
274 |     unset(cxx_s, data, R)
275 |     ensure_k_groups(cxx_s, 3, R)
276 | 
277 |     for i, yi in enumerate(data):
278 |         egid = i % G
279 |         cxx_s.add_value(egid, i, yi, R)
280 | 
281 |     # sample
282 |     y_new_data = mkrow()
283 |     y_new_mask = tuple(randombool() for _ in xrange(D))
284 |     y_new = ma.masked_array(
285 |         np.array([y_new_data], dtype=dtype),
286 |         mask=[y_new_mask])[0]
287 | 
288 |     n_samples = 1000
289 | 
290 |     cxx_samples = np.hstack(
291 |         [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])
292 | 
293 |     idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}
294 | 
295 |     def todist(samples):
296 |         dist = np.zeros(len(idmap))
297 |         for s in samples:
298 |             dist[idmap[tuple(s)]] += 1.0
299 |         dist /= dist.sum()
300 |         return dist
301 | 
302 |     cxx_dist = todist(cxx_samples)
303 |     assert cxx_dist is not None
304 | 
305 |     # XXX(stephentu):
306 |     # when we had python models, we used to compare the posterior
307 |     # sample distribution between the python and C++ models. now
308 |     # we don't do anything useful with the samples
309 | 


--------------------------------------------------------------------------------
/test/test_state_stress.py:
--------------------------------------------------------------------------------
 1 | from microscopes.models import bb
 2 | from microscopes.mixture.definition import model_definition
 3 | from microscopes.mixture.model import initialize as cxx_initialize
 4 | from microscopes.common.rng import rng
 5 | from microscopes.common.recarray.dataview import (
 6 |     numpy_dataview as cxx_numpy_dataview,
 7 | )
 8 | 
 9 | import numpy as np
10 | 
11 | #from nose.plugins.attrib import attr
12 | 
13 | 
14 | def _test_stress(initialize_fn, dataview, R):
15 |     N = 20
16 |     D = 2
17 |     data = np.random.random(size=(N, D)) < 0.8
18 |     Y = np.array([tuple(y) for y in data], dtype=[('', bool)] * D)
19 |     view = dataview(Y)
20 |     defn = model_definition(N, [bb] * D)
21 | 
22 |     s = initialize_fn(defn, view, cluster_hp={'alpha': 2.0}, r=R)
23 | 
24 |     CHANGE_GROUP = 1
25 |     CHANGE_VALUE = 2
26 | 
27 |     nops = 100
28 |     while nops:
29 |         assert len(s.groups()) >= 1
30 |         choice = np.random.choice([CHANGE_GROUP, CHANGE_VALUE])
31 |         if choice == CHANGE_GROUP:
32 |             # remove any empty groups. otherwise, add a new group
33 |             egroups = s.empty_groups()
34 |             if len(egroups) > 1:
35 |                 s.delete_group(egroups[0])
36 |             else:
37 |                 s.create_group(R)
38 |         else:
39 |             eid = np.random.randint(N)
40 |             if s.assignments()[eid] == -1:
41 |                 # add to random group
42 |                 egid = np.random.choice(s.groups())
43 |                 s.add_value(egid, eid, Y[eid], R)
44 |             else:
45 |                 s.remove_value(eid, Y[eid], R)
46 |         s.dcheck_consistency()
47 |         nops -= 1
48 | 
49 | 
50 | def test_stress_cxx():
51 |     _test_stress(cxx_initialize, cxx_numpy_dataview, rng())
52 | 


--------------------------------------------------------------------------------