├── clustering ├── __init__.py ├── Makefile ├── sdml.py ├── cneighbors.c ├── radfriendsregion.py └── neighbors.py ├── .gitignore ├── pres ├── massivens.pdf ├── massivens2.pdf ├── massivens3.pdf ├── mnras.layout ├── plotjointcontour.py ├── plotcontour.py └── mnras_template.tex ├── gennothing.py ├── Makefile ├── LICENSE ├── plotmuseposterior.py ├── plotscaling.py ├── plotevidences.py ├── checkoutput.py ├── TODO.rst ├── README.rst ├── plotposterior.py ├── adaptive_progress.py ├── gensimple_horns.py ├── gensimple_bright.py ├── gen.py ├── cmuselike.c ├── gensimple.py ├── gen_realistic.py ├── gensimple_faint.py ├── clike.c ├── elldrawer.py ├── cachedconstrainer.py ├── profile_generate_subsets.py ├── musefuse_postprocess.py ├── multi_nested_integrator.py ├── hiermetriclearn.py ├── sample.py ├── whitenedmcmc.py ├── friends.py ├── multi_nested_sampler.py └── musefuse.py /clustering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hdf5 2 | *.pyc 3 | *.npz 4 | *.so 5 | *.json 6 | *.pdf 7 | *.png 8 | prof* 9 | -------------------------------------------------------------------------------- /pres/massivens.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JohannesBuchner/massivedatans/master/pres/massivens.pdf -------------------------------------------------------------------------------- /pres/massivens2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JohannesBuchner/massivedatans/master/pres/massivens2.pdf -------------------------------------------------------------------------------- /pres/massivens3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JohannesBuchner/massivedatans/master/pres/massivens3.pdf -------------------------------------------------------------------------------- /pres/mnras.layout: -------------------------------------------------------------------------------- 1 | #% Do not delete the line below; configure depends on this 2 | # \DeclareLaTeXClass[mnras]{article (mnras)} 3 | # Input general definitions 4 | Input stdclass.inc 5 | Style Abstract 6 | InTitle 1 7 | End 8 | -------------------------------------------------------------------------------- /clustering/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC := gcc 3 | CFLAGS += -fPIC -std=c99 -Wall -lm -Wextra 4 | CFLAGS += -O3 5 | 6 | all: cneighbors.so cneighbors-parallel.so 7 | 8 | %-parallel.so: %.c 9 | ${CC} ${CFLAGS} -fopenmp -DPARALLEL=1 $< -o $@ -shared 10 | 11 | %.so: %.c 12 | ${CC} ${CFLAGS} $< -o $@ -shared 13 | clean: 14 | rm *.so 15 | 16 | .PHONY: all clean 17 | 18 | -------------------------------------------------------------------------------- /gennothing.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp 6 | import sys 7 | x = numpy.linspace(400, 800, 200) 8 | 9 | N = int(sys.argv[1]) 10 | noise_level = 0.01 11 | numpy.random.seed(N) 12 | y = numpy.random.normal(0, noise_level, size=(len(x),N)) 13 | 14 | with h5py.File('data_nothing_%s.hdf5' % sys.argv[1], 'w') as f: 15 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 16 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 17 | 18 | 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC := gcc 3 | CFLAGS += -fPIC -std=c99 -Wall -lm -Wextra -pedantic 4 | #CFLAGS += -Wduplicated-cond -Wduplicated-branches -Wrestrict -Wnull-dereference 5 | CFLAGS += -Wlogical-op -Wjump-misses-init -Wdouble-promotion -Wshadow -Wformat=2 6 | CFLAGS += -O3 7 | 8 | all: clike.so clike-parallel.so cmuselike.so cmuselike-parallel.so clustering 9 | 10 | clustering: 11 | $(MAKE) -C clustering/ 12 | 13 | %-parallel.so: %.c 14 | ${CC} ${CFLAGS} -fopenmp -DPARALLEL=1 $< -o $@ -shared 15 | 16 | %.so: %.c 17 | ${CC} ${CFLAGS} $< -o $@ -shared 18 | clean: 19 | rm *.so 20 | 21 | .PHONY: all clean clustering 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Johannes Buchner 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /plotmuseposterior.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import json 3 | import numpy 4 | from numpy import log, log10, arctan, pi, exp 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import h5py 8 | import scipy.stats 9 | import corner 10 | 11 | filename = sys.argv[1] 12 | with h5py.File(filename, 'r') as f: 13 | logZ = f['logZ'].value 14 | for i in range(len(logZ)): 15 | print(' %d ...' % i) 16 | w = f['w'][:,i] + f['L'][:,i] 17 | mask = numpy.isfinite(w) 18 | if mask.sum() < 4000: 19 | continue 20 | jparent = numpy.where(mask)[0] 21 | w = w[jparent] 22 | #print w, w.min(), w.max() 23 | w = numpy.exp(w - w.max()) 24 | w = w / w.sum() 25 | j = numpy.random.choice(jparent, size=100000, p=w) 26 | 27 | O = numpy.log10(f['x'][:,i,0][j]) 28 | Z = f['x'][:,i,1][j] 29 | SFtau = f['x'][:,i,2][j] 30 | SFage = numpy.log10(f['x'][:,i,3][j]) 31 | EBV = f['x'][:,i,4][j] 32 | print(w.shape, O.shape, Z.shape, SFtau.shape, SFage.shape, EBV.shape) 33 | data = numpy.transpose([O, Z, SFtau, SFage, EBV]) 34 | 35 | # make marginal plots 36 | 37 | figure = corner.corner(data, 38 | labels=[r"Continuum", r"logZ", r"SFtau", r"SFage", r'EBV'], 39 | quantiles=[0.16, 0.5, 0.84], 40 | show_titles=True, title_kwargs={"fontsize": 12}) 41 | figure.savefig('museposterior_%d.pdf' % (i+1), bbox_inches='tight') 42 | plt.close() 43 | 44 | -------------------------------------------------------------------------------- /plotscaling.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import json 3 | import numpy 4 | from numpy import log 5 | import sys 6 | import matplotlib.pyplot as plt 7 | 8 | xx = [] 9 | yy = [] 10 | 11 | for filename in sys.argv[1:]: 12 | data = json.load(open(filename)) 13 | if 'ndata' in data: 14 | x = data['ndata'] 15 | else: 16 | x = int(filename.split('.')[0].split('_')[-1]) 17 | #y = json.load(open(filename))['ndraws'] 18 | #if 'duration' not in data: 19 | # continue 20 | #y = data['duration'] 21 | y = data['ndraws'] 22 | xx.append(x) 23 | yy.append(y) 24 | 25 | i = numpy.argsort(xx) 26 | xx = numpy.array(xx)[i] 27 | yy = numpy.array(yy)[i] 28 | 29 | plt.figure(figsize=(5,5)) 30 | plt.plot(xx, xx * max(yy/xx), '-', label='linear cost', color='k') 31 | plt.plot(xx, numpy.sqrt(xx) * numpy.nanmax(yy / numpy.sqrt(xx)), ':', label='sqrt cost', color='gray') 32 | #plt.plot(xx, xx**0.333 * numpy.nanmax(yy / xx**0.333), '--', label='cubic root cost') 33 | #plt.plot(xx, log(xx) * numpy.nanmax(yy / log(xx)), '-.', label='log cost') 34 | plt.ylabel('Model Evaluations') 35 | plt.xlabel('Data Sets') 36 | plt.yscale('log') 37 | plt.xscale('log') 38 | #plt.xlim(0.9, 10000) 39 | plt.xlim(0.8, max(xx)*1.5) 40 | plt.plot(xx, yy, 'o ', label='our algorithm', color='r') 41 | plt.legend(loc='upper left', numpoints=1, prop=dict(size=10)) 42 | plt.savefig('plotscaling.pdf', bbox_inches='tight') 43 | plt.close() 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /plotevidences.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import json 3 | import numpy 4 | from numpy import log, log10 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import h5py 8 | import scipy.stats 9 | 10 | xx = [] 11 | yy = [] 12 | 13 | filename_in = sys.argv[1] 14 | filename = sys.argv[2] 15 | plt.figure(figsize=(6,4)) 16 | f = h5py.File(filename_in, 'r') 17 | logZ0 = numpy.sum(-0.5 * (f['y'].value/0.01)**2, axis=0) 18 | f = h5py.File(filename, 'r') 19 | logZ1 = f['logZ'].value 20 | B = numpy.log10(numpy.exp(logZ1 - logZ0)) 21 | B[B > 4] = 4 22 | bins = numpy.linspace(B.min(), 10, 40) 23 | plt.hist(B, bins=bins, color='k', histtype='step', normed=True) 24 | 25 | filename_in = sys.argv[3] 26 | filename = sys.argv[4] 27 | f = h5py.File(filename_in, 'r') 28 | logZ0 = numpy.sum(-0.5 * (f['y'].value/0.01)**2, axis=0) 29 | f = h5py.File(filename, 'r') 30 | logZ1 = f['logZ'].value 31 | B = numpy.log10(numpy.exp(logZ1 - logZ0)) 32 | Blim = sorted(B)[int(len(B)*0.999)] 33 | Blim = B.max() 34 | print(10**Blim) 35 | bins = numpy.linspace(-5, 5, 100) 36 | plt.hist(B, bins=bins, color='r', histtype='step', normed=True) 37 | x = list(range(-1, 5)) 38 | plt.vlines(Blim, 0, 4, color='green', linestyles=[':']) 39 | plt.ylim(0, 4) 40 | plt.yticks([0, 1, 2, 3, 4]) 41 | y = ['${10}^{%d}$' % xi for xi in x] 42 | plt.xticks(x, y) 43 | plt.xlim(-2, 4.5) 44 | plt.xlabel('Bayes factor B') 45 | plt.ylabel('Frequency') 46 | plt.savefig('plotevidences.pdf', bbox_inches='tight') 47 | plt.close() 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /checkoutput.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import h5py 3 | import sys 4 | import numpy 5 | from numpy import log, log10, exp, pi 6 | import matplotlib.pyplot as plt 7 | 8 | for filename in sys.argv[1:]: 9 | with h5py.File(filename) as f: 10 | print(filename) 11 | logZ = f['logZ'].value 12 | logZerr = f['logZerr'].value 13 | L = f['L'].value 14 | 15 | if len(logZ.shape) > 0: 16 | logZ = logZ[0] 17 | logZerr = logZerr[0] 18 | L = L[:,0] 19 | #print f['x'][-1,0] 20 | else: 21 | #print f['x'][-1] 22 | pass 23 | ndraws = f['ndraws'].value 24 | print('logZ = %.1f +- %.1f' % (logZ, logZerr)) 25 | print('ndraws:', ndraws) 26 | #plt.plot(L) 27 | ndata = f['w'].shape[1] 28 | for d in range(ndata): 29 | w = f['w'][:,d] 30 | w = exp(w - w.max()) 31 | w.sort() 32 | w /= w.sum() 33 | i = numpy.random.choice(numpy.arange(len(w)), size=1000, replace=True, p=w) 34 | A, mu, logsigma = f['x'][:,d,:].transpose() 35 | print(numpy.isfinite(A).all(), A[~numpy.isfinite(A)]) 36 | A = log10(A[i]) 37 | #A = A[i] 38 | mu = mu[i] 39 | logsigma = logsigma[i] 40 | print('A', A.mean(), A.std()) 41 | print('mu', mu.mean(), mu.std()) 42 | print('logsigma', logsigma.mean(), logsigma.std()) 43 | plt.subplot(3, 1, 1) 44 | plt.plot(A, mu, 'x ') 45 | plt.xlabel('A') 46 | plt.ylabel('mu') 47 | plt.subplot(3, 1, 2) 48 | plt.plot(logsigma, mu, 'x ') 49 | plt.xlabel('logsigma') 50 | plt.ylabel('mu') 51 | plt.subplot(3, 1, 3) 52 | L = f['L'][:,d] 53 | L = L[numpy.isfinite(L)] 54 | plt.plot(L, '-') 55 | plt.show() 56 | print(f['w'].shape, f['x'].shape) 57 | 58 | 59 | -------------------------------------------------------------------------------- /TODO.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | TODO 3 | ============ 4 | 5 | Non-performance 6 | ----------------- 7 | 8 | * Change prints to logging 9 | 10 | Performance (single-threaded) 11 | ------------------------------ 12 | 13 | Here we discuss wall-clock time, not number of model evaluations. 14 | If the model is slow enough, there is no issue. 15 | 16 | Currently, the execution speed is limited by two functions: 17 | 18 | 1. Building the RadFriends region draw_constrained -> maxdistance 19 | 20 | maxdistance could be optimized by calling it less often. This is 21 | what sample.CachedConstrainer tries to do. The checks there could be more 22 | generous. -> Done, but maybe more optimisation possible? 23 | 24 | One could also increase the rebuild_every parameters 25 | 26 | One could modify MetricLearningFriendsConstrainer to rebuild not every n calls, 27 | but every n likelihood evaluations. This would improve performance when drawing 28 | is already quite efficient. See nestle, which does this. 29 | -> Done! 30 | 31 | 2. Building the graph to find independent data sets, multi_nested_sampler.generate_subsets_graph 32 | 33 | igraph could be replaced with graphtool, which supports parallelisation. 34 | 35 | One could further explore when to use 36 | generate_subsets_graph vs generate_subsets_nograph (controlled by use_graph) 37 | 38 | 39 | Performance (parallelisation) 40 | ------------------------------ 41 | 42 | * The subsets could be sampled in parallel. 43 | 44 | * The entire framework could be set up in a MapReduce/MPI way, with the 45 | MetricLearningFriendsConstrainer proposing (multiple) points, 46 | passing to multiple machines for evaluating the model, 47 | then using MapReduce to evaluate the likelihood over the Big Data set, 48 | and returning this to MetricLearningFriendsConstrainer. 49 | See MultiNest, which already parallelises the likelihood evaluations. 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========================================================================= 2 | Big Data vs. complex physical models - a scalable inference algorithm 3 | ========================================================================= 4 | 5 | A algorithm for fitting models against many data sets, giving parameter probability distributions. 6 | The key is that model evaluations are efficiently re-used between data sets, 7 | making the algorithm scale sub-linearly. 8 | 9 | See paper for details: https://arxiv.org/abs/1707.04476 10 | 11 | How to run 12 | ============ 13 | 14 | You need to install 15 | 16 | * python-igraph 17 | * numpy, scipy 18 | * h5py 19 | * progressbar 20 | * gcc 21 | 22 | Then run:: 23 | 24 | $ # build 25 | $ make 26 | $ # simulate data set 27 | $ python gensimple_horns.py 10000 28 | $ # analyse 29 | $ OMP_NUM_THREADS=4 python sample.py data_widths_10000.hdf5 100 30 | $ # simulate no-signal data set 31 | $ python gennothing.py 10000 # simulate no-signal data set 32 | $ # analyse 33 | $ OMP_NUM_THREADS=4 python sample.py data_nothing_10000.hdf5 10000 34 | 35 | See paper draft for details. 36 | 37 | Improving Performance 38 | ======================= 39 | 40 | See TODO. 41 | 42 | Implementation notes and Code organisation 43 | ============================================ 44 | 45 | * sample.py sets up everything 46 | * Set your problem definition (parameters, model, likelihood) in sample.py 47 | * Integrator: multi_nested_integrator.py . Calls sampler repeatedly. 48 | * Joint Sampler: multi_nested_sampler.py . This deals with managing the graph and the queues and which live points to use for a new draw. Calls draw_constrained 49 | * The queues (paper) are called shelves in the code. 50 | * RadFriends: hiermetriclearn.py: Suggests new samples from live points and filters with likelihood function to return a higher point. 51 | * clustering/: Fast C implementations for checking if a point is in the neighbourhood and computing safe distances. 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /plotposterior.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import json 3 | import numpy 4 | from numpy import log, log10 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import h5py 8 | import scipy.stats 9 | 10 | xx = [] 11 | yy = [] 12 | 13 | filename = sys.argv[1] 14 | colors = ['yellow', 'pink', 'cyan', 'magenta'] 15 | cmap = plt.cm.gray 16 | zs = [] 17 | plt.figure(figsize=(6,4)) 18 | with h5py.File(filename, 'r') as f: 19 | logZ = f['logZ'].value 20 | for i in range(len(logZ)): 21 | w = f['w'][:,i] + f['L'][:,i] 22 | mask = numpy.isfinite(w) 23 | jparent = numpy.where(mask)[0] 24 | w = w[jparent] 25 | #print w, w.min(), w.max() 26 | w = numpy.exp(w - w.max()) 27 | w = w / w.sum() 28 | j = numpy.random.choice(jparent, size=1000, p=w) 29 | mu = f['x'][:,i,1][j] 30 | if mu.std() < 50: 31 | zs.append(mu.mean() / 440 - 1) 32 | #if mu.std() > 40: 33 | # print 'skipping unconstrained: %.1f' % mu.std() 34 | # continue 35 | #A = log10(f['x'][:,i,0][j]) 36 | A = f['x'][:,i,0][j] * 100 37 | #if i < 4: 38 | # plt.plot(mu[:100], A[:100], '. ', color='r', alpha=0.2) 39 | if i < 4: 40 | color = colors[i] 41 | else: 42 | color = cmap(0.8 * min(50, mu.std())/50.) 43 | plt.errorbar(x=numpy.mean(mu), xerr=mu.std(), 44 | y=A.mean(), yerr=A.std(), 45 | capsize=0, color=color, 46 | elinewidth=4 if i < 4 else 1) 47 | plt.xlabel('Wavelength [nm]') 48 | plt.ylabel('Line amplitude') 49 | plt.xlim(400, 800) 50 | plt.ylim(1, 20) 51 | plt.yticks([1,2,10], [1,2,10]) 52 | plt.yscale('log') 53 | plt.savefig('plotposterior.pdf', bbox_inches='tight') 54 | plt.close() 55 | 56 | plt.figure(figsize=(5,1.5)) 57 | plt.hist(zs, bins=10, histtype='step', label='Well-constrained lines', normed=True) 58 | alpha, beta, scale = 2., 7., 1 59 | x = numpy.linspace(0, 2, 1000) 60 | plt.plot(x, scipy.stats.beta(alpha, beta).pdf(x), '-', color='k', label='Input redshift distribution') 61 | plt.ylabel('Frequency') 62 | plt.xlabel('Redshift') 63 | plt.xlim(0, 1) 64 | plt.savefig('plotposteriorz.pdf', bbox_inches='tight') 65 | plt.close() 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /adaptive_progress.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import progressbar 3 | 4 | """ 5 | From 2.3-dev of progressbar, not in release yet. 6 | """ 7 | 8 | class AdaptiveETA(progressbar.Timer): 9 | """Widget which attempts to estimate the time of arrival. 10 | 11 | Uses a weighted average of two estimates: 12 | 1) ETA based on the total progress and time elapsed so far 13 | 2) ETA based on the progress as per tha last 10 update reports 14 | 15 | The weight depends on the current progress so that to begin with the 16 | total progress is used and at the end only the most recent progress is 17 | used. 18 | """ 19 | 20 | TIME_SENSITIVE = True 21 | NUM_SAMPLES = 10 22 | 23 | def _update_samples(self, currval, elapsed): 24 | sample = (currval, elapsed) 25 | if not hasattr(self, 'samples'): 26 | self.samples = [sample] * (self.NUM_SAMPLES + 1) 27 | else: 28 | self.samples.append(sample) 29 | return self.samples.pop(0) 30 | 31 | def _eta(self, maxval, currval, elapsed): 32 | return elapsed * maxval / float(currval) - elapsed 33 | 34 | def update(self, pbar): 35 | """Updates the widget to show the ETA or total time when finished.""" 36 | if pbar.currval == 0: 37 | return 'ETA: --:--:--' 38 | elif pbar.finished: 39 | return 'Time: %s' % self.format_time(pbar.seconds_elapsed) 40 | else: 41 | elapsed = pbar.seconds_elapsed 42 | currval1, elapsed1 = self._update_samples(pbar.currval, elapsed) 43 | eta = self._eta(pbar.maxval, pbar.currval, elapsed) 44 | if pbar.currval > currval1: 45 | etasamp = self._eta(pbar.maxval - currval1, 46 | pbar.currval - currval1, 47 | elapsed - elapsed1) 48 | weight = (pbar.currval / float(pbar.maxval)) ** 0.5 49 | eta = (1 - weight) * eta + weight * etasamp 50 | return 'ETA: %s' % self.format_time(eta) 51 | 52 | 53 | -------------------------------------------------------------------------------- /pres/plotjointcontour.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | 5 | CX = [2, 2.2] 6 | CSX = [0.5, 0.5] 7 | CSY = [0.2, 0.2] 8 | CY = [1.1, 1.2] 9 | 10 | def likelihood(x, y): 11 | l = 0 12 | for cx, cy, csx, csy in zip(CX, CY, CSX, CSY): 13 | l = -0.5 * (((cx - x)/csx)**2 + ((cy - y)/csy)**2) 14 | yield l 15 | 16 | x = numpy.linspace(-2.5, 6.5, 100) 17 | y = numpy.linspace(-2.5, 6.5, 100) 18 | X, Y = numpy.meshgrid(x, y) 19 | XY = numpy.array(numpy.transpose([X.flatten(), Y.flatten()]), order='C') 20 | L1, L2 = likelihood(X, Y) 21 | Lsorted = L1[30:-30,30:-30].flatten() 22 | Lsorted.sort() 23 | levels = Lsorted[::Lsorted.size/7-1].tolist() # + [L.max()] 24 | levels = levels[2:] 25 | #levels = L.max() - numpy.arange(5) * 4 - 2 26 | plt.figure(figsize=(6, 3), frameon=False) 27 | plt.axis('off') 28 | plt.contour(X, Y, L1, levels) 29 | plt.contour(X, Y, L2, levels) 30 | plt.savefig('plotjointcontour.png', bbox_inches='tight') 31 | plt.savefig('plotjointcontour.pdf', bbox_inches='tight') 32 | plt.close() 33 | 34 | numpy.random.seed(1) 35 | N = 10000 36 | x = numpy.random.uniform(-2, 6, size=N) 37 | y = numpy.random.uniform(-2, 6, size=N) 38 | l1, l2 = likelihood(x, y) 39 | Nlive = 100 40 | for i in range(len(levels)): 41 | plt.figure(figsize=(6, 2.2), frameon=False) 42 | plt.axis('off') 43 | #plt.text(-2, 4, 'Iteration %d' % (i*100)) 44 | #plt.text(-2, 4, '(%d)' % (i+1)) 45 | mask1 = l1 > levels[i] 46 | mask2 = l2 > levels[i] 47 | maskboth = numpy.logical_and(mask1, mask2) 48 | maskone = numpy.logical_or(mask1, mask2) 49 | N1 = 0 50 | N2 = 0 51 | for j in range(N): 52 | if mask1[j] and mask2[j]: # joint 53 | plt.plot(x[j], y[j], '.', color='k') 54 | N1 += 1 55 | N2 += 1 56 | elif mask1[j] and N1 < Nlive: 57 | plt.plot(x[j], y[j], 'x', color='cyan') 58 | N1 += 1 59 | elif mask2[j] and N2 < Nlive: 60 | plt.plot(x[j], y[j], '+', color='magenta') 61 | N2 += 1 62 | else: 63 | pass 64 | if N1 >= Nlive and N2 >= Nlive: 65 | break 66 | plt.contour(X, Y, L1, levels[i:i+1], colors=['cyan'], linestyles=[':']) 67 | plt.contour(X, Y, L2, levels[i:i+1], colors=['magenta'], linestyles=[':']) 68 | plt.ylim(-2.5, 6.2) 69 | plt.xlim(-3, 7) 70 | plt.savefig('plotjointcontour_%d.png' % (i+1), bbox_inches='tight') 71 | plt.savefig('plotjointcontour_%d.pdf' % (i+1), bbox_inches='tight') 72 | plt.close() 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /gensimple_horns.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp, pi, arctan 6 | import sys 7 | 8 | def gauss(x, A, mu, sig): 9 | xT = x.reshape((1,-1)) 10 | AT = A.reshape((-1,1)) 11 | muT = mu.reshape((-1,1)) 12 | sigT = sig.reshape((-1,1)) 13 | return AT * exp(-0.5 * ((muT - xT)/sigT)**2) 14 | 15 | x = numpy.linspace(400, 800, 200) 16 | 17 | N = 40 18 | N = int(sys.argv[1]) 19 | numpy.random.seed(N) 20 | z = arctan(numpy.random.uniform(-pi, pi, size=N)) * 0.1 21 | rest_wave = 656 22 | print('generating parameters ...') 23 | width_narrow = 5.0 * numpy.ones(N) 24 | mean_narrow = rest_wave * (1 + z) 25 | width_narrow = width_narrow 26 | noise_level = 0.01 27 | signal_level = 0.02 / numpy.random.power(3, size=N) 28 | height_narrow = signal_level 29 | 30 | print('generating signal ...') 31 | ym = gauss(A=height_narrow, mu=mean_narrow, x=x, sig=width_narrow) 32 | ym = numpy.transpose(ym) 33 | print(ym.shape) 34 | 35 | # add noise 36 | print('adding noise...') 37 | y = ym.copy() 38 | for i in range(N): 39 | y[:,i] += numpy.random.normal(0, noise_level, size=len(x)) 40 | 41 | print('plotting ...') 42 | #for i in range(min(N, 20)): 43 | # #plt.plot(x, y[:,i], '.-') 44 | # plt.plot(x, y[:,i], '-') 45 | #plt.savefig('gen_widths.pdf', bbox_inches='tight') 46 | #plt.close() 47 | colors = ['yellow', 'pink', 'cyan', 'magenta'] 48 | colors = ['magenta', 'cyan', 'pink', 'yellow'] 49 | for i in range(min(N, 4)): 50 | #plt.plot(x, y[:,i], '.-') 51 | plt.plot(rest_wave * (1 + z[i]), 1.1 * y[:,i].max() / noise_level, 'v', color=colors[i], ms=12, mew=0.5, mec='k') 52 | #plt.plot(rest_wave * (1 + z[i]), 4, 'v', color=colors[i], ms=12) 53 | plt.plot(x, y[:,i] / noise_level, '-', color=colors[i], lw=1) 54 | plt.ylabel('Detector signal') 55 | plt.xlabel('Wavelength [nm]') 56 | plt.savefig('genhorns.pdf', bbox_inches='tight') 57 | plt.close() 58 | 59 | 60 | #print x.shape, y.shape, z.shape 61 | with h5py.File('data_widths_%s.hdf5' % sys.argv[1], 'w') as f: 62 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 63 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 64 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 65 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 66 | f.create_dataset('width_narrow', data=width_narrow, compression='gzip', shuffle=True) 67 | f.create_dataset('height_narrow', data=height_narrow, compression='gzip', shuffle=True) 68 | 69 | 70 | -------------------------------------------------------------------------------- /gensimple_bright.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp 6 | import sys 7 | 8 | def gauss(x, z, A, mu, sig): 9 | xT = x.reshape((1,-1)) 10 | zT = z.reshape((-1,1)) 11 | AT = A.reshape((-1,1)) 12 | muT = mu.reshape((-1,1)) 13 | sigT = sig.reshape((-1,1)) 14 | return AT * exp(-0.5 * ((muT - xT / (1. + zT))/sigT)**2) 15 | 16 | x = numpy.linspace(400, 800, 200) 17 | 18 | N = 40 19 | N = int(sys.argv[1]) 20 | numpy.random.seed(N) 21 | z = numpy.zeros(N) + 0.01 22 | rest_wave = 440 23 | print('generating parameters ...') 24 | # in km/s 25 | width_broad = 4000 * rest_wave / 300000 * numpy.ones(N) 26 | width_narrow = 400 * rest_wave / 300000 * numpy.ones(N) 27 | # convert to nm 28 | mean_broad = rest_wave * numpy.ones(N) 29 | mean_narrow = rest_wave * numpy.ones(N) 30 | width_broad = width_broad 31 | width_narrow = width_narrow 32 | noise_level = 0.01 33 | #signal_level = numpy.random.exponential(size=N) * 0.4 34 | signal_level = numpy.ones(N) * 0.2 35 | #signal_level = numpy.random.uniform(size=N) * 0.5 36 | #is_type1 = numpy.random.uniform(size=N) < 0.5 37 | height_broad = 10**-1 * signal_level 38 | height_narrow = signal_level 39 | 40 | #X = numpy.array([x]) 41 | 42 | print('generating signal ...') 43 | ym = gauss(A=height_broad, mu=mean_broad, x=x, z=z, sig=width_broad) 44 | ym += gauss(A=height_narrow, mu=mean_narrow, x=x, z=z, sig=width_narrow) 45 | ym = numpy.transpose(ym) 46 | print(ym.shape) 47 | 48 | # add noise 49 | print('adding noise...') 50 | y = ym.copy() 51 | for i in range(N): 52 | y[:,i] += numpy.random.normal(0, noise_level, size=len(x)) 53 | 54 | print('plotting ...') 55 | for i in range(min(N, 20)): 56 | #plt.plot(x, y[:,i], '.-') 57 | plt.plot(x, y[:,i], '-') 58 | plt.savefig('gen_bright.pdf', bbox_inches='tight') 59 | plt.close() 60 | 61 | #print x.shape, y.shape, z.shape 62 | with h5py.File('data_bright_%s.hdf5' % sys.argv[1], 'w') as f: 63 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 64 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 65 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 66 | f.create_dataset('mean_broad', data=mean_broad, compression='gzip', shuffle=True) 67 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 68 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 69 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 70 | f.create_dataset('width_narrow', data=width_narrow, compression='gzip', shuffle=True) 71 | f.create_dataset('height_narrow', data=height_narrow, compression='gzip', shuffle=True) 72 | 73 | 74 | -------------------------------------------------------------------------------- /pres/plotcontour.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | #from nested_sampling.clustering.neighbors import find_rdistance, is_within_distance_of, count_within_distance_of, any_within_distance_of 5 | from nested_sampling.samplers.hiermetriclearn import ClusterResult, RadFriendsRegion 6 | 7 | 8 | CX = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4] 9 | CS = [0.2, 0.2, 0.2, 0.2, 0.15, 0.2, 0.15, 0.2, 0.2] 10 | CY = [0.2, 0, 0, 0, 0.1, 0.3, 1, 1.4, 2] 11 | CW = [1, 2, 2, 2, 2, 2, 20, 2, 2] 12 | 13 | CX = numpy.linspace(0, 4, 20) 14 | CY = CX*-0.2 + CX**2*0.3 15 | #plt.plot(x, x*-0.2 + x**2*0.2) 16 | CW = CX * 0 + 2 + 10*CY**2 17 | CW = 1./CW 18 | CW[0] = 0.5 19 | CW[1] = 1 20 | #CW[-5] = 20 21 | CS = CX * 0 + 0.2 22 | #CS[-5] = 0.12 23 | 24 | 25 | def likelihood(x, y): 26 | l = 0 27 | for cx, cy, cw, cs in zip(CX, CY, CW, CS): 28 | l += cw * numpy.exp(-0.5 * (((cx - x)/cs)**2 + ((cy - y)/cs)**2)) 29 | return numpy.log(l) 30 | 31 | 32 | x = numpy.linspace(-2.5, 6.5, 100) 33 | y = numpy.linspace(-2.5, 6.5, 100) 34 | X, Y = numpy.meshgrid(x, y) 35 | XY = numpy.array(numpy.transpose([X.flatten(), Y.flatten()]), order='C') 36 | print(XY.dtype) 37 | L = likelihood(X, Y) 38 | Lsorted = L[30:-30,30:-30].flatten() 39 | Lsorted.sort() 40 | levels = Lsorted[::Lsorted.size/7-1].tolist() # + [L.max()] 41 | levels = levels[2:] 42 | #levels = L.max() - numpy.arange(5) * 4 - 2 43 | plt.figure(figsize=(6, 3), frameon=False) 44 | plt.axis('off') 45 | plt.contour(X, Y, L, levels) 46 | plt.savefig('plotcontour.png', bbox_inches='tight') 47 | plt.savefig('plotcontour.pdf', bbox_inches='tight') 48 | plt.close() 49 | 50 | numpy.random.seed(1) 51 | N = 10000 52 | x = numpy.random.uniform(-2, 6, size=N) 53 | y = numpy.random.uniform(-2, 6, size=N) 54 | l = likelihood(x, y) 55 | Nlive = 100 56 | for i in range(len(levels)): 57 | plt.figure(figsize=(6, 2.2), frameon=False) 58 | plt.axis('off') 59 | plt.text(-2, 4, 'Iteration %d' % (i*100)) 60 | #plt.text(-2, 4, '(%d)' % (i+1)) 61 | mask = l > levels[i] 62 | xlevel = x[mask][:Nlive] 63 | ylevel = y[mask][:Nlive] 64 | live_points = numpy.array(numpy.transpose([xlevel, ylevel]), order='C') 65 | plt.contour(X, Y, L, levels[i:i+1], colors=['k'], linestyles=[':']) 66 | plt.plot(xlevel, ylevel, '.', color='k') 67 | # do radfriends with these points 68 | region = RadFriendsRegion(live_points) 69 | mask = region.are_inside(XY) 70 | maskregion = mask.reshape(X.shape) 71 | plt.contour(X, Y, maskregion*1., [0.5], colors=['orange'], linestyles=['-']) 72 | 73 | plt.ylim(-2.5, 6.2) 74 | plt.xlim(-3, 7) 75 | plt.savefig('plotcontour_%d.png' % (i+1), bbox_inches='tight') 76 | plt.savefig('plotcontour_%d.pdf' % (i+1), bbox_inches='tight') 77 | plt.close() 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /gen.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp 6 | 7 | def gauss(x, z, A, mu, sig): 8 | xT = x.reshape((1,-1)) 9 | zT = z.reshape((-1,1)) 10 | AT = A.reshape((-1,1)) 11 | muT = mu.reshape((-1,1)) 12 | sigT = sig.reshape((-1,1)) 13 | return AT * exp(-0.5 * ((muT - xT / (1. + zT))/sigT)**2) 14 | 15 | x = numpy.linspace(400, 800, 200) 16 | 17 | N = 1000 18 | N = int(sys.argv[1]) 19 | numpy.random.seed(1) 20 | z = numpy.random.beta(2, 30, size=N) * 2 21 | #z = numpy.zeros(N) + 0.01 22 | rest_wave = 440 23 | # in km/s 24 | width_broad = 10**numpy.random.normal(3, 0.2, size=N) * rest_wave / 300000 25 | width_narrow = 10**numpy.random.normal(1, 0.2, size=N) * rest_wave / 300000 26 | print(width_narrow.min()) 27 | print(width_broad.min()) 28 | # convert to nm 29 | mean_broad = rest_wave * numpy.ones(N) 30 | mean_narrow = rest_wave * numpy.ones(N) 31 | width_broad = width_broad 32 | width_narrow = width_narrow 33 | noise_level = 0.01 34 | signal_level = numpy.random.exponential(size=N) * 10 35 | #signal_level = numpy.ones(N) * 10 36 | is_type1 = numpy.random.uniform(size=N) < 0.5 37 | #is_type1 = numpy.random.uniform(size=N) > 0 38 | height_broad = numpy.where(is_type1, 10**numpy.random.normal(0, 0.2, size=N), 10**numpy.random.normal(-2, 0.2, size=N)) * signal_level 39 | height_narrow = signal_level 40 | 41 | #X = numpy.array([x]) 42 | 43 | ym = gauss(A=height_broad, mu=mean_broad, x=x, z=z, sig=width_broad) 44 | ym += gauss(A=height_narrow, mu=mean_narrow, x=x, z=z, sig=width_narrow) 45 | ym = numpy.transpose(ym) 46 | print(ym.shape) 47 | 48 | # add noise 49 | print('adding noise') 50 | y = numpy.random.normal(0, noise_level, size=ym.shape) + ym 51 | print('plotting ...') 52 | for i in range(min(N, 20)): 53 | #plt.plot(x, y[:,i], '.-') 54 | plt.plot(x, y[:,i], '-') 55 | plt.savefig('gen.pdf', bbox_inches='tight') 56 | plt.close() 57 | 58 | print(x.shape, y.shape, z.shape) 59 | with h5py.File('data.hdf5', 'w') as f: 60 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 61 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 62 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 63 | f.create_dataset('mean_broad', data=mean_broad, compression='gzip', shuffle=True) 64 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 65 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 66 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 67 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 68 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 69 | 70 | 71 | -------------------------------------------------------------------------------- /cmuselike.c: -------------------------------------------------------------------------------- 1 | /*** 2 | 3 | Likelihood implementation in C 4 | -------------------------------- 5 | 6 | Copyright (c) 2017 Johannes Buchner 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | 16 | ***/ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #ifdef PARALLEL 23 | #include 24 | #endif 25 | 26 | #define IFVERBOSE if(0) 27 | #define IFDEBUG if(0) 28 | #define adouble double 29 | #define bdouble double 30 | #define sqr(x) (pow(x,2)) 31 | 32 | // Parallelisation does not work at the moment, you are welcome to fix it 33 | // ret = lib.like(yd, vd, ypred, data_mask, ndata, nspec, Lout) 34 | int like( 35 | const void * yyp, const void * vvp, const void * ypredp, const void * data_maskp, 36 | const int ndata, const int nx, 37 | void * Loutp 38 | ) { 39 | const adouble * yy = (const adouble*) yyp; 40 | const adouble * vv = (const adouble*) vvp; 41 | const adouble * ypred = (const adouble*) ypredp; 42 | const bool * data_mask = (const bool*) data_maskp; 43 | adouble * Lout = (adouble*) Loutp; 44 | 45 | #ifdef PARALLEL 46 | #pragma omp parallel for 47 | #endif 48 | for (int i = 0; i < ndata; i++) { 49 | if (data_mask[i]) { 50 | // compute s 51 | double s1 = 0.; 52 | double s2 = 1e-10; 53 | for (int j = 0; j < nx; j++) { 54 | s1 += yy[i+j*ndata] * ypred[j] / vv[i+j*ndata]; 55 | s2 += pow(ypred[j], 2) / vv[i+j*ndata]; 56 | } 57 | double s = s1/s2; 58 | double chi = 0.; 59 | for (int j = 0; j < nx; j++) { 60 | chi += pow(yy[i+j*ndata] - s * ypred[j], 2) / vv[i+j*ndata]; 61 | } 62 | Lout[i] = -0.5 * chi; 63 | } 64 | } 65 | return 0; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /gensimple.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp 6 | import sys 7 | 8 | def gauss(x, z, A, mu, sig): 9 | xT = x.reshape((1,-1)) 10 | zT = z.reshape((-1,1)) 11 | AT = A.reshape((-1,1)) 12 | muT = mu.reshape((-1,1)) 13 | sigT = sig.reshape((-1,1)) 14 | return AT * exp(-0.5 * ((muT - xT / (1. + zT))/sigT)**2) 15 | 16 | x = numpy.linspace(400, 800, 200) 17 | 18 | N = 40 19 | N = int(sys.argv[1]) 20 | numpy.random.seed(N) 21 | alpha, beta, scale = 2., 7., 1 22 | z = numpy.random.beta(alpha, beta, size=N) * scale 23 | #z = numpy.zeros(N) + 0.01 24 | rest_wave = 440 25 | print('generating parameters ...') 26 | # in km/s 27 | width_broad = 4000 * rest_wave / 300000 * numpy.ones(N) 28 | width_narrow = 400 * rest_wave / 300000 * numpy.ones(N) 29 | # convert to nm 30 | mean_broad = rest_wave * numpy.ones(N) 31 | mean_narrow = rest_wave * numpy.ones(N) 32 | width_broad = width_broad 33 | width_narrow = width_narrow 34 | noise_level = 0.01 35 | #signal_level = numpy.random.exponential(size=N) * 0.4 36 | #signal_level = numpy.ones(N) * 0.04 37 | signal_level = numpy.random.normal(0.5, 0.5, size=10*N) 38 | signal_level = signal_level[signal_level>0.2][:N] 39 | #signal_level = numpy.random.uniform(size=N) * 0.5 40 | #is_type1 = numpy.random.uniform(size=N) < 0.5 41 | height_broad = 10**-1 * signal_level 42 | height_narrow = signal_level 43 | 44 | #X = numpy.array([x]) 45 | 46 | print('generating signal ...') 47 | ym = gauss(A=height_broad, mu=mean_broad, x=x, z=z, sig=width_broad) 48 | ym += gauss(A=height_narrow, mu=mean_narrow, x=x, z=z, sig=width_narrow) 49 | ym = numpy.transpose(ym) 50 | print(ym.shape) 51 | 52 | # add noise 53 | print('adding noise...') 54 | y = ym.copy() 55 | for i in range(N): 56 | y[:,i] += numpy.random.normal(0, noise_level, size=len(x)) 57 | 58 | print('plotting ...') 59 | for i in range(min(N, 20)): 60 | #plt.plot(x, y[:,i], '.-') 61 | plt.plot(x, y[:,i], '-') 62 | plt.savefig('gen.pdf', bbox_inches='tight') 63 | plt.close() 64 | 65 | #print x.shape, y.shape, z.shape 66 | with h5py.File('data_%s.hdf5' % sys.argv[1], 'w') as f: 67 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 68 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 69 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 70 | f.create_dataset('mean_broad', data=mean_broad, compression='gzip', shuffle=True) 71 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 72 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 73 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 74 | f.create_dataset('width_narrow', data=width_narrow, compression='gzip', shuffle=True) 75 | f.create_dataset('height_narrow', data=height_narrow, compression='gzip', shuffle=True) 76 | 77 | 78 | -------------------------------------------------------------------------------- /gen_realistic.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import sys 3 | import numpy 4 | import matplotlib.pyplot as plt 5 | import h5py 6 | from numpy import exp 7 | 8 | def gauss(x, z, A, mu, sig): 9 | xT = x.reshape((1,-1)) 10 | zT = z.reshape((-1,1)) 11 | AT = A.reshape((-1,1)) 12 | muT = mu.reshape((-1,1)) 13 | sigT = sig.reshape((-1,1)) 14 | return AT * exp(-0.5 * ((muT - xT / (1. + zT))/sigT)**2) 15 | 16 | x = numpy.linspace(400, 800, 1000) 17 | 18 | N = 10000 19 | numpy.random.seed(1) 20 | z = numpy.random.beta(2, 30, size=N) * 2 21 | #z = numpy.zeros(N) + 0.01 22 | rest_wave = 440 23 | # in km/s 24 | width_broad = 10**numpy.random.normal(3, 0.2, size=N) * rest_wave / 300000 25 | width_narrow = 10**numpy.random.normal(1, 0.2, size=N) * rest_wave / 300000 26 | # convert to nm 27 | mean_broad = rest_wave * numpy.ones(N) 28 | mean_narrow = rest_wave * numpy.ones(N) 29 | width_broad = width_broad 30 | width_narrow = width_narrow 31 | noise_level = 0.01 32 | #signal_level = numpy.random.exponential(size=N) * 10 33 | signal_level = 1./(numpy.random.power(1, size=N)*100 + 2) # bright 34 | #signal_level = 1./(numpy.random.power(1, size=N)*200 + 20) # faint, up to SNR of 5 35 | #signal_level = numpy.ones(N) * 10 36 | is_type1 = numpy.random.uniform(size=N) < 0.5 37 | #is_type1 = numpy.random.uniform(size=N) > 0 38 | height_broad = numpy.where(is_type1, 10**numpy.random.normal(0, 0.2, size=N), 10**numpy.random.normal(-2, 0.2, size=N)) * signal_level 39 | height_narrow = signal_level 40 | 41 | #X = numpy.array([x]) 42 | 43 | ym = gauss(A=height_broad, mu=mean_broad, x=x, z=z, sig=width_broad) 44 | ym += gauss(A=height_narrow, mu=mean_narrow, x=x, z=z, sig=width_narrow) 45 | ym = numpy.transpose(ym) 46 | print(ym.shape) 47 | 48 | # add noise 49 | print('adding noise') 50 | y = numpy.random.normal(0, noise_level, size=ym.shape) + ym 51 | print('truncating ...') 52 | N = int(sys.argv[1]) 53 | y = y[:,:N] 54 | print('plotting ...') 55 | for i in range(min(N, 20)): 56 | #plt.plot(x, y[:,i], '.-') 57 | plt.plot(x, y[:,i], '-') 58 | plt.savefig('gen_realistic.pdf', bbox_inches='tight') 59 | plt.close() 60 | 61 | print(x.shape, y.shape, z.shape) 62 | with h5py.File('data_realistic_%d.hdf5' % N, 'w') as f: 63 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 64 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 65 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 66 | f.create_dataset('mean_broad', data=mean_broad, compression='gzip', shuffle=True) 67 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 68 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 69 | f.create_dataset('width_narrow', data=width_narrow, compression='gzip', shuffle=True) 70 | f.create_dataset('height_narrow', data=height_narrow, compression='gzip', shuffle=True) 71 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 72 | 73 | 74 | -------------------------------------------------------------------------------- /gensimple_faint.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import matplotlib.pyplot as plt 4 | import h5py 5 | from numpy import exp 6 | import sys 7 | 8 | def gauss(x, z, A, mu, sig): 9 | xT = x.reshape((1,-1)) 10 | zT = z.reshape((-1,1)) 11 | AT = A.reshape((-1,1)) 12 | muT = mu.reshape((-1,1)) 13 | sigT = sig.reshape((-1,1)) 14 | return AT * exp(-0.5 * ((muT - xT / (1. + zT))/sigT)**2) 15 | 16 | x = numpy.linspace(400, 800, 200) 17 | 18 | N = 40 19 | N = int(sys.argv[1]) 20 | numpy.random.seed(N) 21 | alpha, beta, scale = 2., 7., 1 22 | z = numpy.random.beta(alpha, beta, size=N) * scale 23 | #z = numpy.zeros(N) + 0.01 24 | rest_wave = 440 25 | print('generating parameters ...') 26 | # in km/s 27 | width_broad = 4000 * rest_wave / 300000 * numpy.ones(N) 28 | width_narrow = 400 * rest_wave / 300000 * numpy.ones(N) 29 | # convert to nm 30 | mean_broad = rest_wave * numpy.ones(N) 31 | mean_narrow = rest_wave * numpy.ones(N) 32 | width_broad = width_broad 33 | width_narrow = width_narrow 34 | noise_level = 0.01 35 | #signal_level = numpy.random.exponential(size=N) * 0.4 36 | #signal_level = numpy.ones(N) * 0.04 37 | signal_level = numpy.random.normal(0.2, 0.2, size=10*N) 38 | signal_level = signal_level[signal_level>0.1][:N] 39 | #signal_level = numpy.random.uniform(size=N) * 0.5 40 | #is_type1 = numpy.random.uniform(size=N) < 0.5 41 | height_broad = 10**-1 * signal_level 42 | height_narrow = signal_level 43 | 44 | #X = numpy.array([x]) 45 | 46 | print('generating signal ...') 47 | ym = gauss(A=height_broad, mu=mean_broad, x=x, z=z, sig=width_broad) 48 | ym += gauss(A=height_narrow, mu=mean_narrow, x=x, z=z, sig=width_narrow) 49 | ym = numpy.transpose(ym) 50 | print(ym.shape) 51 | 52 | # add noise 53 | print('adding noise...') 54 | y = ym.copy() 55 | for i in range(N): 56 | y[:,i] += numpy.random.normal(0, noise_level, size=len(x)) 57 | 58 | print('plotting ...') 59 | colors = ['yellow', 'pink', 'cyan', 'magenta'] 60 | for i in range(min(N, 4)): 61 | #plt.plot(x, y[:,i], '.-') 62 | plt.plot( rest_wave * (1+z[i]), 0.15 * height_narrow[i] / noise_level, 'v', color=colors[i], ms=12) 63 | plt.plot(x, y[:,i] / noise_level, '-', color=colors[i]) 64 | plt.ylabel('Detector signal') 65 | plt.xlabel('Wavelength [nm]') 66 | plt.savefig('genfaint.pdf', bbox_inches='tight') 67 | plt.close() 68 | 69 | #print x.shape, y.shape, z.shape 70 | with h5py.File('data_faint_%s.hdf5' % sys.argv[1], 'w') as f: 71 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 72 | f.create_dataset('y', data=y, compression='gzip', shuffle=True) 73 | f.create_dataset('z', data=z, compression='gzip', shuffle=True) 74 | f.create_dataset('mean_broad', data=mean_broad, compression='gzip', shuffle=True) 75 | f.create_dataset('width_broad', data=width_broad, compression='gzip', shuffle=True) 76 | f.create_dataset('height_broad', data=height_broad, compression='gzip', shuffle=True) 77 | f.create_dataset('mean_narrow', data=mean_narrow, compression='gzip', shuffle=True) 78 | f.create_dataset('width_narrow', data=width_narrow, compression='gzip', shuffle=True) 79 | f.create_dataset('height_narrow', data=height_narrow, compression='gzip', shuffle=True) 80 | 81 | 82 | -------------------------------------------------------------------------------- /clike.c: -------------------------------------------------------------------------------- 1 | /*** 2 | 3 | Likelihood implementation in C 4 | -------------------------------- 5 | 6 | Copyright (c) 2017 Johannes Buchner 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | 16 | ***/ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #ifdef PARALLEL 23 | #include 24 | #endif 25 | 26 | #define IFVERBOSE if(0) 27 | #define IFDEBUG if(0) 28 | #define adouble double 29 | #define bdouble double 30 | #define sqr(x) (pow(x,2)) 31 | 32 | // Parallelisation does not work at the moment, you are welcome to fix it 33 | 34 | int like( 35 | const void * xp, const void * yyp, const int ndata, const int nx, 36 | const double A, const double mu, const double sig, 37 | const double noise_level, 38 | const void * data_maskp, 39 | void * Loutp 40 | ) { 41 | const adouble * x = (const adouble*) xp; 42 | const adouble * yy = (const adouble*) yyp; 43 | const bool * data_mask = (const bool*) data_maskp; 44 | adouble * Lout = (adouble*) Loutp; 45 | 46 | { 47 | #ifdef PARALLEL 48 | int k = 0; 49 | #pragma omp parallel for 50 | // this is stupid because it does not actually safe model evaluations, 51 | // but at least it should run faster for our testing purposes. 52 | for (int i = 0; i < ndata; i++) { 53 | if (data_mask[i]) { 54 | Lout[k] = 0; 55 | for (int j = 0; j < nx; j++) { 56 | const double ypred = A * exp(-0.5 * sqr((mu - x[j])/sig)); 57 | IFVERBOSE printf("y %d %d: %f %f\n", i, j, yy[i + j*ndata], ypred); 58 | Lout[k] += sqr((ypred - yy[i + j*ndata]) / noise_level); 59 | } 60 | k++; 61 | } 62 | } 63 | #else 64 | for (int j = 0; j < nx; j++) { 65 | const double ypred = A * exp(-0.5 * sqr((mu - x[j])/sig)); 66 | 67 | int k = 0; 68 | for (int i = 0; i < ndata; i++) { 69 | IFVERBOSE printf("data_mask %d: %d\n", i, data_mask[i]); 70 | if (data_mask[i]) { 71 | IFVERBOSE printf("y %d %d: %f %f\n", i, j, yy[i + j*ndata], ypred); 72 | Lout[k] += sqr((ypred - yy[i + j*ndata]) / noise_level); 73 | k++; 74 | } 75 | } 76 | } 77 | #endif 78 | } 79 | IFVERBOSE { 80 | int k = 0; 81 | for (int i = 0; i < ndata; i++) { 82 | if (data_mask[i]) { 83 | printf("L %d: %f\n", k, Lout[k]); 84 | k++; 85 | } 86 | } 87 | } 88 | return 0; 89 | } 90 | 91 | -------------------------------------------------------------------------------- /clustering/sdml.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Geometry learning algorithms 5 | ------------------------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | 20 | import numpy as np 21 | import numpy 22 | from numpy import exp 23 | import scipy.linalg 24 | 25 | class IdentityMetric(object): 26 | """ 27 | Input is output. 28 | """ 29 | def fit(self, x): 30 | pass 31 | def transform(self, x): 32 | return x 33 | def untransform(self, y): 34 | return y 35 | def __eq__(self, other): 36 | return self.__dict__ == other.__dict__ 37 | 38 | class SimpleScaling(object): 39 | """ 40 | Whitens by subtracting the mean and scaling by the 41 | standard deviation of each axis. 42 | """ 43 | def __init__(self, verbose=False): 44 | self.verbose = verbose 45 | 46 | def fit(self, X, W=None): 47 | self.mean = numpy.mean(X, axis=0) 48 | X = X - self.mean 49 | self.scale = numpy.std(X, axis=0) 50 | if self.verbose: 'Scaling metric:', self.scale 51 | def transform(self, x): 52 | return (x - self.mean) / self.scale 53 | 54 | def untransform(self, y): 55 | return y * self.scale + self.mean 56 | 57 | def __eq__(self, other): 58 | return self.__dict__ == other.__dict__ 59 | 60 | class TruncatedScaling(object): 61 | """ 62 | Whitens by subtracting the mean and scaling by the 63 | standard deviation of each axis. The scaling is discretized on 64 | a log axis onto integers. 65 | """ 66 | def __init__(self, verbose=False): 67 | self.verbose = verbose 68 | def fit(self, X, W=None): 69 | self.mean = numpy.mean(X, axis=0) 70 | X = X - self.mean 71 | #scale = numpy.max(X, axis=0) - numpy.min(X, axis=0) 72 | scale = numpy.std(X, axis=0) 73 | scalemax = scale.max() * 1.001 74 | scalemin = scale.min() 75 | # round onto discrete log scale to avoid random walk 76 | logscale = (-numpy.log2(scale / scalemax)).astype(int) 77 | self.scale = 2**(logscale.astype(float)) 78 | #print 'Scaling metric:', self.scale, '(from', scale, ')' 79 | if self.verbose: 'Discretized scaling metric:\n', logscale 80 | 81 | def transform(self, x): 82 | return (x - self.mean) / self.scale 83 | 84 | def untransform(self, y): 85 | return y * self.scale + self.mean 86 | 87 | def __eq__(self, other): 88 | return self.__dict__ == other.__dict__ 89 | 90 | -------------------------------------------------------------------------------- /elldrawer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Implementation of MultiEllipsoidal sampling via nestle 5 | 6 | Copyright (c) 2017 Johannes Buchner 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | 16 | 17 | 18 | """ 19 | 20 | import numpy 21 | from numpy import exp, log, log10, pi 22 | from nestle import bounding_ellipsoid, bounding_ellipsoids, sample_ellipsoids 23 | from collections import defaultdict 24 | 25 | class MultiEllipsoidalConstrainer(object): 26 | def __init__(self, rebuild_every = 1000, verbose = False, enlarge=3.): 27 | self.iter = 0 28 | self.ndraws_since_rebuild = 0 29 | self.rebuild_every = int(rebuild_every) 30 | self.enlarge = enlarge 31 | self.verbose = verbose 32 | self.ells = None 33 | self.last_cluster_points = None 34 | 35 | def update(self, points): 36 | # volume is larger than standard Ellipsoid computation 37 | # because we have a superset of various likelihood contours 38 | # increase proportional to number of points 39 | pointvol = exp(-self.iter / self.nlive_points) * (len(points) * 1. / self.nlive_points) / self.nlive_points 40 | self.ells = bounding_ellipsoids(numpy.asarray(points), pointvol=pointvol) 41 | for ell in self.ells: 42 | ell.scale_to_vol(ell.vol * self.enlarge) 43 | 44 | def generate(self, ndim): 45 | ntotal = 0 46 | N = 10000 47 | while True: 48 | u = sample_ellipsoids(self.ells, rstate=numpy.random) 49 | if not (numpy.all(u > 0.) and numpy.all(u < 1.)): 50 | continue 51 | yield u, ntotal 52 | 53 | def rebuild(self, u, ndim): 54 | if self.last_cluster_points is not None and \ 55 | len(self.last_cluster_points) == len(u) and \ 56 | numpy.all(self.last_cluster_points == u): 57 | # do nothing if everything stayed the same 58 | return 59 | 60 | self.update(points=u) 61 | self.last_cluster_points = u 62 | 63 | self.generator = self.generate(ndim) 64 | 65 | def _draw_constrained_prepare(self, Lmins, priortransform, loglikelihood, live_pointsu, ndim, **kwargs): 66 | rebuild = self.ndraws_since_rebuild > self.rebuild_every or self.ells is None 67 | if rebuild: 68 | print('rebuild triggered at call') 69 | self.rebuild(numpy.asarray(live_pointsu), ndim) 70 | self.ndraws_since_rebuild = 0 71 | assert self.generator is not None 72 | return rebuild 73 | 74 | def draw_constrained(self, Lmins, priortransform, loglikelihood, live_pointsu, ndim, iter, nlive_points, **kwargs): 75 | ntoaccept = 0 76 | self.iter = iter 77 | self.nlive_points = nlive_points 78 | #print 'MLFriends trying to replace', Lmins 79 | rebuild = self._draw_constrained_prepare(Lmins, priortransform, loglikelihood, live_pointsu, ndim, **kwargs) 80 | while True: 81 | #print ' starting generator ...' 82 | for u, ntotal in self.generator: 83 | assert (u >= 0).all() and (u <= 1).all(), u 84 | x = priortransform(u) 85 | L = loglikelihood(x) 86 | ntoaccept += 1 87 | self.ndraws_since_rebuild += 1 88 | 89 | if numpy.any(L > Lmins): 90 | # yay, we win 91 | #print 'accept after %d tries' % ntoaccept 92 | return u, x, L, ntoaccept 93 | 94 | # if running very inefficient, optimize clustering 95 | # if we haven't done so at the start 96 | if not rebuild and self.ndraws_since_rebuild > self.rebuild_every: 97 | rebuild = True 98 | print('Ellipsoid rebuild triggered after %d draws' % self.ndraws_since_rebuild) 99 | self.rebuild(numpy.asarray(live_pointsu), ndim) 100 | self.ndraws_since_rebuild = 0 101 | break 102 | 103 | -------------------------------------------------------------------------------- /cachedconstrainer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | from hiermetriclearn import MetricLearningFriendsConstrainer 4 | from elldrawer import MultiEllipsoidalConstrainer 5 | 6 | # use this for MLFriends (RadFriends, but with standardized Euclidean metric) 7 | def generate_fresh_constrainer_mlfriends(): 8 | return MetricLearningFriendsConstrainer( 9 | metriclearner = 'truncatedscaling', force_shrink=True, 10 | rebuild_every=1000, metric_rebuild_every=20, 11 | verbose=False) 12 | 13 | # use this for Ellipsoidal Sampling, like MultiNest 14 | def generate_fresh_constrainer_multiellipsoidal(): 15 | return MultiEllipsoidalConstrainer(rebuild_every=1000, enlarge=3.) 16 | 17 | generate_fresh_constrainer = generate_fresh_constrainer_multiellipsoidal 18 | 19 | class CachedConstrainer(object): 20 | """ 21 | This keeps metric learners if they are used (in the last three iterations). 22 | Otherwise, constructs a fresh one. 23 | """ 24 | def __init__(self, sampler=None): 25 | self.iter = -1 26 | self.prev_prev_prev_generation = {} 27 | self.prev_prev_generation = {} 28 | self.prev_generation = {} 29 | self.curr_generation = {} 30 | self.last_mask = [] 31 | self.last_points = [] 32 | self.last_realmask = None 33 | self.sampler = sampler 34 | 35 | def get(self, mask, realmask, points, it): 36 | while self.iter < it: 37 | # new generation 38 | self.prev_prev_prev_generation = self.prev_prev_generation 39 | self.prev_prev_generation = self.prev_generation 40 | self.prev_generation = self.curr_generation 41 | self.curr_generation = {} 42 | self.last_mask = [] 43 | self.last_realmask = None 44 | self.last_points = [] 45 | self.iter += 1 46 | 47 | # if we only dropped a single (or a few) data sets 48 | # compared to the call just before, lets reuse the same 49 | # this happens in the focussed draw with 1000s of data sets 50 | # where a single data set can accept a point; 51 | # not worth to recompute the region. 52 | if self.last_realmask is not None and len(mask) < len(self.last_mask) and \ 53 | len(mask) > 0.80 * len(self.last_mask) and \ 54 | len(points) <= len(self.last_points) and \ 55 | len(points) > 0.90 * len(self.last_points) and \ 56 | numpy.mean(self.last_realmask == realmask) > 0.80 and \ 57 | numpy.in1d(points, self.last_points).all(): 58 | print('re-using previous, similar region (%.1f%% data set overlap, %.1f%% points overlap)' % (numpy.mean(self.last_realmask == realmask) * 100., len(points) * 100. / len(self.last_points), )) 59 | k = tuple(self.last_mask.tolist()) 60 | return self.curr_generation[k].draw_constrained 61 | print('not re-using region', (len(mask), len(self.last_mask), len(points), len(self.last_points), (len(mask) < len(self.last_mask), len(mask) > 0.80 * len(self.last_mask), len(points) > 0.90 * len(self.last_points), numpy.mean(self.last_realmask == realmask) ) )) 62 | 63 | # normal operation: 64 | k = tuple(mask.tolist()) 65 | self.last_realmask = realmask 66 | self.last_mask = mask 67 | self.last_points = points 68 | 69 | # try to recycle 70 | if k in self.curr_generation: 71 | pass 72 | elif k in self.prev_generation: 73 | print('re-using previous1 region') 74 | self.curr_generation[k] = self.prev_generation[k] 75 | elif k in self.prev_prev_generation: 76 | print('re-using previous2 region') 77 | self.curr_generation[k] = self.prev_prev_generation[k] 78 | elif k in self.prev_prev_prev_generation: 79 | print('re-using previous3 region') 80 | self.curr_generation[k] = self.prev_prev_prev_generation[k] 81 | else: 82 | # nothing found, so start from scratch 83 | self.curr_generation[k] = generate_fresh_constrainer() 84 | #self.curr_generation[k] = MetricLearningFriendsConstrainer( 85 | # metriclearner = 'truncatedscaling', force_shrink=True, 86 | # rebuild_every=1000, metric_rebuild_every=20, 87 | # verbose=False) 88 | self.curr_generation[k].sampler = self.sampler 89 | 90 | return self.curr_generation[k].draw_constrained 91 | 92 | def generate_individual_constrainer(rebuild_every=1000, metric_rebuild_every=20): 93 | individual_constrainers = {} 94 | individual_constrainers_lastiter = {} 95 | def individual_draw_constrained(i, it, sampler): 96 | if i not in individual_constrainers: 97 | #individual_constrainers[i] = MetricLearningFriendsConstrainer( 98 | # metriclearner = 'truncatedscaling', force_shrink=True, 99 | # rebuild_every=rebuild_every, metric_rebuild_every=metric_rebuild_every, 100 | # verbose=False) 101 | individual_constrainers[i] = generate_fresh_constrainer() 102 | individual_constrainers[i].sampler = sampler 103 | individual_constrainers_lastiter[i] = it 104 | if it > individual_constrainers_lastiter[i] + 5: 105 | # force rebuild 106 | individual_constrainers[i].region = None 107 | individual_constrainers_lastiter[i] = it 108 | return individual_constrainers[i].draw_constrained 109 | return individual_constrainers, individual_constrainers_lastiter, individual_draw_constrained 110 | 111 | def generate_superset_constrainer(): 112 | return generate_fresh_constrainer() 113 | #return MetricLearningFriendsConstrainer(metriclearner = 'truncatedscaling', 114 | # rebuild_every=1000, metric_rebuild_every=20, verbose=False, force_shrink=True) 115 | 116 | 117 | -------------------------------------------------------------------------------- /clustering/cneighbors.c: -------------------------------------------------------------------------------- 1 | /*** 2 | 3 | Neighbourhood helper functions accelerated with parallelised C 4 | --------------------------------------------------------------- 5 | 6 | Copyright (c) 2017 Johannes Buchner 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | 16 | ***/ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #ifdef PARALLEL 23 | #include 24 | #endif 25 | 26 | #define IFVERBOSE if(0) 27 | #define IFDEBUG if(0) 28 | #define adouble double 29 | #define bdouble double 30 | #define sqr(x) (pow(x,2)) 31 | 32 | double most_distant_nearest_neighbor( 33 | const void * xxp, int nsamples, int ndim 34 | ) { 35 | const adouble * xx = (const adouble*) xxp; 36 | double nearest_ds[nsamples]; 37 | 38 | IFVERBOSE { 39 | for (int i = 0; i < nsamples; i++) { // one sample at a time 40 | printf("%d: ", i); 41 | for (int k = 0; k < ndim; k++) { 42 | printf("%e\t", xx[i*ndim + k]); 43 | } 44 | printf("\n"); 45 | } 46 | } 47 | #ifdef PARALLEL 48 | #pragma omp parallel for 49 | #endif 50 | for (int i = 0; i < nsamples; i++) { // one sample at a time 51 | // consider all other samples before i 52 | double nearest_d = 1e300; 53 | for (int j = 0; j < nsamples; j++) { 54 | if (j != i) { 55 | double d = 0; 56 | for (int k = 0; k < ndim; k++) { 57 | d += sqr(xx[i*ndim + k] - xx[j*ndim + k]); 58 | } 59 | if (d < nearest_d) { 60 | nearest_d = d; 61 | } 62 | } 63 | } 64 | IFVERBOSE printf("%d: %f\n", i, sqrt(nearest_d)); 65 | nearest_ds[i] = sqrt(nearest_d); 66 | } 67 | double furthest_d = nearest_ds[0]; 68 | 69 | for (int i = 1; i < nsamples; i++) { 70 | if (nearest_ds[i] > furthest_d) 71 | furthest_d = nearest_ds[i]; 72 | } 73 | IFVERBOSE printf("result: %f\n", furthest_d); 74 | return furthest_d; 75 | } 76 | 77 | int is_within_distance_of( 78 | const void * xxp, int nsamples, int ndim, double maxdistance, const void * yp 79 | ) { 80 | const adouble * xx = (const adouble*) xxp; 81 | const adouble * y = (const adouble*) yp; 82 | 83 | for (int i = 0; i < nsamples; i++) { // one sample at a time 84 | double d = 0; 85 | for (int k = 0; k < ndim; k++) { 86 | d += sqr(xx[i*ndim + k] - y[k]); 87 | } 88 | if (sqrt(d) < maxdistance) 89 | return 1; 90 | } 91 | return 0; 92 | } 93 | 94 | 95 | int count_within_distance_of( 96 | const void * xxp, int nsamples, int ndim, double maxdistance, 97 | const void * yyp, int nothers, void * outp, const int countmax 98 | ) { 99 | const adouble * xx = (const adouble*) xxp; 100 | const adouble * yy = (const adouble*) yyp; 101 | double * out = (double*) outp; 102 | 103 | for (int j = 0; j < nothers; j++) { // one sample at a time 104 | for (int i = 0; i < nsamples; i++) { // one sample at a time 105 | double d = 0; 106 | for (int k = 0; k < ndim; k++) { 107 | d += sqr(xx[i*ndim + k] - yy[j*ndim + k]); 108 | } 109 | if (sqrt(d) < maxdistance) { 110 | out[j]++; 111 | // printf("%d: %f\n", j, out[j]); 112 | if (countmax > 0 && out[j] >= countmax) { 113 | break; 114 | } 115 | } 116 | } 117 | } 118 | return 0; 119 | } 120 | 121 | /** 122 | * xxp are double points (nsamples x ndim) 123 | * choicep is whether the point is selected in the bootstrap round (nsamples x nbootstraps) 124 | */ 125 | double bootstrapped_maxdistance( 126 | const void * xxp, 127 | int nsamples, int ndim, 128 | const void * choicep, 129 | int nbootstraps 130 | ) { 131 | const adouble * xx = (const adouble*) xxp; 132 | const adouble * chosen = (const adouble*) choicep; 133 | 134 | double furthest_ds[nbootstraps]; 135 | double furthest_d_bs; 136 | 137 | #ifdef PARALLEL 138 | #pragma omp parallel for 139 | #endif 140 | for(int b = 0; b < nbootstraps; b++) { 141 | double nearest_ds[nsamples]; 142 | double furthest_d = 0; 143 | //printf("bootstrap round %d\n", b); 144 | // find one that was not chosen 145 | for (int i = 0; i < nsamples; i++) { 146 | if (chosen[i*nbootstraps + b] != 0) continue; 147 | //printf(" considering %d\n", i); 148 | double nearest_d = 1e300; 149 | for (int j = 0; j < nsamples; j++) { 150 | if (chosen[j*nbootstraps + b] == 0) continue; 151 | double d = 0; 152 | for (int k = 0; k < ndim; k++) { 153 | d += sqr(xx[i*ndim + k] - xx[j*ndim + k]); 154 | } 155 | if (d < nearest_d) { 156 | nearest_d = d; 157 | } 158 | } 159 | //printf(" %d: %f\n", i, sqrt(nearest_d)); 160 | nearest_ds[i] = sqrt(nearest_d); 161 | } 162 | for (int i = 1; i < nsamples; i++) { 163 | if (chosen[i*nbootstraps + b] != 0) continue; 164 | if (nearest_ds[i] > furthest_d) 165 | furthest_d = nearest_ds[i]; 166 | } 167 | //printf("bootstrap round %d gave %f\n", b, furthest_d); 168 | furthest_ds[b] = furthest_d; 169 | } 170 | 171 | furthest_d_bs = furthest_ds[0]; 172 | for (int i = 1; i < nbootstraps; i++) { 173 | if (furthest_ds[i] > furthest_d_bs) 174 | furthest_d_bs = furthest_ds[i]; 175 | } 176 | 177 | IFVERBOSE printf("result: %f\n", furthest_d_bs); 178 | return furthest_d_bs; 179 | } 180 | -------------------------------------------------------------------------------- /profile_generate_subsets.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import sys 4 | import time 5 | import networkx 6 | import igraph 7 | 8 | def generate_subsets_reference(data_mask, live_pointsp, graph, _): 9 | # generate data subsets which share points. 10 | firstmember = numpy.where(data_mask)[0][0] 11 | if len(live_pointsp[:,firstmember]) == len(numpy.unique(live_pointsp[:,data_mask].flatten())): 12 | # trivial case: all live points are the same across data sets 13 | yield data_mask, live_pointsp[:,firstmember] 14 | return 15 | 16 | to_handle = data_mask.copy() 17 | while to_handle.any(): 18 | firstmember = numpy.where(to_handle)[0][0] 19 | to_handle[firstmember] = False 20 | members = [firstmember] 21 | # get live points of this member 22 | member_live_pointsp = live_pointsp[:,firstmember].tolist() 23 | # look through to_handle for entries and check if they have the points 24 | i = 0 25 | while True: 26 | if i >= len(member_live_pointsp) or not to_handle.any(): 27 | break 28 | p = member_live_pointsp[i] 29 | sharing = (live_pointsp[:,to_handle] == p).any(axis=0) 30 | #assert len(sharing) == to_handle.sum() 31 | newmembers = numpy.where(to_handle)[0][sharing] 32 | #assert numpy.all(newmembers == numpy.arange(len(to_handle))[to_handle][sharing]) 33 | 34 | #print 'new members:', newmembers 35 | members += newmembers.tolist() 36 | for newp in numpy.unique(live_pointsp[:,newmembers]): 37 | if newp not in member_live_pointsp: 38 | member_live_pointsp.append(newp) 39 | to_handle[newmembers] = False 40 | i = i + 1 41 | 42 | # now we have our members and live points 43 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 44 | member_data_mask[members] = True 45 | #print 'returning:', member_data_mask, member_live_pointsp 46 | yield member_data_mask, member_live_pointsp 47 | 48 | def generate_subsets_graph_simple(data_mask, live_pointsp, graph, _): 49 | # generate data subsets which share points. 50 | firstmember = numpy.where(data_mask)[0][0] 51 | # then identify disjoint subgraphs 52 | for subgraph in networkx.connected_component_subgraphs(graph, copy=False): 53 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 54 | member_live_pointsp = [] 55 | for nodetype, i in subgraph.nodes(): 56 | if nodetype == 0: 57 | member_data_mask[i] = True 58 | else: 59 | member_live_pointsp.append(i) 60 | yield member_data_mask, member_live_pointsp 61 | 62 | def generate_subsets_graph(data_mask, live_pointsp, graph, _): 63 | # generate data subsets which share points. 64 | firstmember = numpy.where(data_mask)[0][0] 65 | allp = numpy.unique(live_pointsp[:,data_mask].flatten()) 66 | if len(live_pointsp[:,firstmember]) == len(allp): 67 | # trivial case: all live points are the same across data sets 68 | yield data_mask, live_pointsp[:,firstmember] 69 | return 70 | 71 | subgraphs = list(networkx.connected_component_subgraphs(graph, copy=False)) 72 | if len(subgraphs) == 1: 73 | yield data_mask, allp 74 | return 75 | 76 | # then identify disjoint subgraphs 77 | for subgraph in subgraphs: 78 | print('networkx subgraph:', subgraph.nodes()) 79 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 80 | member_live_pointsp = [] 81 | for nodetype, i in subgraph.nodes(): 82 | if nodetype == 0: 83 | member_data_mask[i] = True 84 | else: 85 | member_live_pointsp.append(i) 86 | yield member_data_mask, member_live_pointsp 87 | 88 | def generate_subsets_igraph(data_mask, live_pointsp, _graph, graph): 89 | # generate data subsets which share points. 90 | firstmember = numpy.where(data_mask)[0][0] 91 | allp = numpy.unique(live_pointsp[:,data_mask].flatten()) 92 | if len(live_pointsp[:,firstmember]) == len(allp): 93 | # trivial case: all live points are the same across data sets 94 | yield data_mask, live_pointsp[:,firstmember] 95 | return 96 | 97 | subgraphs = graph.clusters() 98 | if len(subgraphs) == 1: 99 | yield data_mask, allp 100 | return 101 | # then identify disjoint subgraphs 102 | for subgraph in subgraphs: 103 | #print 'igraph subgraph:', subgraph 104 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 105 | member_live_pointsp = [] 106 | for vi in subgraph: 107 | att = graph.vs[vi].attributes() 108 | #print ' ', att 109 | if att['vtype'] == 0: 110 | i = att['nodeid'] 111 | member_data_mask[i] = True 112 | else: 113 | p = att['pointid'] 114 | member_live_pointsp.append(p) 115 | if member_data_mask.any(): 116 | yield member_data_mask, member_live_pointsp 117 | 118 | data_sets = [] 119 | 120 | t0 = time.time() 121 | for filename in sys.argv[1:]: 122 | data = numpy.load(filename) 123 | data_mask, live_pointsp = data['data_mask'], data['live_pointsp'] 124 | # create graph 125 | graph = networkx.Graph() 126 | graph2 = igraph.Graph() 127 | # pointing from live_point to member 128 | 129 | for p in numpy.unique(live_pointsp): 130 | graph2.add_vertex("p%d" % p, pointid=p, vtype=1) 131 | for i in numpy.where(data_mask)[0]: 132 | graph.add_edges_from((((0, i), (1, p)) for p in live_pointsp[:,i])) 133 | graph2.add_vertex("n%d" % i, nodeid=i, vtype=0) 134 | graph2.add_edges([("n%d" % i, "p%d" % p) for p in live_pointsp[:,i]]) 135 | 136 | data_sets.append((data_mask, live_pointsp, graph, graph2)) 137 | t1 = time.time() 138 | print('loading took %fs' % (t1 - t0)) 139 | 140 | prev_output = [] 141 | for implementation in [generate_subsets_reference, generate_subsets_graph_simple, generate_subsets_graph, generate_subsets_igraph]: 142 | print('running', implementation) 143 | output = [] 144 | t0 = time.time() 145 | for a, b, graph, graph2 in data_sets: 146 | out = list(implementation(a, b, graph, graph2)) 147 | output.append(out) 148 | t1 = time.time() 149 | print(' took %fs' % (t1 - t0)) 150 | #for a, b in zip(output, 151 | if prev_output != []: 152 | print('checking for correctness...') 153 | for memberlist1, memberlist2 in zip(output, prev_output): 154 | memberlist1 = sorted(memberlist1, key=lambda ml: (len(ml), ml[0][0])) 155 | memberlist2 = sorted(memberlist2, key=lambda ml: (len(ml), ml[0][0])) 156 | for (md, ml), (md2, ml2) in zip(memberlist1, memberlist2): 157 | #print len(md), md.sum(), len(md2), md2.sum(), len(ml), len(ml2) 158 | assert numpy.all(md == md2), (numpy.where(md), numpy.where(md2)) 159 | assert sorted(ml) == sorted(ml2) 160 | assert len(memberlist1) == len(memberlist2), (len(memberlist1), len(memberlist2)) 161 | prev_output = output 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /musefuse_postprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Main program 5 | --------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | from numpy import exp 21 | import h5py 22 | import sys 23 | import json 24 | import os 25 | import time 26 | import astropy.io.fits as pyfits 27 | import matplotlib.pyplot as plt 28 | 29 | print('loading data...') 30 | f = pyfits.open(sys.argv[1]) 31 | datasection = f['DATA'] 32 | y = datasection.data # values 33 | y = y[:3600,:,:] 34 | noise_level = f['STAT'].data # variance 35 | noise_level = noise_level[:3600,:,:] 36 | nspec, npixx, npixy = y.shape 37 | 38 | print('applying subselection ...') 39 | ## replaced by mask 40 | #y = y[:,80:200,170:240] 41 | #noise_level = noise_level[:,80:200,170:240] 42 | #y = y[:,70:80,35:45] 43 | #noise_level = noise_level[:,70:80,35:45] 44 | 45 | regionfile = sys.argv[2] 46 | import pyregion 47 | region = pyregion.parse(open(regionfile).read()) 48 | mask = region.get_mask(shape=(npixx, npixy)) 49 | ymask = numpy.array([mask] * len(y)) 50 | xids, yids = numpy.where(mask) 51 | y = y[ymask] 52 | y = y.reshape((nspec, -1)) 53 | noise_level = noise_level[ymask] 54 | noise_level = noise_level.reshape((nspec, -1)) 55 | 56 | #nspec, npixx, npixy = y.shape 57 | print((y.shape)) 58 | outputimg = numpy.zeros((npixx, npixy)) * numpy.nan 59 | 60 | #y = y.reshape((nspec, -1)) 61 | #noise_level = noise_level.reshape((nspec, -1)) 62 | outputimg_flat = outputimg #.reshape((-1)) 63 | x = datasection.header['CD3_3'] * numpy.arange(nspec) + datasection.header['CRVAL3'] 64 | print(' finding NaNs...') 65 | good = numpy.isfinite(noise_level).all(axis=0) 66 | #assert good.shape == (npixx*npixy,), good.shape 67 | #goodids = numpy.where(good)[0] 68 | goodids = list(zip(xids[good], yids[good])) 69 | print((len(good), len(goodids))) 70 | 71 | y = y[:,good] 72 | noise_level = noise_level[:,good] 73 | ndata = os.environ.get('MAXDATA', len(goodids)) 74 | print(' truncating data to %d sets...' % ndata, goodids[:ndata]) 75 | ## truncate data 76 | y = y[:,:ndata] 77 | noise_level = noise_level[:,:ndata] 78 | goodids = goodids[:ndata] 79 | 80 | print((y.shape)) 81 | 82 | prefix = sys.argv[1] 83 | modelname = os.environ.get('MODEL', 'FULL') 84 | if modelname == 'ZSOL': 85 | paramnames = ['logSFtau', 'SFage', 'z', 'EBV'] 86 | prefix = prefix + '_zsol_' 87 | elif modelname == 'FULL': 88 | paramnames = ['Z', 'logSFtau', 'SFage', 'z', 'EBV'] 89 | prefix = prefix + '_full_' 90 | else: 91 | assert False 92 | 93 | filename = prefix + '.out_%d.hdf5' % ndata 94 | f = h5py.File(filename, 'r') 95 | 96 | nsamplesmax, nids, nparams = f['x'].shape 97 | assert nids == len(goodids), (nids, goodids) 98 | 99 | output_Z = outputimg_flat.copy() 100 | output_Zerr = outputimg_flat.copy() 101 | output_means = {} 102 | output_errs = {} 103 | for pi in range(nparams): 104 | output_means[pi] = outputimg_flat.copy() 105 | output_errs[pi] = outputimg_flat.copy() 106 | 107 | weights = numpy.transpose(f['w'].value + f['L'].value) 108 | print(weights.shape) 109 | 110 | #def pointfactory(): 111 | # x = f['x'].value 112 | # for i in range(nids): 113 | # yield x[:,i,:] 114 | points = numpy.swapaxes(f['x'].value, 0, 1) 115 | 116 | 117 | 118 | for i, (w, logZ, logZerr, x) in enumerate(zip(weights, f['logZ'].value, f['logZerr'].value, points)): 119 | xi, yi = goodids[i] 120 | mask = numpy.isfinite(w) 121 | jparent = numpy.where(mask)[0] 122 | w = w[jparent] 123 | w = numpy.exp(w - w.max()) 124 | w = w / w.sum() 125 | j = numpy.random.choice(jparent, size=4000, p=w) 126 | print(' %d/%d: spaxel %s: from %d samples drew %d unique posterior points' % (i+1, nids, (xi, yi), len(jparent), len(numpy.unique(j)))) 127 | 128 | print(' logZ = %.1f +- %.1f' % (logZ, logZerr)) 129 | output_Z[xi, yi] = logZ 130 | output_Zerr[xi, yi] = logZerr 131 | #x = f['x'][:,i,:] 132 | xequal = x[j,:] 133 | for k in range(nparams): 134 | v = xequal[:,k] 135 | output_means[k][xi, yi] = v.mean() 136 | output_errs[k][xi, yi] = v.std() 137 | print(' param %d = %.3f +- %.3f (%s)' % (k, v.mean(), v.std(), paramnames[k])) 138 | if i < 5: 139 | numpy.savetxt(prefix + '.outsamples_%d.txt' % i, xequal) 140 | #if i > 1000: break 141 | 142 | output_Z = output_Z.reshape((npixx, npixy)) 143 | output_Zerr = output_Zerr.reshape((npixx, npixy)) 144 | for pi in range(nparams): 145 | output_means[pi] = output_means[pi].reshape((npixx, npixy)) 146 | output_errs[pi] = output_errs[pi].reshape((npixx, npixy)) 147 | 148 | filename = prefix + '.outimg_%d.hdf5' % ndata 149 | print('writing image files ...') 150 | def makeimg(name, img, title=None): 151 | outfilename = prefix + '.outimg_%d_%s.pdf' % (ndata, name) 152 | print('creating %s ...' % outfilename) 153 | plt.figure() 154 | if title is None: 155 | title = name 156 | plt.title(title) 157 | plt.imshow(img, cmap=plt.cm.RdBu) 158 | plt.colorbar() 159 | plt.savefig(outfilename, bbox_inches='tight') 160 | plt.close() 161 | 162 | # store results 163 | with h5py.File(filename, 'w') as fimg: 164 | fimg.create_dataset('logZ', data=output_Z, compression='gzip', shuffle=True) 165 | makeimg('logZ', output_Z) 166 | fimg.create_dataset('logZerr', data=output_Zerr, compression='gzip', shuffle=True) 167 | makeimg('logZerr', output_Zerr) 168 | for k in range(nparams): 169 | fimg.create_dataset('param%d' % k, data=output_means[k], compression='gzip', shuffle=True) 170 | makeimg('param%d' % k, output_means[k], title=paramnames[k]) 171 | fimg.create_dataset('param%derr' % k, data=output_errs[k], compression='gzip', shuffle=True) 172 | makeimg('param%derr' % k, output_errs[k], title=paramnames[k] + ' errors') 173 | fimg.attrs['nparams'] = nparams 174 | 175 | 176 | -------------------------------------------------------------------------------- /clustering/radfriendsregion.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | RadFriends region with transforms 5 | ---------------------------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | import scipy.spatial, scipy.cluster 21 | from .neighbors import find_rdistance, is_within_distance_of, count_within_distance_of, any_within_distance_of 22 | from collections import defaultdict 23 | 24 | class ClusterResult(object): 25 | def __init__(self, points, clusters, metric, verbose=False): 26 | self.ws = points 27 | self.clusters = clusters 28 | self.metric = metric 29 | if verbose: 30 | print('CLUSTERS:') 31 | for cluster in clusters: 32 | clusterpoints = metric.untransform(points[cluster,:]) 33 | print('CLUSTER:', clusterpoints.mean(axis=0), clusterpoints.std(axis=0)) 34 | 35 | def get_cluster_id(self, point): 36 | w = self.metric.transform(point) 37 | dists = scipy.spatial.distance.cdist(self.ws, [w], metric='euclidean') 38 | i = numpy.argmin(dists) 39 | for j, cluster in enumerate(self.clusters): 40 | if i in cluster: 41 | return j 42 | 43 | def get_cluster_ids(self, points): 44 | ws = self.metric.transform(points) 45 | dists = scipy.spatial.distance.cdist(self.ws, ws, metric='euclidean') 46 | i = numpy.argmin(dists, axis=0) 47 | assert len(i) == len(points) 48 | results = [] 49 | for ii in i: 50 | for j, cluster in enumerate(self.clusters): 51 | if ii in cluster: 52 | results.append(j) 53 | return results 54 | 55 | def get_n_clusters(self): 56 | return len(self.clusters) 57 | 58 | class RadFriendsRegion(object): 59 | def __init__(self, members, maxdistance=None, metric='euclidean', nbootstraps=10, verbose=False): 60 | self.members = members 61 | assert metric == 'euclidean' 62 | if maxdistance is None: 63 | maxdistance = find_rdistance(members, nbootstraps=nbootstraps, 64 | metric=metric, verbose=verbose) 65 | # print 'new RadFriendsRegion with r=', maxdistance 66 | self.maxdistance = maxdistance 67 | self.metric = metric 68 | self.verbose = verbose 69 | self.lo = numpy.min(self.members, axis=0) - self.maxdistance 70 | self.hi = numpy.max(self.members, axis=0) + self.maxdistance 71 | 72 | def add_members(self, us): 73 | self.members = numpy.vstack((self.members, us)) 74 | self.lo = numpy.min(self.members, axis=0) - self.maxdistance 75 | self.hi = numpy.max(self.members, axis=0) + self.maxdistance 76 | 77 | def are_near_members(self, us): 78 | dists = scipy.spatial.distance.cdist(self.members, us, metric=self.metric) 79 | dist_criterion = dists < self.maxdistance 80 | return dist_criterion 81 | 82 | def count_nearby_members(self, us): 83 | return count_within_distance_of(self.members, self.maxdistance, us) 84 | 85 | def get_nearby_member_ids(self, u): 86 | return numpy.where(self.are_near_members([u]))[0] 87 | 88 | def is_inside(self, u): 89 | # is it true for at least one? 90 | if not ((u >= self.lo).all() and (u <= self.hi).all()): 91 | return False 92 | return is_within_distance_of(self.members, self.maxdistance, u) 93 | #return self.are_near_members([u]).any() 94 | 95 | def are_inside(self, us): 96 | # is it true for at least one? 97 | #return self.are_near_members(us).any(axis=0) 98 | return any_within_distance_of(self.members, self.maxdistance, us) 99 | 100 | def get_clusters(self): 101 | # agglomerate clustering of members 102 | dists = scipy.spatial.distance.cdist(self.members, self.members, metric=self.metric) 103 | connected = dists < self.maxdistance 104 | nmembers = len(self.members) 105 | cluster = dict([(i,i) for i in range(nmembers)]) 106 | for i in range(nmembers): 107 | neighbors = numpy.where(connected[i,:])[0] #[i+1:] 108 | for j in neighbors: 109 | cluster[j] = cluster[i] 110 | result = defaultdict(list) 111 | for element, cluster_nro in list(cluster.items()): 112 | result[cluster_nro].append(element) 113 | #print 'RadFriends: %d clusters' % len(result) 114 | return result 115 | 116 | 117 | def generate(self, nmax=0): 118 | members = self.members 119 | maxdistance = self.maxdistance 120 | nmembers, ndim = numpy.shape(self.members) 121 | # how many points to try to generate 122 | # if too small, many function calls, inefficient 123 | # if too large, large cdist matrices, spikes in memory use 124 | N = 1000 125 | verbose = self.verbose 126 | nall = 0 127 | ntotal = 0 128 | #print 'draw from radfriends' 129 | while nmax == 0 or nall < nmax: 130 | #print 'drew %d/%d so far' % (N, nmax) 131 | # draw from box 132 | # this can be efficient if there are a lot of points 133 | ntotal = ntotal + N 134 | nall += N 135 | us = numpy.random.uniform(self.lo, self.hi, size=(N, ndim)) 136 | mask = self.are_inside(us) 137 | #print 'accepted %d/%d [box draw]' % (mask.sum(), N) 138 | if mask.any(): 139 | yield us[mask,:], ntotal 140 | #for u in us[mask,:]: 141 | # #print 'box draw success:', ntotal 142 | # yield u, ntotal 143 | ntotal = 0 144 | 145 | # draw from points 146 | # this can be efficient in higher dimensions 147 | us = members[numpy.random.randint(0, len(members), N),:] 148 | ntotal = ntotal + N 149 | nall += N 150 | if verbose: print('chosen point', us) 151 | # draw direction around it 152 | direction = numpy.random.normal(0, 1, size=(N, ndim)) 153 | direction = direction / ((direction**2).sum(axis=1)**0.5).reshape((-1,1)) 154 | if verbose: print('chosen direction', direction) 155 | # choose radius: volume gets larger towards the outside 156 | # so give the correct weight with dimensionality 157 | radius = maxdistance * numpy.random.uniform(0, 1, size=(N,1))**(1./ndim) 158 | us = us + direction * radius 159 | #mask = numpy.logical_and((u >= self.lo).all(axis=0), (u <= self.hi).all(axis=0)) 160 | #if not mask.any(): 161 | # if verbose: print 'rejection because outside' 162 | # continue 163 | #us = us[mask,:] 164 | #if verbose: print 'using point', us 165 | # count the number of points this is close to 166 | nnear = self.count_nearby_members(us) 167 | if verbose: print('near', nnear) 168 | # accept with probability 1./nnear 169 | coin = numpy.random.uniform(size=len(us)) 170 | 171 | accept = coin < 1. / nnear 172 | #print 'accepted %d/%d [point draw]' % (accept.sum(), N) 173 | if not accept.any(): 174 | if verbose: print('probabilistic rejection due to overlaps') 175 | continue 176 | #print ' overlaps accepted %d of %d, typically %.2f neighbours' % (accept.sum(), N, nnear.mean()) 177 | us = us[accept,:] 178 | yield us, ntotal 179 | #for u in us: 180 | # #print 'ball draw success:', ntotal 181 | # yield u, ntotal 182 | ntotal = 0 183 | 184 | -------------------------------------------------------------------------------- /multi_nested_integrator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Integrator 5 | ---------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | from numpy import exp, log, log10, pi 21 | import progressbar 22 | from adaptive_progress import AdaptiveETA 23 | from numpy import logaddexp 24 | import sys 25 | 26 | def integrate_remainder(sampler, logwidth, logVolremaining, logZ, H, globalLmax): 27 | # logwidth remains the same now for each sample 28 | remainder = list(sampler.remainder()) 29 | logV = logwidth 30 | L0 = remainder[-1][2] 31 | L0 = globalLmax 32 | logLs = [Li - L0 for ui, xi, Li in remainder] 33 | Ls = numpy.exp(logLs) 34 | LsMax = Ls.copy() 35 | LsMax[-1] = numpy.exp(globalLmax - L0) 36 | Lmax = LsMax[1:].sum(axis=0) + LsMax[-1] 37 | #Lmax = Ls[1:].sum(axis=0) + Ls[-1] 38 | Lmin = Ls[:-1].sum(axis=0) + Ls[0] 39 | logLmid = log(Ls.sum(axis=0)) + L0 40 | logZmid = logaddexp(logZ, logV + logLmid) 41 | logZup = logaddexp(logZ, logV + log(Lmax) + L0) 42 | logZlo = logaddexp(logZ, logV + log(Lmin) + L0) 43 | logZerr = logZup - logZlo 44 | assert numpy.isfinite(H).all() 45 | assert numpy.isfinite(logZerr).all(), logZerr 46 | 47 | for i in range(len(remainder)): 48 | ui, xi, Li = remainder[i] 49 | wi = logwidth + Li 50 | logZnew = logaddexp(logZ, wi) 51 | #Hprev = H 52 | H = exp(wi - logZnew) * Li + exp(logZ - logZnew) * (H + logZ) - logZnew 53 | H[H < 0] = 0 54 | #assert (H>0).all(), (H, Hprev, wi, Li, logZ, logZnew) 55 | logZ = logZnew 56 | 57 | #assert numpy.isfinite(logZerr + (H / sampler.nlive_points)**0.5), (H, sampler.nlive_points, logZerr) 58 | 59 | return logV + logLmid, logZerr, logZmid, logZerr + (H / sampler.nlive_points)**0.5, logZerr + (H / sampler.nlive_points)**0.5 60 | 61 | """ 62 | Performs the Nested Sampling integration by calling the *sampler* multiple times 63 | until the *tolerance* is reached, or the maximum number of likelihood evaluations 64 | is exceeded. 65 | 66 | :param sampler: Sampler 67 | :param tolerance: uncertainty in log Z to compute to 68 | :param max_samples: maximum number of likelihood evaluations (None for no limit) 69 | 70 | @return dictionary containing the keys 71 | 72 | logZ, logZerr: log evidence and uncertainty, 73 | samples: all obtained samples, 74 | weights: posterior samples: 75 | list of prior coordinates, transformed coordinates, likelihood value 76 | and weight 77 | information: information H 78 | niterations: number of nested sampling iterations 79 | """ 80 | def multi_nested_integrator(multi_sampler, tolerance = 0.01, max_samples=None, min_samples = 0, need_robust_remainder_error=True): 81 | sampler = multi_sampler 82 | logVolremaining = 0 83 | logwidth = log(1 - exp(-1. / sampler.nlive_points)) 84 | weights = [] #[-1e300, 1]] 85 | 86 | widgets = ["|...|", 87 | progressbar.Bar(), progressbar.Percentage(), AdaptiveETA()] 88 | pbar = progressbar.ProgressBar(widgets = widgets, maxval=sampler.nlive_points) 89 | 90 | i = 0 91 | ndata = multi_sampler.ndata 92 | running = numpy.ones(ndata, dtype=bool) 93 | last_logwidth = numpy.zeros(ndata) 94 | last_logVolremaining = numpy.zeros(ndata) 95 | last_remainderZ = numpy.zeros(ndata) 96 | last_remainderZerr = numpy.zeros(ndata) 97 | logZerr = numpy.zeros(ndata) 98 | ui, xi, Li = next(sampler) 99 | wi = logwidth + Li 100 | logZ = wi 101 | H = Li - logZ 102 | remainder_tails = [[]] * ndata 103 | pbar.currval = i 104 | pbar.start() 105 | while True: 106 | i = i + 1 107 | logwidth = log(1 - exp(-1. / sampler.nlive_points)) + logVolremaining 108 | last_logwidth[running] = logwidth 109 | last_logVolremaining[running] = logwidth 110 | logVolremaining -= 1. / sampler.nlive_points 111 | 112 | # fill up, otherwise set weight to zero 113 | Lifull = numpy.zeros(ndata) 114 | Lifull[:] = -numpy.inf 115 | Lifull[running] = Li 116 | uifull = numpy.zeros((ndata, ui.shape[1])) 117 | uifull[running,:] = ui 118 | xifull = numpy.zeros((ndata, ui.shape[1])) 119 | xifull[running,:] = xi 120 | weights.append([uifull, xifull, Lifull, numpy.where(running, logwidth, -numpy.inf), running]) 121 | 122 | logZerr[running] = (H[running] / sampler.nlive_points)**0.5 123 | 124 | sys.stdout.flush() 125 | pbar.update(i) 126 | 127 | # expected number of iterations: 128 | i_final = -sampler.nlive_points * (-sampler.Lmax + log(exp(numpy.max([tolerance - logZerr[running], logZerr[running] / 100.], axis=0) + logZ[running]) - exp(logZ[running]))) 129 | i_final = numpy.where(i_final < i+1, i+1, numpy.where(i_final > i+100000, i+100000, i_final)) 130 | max_value = max(i+1, i_final.max()) 131 | if hasattr(pbar, 'max_value'): 132 | pbar.max_value = max_value 133 | elif hasattr(pbar, 'maxval'): 134 | pbar.maxval = max_value 135 | 136 | if i > min_samples and i % 50 == 1 or (max_samples and i > max_samples): 137 | remainderZ, remainderZerr, totalZ, totalZerr, totalZerr_bootstrapped = integrate_remainder(sampler, logwidth, logVolremaining, logZ[running], H[running], sampler.Lmax) 138 | print('checking for termination:', remainderZ, remainderZerr, totalZ, totalZerr) 139 | # tolerance 140 | last_remainderZ[running] = remainderZ 141 | last_remainderZerr[running] = remainderZerr 142 | terminating = totalZerr < tolerance 143 | if max_samples and i > max_samples: 144 | terminating[:] = True 145 | widgets[0] = '|%d/%d samples+%d/%d|lnZ = %.2f +- %.3f + %.3f|L=%.2f^%.2f ' % ( 146 | i + 1, max_value, sampler.nlive_points, sampler.ndraws, logaddexp(logZ[running][0], remainderZ[0]), max(logZerr[running]), max(remainderZerr), Li[0], sampler.Lmax[0]) 147 | if terminating.any(): 148 | print('terminating %d, namely:' % terminating.sum(), list(numpy.where(terminating)[0])) 149 | for j, k in enumerate(numpy.where(running)[0]): 150 | if terminating[j]: 151 | remainder_tails[k] = [[ui, xi, Li, logwidth] for ui, xi, Li in sampler.remainder(j)] 152 | sampler.cut_down(~terminating) 153 | running[running] = ~terminating 154 | if not running.any(): 155 | break 156 | print(widgets[0]) 157 | ui, xi, Li = next(sampler) 158 | wi = logwidth + Li 159 | logZnew = logaddexp(logZ[running], wi) 160 | H[running] = exp(wi - logZnew) * Li + exp(logZ[running] - logZnew) * (H[running] + logZ[running]) - logZnew 161 | logZ[running] = logZnew 162 | 163 | # add tail 164 | # not needed for integral, but for posterior samples, otherwise there 165 | # is a hole in the most likely parameter ranges. 166 | all_tails = numpy.ones(ndata, dtype=bool) 167 | for i in range(sampler.nlive_points): 168 | u, x, L, logwidth = list(zip(*[tail[i] for tail in remainder_tails])) 169 | weights.append([u, x, L, logwidth, all_tails]) 170 | logZerr = logZerr + last_remainderZerr 171 | logZ = logaddexp(logZ, last_remainderZ) 172 | 173 | return dict(logZ=logZ, logZerr=logZerr, 174 | weights=weights, information=H, 175 | niterations=i) 176 | 177 | __all__ = [multi_nested_integrator] 178 | 179 | -------------------------------------------------------------------------------- /pres/mnras_template.tex: -------------------------------------------------------------------------------- 1 | % mnras_template.tex 2 | % 3 | % LaTeX template for creating an MNRAS paper 4 | % 5 | % v3.0 released 14 May 2015 6 | % (version numbers match those of mnras.cls) 7 | % 8 | % Copyright (C) Royal Astronomical Society 2015 9 | % Authors: 10 | % Keith T. Smith (Royal Astronomical Society) 11 | 12 | % Change log 13 | % 14 | % v3.0 May 2015 15 | % Renamed to match the new package name 16 | % Version number matches mnras.cls 17 | % A few minor tweaks to wording 18 | % v1.0 September 2013 19 | % Beta testing only - never publicly released 20 | % First version: a simple (ish) template for creating an MNRAS paper 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | % Basic setup. Most papers should leave these options alone. 24 | \documentclass[a4paper,fleqn,usenatbib]{mnras} 25 | 26 | % MNRAS is set in Times font. If you don't have this installed (most LaTeX 27 | % installations will be fine) or prefer the old Computer Modern fonts, comment 28 | % out the following line 29 | \usepackage{newtxtext,newtxmath} 30 | % Depending on your LaTeX fonts installation, you might get better results with one of these: 31 | %\usepackage{mathptmx} 32 | %\usepackage{txfonts} 33 | 34 | % Use vector fonts, so it zooms properly in on-screen viewing software 35 | % Don't change these lines unless you know what you are doing 36 | \usepackage[T1]{fontenc} 37 | \usepackage{ae,aecompl} 38 | 39 | 40 | %%%%% AUTHORS - PLACE YOUR OWN PACKAGES HERE %%%%% 41 | 42 | % Only include extra packages if you really need them. Common packages are: 43 | \usepackage{graphicx} % Including figure files 44 | \usepackage{amsmath} % Advanced maths commands 45 | \usepackage{amssymb} % Extra maths symbols 46 | 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 48 | 49 | %%%%% AUTHORS - PLACE YOUR OWN COMMANDS HERE %%%%% 50 | 51 | % Please keep new commands to a minimum, and use \newcommand not \def to avoid 52 | % overwriting existing commands. Example: 53 | %\newcommand{\pcm}{\,cm$^{-2}$} % per cm-squared 54 | 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | 57 | %%%%%%%%%%%%%%%%%%% TITLE PAGE %%%%%%%%%%%%%%%%%%% 58 | 59 | % Title of the paper, and the short title which is used in the headers. 60 | % Keep the title short and informative. 61 | \title[Short title, max. 45 characters]{MNRAS \LaTeXe\ template -- title goes here} 62 | 63 | % The list of authors, and the short list which is used in the headers. 64 | % If you need two or more lines of authors, add an extra line using \newauthor 65 | \author[K. T. Smith et al.]{ 66 | Keith T. Smith,$^{1}$\thanks{E-mail: mn@ras.org.uk (KTS)} 67 | A. N. Other,$^{2}$ 68 | Third Author$^{2,3}$ 69 | and Fourth Author$^{3}$ 70 | \\ 71 | % List of institutions 72 | $^{1}$Royal Astronomical Society, Burlington House, Piccadilly, London W1J 0BQ, UK\\ 73 | $^{2}$Department, Institution, Street Address, City Postal Code, Country\\ 74 | $^{3}$Another Department, Different Institution, Street Address, City Postal Code, Country 75 | } 76 | 77 | % These dates will be filled out by the publisher 78 | \date{Accepted XXX. Received YYY; in original form ZZZ} 79 | 80 | % Enter the current year, for the copyright statements etc. 81 | \pubyear{2015} 82 | 83 | % Don't change these lines 84 | \begin{document} 85 | \label{firstpage} 86 | \pagerange{\pageref{firstpage}--\pageref{lastpage}} 87 | \maketitle 88 | 89 | % Abstract of the paper 90 | \begin{abstract} 91 | This is a simple template for authors to write new MNRAS papers. 92 | The abstract should briefly describe the aims, methods, and main results of the paper. 93 | It should be a single paragraph not more than 250 words (200 words for Letters). 94 | No references should appear in the abstract. 95 | \end{abstract} 96 | 97 | % Select between one and six entries from the list of approved keywords. 98 | % Don't make up new ones. 99 | \begin{keywords} 100 | keyword1 -- keyword2 -- keyword3 101 | \end{keywords} 102 | 103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 104 | 105 | %%%%%%%%%%%%%%%%% BODY OF PAPER %%%%%%%%%%%%%%%%%% 106 | 107 | \section{Introduction} 108 | 109 | This is a simple template for authors to write new MNRAS papers. 110 | See \texttt{mnras\_sample.tex} for a more complex example, and \texttt{mnras\_guide.tex} 111 | for a full user guide. 112 | 113 | All papers should start with an Introduction section, which sets the work 114 | in context, cites relevant earlier studies in the field by \citet{Others2013}, 115 | and describes the problem the authors aim to solve \citep[e.g.][]{Author2012}. 116 | 117 | \section{Methods, Observations, Simulations etc.} 118 | 119 | Normally the next section describes the techniques the authors used. 120 | It is frequently split into subsections, such as Section~\ref{sec:maths} below. 121 | 122 | \subsection{Maths} 123 | \label{sec:maths} % used for referring to this section from elsewhere 124 | 125 | Simple mathematics can be inserted into the flow of the text e.g. $2\times3=6$ 126 | or $v=220$\,km\,s$^{-1}$, but more complicated expressions should be entered 127 | as a numbered equation: 128 | 129 | \begin{equation} 130 | x=\frac{-b\pm\sqrt{b^2-4ac}}{2a}. 131 | \label{eq:quadratic} 132 | \end{equation} 133 | 134 | Refer back to them as e.g. equation~(\ref{eq:quadratic}). 135 | 136 | \subsection{Figures and tables} 137 | 138 | Figures and tables should be placed at logical positions in the text. Don't 139 | worry about the exact layout, which will be handled by the publishers. 140 | 141 | Figures are referred to as e.g. Fig.~\ref{fig:example_figure}, and tables as 142 | e.g. Table~\ref{tab:example_table}. 143 | 144 | % Example figure 145 | \begin{figure} 146 | % To include a figure from a file named example.* 147 | % Allowable file formats are eps or ps if compiling using latex 148 | % or pdf, png, jpg if compiling using pdflatex 149 | \includegraphics[width=\columnwidth]{example} 150 | \caption{This is an example figure. Captions appear below each figure. 151 | Give enough detail for the reader to understand what they're looking at, 152 | but leave detailed discussion to the main body of the text.} 153 | \label{fig:example_figure} 154 | \end{figure} 155 | 156 | % Example table 157 | \begin{table} 158 | \centering 159 | \caption{This is an example table. Captions appear above each table. 160 | Remember to define the quantities, symbols and units used.} 161 | \label{tab:example_table} 162 | \begin{tabular}{lccr} % four columns, alignment for each 163 | \hline 164 | A & B & C & D\\ 165 | \hline 166 | 1 & 2 & 3 & 4\\ 167 | 2 & 4 & 6 & 8\\ 168 | 3 & 5 & 7 & 9\\ 169 | \hline 170 | \end{tabular} 171 | \end{table} 172 | 173 | 174 | \section{Conclusions} 175 | 176 | The last numbered section should briefly summarise what has been done, and describe 177 | the final conclusions which the authors draw from their work. 178 | 179 | \section*{Acknowledgements} 180 | 181 | The Acknowledgements section is not numbered. Here you can thank helpful 182 | colleagues, acknowledge funding agencies, telescopes and facilities used etc. 183 | Try to keep it short. 184 | 185 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 186 | 187 | %%%%%%%%%%%%%%%%%%%% REFERENCES %%%%%%%%%%%%%%%%%% 188 | 189 | % The best way to enter references is to use BibTeX: 190 | 191 | %\bibliographystyle{mnras} 192 | %\bibliography{example} % if your bibtex file is called example.bib 193 | 194 | 195 | % Alternatively you could enter them by hand, like this: 196 | % This method is tedious and prone to error if you have lots of references 197 | \begin{thebibliography}{99} 198 | \bibitem[\protect\citeauthoryear{Author}{2012}]{Author2012} 199 | Author A.~N., 2013, Journal of Improbable Astronomy, 1, 1 200 | \bibitem[\protect\citeauthoryear{Others}{2013}]{Others2013} 201 | Others S., 2012, Journal of Interesting Stuff, 17, 198 202 | \end{thebibliography} 203 | 204 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 205 | 206 | %%%%%%%%%%%%%%%%% APPENDICES %%%%%%%%%%%%%%%%%%%%% 207 | 208 | \appendix 209 | 210 | \section{Some extra material} 211 | 212 | If you want to present additional material which would interrupt the flow of the main paper, 213 | it can be placed in an Appendix which appears after the list of references. 214 | 215 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 216 | 217 | 218 | % Don't change these lines 219 | \bsp % typesetting comment 220 | \label{lastpage} 221 | \end{document} 222 | 223 | % End of mnras_template.tex -------------------------------------------------------------------------------- /hiermetriclearn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Implementation of RadFriends 5 | https://arxiv.org/abs/1407.5459 6 | Uses standardised euclidean distance, which makes it fast. 7 | 8 | Copyright (c) 2017 Johannes Buchner 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 11 | 12 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 17 | 18 | 19 | 20 | """ 21 | 22 | import numpy 23 | import scipy.spatial, scipy.cluster 24 | import matplotlib.pyplot as plt 25 | from clustering.neighbors import find_rdistance, is_within_distance_of, count_within_distance_of, any_within_distance_of 26 | from clustering.sdml import IdentityMetric, SimpleScaling, TruncatedScaling 27 | from collections import defaultdict 28 | from clustering.radfriendsregion import ClusterResult, RadFriendsRegion 29 | 30 | class MetricLearningFriendsConstrainer(object): 31 | def __init__(self, metriclearner, rebuild_every = 50, metric_rebuild_every = 50, verbose = False, 32 | keep_phantom_points=False, optimize_phantom_points=False, 33 | force_shrink=False): 34 | self.iter_since_metric_rebuild = 0 35 | self.ndraws_since_rebuild = 0 36 | self.region = None 37 | self.rebuild_every = int(rebuild_every) 38 | self.metric_rebuild_every = int(metric_rebuild_every) 39 | self.verbose = verbose 40 | self.force_shrink = force_shrink 41 | self.metriclearner = metriclearner 42 | self.metric = IdentityMetric() 43 | self.clusters = None 44 | self.direct_draws_efficient = True 45 | self.last_cluster_points = None 46 | self.prev_maxdistance = None 47 | 48 | def cluster(self, u, ndim, keepMetric=False): 49 | w = self.metric.transform(u) 50 | prev_region = self.region 51 | if keepMetric: 52 | self.region = RadFriendsRegion(members=w) 53 | if self.force_shrink and self.region.maxdistance > self.prev_maxdistance: 54 | self.region = RadFriendsRegion(members=w, maxdistance=self.prev_maxdistance) 55 | self.prev_maxdistance = self.region.maxdistance 56 | print('keeping metric, not reclustering.') 57 | return 58 | 59 | metric_updated = False 60 | clustermetric = self.metric 61 | print('computing distances for clustering...') 62 | # Overlay all clusters (shift by cluster mean) 63 | print('Metric update ...') 64 | cluster_mean = numpy.mean(u, axis=0) 65 | shifted_cluster_members = u - cluster_mean 66 | 67 | # Using original points and new metric, compute RadFriends bootstrapped distance and store 68 | if self.metriclearner == 'none': 69 | metric = self.metric # stay with identity matrix 70 | metric_updated = False 71 | elif self.metriclearner == 'simplescaling': 72 | metric = SimpleScaling() 73 | metric.fit(shifted_cluster_members) 74 | metric_updated = True 75 | elif self.metriclearner == 'truncatedscaling': 76 | metric = TruncatedScaling() 77 | metric.fit(shifted_cluster_members) 78 | metric_updated = self.metric == IdentityMetric() or not numpy.all(self.metric.scale == metric.scale) 79 | else: 80 | assert False, self.metriclearner 81 | 82 | self.metric = metric 83 | 84 | wnew = self.metric.transform(u) 85 | print('Region update ...') 86 | 87 | self.region = RadFriendsRegion(members=wnew) #, maxdistance=shifted_region.maxdistance) 88 | if not metric_updated and self.force_shrink and self.prev_maxdistance is not None: 89 | if self.region.maxdistance > self.prev_maxdistance: 90 | self.region = RadFriendsRegion(members=w, maxdistance=self.prev_maxdistance) 91 | self.prev_maxdistance = self.region.maxdistance 92 | print('done.') 93 | 94 | def are_inside_cluster(self, points): 95 | w = self.metric.transform(points) 96 | return self.region.are_inside(w) 97 | 98 | def is_inside(self, point): 99 | if not ((point >= 0).all() and (point <= 1).all()): 100 | return False 101 | w = self.metric.transform(point) 102 | return self.region.is_inside(w) 103 | 104 | def generate(self, ndim): 105 | ntotal = 0 106 | N = 10000 107 | while True: 108 | #if numpy.random.uniform() < 0.01: 109 | if ndim < 40: 110 | # draw from radfriends directly 111 | for ws, n in self.region.generate(N): 112 | us = self.metric.untransform(ws) 113 | assert us.shape[1] == ndim, us.shape 114 | ntotal = ntotal + n 115 | mask = numpy.logical_and(us < 1, us > 0).all(axis=1) 116 | assert mask.shape == (len(us),), (mask.shape, us.shape) 117 | if mask.any(): 118 | #print 'radfriends draw in unit cube:', mask.sum(), ntotal 119 | for u in us[mask,:]: 120 | assert u.shape == (us[0].shape), (u.shape, us.shape, mask.shape) 121 | yield u, ntotal 122 | ntotal = 0 123 | #if all([0 <= ui <= 1 for ui in u]): 124 | # yield u, ntotal 125 | # ntotal = 0 126 | if numpy.random.uniform() < 0.1: 127 | # draw from unit cube 128 | # this can be efficient if volume still large 129 | ntotal = ntotal + N 130 | us = numpy.random.uniform(size=(N, ndim)) 131 | ws = self.metric.transform(us) 132 | nnear = self.region.are_inside(ws) 133 | #print ' %d of %d accepted' % (nnear.sum(), N) 134 | for u in us[nnear,:]: 135 | #print 'unit cube draw success:', ntotal 136 | yield u, ntotal 137 | ntotal = 0 138 | 139 | def rebuild(self, u, ndim, keepMetric=False): 140 | if self.last_cluster_points is not None and \ 141 | len(self.last_cluster_points) == len(u) and \ 142 | numpy.all(self.last_cluster_points == u): 143 | # do nothing if everything stayed the same 144 | return 145 | 146 | self.cluster(u=u, ndim=ndim, keepMetric=keepMetric) 147 | self.last_cluster_points = u 148 | 149 | print('maxdistance:', self.region.maxdistance) 150 | self.generator = self.generate(ndim) 151 | 152 | def _draw_constrained_prepare(self, Lmins, priortransform, loglikelihood, live_pointsu, ndim, **kwargs): 153 | rebuild = self.ndraws_since_rebuild > self.rebuild_every or self.region is None 154 | rebuild_metric = self.iter_since_metric_rebuild > self.metric_rebuild_every 155 | keepMetric = not rebuild_metric 156 | if rebuild: 157 | print('rebuild triggered at call') 158 | self.rebuild(numpy.asarray(live_pointsu), ndim, keepMetric=keepMetric) 159 | self.ndraws_since_rebuild = 0 160 | if rebuild_metric: 161 | self.iter_since_metric_rebuild = 0 162 | else: 163 | #print 'no rebuild: %d %d' % (self.iter_since_metric_rebuild, self.ndraws_since_rebuild) 164 | rebuild_metric = False 165 | assert self.generator is not None 166 | return rebuild, rebuild_metric 167 | 168 | def get_Lmax(self): 169 | if len(self.phantom_points_Ls) == 0: 170 | return None 171 | return max(self.phantom_points_Ls) 172 | 173 | def draw_constrained(self, Lmins, priortransform, loglikelihood, live_pointsu, ndim, **kwargs): 174 | ntoaccept = 0 175 | ntotalsum = 0 176 | self.iter_since_metric_rebuild += 1 177 | #print 'MLFriends trying to replace', Lmins 178 | rebuild, rebuild_metric = self._draw_constrained_prepare(Lmins, priortransform, loglikelihood, live_pointsu, ndim, **kwargs) 179 | while True: 180 | #print ' starting generator ...' 181 | for u, ntotal in self.generator: 182 | assert (u >= 0).all() and (u <= 1).all(), u 183 | ntotalsum += ntotal 184 | x = priortransform(u) 185 | L = loglikelihood(x) 186 | ntoaccept += 1 187 | self.ndraws_since_rebuild += 1 188 | 189 | #print 'ntotal:', ntotal 190 | if ntotal > 100000: 191 | self.direct_draws_efficient = False 192 | 193 | if numpy.any(L > Lmins): 194 | # yay, we win 195 | #print 'accept after %d tries' % ntoaccept 196 | return u, x, L, ntoaccept 197 | 198 | # if running very inefficient, optimize clustering 199 | # if we haven't done so at the start 200 | if not rebuild and self.ndraws_since_rebuild > self.rebuild_every: 201 | rebuild = True 202 | print('RadFriends rebuild triggered after %d draws' % self.ndraws_since_rebuild) 203 | self.rebuild(numpy.asarray(live_pointsu), ndim, keepMetric=True) 204 | self.ndraws_since_rebuild = 0 205 | break 206 | if not rebuild_metric and ntoaccept > 200: 207 | rebuild_metric = True 208 | print('RadFriends metric rebuild triggered after %d draws' % self.ndraws_since_rebuild) 209 | self.rebuild(numpy.asarray(live_pointsu), ndim, keepMetric=False) 210 | self.iter_since_metric_rebuild = 0 211 | break 212 | 213 | -------------------------------------------------------------------------------- /sample.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Main program 5 | --------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | from numpy import exp 21 | import h5py 22 | import sys 23 | import json 24 | import os 25 | import time 26 | 27 | print('loading data...') 28 | ndata = int(sys.argv[2]) 29 | with h5py.File(sys.argv[1], 'r') as f: 30 | x = numpy.array(f['x'].value) 31 | y = numpy.array(f['y'][:,:ndata]) 32 | 33 | 34 | """ 35 | 36 | Definition of the problem 37 | - parameter space (here: 3d) 38 | - likelihood function which consists of 39 | - model function ("slow predicting function") 40 | - data comparison 41 | 42 | """ 43 | 44 | nx, ndata = y.shape 45 | noise_level = 0.01 46 | params = ['A', 'mu', 'sig'] #, 'noise_level'] 47 | nparams = len(params) 48 | 49 | def gauss(x, z, A, mu, sig): 50 | return A * exp(-0.5 * ((mu - x / (1. + z))/sig)**2) 51 | 52 | def priortransform(cube): 53 | # definition of the parameter width, by transforming from a unit cube 54 | cube = cube.copy() 55 | cube[0] = 10**(cube[0] * 2 - 2) 56 | cube[1] = cube[1] * 400 + 400 57 | cube[2] = cube[2] * 2 58 | return cube 59 | 60 | # the following is a python-only implementation of the likelihood 61 | # @ params are the parameters (as transformed by priortransform) 62 | # @ data_mask is which data sets to consider. 63 | # returns a likelihood vector 64 | def multi_loglikelihood(params, data_mask): 65 | A, mu, log_sig_kms = params 66 | # predict the model 67 | sig = 10**log_sig_kms 68 | ypred = A * exp(-0.5 * ((mu - x)/sig)**2) 69 | # do the data comparison 70 | L = -0.5 * (((ypred.reshape((-1,1)) - y[:,data_mask])/noise_level)**2).sum(axis=0) 71 | return L 72 | 73 | #print multi_loglikelihood([0.88091237, 444.44207558, 2.77671952], numpy.ones(ndata)==1) 74 | #print multi_loglikelihood([1.65758829e-01, 4.45518543e+02, 3.25894638e+00], numpy.ones(ndata)==1) 75 | #print multi_loglikelihood([0.95572931, 443.99407818, 2.95764509], numpy.ones(ndata)==1) 76 | 77 | # The following is a C implementation of the likelihood 78 | from ctypes import * 79 | from numpy.ctypeslib import ndpointer 80 | 81 | if int(os.environ.get('OMP_NUM_THREADS', '1')) > 1 and False: # does not work correctly yet 82 | lib = cdll.LoadLibrary('./clike-parallel.so') 83 | else: 84 | lib = cdll.LoadLibrary('./clike.so') 85 | lib.like.argtypes = [ 86 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 87 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 88 | c_int, 89 | c_int, 90 | c_double, 91 | c_double, 92 | c_double, 93 | c_double, 94 | ndpointer(dtype=numpy.bool, ndim=1, flags='C_CONTIGUOUS'), 95 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 96 | ] 97 | 98 | # @ params are the parameters (as transformed by priortransform) 99 | # @ data_mask is which data sets to consider. 100 | # returns a likelihood vector 101 | def multi_loglikelihood(params, data_mask): 102 | A, mu, log_sig_kms = params 103 | sig = 10**log_sig_kms 104 | Lout = numpy.zeros(data_mask.sum()) 105 | # do everything in C and return the resulting likelihood vector 106 | ret = lib.like(x, y, ndata, nx, A, mu, sig, noise_level, data_mask, Lout) 107 | #assert numpy.isfinite(Lout).all(), (Lout, params) 108 | return -0.5 * Lout 109 | 110 | #print multi_loglikelihood([0.88091237, 444.44207558, 2.77671952], numpy.ones(ndata)==1) 111 | #print multi_loglikelihood([1.65758829e-01, 4.45518543e+02, 3.25894638e+00], numpy.ones(ndata)==1) 112 | #print multi_loglikelihood([0.95572931, 443.99407818, 2.95764509], numpy.ones(ndata)==1) 113 | 114 | """ 115 | 116 | After defining the problem, we use generic code to set up 117 | - Nested Sampling (Multi)Integrator 118 | - Our special sampler 119 | - RadFriends (constrained region draw) 120 | 121 | We start with the latter. 122 | """ 123 | 124 | 125 | from multi_nested_integrator import multi_nested_integrator 126 | from multi_nested_sampler import MultiNestedSampler 127 | 128 | import cachedconstrainer 129 | from cachedconstrainer import CachedConstrainer, generate_individual_constrainer, generate_superset_constrainer, MultiEllipsoidalConstrainer, MetricLearningFriendsConstrainer, generate_fresh_constrainer 130 | 131 | constrainer_type = os.environ.get('CONSTRAINER', 'MLFRIENDS') 132 | if constrainer_type == 'MLFRIENDS': 133 | def generate_fresh_constrainer(): 134 | return MetricLearningFriendsConstrainer( 135 | metriclearner = 'truncatedscaling', force_shrink=True, 136 | rebuild_every=1000, metric_rebuild_every=20, 137 | verbose=False) 138 | 139 | superset_constrainer = MetricLearningFriendsConstrainer( 140 | metriclearner = 'truncatedscaling', force_shrink=True, 141 | rebuild_every=1000, metric_rebuild_every=20, 142 | verbose=False) 143 | elif constrainer_type == 'MULTIELLIPSOIDS': 144 | def generate_fresh_constrainer(): 145 | return MultiEllipsoidalConstrainer(rebuild_every=1000) 146 | 147 | superset_constrainer = generate_fresh_constrainer() 148 | elif constrainer_type == 'SLICE': 149 | #from whitenedmcmc import FilteredMCMCConstrainer, HybridMLMultiEllipsoidConstrainer 150 | from whitenedmcmc import SliceConstrainer, FilteredMahalanobisHARMProposal, FilteredUnitIterateSliceProposal 151 | def generate_fresh_constrainer(): 152 | return SliceConstrainer(proposer=FilteredUnitIterateSliceProposal(), nsteps=nparams*5) 153 | superset_constrainer = generate_fresh_constrainer() 154 | else: 155 | assert False, constrainer_type 156 | 157 | cachedconstrainer.generate_fresh_constrainer = generate_fresh_constrainer 158 | 159 | cc = CachedConstrainer() 160 | focusset_constrainer = cc.get 161 | _, _, individual_draw_constrained = generate_individual_constrainer() 162 | numpy.random.seed(1) 163 | start_time = time.time() 164 | print('setting up integrator ...') 165 | nlive_points = int(os.environ.get('NLIVE_POINTS','400')) 166 | 167 | # constrained region draw functions 168 | # we try hard to keep information about current regions and subselected regions 169 | # because recomputing the regions is expensive if the likelihood is very fast. 170 | # There are three constrainers: 171 | # - the one of the superset (all data sets) 172 | # - one for each data set if need a individual draw (focussed draw with only one) 173 | # - a memory for recent clusterings, because they might recur in the next iteration(s) 174 | # Note that this does caching not improve the algorithms efficiency 175 | # in fact, not recomputing regions keeps the regions larger, 176 | # leading potentially to slightly more rejections. 177 | # However, there is substantial execution speedup. 178 | 179 | 180 | # now set up sampler and pass the three constrainers 181 | 182 | sampler = MultiNestedSampler(nlive_points = nlive_points, 183 | priortransform=priortransform, multi_loglikelihood=multi_loglikelihood, 184 | ndim=nparams, ndata=ndata, 185 | superset_draw_constrained = superset_constrainer.draw_constrained, 186 | individual_draw_constrained = individual_draw_constrained, 187 | draw_constrained = focusset_constrainer, 188 | nsuperset_draws = int(os.environ.get('SUPERSET_DRAWS', '10')), 189 | use_graph = os.environ.get('USE_GRAPH', '1') == '1' 190 | ) 191 | 192 | superset_constrainer.sampler = sampler 193 | cc.sampler = sampler 194 | print('integrating ...') 195 | max_samples = int(os.environ.get('MAXSAMPLES', 0)) 196 | min_samples = int(os.environ.get('MINSAMPLES', 0)) 197 | results = multi_nested_integrator(tolerance=0.5, multi_sampler=sampler, min_samples=min_samples, max_samples=max_samples) 198 | duration = time.time() - start_time 199 | print('writing output files ...') 200 | prefix = '%s_%s_nlive%d_%d.out8' % (sys.argv[1], constrainer_type, nlive_points, ndata) 201 | # store results 202 | with h5py.File(prefix + '.hdf5', 'w') as f: 203 | f.create_dataset('logZ', data=results['logZ'], compression='gzip', shuffle=True) 204 | f.create_dataset('logZerr', data=results['logZerr'], compression='gzip', shuffle=True) 205 | u, x, L, w, mask = list(zip(*results['weights'])) 206 | f.create_dataset('u', data=u, compression='gzip', shuffle=True) 207 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 208 | f.create_dataset('L', data=L, compression='gzip', shuffle=True) 209 | f.create_dataset('w', data=w, compression='gzip', shuffle=True) 210 | f.create_dataset('mask', data=mask, compression='gzip', shuffle=True) 211 | f.create_dataset('ndraws', data=sampler.ndraws) 212 | print('logZ = %.1f +- %.1f' % (results['logZ'][0], results['logZerr'][0])) 213 | print('ndraws:', sampler.ndraws, 'niter:', len(w)) 214 | 215 | print('writing statistic ...') 216 | json.dump(dict(ndraws=sampler.ndraws, duration=duration, ndata=ndata, niter=len(w)), 217 | open(prefix + '.stats.json', 'w'), indent=4) 218 | print('done.') 219 | 220 | 221 | -------------------------------------------------------------------------------- /clustering/neighbors.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Neighbourhood helper functions 5 | ------------------------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | import scipy.spatial 21 | 22 | def initial_maxdistance_guess(u): 23 | n = len(u) 24 | distances = scipy.spatial.distance.cdist(u, u) 25 | nearest = [distances[i,:].argsort()[1] for i in range(n)] 26 | nearest = [numpy.abs(u[k,:] - u[i,:]) for i, k in enumerate(nearest)] 27 | # compute distance maximum 28 | maxdistance = numpy.max(nearest, axis=0) 29 | return maxdistance 30 | 31 | def update_maxdistance(u, ibootstrap, maxdistance, verbose = False): 32 | n, ndim = u.shape 33 | 34 | # bootstrap to find smallest maxdistance which includes 35 | # all points 36 | choice = list(set(numpy.random.choice(numpy.arange(n), size=n))) 37 | notchosen = set(range(n)) - set(choice) 38 | # check if included with our starting criterion 39 | for i in notchosen: 40 | dists = numpy.abs(u[i,:] - u[choice,:]) 41 | close = numpy.all(dists < maxdistance.reshape((1,-1)), axis=1) 42 | assert close.shape == (len(choice),), (close.shape, len(choice)) 43 | # find the point where we have to increase the least 44 | if not close.any(): 45 | # compute maxdists -- we already did that 46 | # compute extension to maxdistance 47 | #maxdistance_suggest = [numpy.max([maxdistance, d], axis=0) for d in dists] 48 | maxdistance_suggest = numpy.where(maxdistance > dists, dists, maxdistance) 49 | assert maxdistance_suggest.shape == (len(dists), ndim) 50 | # compute volume increase in comparison to maxdistance 51 | #increase = [(numpy.log(m) - numpy.log(maxdistance)).sum() for m in maxdistance_suggest] 52 | increase = numpy.log(maxdistance_suggest).sum(axis=1) - numpy.log(maxdistance).sum() 53 | 54 | # choose smallest 55 | nearest = numpy.argmin(increase) 56 | if verbose: print(ibootstrap, 'nearest:', u[i], u[nearest], increase[nearest]) 57 | # update maxdistance 58 | maxdistance = numpy.where(dists[nearest] > maxdistance, dists[nearest], maxdistance) 59 | if verbose: print(ibootstrap, 'extending:', maxdistance) 60 | else: 61 | # we got this one, everything is fine 62 | pass 63 | return maxdistance 64 | 65 | def find_maxdistance(u, verbose=False, nbootstraps=15): 66 | # find nearest point for every point 67 | if verbose: print('finding nearest neighbors:') 68 | maxdistance = initial_maxdistance_guess(u) 69 | #maxdistance = numpy.zeros(ndim) 70 | if verbose: print('initial:', maxdistance) 71 | for ibootstrap in range(nbootstraps): 72 | maxdistance = update_maxdistance(u, ibootstrap, maxdistance, verbose=verbose) 73 | return maxdistance 74 | 75 | def is_within_distance_of(members, maxdistance, u, metric='euclidean'): 76 | dists = scipy.spatial.distance.cdist(members, us, metric=metric) 77 | return (dists < maxdistance).any() 78 | 79 | def count_within_distance_of(members, maxdistance, us, metric='euclidean'): 80 | dists = scipy.spatial.distance.cdist(members, us, metric=metric) 81 | return (dists < maxdistance).sum(axis=0) 82 | 83 | def any_within_distance_of(members, maxdistance, us, metric='euclidean'): 84 | dists = scipy.spatial.distance.cdist(members, us, metric=metric) 85 | return (dists < maxdistance).any(axis=0) 86 | 87 | most_distant_nearest_neighbor = None 88 | bootstrapped_maxdistance = None 89 | try: 90 | import os 91 | from ctypes import * 92 | from numpy.ctypeslib import ndpointer 93 | 94 | if int(os.environ.get('OMP_NUM_THREADS', '1')) > 1: 95 | libname = 'cneighbors-parallel.so' 96 | else: 97 | libname = 'cneighbors.so' 98 | libfilename = os.path.join(os.path.dirname(os.path.abspath(__file__)), libname) 99 | lib = cdll.LoadLibrary(libfilename) 100 | lib.most_distant_nearest_neighbor.argtypes = [ 101 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 102 | c_int, 103 | c_int, 104 | ] 105 | lib.most_distant_nearest_neighbor.restype = c_double 106 | 107 | def most_distant_nearest_neighbor(xx): 108 | i, m = xx.shape 109 | r = lib.most_distant_nearest_neighbor(xx, i, m) 110 | return r 111 | 112 | lib.is_within_distance_of.argtypes = [ 113 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 114 | c_int, 115 | c_int, 116 | c_double, 117 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 118 | ] 119 | lib.is_within_distance_of.restype = c_int 120 | 121 | def is_within_distance_of(xx, maxdistance, y): 122 | i, m = xx.shape 123 | r = lib.is_within_distance_of(xx, i, m, maxdistance, y) 124 | return r == 1 125 | 126 | lib.count_within_distance_of.argtypes = [ 127 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 128 | c_int, 129 | c_int, 130 | c_double, 131 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 132 | c_int, 133 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 134 | c_int, 135 | ] 136 | 137 | def count_within_distance_of(xx, maxdistance, yy): 138 | i, m = xx.shape 139 | j = len(yy) 140 | counts = numpy.zeros(len(yy)) 141 | r = lib.count_within_distance_of(xx, i, m, maxdistance, yy, j, counts, 0) 142 | counts = counts.astype(int) 143 | # check 144 | #dists = scipy.spatial.distance.cdist(xx, yy, metric='euclidean') 145 | #counts_true = (dists < maxdistance).sum(axis=0) 146 | #assert (counts == counts_true).all(), (counts, counts_true) 147 | return counts 148 | 149 | def any_within_distance_of(xx, maxdistance, yy): 150 | i, m = xx.shape 151 | j = len(yy) 152 | counts = numpy.zeros(len(yy)) 153 | r = lib.count_within_distance_of(xx, i, m, maxdistance, yy, j, counts, 1) 154 | counts = counts > 0 155 | # check 156 | #dists = scipy.spatial.distance.cdist(xx, yy, metric='euclidean') 157 | #counts_true = (dists < maxdistance).any(axis=0) 158 | #assert (counts == counts_true).all(), (counts, counts_true) 159 | return counts 160 | 161 | lib.bootstrapped_maxdistance.argtypes = [ 162 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 163 | c_int, 164 | c_int, 165 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 166 | c_int, 167 | ] 168 | lib.bootstrapped_maxdistance.restype = c_double 169 | 170 | def bootstrapped_maxdistance(xx, nbootstraps): 171 | nsamples, ndim = xx.shape 172 | chosen = numpy.zeros((nsamples, nbootstraps)) 173 | for b in range(nbootstraps): 174 | chosen[numpy.random.choice(numpy.arange(nsamples), size=nsamples, replace=True),b] = 1. 175 | 176 | maxdistance = lib.bootstrapped_maxdistance(xx, nsamples, ndim, chosen, nbootstraps) 177 | return maxdistance 178 | 179 | except ImportError as e: 180 | print('Using slow, high-memory neighborhood function nearest_rdistance_guess because import failed:', e) 181 | except Exception as e: 182 | print('Using slow, high-memory neighborhood function nearest_rdistance_guess because:', e) 183 | 184 | 185 | def nearest_rdistance_guess(u, metric='euclidean'): 186 | if metric == 'euclidean' and most_distant_nearest_neighbor is not None: 187 | return most_distant_nearest_neighbor(u) 188 | n = len(u) 189 | distances = scipy.spatial.distance.cdist(u, u, metric=metric) 190 | numpy.fill_diagonal(distances, 1e300) 191 | nearest_neighbor_distance = numpy.min(distances, axis = 1) 192 | rdistance = numpy.max(nearest_neighbor_distance) 193 | #print 'distance to nearest:', rdistance, nearest_neighbor_distance 194 | return rdistance 195 | 196 | def initial_rdistance_guess(u, metric='euclidean', k = 10): 197 | n = len(u) 198 | distances = scipy.spatial.distance.cdist(u, u, metric=metric) 199 | if k == 1: 200 | # numpy.diag(distances) 201 | # nearest = [distances[i,:])[1:k] for i in range(n)] 202 | distances2 = distances + numpy.diag(1e100 * numpy.ones(len(distances))) 203 | nearest = distances2.min(axis=0) 204 | else: 205 | assert False, k 206 | nearest = [numpy.sort(distances[i,:])[1:k+1] for i in range(n)] 207 | # compute distance maximum 208 | rdistance = numpy.max(nearest) 209 | return rdistance 210 | 211 | def update_rdistance(u, ibootstrap, rdistance, verbose = False, metric='euclidean'): 212 | n, ndim = u.shape 213 | 214 | # bootstrap to find smallest rdistance which includes 215 | # all points 216 | choice = set(numpy.random.choice(numpy.arange(n), size=n)) 217 | mask = numpy.array([c in choice for c in numpy.arange(n)]) 218 | 219 | distances = scipy.spatial.distance.cdist(u[mask], u[-mask], metric=metric) 220 | assert distances.shape == (mask.sum(), (-mask).sum()) 221 | nearest_distance_to_members = distances.min(axis=0) 222 | if verbose: 223 | print('nearest distances:', nearest_distance_to_members.max(), nearest_distance_to_members) 224 | newrdistance = max(rdistance, nearest_distance_to_members.max()) 225 | if newrdistance > rdistance and verbose: 226 | print(ibootstrap, 'extending:', newrdistance) 227 | return newrdistance 228 | 229 | def find_rdistance(u, verbose=False, nbootstraps=15, metric='euclidean'): 230 | if metric == 'euclidean' and bootstrapped_maxdistance is not None: 231 | return bootstrapped_maxdistance(u, nbootstraps) 232 | # find nearest point for every point 233 | if verbose: print('finding nearest neighbors:') 234 | rdistance = 0 #initial_rdistance_guess(u) 235 | if verbose: print('initial:', rdistance) 236 | for ibootstrap in range(nbootstraps): 237 | rdistance = update_rdistance(u, ibootstrap, rdistance, verbose=verbose, metric=metric) 238 | return rdistance 239 | 240 | if __name__ == '__main__': 241 | nbootstraps = 10 242 | numpy.random.seed(1) 243 | u = numpy.random.uniform(size=(200,2)) 244 | for i in range(100): 245 | numpy.random.seed(i) 246 | a = bootstrapped_maxdistance(u, nbootstraps) 247 | numpy.random.seed(i) 248 | b = find_rdistance(u, nbootstraps=nbootstraps, metric='euclidean', verbose=False) 249 | print(a, b) 250 | assert numpy.allclose(a, b) 251 | 252 | -------------------------------------------------------------------------------- /whitenedmcmc.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Implementation of MultiEllipsoidal sampling via nestle 4 | 5 | Copyright (c) 2017 Johannes Buchner 6 | 7 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | 15 | 16 | 17 | """ 18 | 19 | import numpy 20 | from numpy import exp, log, log10, pi 21 | from nestle import bounding_ellipsoid, bounding_ellipsoids, sample_ellipsoids 22 | from collections import defaultdict 23 | import scipy.spatial, scipy.cluster 24 | import matplotlib.pyplot as plt 25 | import numpy 26 | from numpy import exp, log, log10, pi, cos, sin 27 | from nestle import bounding_ellipsoid, bounding_ellipsoids, sample_ellipsoids 28 | 29 | def is_inside_unit_filter(u): 30 | return numpy.all(u >= 0) and numpy.all(u <= 1) 31 | 32 | 33 | class BaseProposal(object): 34 | """ 35 | Base class for proposal function. 36 | 37 | :param scale: Scale of proposal 38 | :param adapt: Adaptation rule to use for scale, when new_chain is called. 39 | 40 | If adapt is False, no adaptation is done. If adapt is 'Sivia', the rule 41 | of Sivia & Skilling (2006) is used. If adapt is something else, 42 | a crude thresholding adaptation is used to gain ~50% acceptance. 43 | """ 44 | def __init__(self, adapt = False, scale = 1.): 45 | self.accepts = [] 46 | self.adapt = adapt 47 | self.scale = scale 48 | """ 49 | Proposal function (to be overwritten) 50 | """ 51 | def propose(self, u, ndim, live_pointsu=None, is_inside_filter=None): 52 | return u 53 | """ 54 | Reset accept counters and adapt proposal (if activated). 55 | """ 56 | def new_chain(self, live_pointsu=None, is_inside_filter=None): 57 | if self.adapt and len(self.accepts) > 0: 58 | # adjust future scale based on acceptance rate 59 | m = numpy.mean(self.accepts) 60 | assert 0 <= m <= 1, (m, self.accepts) 61 | if self.adapt == 'sivia': 62 | if m > 0.5: self.scale *= exp(1./numpy.sum(self.accepts)) 63 | else: self.scale /= exp(1./(len(self.accepts) - numpy.sum(self.accepts))) 64 | elif self.adapt == 'sivia-neg-binom': 65 | # negative binomial rate estimator 66 | m = (sum(self.accepts) - 1) / (len(self.accepts) - 1.) 67 | if m > 0.5: self.scale *= exp(1./numpy.sum(self.accepts)) 68 | else: self.scale /= exp(1./(len(self.accepts) - numpy.sum(self.accepts))) 69 | elif self.adapt == 'step': 70 | #print 'adaptation:', m 71 | if m <= 0.1: 72 | self.scale /= 1.1 73 | elif m <= 0.3: 74 | self.scale /= 1.01 75 | elif m >= 0.9: 76 | self.scale *= 1.1 77 | elif m >= 0.7: 78 | self.scale *= 1.01 79 | else: 80 | assert False, self.adapt 81 | assert numpy.all(numpy.isfinite(self.scale)), self.scale 82 | self.accepts = [] 83 | 84 | """ 85 | Add a point to the record. 86 | :param accepted: True if accepted, False if rejected. 87 | """ 88 | def accept(self, accepted): 89 | self.accepts.append(accepted) 90 | 91 | """ 92 | Print some stats on the acceptance rate 93 | """ 94 | def stats(self): 95 | print('Proposal %s stats: %.2f%% accepts' % (repr(self), 96 | numpy.mean(self.accepts) * 100.)) 97 | 98 | class MultiScaleProposal(BaseProposal): 99 | """Proposal over multiple scales, inspired by DNest. 100 | Uses the formula 101 | 102 | :math:`x + n * 10^{l - s * u}` 103 | 104 | where l is the location, s is the scale and u is a uniform variate, 105 | and n is a normal variate. 106 | 107 | @see MultiScaleProposal 108 | """ 109 | def __init__(self, loc = -4.5, scale=1.5, adapt=False): 110 | # 10**(1.5 - 6 * u) (inspired by DNest) 111 | # a + (b - a) * u 112 | # a = 1.5, b = -4.5 113 | # a should increase for larger scales, decrease for smaller 114 | 115 | self.loc = loc 116 | BaseProposal.__init__(self, adapt=adapt, scale=scale) 117 | def __repr__(self): 118 | return 'MultiScaleProposal(loc=%s, scale=%s, adapt=%s)' % (self.loc, self.scale, self.adapt) 119 | def propose(self, u, ndim, live_pointsu=None, is_inside_filter=None): 120 | p = u + numpy.random.normal() * 10**(self.scale + (self.loc - self.scale) * numpy.random.uniform()) 121 | p[p > 1] = 1 122 | p[p < 0] = 0 123 | #p = p - numpy.floor(p) 124 | return p 125 | 126 | 127 | class FilteredUnitHARMProposal(BaseProposal): 128 | """ 129 | Unit HARM proposal. 130 | 131 | @see BaseProposal 132 | """ 133 | def __init__(self, adapt = False, scale = 1.): 134 | BaseProposal.__init__(self, adapt=False, scale=float(scale)) 135 | 136 | def generate_direction(self, u, ndim, points): 137 | # generate unit direction 138 | x = numpy.random.normal(size=ndim) 139 | d = x / (x**2).sum()**0.5 140 | return d 141 | def new_chain(self, u, ndim, points, is_inside_filter): 142 | BaseProposal.new_chain(self) 143 | self.new_direction(u, ndim, points, is_inside_filter) 144 | def new_direction(self, u, ndim, points, is_inside_filter): 145 | d = self.generate_direction(u, ndim, points) 146 | #print('initial scale:', self.scale) 147 | # find end points 148 | forward_scale = self.scale 149 | # find a scale that is too large 150 | while True: 151 | assert forward_scale > 0 152 | p_for = u + d * forward_scale 153 | if is_inside_filter(p_for): 154 | # we are proposing too small. We should be outside 155 | forward_scale *= 2 156 | #print('too small, stepping further', forward_scale) 157 | else: 158 | break 159 | 160 | backward_scale = self.scale 161 | # find a scale that is too large 162 | while True: 163 | assert backward_scale > 0 164 | p_rev = u - d * backward_scale 165 | if is_inside_filter(p_rev): 166 | # we are proposing too small. We should be outside 167 | #print('too small, stepping back', backward_scale) 168 | backward_scale *= 2 169 | else: 170 | break 171 | # remember scale for next time: 172 | self.backward_scale = -backward_scale 173 | self.forward_scale = forward_scale 174 | self.direction = d 175 | 176 | def propose(self, u, ndim, points, is_inside_filter): 177 | # generate a random point between the two points. 178 | while True: 179 | #print('slice range:', (self.backward_scale, self.forward_scale)) 180 | x = numpy.random.uniform(self.backward_scale, self.forward_scale) 181 | p = u + self.direction * x 182 | #assert self.forward_scale - self.backward_scale > 1e-100 183 | if x < 0: 184 | self.backward_scale = x 185 | else: 186 | self.forward_scale = x 187 | if is_inside_filter(p): 188 | if self.adapt: 189 | self.scale = self.forward_scale - self.backward_scale 190 | #print('adapting scale to', self.scale) 191 | return p 192 | 193 | def accept(self, accepted): 194 | # scale should not be modified 195 | pass 196 | 197 | def __repr__(self): 198 | return 'FilteredUnitHARMProposal(scale=%s, adapt=%s)' % (self.scale, self.adapt) 199 | 200 | class FilteredMahalanobisHARMProposal(FilteredUnitHARMProposal): 201 | """ 202 | Mahalanobis HARM proposal. 203 | 204 | @see BaseProposal 205 | """ 206 | 207 | def generate_direction(self, u, ndim, points): 208 | # generate direction from mahalanobis metric 209 | metric = numpy.cov(numpy.transpose(points)) 210 | assert metric.shape == (ndim,ndim), metric.shape 211 | x = numpy.random.multivariate_normal(numpy.zeros(ndim), metric) 212 | d = x / (x**2).sum()**0.5 213 | return d 214 | def __repr__(self): 215 | return 'FilteredMahalanobisHARMProposal(scale=%s, adapt=%s)' % (self.scale, self.adapt) 216 | 217 | class FilteredUnitRandomSliceProposal(FilteredUnitHARMProposal): 218 | """ 219 | Unit Slice sampling proposal, random component-wise. 220 | 221 | @see BaseProposal 222 | """ 223 | def generate_direction(self, u, ndim, points): 224 | # choose a random base vector 225 | d = numpy.zeros(ndim) 226 | i = numpy.random.randint(ndim) 227 | d[i] = 1 228 | return d 229 | def __repr__(self): 230 | return 'FilteredUnitRandomSliceProposal(scale=%s, adapt=%s)' % (self.scale, self.adapt) 231 | 232 | class FilteredUnitIterateSliceProposal(FilteredUnitHARMProposal): 233 | """ 234 | Unit Slice sampling proposal, iterative component-wise. 235 | 236 | @see BaseProposal 237 | """ 238 | def __init__(self, adapt = False, scale = 1.): 239 | BaseProposal.__init__(self, adapt=False, scale=float(scale)) 240 | self.curindex = 0 241 | 242 | def generate_direction(self, u, ndim, points): 243 | # choose next base vector 244 | d = numpy.zeros(ndim) 245 | self.curindex = (self.curindex + 1) % ndim 246 | d[self.curindex] = 1 247 | return d 248 | def __repr__(self): 249 | return 'FilteredUnitIterateSliceProposal(scale=%s, adapt=%s)' % (self.scale, self.adapt) 250 | 251 | class FilteredUnitRandomSliceProposal(FilteredUnitHARMProposal): 252 | """ 253 | Unit Slice sampling proposal, random component-wise. 254 | 255 | @see BaseProposal 256 | """ 257 | def generate_direction(self, u, ndim, points): 258 | # choose a random base vector 259 | d = numpy.zeros(ndim) 260 | i = numpy.random.randint(ndim) 261 | d[i] = 1 262 | return d 263 | def __repr__(self): 264 | return 'FilteredUnitRandomSliceProposal(scale=%s, adapt=%s)' % (self.scale, self.adapt) 265 | 266 | class SliceConstrainer(object): 267 | """ 268 | Markov chain Monte Carlo proposals using the Metropolis update: 269 | Do a number of steps, while adhering to boundary. 270 | """ 271 | def __init__(self, proposer = MultiScaleProposal(), nsteps = 10, nmaxsteps = 10000): 272 | self.proposer = proposer 273 | self.sampler = None 274 | # number of new directions 275 | self.nsteps = nsteps 276 | # number of narrowings 277 | self.nmaxsteps = nmaxsteps 278 | 279 | def draw_constrained(self, Lmins, priortransform, loglikelihood, ndim, 280 | live_pointsu, **kwargs): 281 | i = numpy.random.randint(len(live_pointsu)) 282 | ui = live_pointsu[i] 283 | xi = None 284 | naccepts = 0 285 | nevals = 0 286 | # new direction 287 | for i in range(self.nsteps): 288 | self.proposer.new_chain(ui, ndim, live_pointsu, is_inside_unit_filter) 289 | # narrow in until we get an accept 290 | for n in range(self.nmaxsteps): 291 | u = self.proposer.propose(ui, ndim, live_pointsu, is_inside_unit_filter) 292 | x = priortransform(u) 293 | L = loglikelihood(x) 294 | nevals += 1 295 | # MH accept rule 296 | # accept = L > Li or numpy.random.uniform() < exp(L - Li) 297 | # Likelihood-difference independent, because we do 298 | # exploration of the prior (full diffusion). 299 | # but only accept in constrained region, because that 300 | # is what we are exploring now. 301 | # accept = L >= Lmin 302 | #### 303 | # For collaborative nested sampling it is sampling 304 | # from the super-contour, so only one needs to work: 305 | accept = numpy.any(L >= Lmins) 306 | 307 | # tell proposer so it can scale 308 | self.proposer.accept(accept) 309 | if accept: 310 | ui, xi, Li = u, x, L 311 | naccepts += 1 312 | break 313 | if numpy.all(Li < Lmins): 314 | print() 315 | print('ERROR: SliceConstrainer could not find a point matching constraint!') 316 | print('ERROR: Proposer stats:') 317 | self.proposer.stats() 318 | assert numpy.all(Li < Lmins), (Li, Lmins, self.nmaxsteps, numpy.mean(self.proposer.accepts), len(self.proposer.accepts)) 319 | if xi is None: 320 | xi = priortransform(ui) 321 | return ui, xi, Li, nevals 322 | 323 | def stats(self): 324 | return self.proposer.stats() 325 | 326 | -------------------------------------------------------------------------------- /friends.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import numpy 3 | import scipy.spatial, scipy.cluster 4 | import matplotlib.pyplot as plt 5 | from nested_sampling.clustering import clusterdetect 6 | from nested_sampling.clustering.neighbors import find_maxdistance, find_rdistance, initial_rdistance_guess, nearest_rdistance_guess 7 | 8 | class FriendsConstrainer(object): 9 | """ 10 | Rejection sampling pre-filtering method based on neighborhood to live points. 11 | 12 | "Distant" means in this implementation that the distance to a cluster member 13 | is large. 14 | The maximum distance to a cluster is computed by considering each 15 | cluster member and its k nearest neighbors in turn, and 16 | computing the maximum distance. 17 | 18 | :param rebuild_every: After how many iterations should the clustering 19 | distance be re-computed? 20 | 21 | :param radial: 22 | if radial = True, then the normal euclidean distance is used. 23 | otherwise, the absolute coordinate difference in each dimension is used. 24 | 25 | :param metric: 26 | metric to use. Use 'chebyshev' for SupFriends, in which case then 27 | the supremum norm is used. Use 'euclidean' for RadFriends, via 28 | the euclidean norm. 29 | 30 | :param jackknife: 31 | if True, instead of leaving out a group of live points in 32 | the distance estimate, only one is left out in turn (jackknife resampling 33 | instead of bootstrap resampling). 34 | 35 | :param force_shrink: 36 | if True, the distance can only decrease between sampling steps. 37 | 38 | """ 39 | def __init__(self, rebuild_every = 50, radial = True, metric = 'euclidean', jackknife = False, 40 | force_shrink = False, 41 | hinter = None, verbose = False, 42 | keep_phantom_points=False, optimize_phantom_points=False): 43 | self.maxima = [] 44 | self.iter = 0 45 | self.region = None 46 | self.rebuild_every = rebuild_every 47 | self.radial = radial 48 | self.metric = metric 49 | self.file = None 50 | self.jackknife = jackknife 51 | self.force_shrink = force_shrink 52 | self.hinter = hinter 53 | self.verbose = verbose 54 | if keep_phantom_points: 55 | assert self.force_shrink, 'keep_phantom_points needs force_shrink=True' 56 | self.keep_phantom_points = keep_phantom_points 57 | self.optimize_phantom_points = optimize_phantom_points 58 | self.phantom_points = [] 59 | self.phantom_points_Ls = [] 60 | self.last_cluster_points = None 61 | 62 | def cluster(self, u, ndim, keepRadius=False): 63 | """ 64 | 65 | """ 66 | if self.verbose: print('building region ...') 67 | if len(u) > 10: 68 | if keepRadius and self.region is not None and 'maxdistance' in self.region: 69 | maxdistance = self.region['maxdistance'] 70 | else: 71 | if self.radial: 72 | if self.jackknife: 73 | #maxdistance = initial_rdistance_guess(u, k=1, metric=self.metric) 74 | maxdistance = nearest_rdistance_guess(u, metric=self.metric) 75 | else: 76 | maxdistance = find_rdistance(u, nbootstraps=20, metric=self.metric, verbose=self.verbose) 77 | else: 78 | maxdistance = find_maxdistance(u) 79 | if self.force_shrink and self.region is not None and 'maxdistance' in self.region: 80 | maxdistance = min(maxdistance, self.region['maxdistance']) 81 | if self.keep_phantom_points and len(self.phantom_points) > 0: 82 | # add phantoms to u now 83 | print('including phantom points in cluster members', self.phantom_points) 84 | u = numpy.vstack((u, self.phantom_points)) 85 | ulow = numpy.max([u.min(axis=0) - maxdistance, numpy.zeros(ndim)], axis=0) 86 | uhigh = numpy.min([u.max(axis=0) + maxdistance, numpy.ones(ndim)], axis=0) 87 | else: 88 | maxdistance = None 89 | ulow = numpy.zeros(ndim) 90 | uhigh = numpy.ones(ndim) 91 | if self.verbose: print('setting sampling region:', (ulow, uhigh), maxdistance) 92 | self.region = dict(members=u, maxdistance=maxdistance, ulow=ulow, uhigh=uhigh) 93 | self.generator = None 94 | 95 | def is_inside(self, u): 96 | """ 97 | Check if this new point is near or inside one of our clusters 98 | """ 99 | ndim = len(u) 100 | ulow = self.region['ulow'] 101 | uhigh = self.region['uhigh'] 102 | if not ((ulow <= u).all() and (uhigh >= u).all()): 103 | # does not even lie in our primitive rectangle 104 | # do not even need to compute the distances 105 | return False 106 | 107 | members = self.region['members'] 108 | maxdistance = self.region['maxdistance'] 109 | 110 | # if not initialized: no prefiltering 111 | if maxdistance is None: 112 | return True 113 | 114 | # compute distance to each member in each dimension 115 | if self.radial: 116 | dists = scipy.spatial.distance.cdist(members, [u], metric=self.metric) 117 | assert dists.shape == (len(members), 1) 118 | dist_criterion = dists < maxdistance 119 | else: 120 | dists = numpy.abs(u - members) 121 | assert dists.shape == (len(members), ndim), (dists.shape, ndim, len(members)) 122 | # nearer than maxdistance in all dimensions 123 | dist_criterion = numpy.all(dists < maxdistance, axis=1) 124 | assert dist_criterion.shape == (len(members),), (dist_criterion.shape, len(members)) 125 | # is it true for at least one? 126 | closeby = dist_criterion.any() 127 | if closeby: 128 | return True 129 | return False 130 | 131 | def are_inside_rect(self, u): 132 | """ 133 | Check if the new points are near or inside one of our clusters 134 | """ 135 | ulow = self.region['ulow'] 136 | uhigh = self.region['uhigh'] 137 | mask = numpy.logical_and(((ulow <= u).all(axis=1), (uhigh >= u).all(axis=1))) 138 | def are_inside_cluster(self, u, ndim): 139 | members = self.region['members'] 140 | maxdistance = self.region['maxdistance'] 141 | 142 | # if not initialized: no prefiltering 143 | if maxdistance is None: 144 | return numpy.ones(len(u), dtype=bool) 145 | 146 | # compute distance to each member in each dimension 147 | if self.radial: 148 | dists = scipy.spatial.distance.cdist(members, u, metric=self.metric) 149 | assert dists.shape == (len(members), len(u)) 150 | dist_criterion = dists < maxdistance 151 | else: 152 | raise NotImplementedError() 153 | # is it true for at least one? 154 | closeby = dist_criterion.any(axis=0) 155 | return closeby 156 | 157 | def generate(self, ndim): 158 | it = True 159 | verbose = False and self.verbose 160 | ntotal = 0 161 | # largest maxdistance where generating from full space makes sense 162 | full_maxdistance = 0.5 * (0.01)**(1./ndim) 163 | while True: 164 | maxdistance = self.region['maxdistance'] 165 | if maxdistance is None: 166 | # do a prefiltering rejection sampling first 167 | u = numpy.random.uniform(self.region['ulow'], self.region['uhigh'], size=ndim) 168 | yield u, ntotal 169 | ntotal = 0 170 | continue 171 | members = self.region['members'] 172 | it = numpy.random.uniform() < 0.01 173 | # depending on the region size compared to 174 | # the total space, one of the two methods will 175 | # be more efficient 176 | if it or not self.radial or maxdistance > full_maxdistance: 177 | it = True 178 | # for large regions 179 | # do a prefiltering rejection sampling first 180 | us = numpy.random.uniform(self.region['ulow'], self.region['uhigh'], size=(100, ndim)) 181 | ntotal += 100 182 | mask = self.are_inside_cluster(self.transform_points(us), ndim) 183 | if not mask.any(): 184 | continue 185 | us = us[mask] 186 | #indices = numpy.arange(len(mask))[mask] 187 | #for i in indices: 188 | # u = us[indices[i],:] 189 | for u in us: 190 | yield u, ntotal 191 | ntotal = 0 192 | else: 193 | # for small regions 194 | # draw from points 195 | us = members[numpy.random.randint(0, len(members), 100),:] 196 | ntotal += 100 197 | if verbose: print('chosen point', us) 198 | if self.metric == 'euclidean': 199 | # draw direction around it 200 | direction = numpy.random.normal(0, 1, size=(100, ndim)) 201 | direction = direction / ((direction**2).sum(axis=1)**0.5).reshape((-1,1)) 202 | if verbose: print('chosen direction', direction) 203 | # choose radius: volume gets larger towards the outside 204 | # so give the correct weight with dimensionality 205 | radius = maxdistance * numpy.random.uniform(0, 1, size=(100,1))**(1./ndim) 206 | us = us + direction * radius 207 | else: 208 | assert self.metric == 'chebyshev' 209 | us = us + numpy.random.uniform(-maxdistance, maxdistance, size=(100, ndim)) 210 | if verbose: print('using point', u) 211 | inside = numpy.logical_and((us >= 0).all(axis=1), (us <= 1).all(axis=1)) 212 | if not inside.any(): 213 | if verbose: print('outside boundaries', us, direction, maxdistance) 214 | continue 215 | us = us[inside] 216 | # count the number of points this is close to 217 | dists = scipy.spatial.distance.cdist(members, us, metric=self.metric) 218 | assert dists.shape == (len(members), len(us)) 219 | nnear = (dists < maxdistance).sum(axis=0) 220 | if verbose: print('near', nnear) 221 | #ntotal += 1 222 | # accept with probability 1./nnear 223 | coin = numpy.random.uniform(size=len(us)) 224 | 225 | accept = coin < 1. / nnear 226 | if not accept.any(): 227 | if verbose: print('probabilistic rejection due to overlaps') 228 | continue 229 | us = us[accept] 230 | for u in us: 231 | yield u, ntotal 232 | ntotal = 0 233 | 234 | def transform_new_points(self, us): 235 | return us 236 | def transform_points(self, us): 237 | return us 238 | def transform_point(self, u): 239 | return u 240 | 241 | def rebuild(self, u, ndim, keepRadius=False): 242 | if self.last_cluster_points is None or \ 243 | len(self.last_cluster_points) != len(u) or \ 244 | numpy.any(self.last_cluster_points != u): 245 | self.cluster(u=self.transform_new_points(u), ndim=ndim, keepRadius=keepRadius) 246 | self.last_cluster_points = u 247 | 248 | # reset generator 249 | self.generator = self.generate(ndim=ndim) 250 | def debug(self, ndim): 251 | if self.file is None: 252 | #self.file = open("friends_debug.txt", "a") 253 | import tempfile 254 | filename = tempfile.mktemp(dir='', 255 | prefix='friends%s-%s_' % ( 256 | '1' if self.jackknife else '', 257 | self.metric)) 258 | self.file = open(filename, 'w') 259 | self.file.write("{} {} {}\n".format(self.iter, self.region['maxdistance'], len(self.region['members']))) 260 | self.file.write("{} {} {} {}\n".format(self.iter, self.region['maxdistance'], len(self.region['members']), ndim)) 261 | def debugplot(self, u = None): 262 | print('creating plot...') 263 | n = len(self.region['members'][0]) / 2 264 | plt.figure(figsize=(6, n/2*4+1)) 265 | m = self.region['members'] 266 | d = self.region['maxdistance'] 267 | for i in range(n): 268 | plt.subplot(numpy.ceil(n / 2.), 2, 1+i) 269 | j = i * 2 270 | k = i * 2 + 1 271 | plt.plot(m[:,j], m[:,k], 'x', color='b', ms=1) 272 | plt.gca().add_artist(plt.Circle((m[0,j], m[0,k]), d, color='g', alpha=0.3)) 273 | if u is not None: 274 | plt.plot(u[j], u[k], 's', color='r') 275 | plt.gca().add_artist(plt.Circle((u[j], u[k]), d, color='r', alpha=0.3)) 276 | prefix='friends%s-%s_' % ('1' if self.jackknife else '', self.metric) 277 | plt.savefig(prefix + 'cluster.pdf') 278 | plt.close() 279 | print('creating plot... done') 280 | 281 | def draw_constrained(self, Lmins, priortransform, loglikelihood, live_pointsu, ndim, max_draws=None, **kwargs): 282 | # previous is [[u, x, L], ...] 283 | self.iter += 1 284 | rebuild = self.iter % self.rebuild_every == 1 285 | if rebuild or self.region is None: 286 | self.rebuild(numpy.asarray(live_pointsu), ndim, keepRadius=False) 287 | if self.generator is None: 288 | self.generator = self.generate(ndim=ndim) 289 | ntoaccept = 0 290 | ntotalsum = 0 291 | while True: 292 | for u, ntotal in self.generator: 293 | assert (u >= 0).all() and (u <= 1).all(), u 294 | ntotalsum += ntotal 295 | 296 | if self.hinter is not None: 297 | hints = self.hinter(u) 298 | if len(hints) == 0: 299 | # no way 300 | continue 301 | if len(hints) > 1: 302 | # choose a random solution, by size 303 | raise NotImplementedError("multiple solutions not implemented") 304 | hints = hints[numpy.random.randInt(len(hints))] 305 | else: 306 | hints = hints[0] 307 | 308 | for i, lo, hi in hints: 309 | u[i] = numpy.random.uniform(lo, hi) 310 | if not is_inside(self.transform_point(u)): 311 | # not sure if this is a good idea 312 | # it means we dont completely trust 313 | # the hinting function 314 | continue 315 | 316 | x = priortransform(u) 317 | L = loglikelihood(x) 318 | ntoaccept += 1 319 | 320 | if numpy.any(L > Lmins) or (max_draws is not None and ntotalsum > max_draws): 321 | # yay, we win 322 | if ntotalsum > 10000: 323 | if self.verbose: 324 | print('sampled %d points, evaluated %d ' % (ntotalsum, ntoaccept)) 325 | #self.debugplot(u) 326 | return u, x, L, ntoaccept 327 | 328 | # if running very inefficient, optimize clustering 329 | # if we haven't done so at the start 330 | if not rebuild and ntoaccept > 1000: 331 | #self.debugplot(u) 332 | break 333 | rebuild = True 334 | self.rebuild(numpy.asarray(live_pointsu), ndim, keepRadius=False) 335 | 336 | if __name__ == '__main__': 337 | friends = FriendsConstrainer(radial = True) 338 | 339 | u = numpy.random.uniform(0.45, 0.55, size=1000).reshape((-1, 2)) 340 | ndim = 2 341 | friends.cluster(u, ndim=ndim) 342 | Lmin = -1 343 | rv = scipy.stats.norm(0.515, 0.03) 344 | def priortransform(x): return x 345 | def loglikelihood(x): return rv.logpdf(x).sum() 346 | previous = [] 347 | colors = ['r', 'g', 'orange'] 348 | plt.figure("dists", figsize=(7,4)) 349 | plt.figure("plane", figsize=(5,5)) 350 | plt.plot(u[:,0], u[:,1], 'x') 351 | Lmins = [-5, 2, 2.5] #, 2.58] 352 | for j, (Lmin, color) in enumerate(zip(numpy.array(Lmins)*ndim, colors)): 353 | values = [] 354 | for i in range(200): 355 | friends.iter = 4 # avoid rebuild 356 | u, x, L, ntoaccept = friends.draw_constrained(Lmin, priortransform, loglikelihood, previous, ndim) 357 | plt.figure("plane") 358 | plt.plot(u[0], u[1], '+', color=color) 359 | values.append(u) 360 | values = numpy.array(values) 361 | plt.figure("dists") 362 | for k in range(ndim): 363 | plt.subplot(1, ndim, k + 1) 364 | plt.title('Lmin={}, dim={}'.format(Lmin, k)) 365 | plt.hist(values[:,k], cumulative=True, normed=True, 366 | color=color, bins=1000, histtype='step') 367 | plt.figure("plane") 368 | plt.savefig('friends_sampling_test.pdf', bbox_inches='tight') 369 | plt.close() 370 | plt.figure("dists") 371 | plt.savefig('friends_sampling_test_dists.pdf', bbox_inches='tight') 372 | plt.close() 373 | 374 | # another test: given a group of samples, assert that only neighbors are evaluated 375 | 376 | r = numpy.random.uniform(0.2, 0.25, size=400) 377 | phi = numpy.random.uniform(0, 1, size=400)**10 * 2*numpy.pi 378 | u = numpy.transpose([0.5 + r*numpy.cos(phi), 0.5 + r*numpy.sin(phi)]) 379 | friends.cluster(u, ndim=2) 380 | plt.figure(figsize=(10,5)) 381 | plt.subplot(1, 2, 1) 382 | plt.plot(u[:,0], u[:,1], 'x') 383 | suggested = [] 384 | def loglikelihood(x): 385 | r = ((x[0] - 0.5)**2 + (x[1] - 0.5)**2)**0.5 386 | #assert r < 0.5 387 | #assert r > 0.1 388 | suggested.append(r) 389 | if r > 0.2 and r < 0.25: 390 | plt.plot(x[0], x[1], 'o', color='green') 391 | return 100 392 | plt.plot(x[0], x[1], 'o', color='red') 393 | return -100 394 | 395 | ndim = 2 396 | taken = [] 397 | for i in range(100): 398 | friends.iter = 4 # avoid rebuild 399 | u, x, L, ntoaccept = friends.draw_constrained(Lmin, priortransform, loglikelihood, previous, ndim) 400 | r = ((x[0] - 0.5)**2 + (x[1] - 0.5)**2)**0.5 401 | taken.append(r) 402 | print('suggested:', u) 403 | plt.subplot(1, 2, 2) 404 | plt.hist(taken, cumulative=True, normed=True, 405 | color='g', bins=1000, histtype='step') 406 | plt.hist(suggested, cumulative=True, normed=True, 407 | color='r', bins=1000, histtype='step') 408 | #x = numpy.linspace(0, 1, 400) 409 | #y = x**ndim - (x - min(suggested) / max(suggested))**ndim 410 | #y /= max(y) 411 | #plt.plot(x * (max(suggested) - min(suggested)) + min(suggested), y, '--', color='grey') 412 | 413 | plt.savefig('friends_sampling_test_sampling.pdf', bbox_inches='tight') 414 | plt.close() 415 | 416 | 417 | 418 | 419 | -------------------------------------------------------------------------------- /multi_nested_sampler.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Sampler 5 | ---------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | 18 | 19 | """ 20 | import numpy 21 | from numpy import exp, log, log10, pi 22 | import progressbar 23 | import igraph 24 | from collections import defaultdict 25 | 26 | status_symbols = { 27 | 0:' ', 28 | 1:u"\u2581", 29 | 2:u"\u2582", 30 | 3:u"\u2583", 31 | 4:u"\u2584", 5:u"\u2584", 32 | 6:u"\u2585", 7:u"\u2585", 33 | 8:u"\u2586", 9:u"\u2586", 34 | 10:u"\u2587", 11:u"\u2587", 12:u"\u2587", 13:u"\u2587", 14:u"\u2587", 35 | 15:u"\u2588", 16:u"\u2588", 17:u"\u2588", 18:u"\u2588", 19:u"\u2588", 36 | } 37 | 38 | def find_nsmallest(n, arr1, arr2): 39 | # old version 40 | arr = numpy.hstack((arr1, arr2)) 41 | arr.sort() 42 | return arr[n] 43 | 44 | def find_nsmallest(n, arr1, arr2): 45 | # new version, faster because it does not need to sort everything 46 | arr = numpy.concatenate((arr1, arr2)) 47 | return numpy.partition(arr, n)[n] 48 | 49 | class MultiNestedSampler(object): 50 | """ 51 | Samples points, always replacing the worst live point, forever. 52 | 53 | This implementation always removes and replaces one point (r=1), 54 | and does so linearly (no parallelisation). 55 | 56 | This class is implemented as an iterator. 57 | """ 58 | def __init__(self, priortransform, multi_loglikelihood, superset_draw_constrained, individual_draw_constrained, draw_constrained, 59 | ndata, ndim, nlive_points = 200, draw_global_uniform = None, 60 | nsuperset_draws = 10, use_graph=False): 61 | self.nlive_points = nlive_points 62 | self.nsuperset_draws = nsuperset_draws 63 | self.priortransform = priortransform 64 | self.real_multi_loglikelihood = multi_loglikelihood 65 | self.multi_loglikelihood = multi_loglikelihood 66 | self.superset_draw_constrained = superset_draw_constrained 67 | self.individual_draw_constrained = individual_draw_constrained 68 | self.draw_constrained = draw_constrained 69 | #self.samples = [] 70 | self.global_iter = 0 71 | self.ndim = ndim 72 | self.ndata = ndata 73 | self.superpoints = [] 74 | # lazy building of graph 75 | self.use_graph = use_graph 76 | self.membership_graph = None 77 | self.last_graph = None 78 | self.last_graph_selection = None 79 | self.point_data_map = None 80 | # draw N starting points from prior 81 | pointpile = [] 82 | pointpilex = [] 83 | live_pointsp = [None] * nlive_points 84 | #live_pointsu = [None] * nlive_points 85 | #live_pointsx = [None] * nlive_points 86 | live_pointsL = [None] * nlive_points 87 | 88 | print('generating initial %d live points' % (nlive_points)) 89 | data_mask = numpy.ones(ndata) == 1 90 | 91 | for i in range(nlive_points): 92 | u = self.draw_global_uniform() 93 | x = priortransform(u) 94 | L = multi_loglikelihood(x, data_mask=data_mask) 95 | p = len(pointpile) 96 | live_pointsp[i] = [p]*ndata 97 | pointpile.append(u) 98 | pointpilex.append(x) 99 | #self.global_iter += 1 100 | #live_pointsu[i] = [u]*ndata 101 | #live_pointsx[i] = [x]*ndata 102 | live_pointsL[i] = L 103 | self.superpoints.append(p) 104 | #self.samples.append([live_pointsu[i], live_pointsx[i], live_pointsL[i]]) 105 | print('generated %d live points' % (nlive_points)) 106 | self.pointpile = numpy.array(pointpile) 107 | self.pointpilex = numpy.array(pointpilex) 108 | self.live_pointsp = numpy.array(live_pointsp) 109 | #self.live_pointsu = numpy.array(live_pointsu) 110 | #self.live_pointsx = numpy.array(live_pointsx) 111 | self.live_pointsL = numpy.array(live_pointsL) 112 | self.Lmax = self.live_pointsL.max(axis=0) 113 | self.data_mask_all = numpy.ones(self.ndata) == 1 114 | self.real_data_mask_all = numpy.ones(self.ndata) == 1 115 | assert self.Lmax.shape == (ndata,) 116 | self.ndraws = nlive_points 117 | self.shelves = [[] for _ in range(ndata)] 118 | 119 | self.dump_iter = 1 120 | 121 | def draw_global_uniform(self): 122 | return numpy.random.uniform(0, 1, size=self.ndim) 123 | 124 | def get_unique_points(self, allpoints): 125 | d = allpoints.reshape((-1,self.ndim)) 126 | b = d.view(numpy.dtype((numpy.void, d.dtype.itemsize * d.shape[1]))) 127 | _, idx = numpy.unique(b, return_index=True) 128 | return d[idx] 129 | 130 | def get_unique_pointsp(self, allpoints): 131 | idx = numpy.unique(allpoints) 132 | return self.pointpile[idx], idx 133 | 134 | def prepare(self): 135 | live_pointsL = self.live_pointsL 136 | Lmins = live_pointsL.min(axis=0) 137 | Lmini = live_pointsL.argmin(axis=0) 138 | # clean up shelves 139 | for d in range(self.ndata): 140 | self.shelves[d] = [(pj, uj, xj, Lj) for (pj, uj, xj, Lj) in self.shelves[d] if Lj > Lmins[d]] 141 | all_global_live_pointsu, all_global_live_pointsp = self.get_unique_pointsp(self.live_pointsp) 142 | all_Lmin = live_pointsL.min() 143 | return all_global_live_pointsu, all_global_live_pointsp, all_Lmin, Lmins, Lmini 144 | 145 | def shelf_status(self): 146 | print('shelf status: %s' % ''.join([status_symbols.get(len(shelf), 'X') for shelf in self.shelves])) 147 | 148 | def cut_down(self, surviving): 149 | # delete some data sets 150 | self.live_pointsp = self.live_pointsp[:,surviving] 151 | self.live_pointsL = self.live_pointsL[:,surviving] 152 | self.shelves = [shelf for s, shelf in zip(surviving, self.shelves) if s] 153 | self.ndata = surviving.sum() 154 | self.Lmax = self.live_pointsL.max(axis=0) 155 | self.data_mask_all = numpy.ones(self.ndata) == 1 156 | self.real_data_mask_all[self.real_data_mask_all] = surviving 157 | def multi_loglikelihood_subset(params, mask): 158 | subset_mask = self.real_data_mask_all.copy() 159 | subset_mask[subset_mask] = mask 160 | return self.real_multi_loglikelihood(params, subset_mask) 161 | 162 | self.multi_loglikelihood = multi_loglikelihood_subset 163 | # rebuild graph because igraph does not support renaming nodes 164 | self.membership_graph = None 165 | self.point_data_map = None 166 | self.last_graph = None 167 | self.last_graph_selection = None 168 | #if self.point_data_map is not None: 169 | # for d, s in enumerate(surviving) 170 | # if s: continue 171 | # for p in self.live_pointsp[:,d]: 172 | # self.point_data_map[p].add(d) 173 | 174 | 175 | def rebuild_graph(self): 176 | if self.membership_graph is None: 177 | print('constructing graph...') 178 | graph = igraph.Graph(directed=False) 179 | # pointing from live_point to member 180 | for i in numpy.where(self.data_mask_all)[0]: 181 | graph.add_vertex("n%d" % i, id=i, vtype=0) 182 | for p in range(len(self.pointpile)): 183 | graph.add_vertex("p%d" % p, id=p, vtype=1) 184 | edges = [] 185 | for i in numpy.where(self.data_mask_all)[0]: 186 | #graph.add_vertex("n%d" % i, id=i, vtype=0) 187 | edges += [("n%d" % i, "p%d" % p) for p in self.live_pointsp[:,i]] 188 | print('connecting graph ...') 189 | graph.add_edges(edges) 190 | print('constructing graph done.') 191 | self.membership_graph = graph 192 | 193 | def rebuild_map(self): 194 | if self.point_data_map is None: 195 | print('constructing map...') 196 | # pointing from live_point to member 197 | self.point_data_map = defaultdict(set) 198 | for i in range(self.ndata): 199 | for p in self.live_pointsp[:,i]: 200 | self.point_data_map[p].add(i) 201 | print('constructing map done.') 202 | 203 | 204 | def generate_subsets_nograph(self, data_mask, allp): 205 | # generate data subsets which share points. 206 | selected = numpy.where(data_mask)[0] 207 | all_selected = len(selected) == len(data_mask) 208 | firstmember = selected[0] 209 | if len(selected) == 1: 210 | # trivial case: 211 | # requested only a single slot, so return its live points 212 | yield data_mask, self.live_pointsp[:,firstmember] 213 | return 214 | 215 | if not all_selected: 216 | allp = numpy.unique(self.live_pointsp[:,selected].flatten()) 217 | 218 | if len(allp) < 2 * self.nlive_points: 219 | print('generate_subsets: only %d unique live points known, so connected' % len(allp)) 220 | # if fewer than 2*nlive unique points are known, 221 | # some must be shared between data sets. 222 | # So no disjoint data sets 223 | yield data_mask, allp 224 | return 225 | 226 | if len(self.superpoints) > 0: 227 | print('generate_subsets: %d superpoints known, so connected' % len(self.superpoints)) 228 | # there are some points shared by all data sets 229 | # so no disjoint data sets 230 | yield data_mask, allp 231 | return 232 | 233 | self.rebuild_map() 234 | to_handle = data_mask.copy() 235 | while to_handle.any(): 236 | firstmember = numpy.where(to_handle)[0][0] 237 | to_handle[firstmember] = False 238 | members = [firstmember] 239 | # get live points of this member 240 | member_live_pointsp = self.live_pointsp[:,firstmember].tolist() 241 | # look through to_handle for entries and check if they have the points 242 | i = 0 243 | while True: 244 | if i >= len(member_live_pointsp) or not to_handle.any(): 245 | break 246 | p = member_live_pointsp[i] 247 | newmembers = [m for m in self.point_data_map[p] if to_handle[m]] 248 | print(newmembers) 249 | members += newmembers 250 | for newp in numpy.unique(self.live_pointsp[:,newmembers]): 251 | if newp not in member_live_pointsp: 252 | member_live_pointsp.append(newp) 253 | to_handle[newmembers] = False 254 | i = i + 1 255 | 256 | # now we have our members and live points 257 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 258 | member_data_mask[members] = True 259 | #print 'returning:', member_data_mask, member_live_pointsp 260 | yield member_data_mask, member_live_pointsp 261 | 262 | def generate_subsets_graph(self, data_mask, allp): 263 | # generate data subsets which share points. 264 | selected = numpy.where(data_mask)[0] 265 | all_selected = len(selected) == len(data_mask) 266 | firstmember = selected[0] 267 | if len(selected) == 1: 268 | # trivial case: 269 | # requested only a single slot, so return its live points 270 | yield data_mask, self.live_pointsp[:,firstmember] 271 | return 272 | 273 | if not all_selected: 274 | allp = numpy.unique(self.live_pointsp[:,selected].flatten()) 275 | 276 | if len(allp) < 2 * self.nlive_points: 277 | print('generate_subsets: only %d unique live points known, so connected' % len(allp)) 278 | # if fewer than 2*nlive unique points are known, 279 | # some must be shared between data sets. 280 | # So no disjoint data sets 281 | yield data_mask, allp 282 | return 283 | 284 | if len(self.superpoints) > 0: 285 | print('generate_subsets: %d superpoints known, so connected' % len(self.superpoints)) 286 | # there are some points shared by all data sets 287 | # so no disjoint data sets 288 | yield data_mask, allp 289 | return 290 | 291 | self.rebuild_graph() 292 | if all_selected: 293 | graph = self.membership_graph 294 | else: 295 | graph = self._generate_subsets_graph_create_subgraph(data_mask, allp) 296 | 297 | for sub_data_mask, sub_points in self._generate_subsets_graph_subgraphs(graph, data_mask, all_selected, allp): 298 | yield sub_data_mask, sub_points 299 | 300 | def _generate_subsets_graph_create_subgraph(self, data_mask, allp): 301 | # need to look at the subgraph with only the selected 302 | # dataset nodes 303 | members = ['n%d' % v for v, sel in enumerate(data_mask) if sel] 304 | members += ['p%d' % p for p in allp] 305 | # if the previous graph had all these nodes (or more) 306 | if self.last_graph is not None and self.last_graph_selection[data_mask].all(): 307 | # re-using previously cut-down graph 308 | # this may speed things up because we have to cut less 309 | print('generate_subsets: re-using previous graph') 310 | prevgraph = self.last_graph 311 | else: 312 | # not a super-set, need to start with whole graph 313 | prevgraph = self.membership_graph 314 | 315 | graph = prevgraph.subgraph(members) 316 | self.last_graph = graph 317 | self.last_graph_selection = data_mask 318 | return graph 319 | 320 | 321 | def _generate_subsets_graph_subgraphs(self, graph, data_mask, all_selected, allp): 322 | # we could test here with graph.is_connected() first 323 | # but if it is connected, then it takes as long as clusters() 324 | # and if it not connected, we have to call clusters() anyways. 325 | subgraphs = graph.clusters() 326 | assert len(subgraphs) > 0 327 | 328 | # single-node subgraphs can occur when 329 | # a live point is not used anymore 330 | # a real subgraph has to have a data point and its live points, 331 | # so at least nlive_points+1 entries 332 | subgraphs = [subgraph for subgraph in subgraphs if len(subgraph) > 1] 333 | 334 | if len(subgraphs) == 1: 335 | yield data_mask, allp 336 | return 337 | 338 | # then identify disjoint subgraphs 339 | for subgraph in subgraphs: 340 | member_data_mask = numpy.zeros(len(data_mask), dtype=bool) 341 | member_live_pointsp = [] 342 | for vi in subgraph: 343 | att = graph.vs[vi].attributes() 344 | #print ' ', att 345 | if att['vtype'] == 0: 346 | i = att['id'] 347 | member_data_mask[i] = True 348 | else: 349 | p = att['id'] 350 | member_live_pointsp.append(p) 351 | if member_data_mask.any(): 352 | yield member_data_mask, member_live_pointsp 353 | #else: 354 | # print 'skipping node-free subgraph:', [self.membership_graph.vs[vi].attributes()['name'] for vi in subgraph] 355 | # print graph 356 | 357 | def __next__(self): 358 | # select worst point, lowest likelihood and replace 359 | live_pointsL = self.live_pointsL 360 | superset_membersets = None 361 | 362 | print('iteration %d' % self.global_iter) 363 | all_global_live_pointsu, all_global_live_pointsp, all_Lmin, Lmins, Lmini = self.prepare() 364 | iter = 0 365 | while True: 366 | iter += 1 367 | empty_mask = numpy.array([len(self.shelves[d]) == 0 for d in range(self.ndata)]) 368 | if not empty_mask.any(): 369 | # all have something in their shelves 370 | break 371 | 372 | # if superset draws enabled, do some of these first. 373 | sample_subset = iter > self.nsuperset_draws 374 | 375 | if sample_subset: 376 | # subset draw: focus on filling empty ones 377 | data_mask = empty_mask 378 | # cut_level = 5 4 3 2 1 0 0 0 0 379 | #cut_level = max(0, 5 - (iter - self.nsuperset_draws)) 380 | #data_mask = numpy.array([len(self.shelves[d]) <= cut_level for d in range(self.ndata)]) 381 | global_live_pointsu, global_live_pointsp = self.get_unique_pointsp(self.live_pointsp[:,data_mask]) 382 | else: 383 | # super-set draw, try to fill all/any 384 | data_mask = self.data_mask_all 385 | global_live_pointsu = all_global_live_pointsu 386 | global_live_pointsp = all_global_live_pointsp 387 | Lmin = all_Lmin 388 | use_rebuilding_draw = sample_subset 389 | 390 | self.shelf_status() 391 | # if the data sets do not share any live points, 392 | # it does not make sense to analyse them jointly 393 | # so we break them up into membersets here, stringing 394 | # together those that do. 395 | 396 | # if a previous superset draw did the decomposition already, 397 | # just reuse it 398 | if superset_membersets is not None and not sample_subset: 399 | membersets = superset_membersets 400 | elif self.use_graph: 401 | membersets = list(self.generate_subsets_graph(data_mask, global_live_pointsp)) 402 | else: 403 | membersets = list(self.generate_subsets_nograph(data_mask, global_live_pointsp)) 404 | 405 | if not sample_subset and superset_membersets is None: 406 | # store superset decomposition 407 | superset_membersets = membersets 408 | 409 | assert len(membersets) > 0 410 | if len(membersets) > 1: 411 | # if the data is split, regions need to be 412 | # rebuilt for every group 413 | use_rebuilding_draw = True 414 | 415 | for ji, (joint_data_mask, joint_live_pointsp) in enumerate(membersets): 416 | print('live point set %d/%d: %d from %d datasets, %s' % ( 417 | ji+1, len(membersets), len(joint_live_pointsp), 418 | joint_data_mask.sum(), 419 | 'focussed set constrained draw' if sample_subset else 'super-set constrained draw')) 420 | joint_live_pointsu = self.pointpile[joint_live_pointsp] 421 | #print 'members:', joint_data_mask.shape, joint_live_pointsu.shape 422 | max_draws = 1000 423 | njoints = joint_data_mask.sum() 424 | joint_indices = numpy.where(joint_data_mask)[0] 425 | firstd = joint_indices[0] 426 | # if it is the only dataset and we need an entry here, try longer 427 | if njoints == 1 and len(self.shelves[firstd]) == 0: 428 | max_draws = 100000 429 | 430 | # if there is more than one memberset and this one is full, 431 | # we do not need to do anything 432 | # this should be a rare occasion 433 | if len(membersets) > 1 and not sample_subset and all([len(self.shelves[d]) > 0 for d in joint_indices]): 434 | continue 435 | 436 | # Lmin needs to be corrected. It is the lowest L, but 437 | # this may not be useful for making a draw. 438 | Lmins_higher = Lmins[joint_indices].copy() 439 | for j, d in enumerate(joint_indices): 440 | n = len(self.shelves[d]) 441 | if n == 0: 442 | # relevant only for non-empty shelves 443 | continue 444 | # to insert at position n 445 | # there must be n elements smaller 446 | # in self.shelves[d] and self.live_pointsL[:,d] 447 | Lmins_higher[j] = find_nsmallest(n, live_pointsL[:,d], [Li for _, _, _, Li in self.shelves[d]]) 448 | 449 | if njoints == 1: 450 | # only a single data set, we can keep the same region for longer 451 | real_firstd = numpy.where(self.real_data_mask_all)[0][firstd] 452 | draw_constrained = self.individual_draw_constrained(real_firstd, self.global_iter, sampler=self) 453 | elif use_rebuilding_draw: 454 | # a subset, perhaps different then last iteration 455 | # need to reconstruct the region from scratch 456 | real_joint_indices = numpy.where(self.real_data_mask_all)[0][joint_indices] 457 | draw_constrained = self.draw_constrained(real_joint_indices, self.real_data_mask_all, joint_live_pointsp, self.global_iter) 458 | else: 459 | # full data set, can keep longer 460 | draw_constrained = self.superset_draw_constrained 461 | 462 | uj, xj, Lj, n = draw_constrained( 463 | Lmins=Lmins_higher, 464 | priortransform=self.priortransform, 465 | loglikelihood=lambda params: self.multi_loglikelihood(params, joint_data_mask), 466 | ndim=self.ndim, 467 | draw_global_uniform=self.draw_global_uniform, 468 | live_pointsu = joint_live_pointsu, 469 | max_draws=max_draws, 470 | iter=self.global_iter, 471 | nlive_points=self.nlive_points 472 | ) 473 | 474 | # we have a new draw 475 | self.ndraws += int(n) 476 | ppi = len(self.pointpile) 477 | if self.membership_graph is not None: 478 | self.membership_graph.add_vertex("p%d" % ppi, id=ppi, vtype=1) 479 | self.pointpile = numpy.vstack((self.pointpile, [uj])) 480 | self.pointpilex = numpy.vstack((self.pointpilex, [xj])) 481 | nfilled = 0 482 | for j, d in enumerate(numpy.where(joint_data_mask)[0]): 483 | if Lj[j] > Lmins_higher[j]: 484 | self.shelves[d].append((ppi, uj, xj, Lj[j])) 485 | nfilled += 1 486 | if nfilled == self.ndata: 487 | # new point is a superpoint, accepted by all 488 | self.superpoints.append(ppi) 489 | print('accept after %d tries, filled %d shelves' % (n, nfilled)) 490 | 491 | # we got a new point 492 | #print 'new point:', Lmins[data_mask], (Lj>Lmins[data_mask])*1 493 | 494 | # pop: for every data entry, advance one point 495 | print('advancing all...') 496 | self.global_iter += 1 497 | pj_old = self.live_pointsp[Lmini,numpy.arange(self.ndata)] 498 | uis = self.pointpile[pj_old] 499 | xis = self.pointpilex[pj_old] 500 | Lis = live_pointsL[Lmini, numpy.arange(self.ndata)] 501 | if self.membership_graph is not None: 502 | print(' deleting edges...') 503 | self.membership_graph.delete_edges([("n%d" % d, "p%d" % pj) for d, pj in enumerate(pj_old)]) 504 | if self.point_data_map is not None: 505 | for d, pj in enumerate(pj_old): 506 | self.point_data_map[pj].remove(d) 507 | # point assignment changed, so can not re-use any more directly 508 | self.last_graph = None 509 | self.last_graph_selection = None 510 | if self.superpoints: 511 | print(' dropping superpoints ...') 512 | for pj in numpy.unique(pj_old): 513 | # no longer a superpoint, because it is no 514 | # longer shared by all data sets 515 | if pj in self.superpoints: 516 | self.superpoints.remove(pj) 517 | new_edges = None if self.membership_graph is None else [] 518 | print(' replacing dead points ...') 519 | for d in range(self.ndata): 520 | i = Lmini[d] 521 | pj, uj, xj, Lj = self.shelves[d].pop(0) 522 | self.live_pointsp[i,d] = pj 523 | live_pointsL[i,d] = Lj 524 | if new_edges is not None: 525 | new_edges.append(("n%d" % d, "p%d" % pj)) 526 | if self.point_data_map is not None: 527 | self.point_data_map[pj].add(d) 528 | if self.membership_graph is not None: 529 | print(' adding edges ...') 530 | self.membership_graph.add_edges(new_edges) 531 | self.Lmax = live_pointsL.max(axis=0) 532 | assert self.Lmax.shape == (self.ndata,) 533 | print('advancing done.') 534 | return numpy.asarray(uis), numpy.asarray(xis), numpy.asarray(Lis) 535 | 536 | def remainder(self, d=None): 537 | if d is None: 538 | print('sorting remainder...') 539 | indices = numpy.empty((self.ndata, self.nlive_points), dtype=int) 540 | for d in range(self.ndata): 541 | indices[d,:] = numpy.argsort(self.live_pointsL[:,d]) 542 | ds = numpy.arange(self.ndata) 543 | print('building remainder...') 544 | for i in range(self.nlive_points): 545 | j = indices[ds,i] 546 | p = self.live_pointsp[j,ds] 547 | u = self.pointpile[p] 548 | x = self.pointpilex[p] 549 | L = self.live_pointsL[j,ds] 550 | #u = [self.pointpile[self.live_pointsp[indices[d][i],d]] for d in range(self.ndata)] 551 | #x = [self.pointpilex[self.live_pointsp[indices[d][i],d]] for d in range(self.ndata)] 552 | #L = numpy.asarray([self.live_pointsL[indices[d][i],d] for d in range(self.ndata)]) 553 | yield u, x, L 554 | print('remainder done.') 555 | else: 556 | indices = numpy.argsort(self.live_pointsL[:,d]) 557 | for i in indices: 558 | u = self.pointpile[self.live_pointsp[i,d]] 559 | x = self.pointpilex[self.live_pointsp[i,d]] 560 | L = self.live_pointsL[i,d] 561 | yield u, x, L 562 | #yield self.live_pointsu[i,d], self.live_pointsx[i,d], self.live_pointsL[i,d] 563 | 564 | next = __next__ 565 | 566 | def __iter__(self): 567 | while True: yield self.__next__() 568 | 569 | __all__ = [MultiNestedSampler] 570 | 571 | -------------------------------------------------------------------------------- /musefuse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | """ 3 | 4 | Main program 5 | --------------- 6 | 7 | Copyright (c) 2017 Johannes Buchner 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 16 | 17 | """ 18 | 19 | import numpy 20 | from numpy import exp 21 | import h5py 22 | import sys 23 | import json 24 | import os 25 | import time 26 | import astropy.io.fits as pyfits 27 | import matplotlib.pyplot as plt 28 | 29 | do_plotting = False 30 | 31 | print('loading data...') 32 | f = pyfits.open(sys.argv[1]) 33 | datasection = f['DATA'] 34 | y = datasection.data # values 35 | y = y[:3600,:,:] 36 | nspec, npixx, npixy = y.shape 37 | noise_level = f['STAT'].data # variance 38 | noise_level = noise_level[:3600,:,:] 39 | good = numpy.isfinite(noise_level).all(axis=0) 40 | print(' %.2f%% good...' % (100*good.mean())) 41 | #print numpy.where(~numpy.isfinite(noise_level[:,40,40])) 42 | #print noise_level[~numpy.isfinite(noise_level[:,40,40]),40,40] 43 | 44 | if do_plotting: 45 | print('plotting image...') 46 | plt.figure(figsize=(20,20)) 47 | plt.imshow(y[0,:,:]) 48 | plt.savefig('musefuse_img0.png', bbox_inches='tight') 49 | plt.close() 50 | 51 | regionfile = sys.argv[2] 52 | import pyregion 53 | region = pyregion.parse(open(regionfile).read()) 54 | mask = region.get_mask(shape=(npixx, npixy)) 55 | 56 | maskx = mask.any(axis=0) 57 | masky = mask.any(axis=1) 58 | i = numpy.where(maskx)[0] 59 | ilo, ihi = i.min(), i.max() + 1 60 | j = numpy.where(masky)[0] 61 | jlo, jhi = j.min(), j.max() + 1 62 | print((mask.sum(), ilo, ihi, jlo, jhi, y.shape, npixx, npixy)) 63 | #ndata = mask.sum() 64 | 65 | #ymask = mask.reshape((1, npixx, npixy)) 66 | ymask = numpy.array([mask] * len(y)) 67 | y[~ymask] = numpy.nan 68 | if do_plotting: 69 | print('plotting selection ...') 70 | plt.figure(figsize=(20,20)) 71 | plt.imshow(y[0,ilo:ihi,jlo:jhi]) 72 | plt.colorbar() 73 | plt.savefig('musefuse_sel_img0.png', bbox_inches='tight') 74 | plt.close() 75 | 76 | print('applying subselection ...') 77 | y = y[ymask] 78 | noise_level = noise_level[ymask] 79 | print(' subselection gave %s ...' % (y.shape)) 80 | y = y.reshape((nspec, -1)) 81 | noise_level = noise_level.reshape((nspec, -1)) 82 | x = datasection.header['CD3_3'] * numpy.arange(nspec) + datasection.header['CRVAL3'] 83 | wavelength = x 84 | #good = numpy.logical_and(numpy.isfinite(noise_level).all(axis=0), numpy.isfinite(y).all(axis=0)) 85 | print(' finding NaNs...') 86 | good = numpy.isfinite(noise_level).all(axis=0) 87 | print(' found %d finite spaxels ...' % (good.sum())) 88 | #assert good.shape == (ymask.sum(),), good.shape 89 | goodids = numpy.where(good)[0] 90 | #numpy.random.shuffle(goodids) 91 | 92 | ndata = int(os.environ.get('MAXDATA', len(goodids))) 93 | print(' truncating data to %d sets...' % ndata, goodids[:ndata]) 94 | ## truncate data 95 | y = y[:,goodids[:ndata]] 96 | noise_level = noise_level[:,goodids[:ndata]] 97 | assert (noise_level>0).all(), noise_level 98 | 99 | assert y.shape == (nspec, ndata), (y.shape, nspec, ndata) 100 | assert noise_level.shape == (nspec, ndata) 101 | 102 | assert ndata > 0, 'No valid data!?' 103 | 104 | #noise_level[noise_level > 2 * numpy.median(vd[:,i]] = 1000 105 | 106 | print(' cleaning data') 107 | noise_level2 = noise_level.copy() 108 | w = 10 109 | for j in range(nspec): 110 | lo = j - w 111 | hi = j + w 112 | if lo < 0: 113 | lo = 0 114 | if hi > nspec: 115 | hi = nspec 116 | seg = noise_level[lo:hi,:] 117 | med = numpy.median(seg, axis=0) 118 | diff = numpy.abs(med.reshape((1, -1)) - seg) 119 | meddiff = numpy.median(diff, axis=0) 120 | diff = numpy.abs(noise_level[j,:] - med) 121 | v = (diff > 5 * meddiff) * 1e10 122 | #k = j 123 | if False and v.any(): 124 | print(' updating noise level at', j) #, meddiff, diff 125 | for k in range(max(0, j-3), min(nspec-1, j+3)+1): 126 | noise_level2[k,:] += v 127 | 128 | noise_level2[1600:1670,:] += 1e10 129 | noise_level2[1730:1780,:] += 1e10 130 | noise_level2[1950:2000,:] += 1e10 131 | noise_level2[1750+500:2200+500,:] += 1e10 132 | noise_level2[2300+500:2500+500,:] += 1e10 133 | #noise_level2[noise_level2 > noise_level.max()] = noise_level.max() 134 | 135 | if do_plotting: 136 | for i in range(ndata): 137 | plt.figure() 138 | xi = numpy.arange(len(y[:,i])) 139 | plt.plot(xi, y[:,i], color='k', lw=1) 140 | sigma0 = noise_level[:,i]**0.5 141 | plt.fill_between(xi, y[:,i] - sigma0, y[:,i] + sigma0, alpha=0.3, color='red') 142 | sigma = noise_level2[:,i]**0.5 143 | plt.fill_between(xi, y[:,i] - sigma, y[:,i] + sigma, alpha=0.3, color='gray') 144 | idx = numpy.where(noise_level2[:,i] != noise_level[:,i])[0] 145 | lo, hi = y[:,i].min(), y[:,i].max() 146 | plt.plot(xi, lo+sigma0, color='b') 147 | plt.plot(xi, lo+0*sigma0, color='b') 148 | plt.vlines(idx, lo, hi, color='g', alpha=0.1, lw=0.1) 149 | plt.ylim(lo, hi) 150 | #plt.xlim(500, 3500) 151 | plt.savefig('musefuse_data%d.pdf' % (i+1), bbox_inches='tight') 152 | plt.close() 153 | 154 | noise_level = noise_level2 155 | 156 | """ 157 | 158 | Definition of the problem 159 | - parameter space (here: 3d) 160 | - likelihood function which consists of 161 | - model function ("slow predicting function") 162 | - data comparison 163 | 164 | """ 165 | 166 | paramnames = ['Z', 'logSFtau', 'SFage', 'z', 'EBV'] #, 'misfit'] 167 | nparams = len(paramnames) 168 | 169 | zlo = float(sys.argv[3]) 170 | zhi = float(sys.argv[4]) 171 | filenames = sys.argv[5:] 172 | grid = [] 173 | 174 | for iZ, filename in enumerate(filenames): 175 | print(filename) 176 | data = numpy.loadtxt(filename) 177 | model_wavelength = data[:,0] 178 | model_templates = data[:,1:].transpose() 179 | grid.append(model_templates) 180 | 181 | inversewavelength_grid = numpy.linspace(1/10000., 1/4000., 2000) 182 | # sigma is applied on that grid 183 | # to convert to km/s, we need the wavelength, e.g. at 4000 and the element size 184 | inversewavelength_gridwidth_A = 0.24 / 5 # A at 4000 (the end of this grid) 185 | 186 | Zs = numpy.log10([0.0001, 0.0004, 0.004, 0.008, 0.02, 0.05, 0.1]) 187 | sftaus = numpy.log10(numpy.array([1, 4, 10, 40, 100, 400, 1000, 4000]) * 1.e6) 188 | sfages = numpy.linspace(0, 13, 26) 189 | ages = numpy.array([0.000E+00, 1.000E+05, 1.412E+05, 1.585E+05, 1.778E+05, 1.995E+05, 2.239E+05, 2.512E+05, 2.818E+05, 3.162E+05, 3.548E+05, 3.981E+05, 4.467E+05, 5.012E+05, 5.623E+05, 6.310E+05, 7.080E+05, 7.943E+05, 8.913E+05, 1.000E+06, 1.047E+06, 1.096E+06, 1.148E+06, 1.202E+06, 1.259E+06, 1.318E+06, 1.380E+06, 1.445E+06, 1.514E+06, 1.585E+06, 1.660E+06, 1.738E+06, 1.820E+06, 1.906E+06, 1.995E+06, 2.089E+06, 2.188E+06, 2.291E+06, 2.399E+06, 2.512E+06, 2.630E+06, 2.754E+06, 2.884E+06, 3.020E+06, 3.162E+06, 3.311E+06, 3.467E+06, 3.631E+06, 3.802E+06, 3.981E+06, 4.169E+06, 4.365E+06, 4.571E+06, 4.786E+06, 5.012E+06, 5.248E+06, 5.495E+06, 5.754E+06, 6.026E+06, 6.310E+06, 6.607E+06, 6.918E+06, 7.244E+06, 7.586E+06, 7.943E+06, 8.318E+06, 8.710E+06, 9.120E+06, 9.550E+06, 1.000E+07, 1.047E+07, 1.096E+07, 1.148E+07, 1.202E+07, 1.259E+07, 1.318E+07, 1.380E+07, 1.445E+07, 1.514E+07, 1.585E+07, 1.660E+07, 1.738E+07, 1.820E+07, 1.906E+07, 1.995E+07, 2.089E+07, 2.188E+07, 2.291E+07, 2.399E+07, 2.512E+07, 2.630E+07, 2.754E+07, 2.900E+07, 3.000E+07, 3.100E+07, 3.200E+07, 3.300E+07, 3.400E+07, 3.500E+07, 3.600E+07, 3.700E+07, 3.800E+07, 3.900E+07, 4.000E+07, 4.250E+07, 4.500E+07, 4.750E+07, 5.000E+07, 5.250E+07, 5.500E+07, 5.709E+07, 6.405E+07, 7.187E+07, 8.064E+07, 9.048E+07, 1.015E+08, 1.139E+08, 1.278E+08, 1.434E+08, 1.609E+08, 1.805E+08, 2.026E+08, 2.273E+08, 2.550E+08, 2.861E+08, 3.210E+08, 3.602E+08, 4.042E+08, 4.535E+08, 5.088E+08, 5.709E+08, 6.405E+08, 7.187E+08, 8.064E+08, 9.048E+08, 1.015E+09, 1.139E+09, 1.278E+09, 1.434E+09, 1.609E+09, 1.680E+09, 1.700E+09, 1.800E+09, 1.900E+09, 2.000E+09, 2.100E+09, 2.200E+09, 2.300E+09, 2.400E+09, 2.500E+09, 2.600E+09, 2.750E+09, 3.000E+09, 3.250E+09, 3.500E+09, 3.750E+09, 4.000E+09, 4.250E+09, 4.500E+09, 4.750E+09, 5.000E+09, 5.250E+09, 5.500E+09, 5.750E+09, 6.000E+09, 6.250E+09, 6.500E+09, 6.750E+09, 7.000E+09, 7.250E+09, 7.500E+09, 7.750E+09, 8.000E+09, 8.250E+09, 8.500E+09, 8.750E+09, 9.000E+09, 9.250E+09, 9.500E+09, 9.750E+09, 1.000E+10, 1.025E+10, 1.050E+10, 1.075E+10, 1.100E+10, 1.125E+10, 1.150E+10, 1.175E+10, 1.200E+10, 1.225E+10, 1.250E+10, 1.275E+10, 1.300E+10, 1.325E+10, 1.350E+10, 1.375E+10, 1.400E+10, 1.425E+10, 1.450E+10, 1.475E+10, 1.500E+10, 1.525E+10, 1.550E+10, 1.575E+10, 1.600E+10, 1.625E+10, 1.650E+10, 1.675E+10, 1.700E+10, 1.725E+10, 1.750E+10, 1.775E+10, 1.800E+10, 1.825E+10, 1.850E+10, 1.875E+10, 1.900E+10, 1.925E+10, 1.950E+10, 1.975E+10, 2.000E+10])[::2] 190 | 191 | nZ = len(Zs) 192 | nSFage = len(sfages) 193 | nSFtau = len(sftaus) 194 | #nspec2 = models.shape 195 | #assert nspec2 == nspec 196 | #models /= 1e-10 + models[:,:,:,2000].reshape((nZ, nSFage, nSFtau, 1)) # normalise somewhere to one 197 | 198 | """ 199 | nspec = 3000 200 | #models = models[:,:,:,500:3500] 201 | y = y[500:3500,:] 202 | wavelength = wavelength[500:3500] 203 | noise_level = noise_level[500:3500,:] 204 | """ 205 | y = y.astype(numpy.float64).copy() 206 | noise_level = noise_level.astype(numpy.float64).copy() 207 | 208 | wavelength = wavelength / 10. 209 | model_wavelength = model_wavelength / 10. 210 | calzetti_result = numpy.zeros_like(model_wavelength) 211 | mask = (model_wavelength < 630) 212 | calzetti_result[mask] = 2.659 * (-2.156 + 1.509e3 / model_wavelength[mask] - 213 | 0.198e6 / model_wavelength[mask] ** 2 + 214 | 0.011e9 / model_wavelength[mask] ** 3) + 4.05 215 | 216 | # Attenuation between 630 nm and 2200 nm 217 | mask = (model_wavelength >= 630) 218 | calzetti_result[mask] = 2.659 * (-1.857 + 1.040e3 / model_wavelength[mask]) + 4.05 219 | 220 | import scipy.interpolate, scipy.ndimage 221 | 222 | def model(Z, SFtau, sfage, z, EBV): 223 | iZ = numpy.where(Zs <= Z)[-1][-1] 224 | #print(' selecting Z: %d' % iZ) 225 | model_templates = grid[iZ] 226 | #print(' template max value:', model_templates.max(), model_templates.shape) 227 | assert numpy.all(model_templates>=0), model_templates 228 | # convolve the template 229 | 230 | # SFage = 0-13 (Gyrs). 231 | #print(' selecting sfage: %.2f' % sfage) 232 | # ----123456789SFage________ --age--> 233 | tsinceSF = sfage * 1.e9 - ages 234 | tsinceSF[tsinceSF <= 0] = 0 235 | # star formation history is a (delayed) exponential decline. 236 | SFtau = float(SFtau) 237 | #print(' selecting SFtau: %.2f' % SFtau) 238 | sfh = tsinceSF / SFtau**2 * numpy.exp(-tsinceSF/SFtau) 239 | sfh /= sfh.max() 240 | assert numpy.all(sfh>=0), sfh 241 | #print(' ages: ', ages) 242 | #print(' tsinceSF: ', tsinceSF) 243 | #print(' sfh: ', sfh) 244 | # before sfage, no stars 245 | age_weight = ages[1:] - ages[:-1] 246 | assert numpy.all(age_weight>=0), age_weight 247 | 248 | # weight stellar templates with this SFH 249 | #print(model_templates.shape, sfh.shape, age_weight.shape) 250 | template = numpy.sum(model_templates[:-1] * \ 251 | sfh[:-1].reshape((-1,1)) * age_weight.reshape((-1,1)), axis=0) 252 | assert template.shape == (len(model_wavelength),), template.shape 253 | #print(' template max value after sfh convolution:', template.max()) 254 | # normalise template at the highest wavelength 255 | template /= 1e-10 + template[2050] 256 | 257 | # apply calzetti extinction law at restframe 258 | template = template * 10**(-2.5 * calzetti_result * EBV) 259 | #print(' template max value after extinction:', template.max()) 260 | 261 | #template = numpy.interp(x=inversewavelength_grid, xp=1./model_wavelength[::-1], fp=template[::-1]) 262 | # 263 | ## add Doppler blurring 264 | ## sigma_4000 is something like a readshift: 265 | ## f = f_0 * (1 + v/c) 266 | #sigma = 1 + v / 300000. 267 | ## if sigma is 1A at 4000A, then on the 1/lam grid it should be this wide: 268 | #sigma_grid = sigma * 4000 / inversewavelength_gridwidth_A 269 | ## convolve: 270 | #template = scipy.ndimage.filters.gaussian_filter1d(template, sigma_grid) 271 | 272 | # convert back to lambda 273 | 274 | # redshift / Doppler shift 275 | # interpolate template onto data grid 276 | # we go to the model at the restframe wavelength, which is bluer 277 | # template = numpy.interp(x=wavelength / (1 + z), xp=inversewavelength_grid, fp=template) 278 | template = numpy.interp(x=wavelength / (1 + z), xp=model_wavelength, fp=template) 279 | #print(' template max value after redshifting:', template.max()) 280 | 281 | #template = model_interp([Z, sfage, SFtau])[0] 282 | assert template.shape == (nspec,), template.shape 283 | #assert numpy.all(numpy.isfinite(exttemplate)), exttemplate 284 | return template 285 | 286 | if True: 287 | #O = 20 288 | Z, SFtau, SFage, z, EBV = -2, 1.e8, 1, 0, 0 289 | for Z in [-4, -2, -1]: 290 | ypred = model(Z, SFtau, SFage, z, EBV) 291 | plt.plot(wavelength, ypred, label='Z=%s' % Z) 292 | plt.legend(loc='best') 293 | plt.savefig('musefuse_model_Z.pdf', bbox_inches='tight') 294 | plt.close() 295 | Z = -2 296 | for SFtau in [6., 6.1, 6.3, 6.5, 7., 8., 9.]: 297 | ypred = model(Z, 10**SFtau, SFage, z, EBV) 298 | plt.plot(wavelength, ypred, label='SFtau=${10}^{%s}$' % SFtau) 299 | plt.legend(loc='best') 300 | plt.savefig('musefuse_model_SFtau.pdf', bbox_inches='tight') 301 | plt.close() 302 | SFtau = 1e8 303 | for SFage in [0.001, 0.01, 0.1, 1, 6, 12]: 304 | ypred = model(Z, SFtau, SFage, z, EBV) 305 | plt.plot(wavelength, ypred, label='SFage=%s' % SFage) 306 | plt.legend(loc='best') 307 | plt.savefig('musefuse_model_SFage.pdf', bbox_inches='tight') 308 | plt.close() 309 | SFage = 1 310 | for z in [0, 0.1, 0.2, 0.3, 0.4, 0.5]: 311 | ypred = model(Z, SFtau, SFage, z, EBV) 312 | plt.plot(wavelength, ypred, label='z=%s' % z) 313 | plt.legend(loc='best') 314 | plt.savefig('musefuse_model_z.pdf', bbox_inches='tight') 315 | plt.close() 316 | z = 0. 317 | for EBV in [0, 0.5, 1]: 318 | ypred = model(Z, SFtau, SFage, z, EBV) 319 | plt.plot(wavelength, ypred, label='EBV=%s' % EBV) 320 | plt.legend(loc='best') 321 | plt.savefig('musefuse_model_EBV.pdf', bbox_inches='tight') 322 | plt.close() 323 | 324 | 325 | def priortransform(cube): 326 | # definition of the parameter width, by transforming from a unit cube 327 | cube = cube.copy() 328 | #cube[0] = 10**(cube[0] * 4 - 2) # plateau 329 | cube[0] = cube[0] * (Zs.max() - Zs.min()) + Zs.min() 330 | cube[1] = cube[1] * (sftaus.max() - sftaus.min()) + sftaus.min() 331 | cube[2] = cube[2] * (sfages.max() - sfages.min()) + sfages.min() 332 | #cube[4] = cube[4] * 3 + 1 # v (km/s) 333 | cube[3] = cube[3] * (zhi - zlo) + zlo # z 334 | cube[4] = cube[4] * 2 # E(B-V) 335 | #cube[8] = cube[8] * 4 - 1 # misfit 336 | return cube 337 | 338 | def priortransform_simple(cube): 339 | # definition of the parameter width, by transforming from a unit cube 340 | cube = cube.copy() 341 | #cube[0] = 10**(cube[0] * 4 - 2) # plateau 342 | cube[0] = cube[0] * (sftaus.max() - sftaus.min()) + sftaus.min() 343 | cube[1] = cube[1] * (sfages.max() - sfages.min()) + sfages.min() 344 | cube[2] = cube[2] * (zhi - zlo) + zlo # z 345 | cube[3] = cube[3] * 2 # E(B-V) 346 | return cube 347 | 348 | # the following is a python-only implementation of the likelihood 349 | # @ params are the parameters (as transformed by priortransform) 350 | # @ data_mask is which data sets to consider. 351 | # returns a likelihood vector 352 | Lmax = -1e100 353 | Lmax = -1e100 * numpy.ones(ndata) 354 | def multi_loglikelihood(params, data_mask): 355 | global Lmax 356 | O, Z, logSFtau, SFage, z, EBV = params 357 | SFtau = 10**logSFtau 358 | # predict the model 359 | ypred = model(Z, SFtau, SFage, z, EBV) 360 | # do the data comparison 361 | #print ypred.shape, y.shape, data_mask 362 | ndata = data_mask.sum() 363 | if (ypred == 0).all(): 364 | # give low probability to solutions with no stars 365 | return numpy.ones(ndata) * -1e100 366 | ypred += O 367 | 368 | yd = y[:,data_mask] 369 | vd = noise_level[:,data_mask] #+ 10**logvar 370 | #vd[vd > 2 * numpy.median(vd)] = 1000 371 | 372 | # simple likelihood, would need a normalisation factor: 373 | # L = -0.5 * numpy.nansum((ypred.reshape((-1,1)) - yd)**2/vd, axis=0) 374 | L = numpy.zeros(ndata) 375 | 376 | for i in numpy.arange(ndata): 377 | # scaled likelihood, like LePhare 378 | # s = sum[OjMj/sigmaj^2] / sum[Mj^2/sigmaj^2] 379 | s = numpy.nansum(yd[:,i] * ypred / vd[:,i]) / (numpy.nansum(ypred**2 / vd[:,i]) + 1e-10) 380 | assert numpy.isfinite(s), (s, ypred, ypred**2, yd[:,i], vd[:,i]) 381 | # chi2 = sum[(Oi - s*Mi)^2 / sigmai^2] 382 | chi2 = numpy.nansum((yd[:,i] - s * ypred)**2 / vd[:,i]) # + numpy.log(2*numpy.pi*vd)) 383 | L[i] = -0.5 * chi2 + numpy.random.uniform() * 1e-5 384 | j = numpy.where(data_mask)[0][i] 385 | if L[i] > Lmax[j]: 386 | Lmax[j] = L[i] 387 | print('plotting...') 388 | plt.figure(figsize=(20,20)) 389 | plt.subplot(3, 1, 1) 390 | plt.title(str(params) + ' : chi2:' + str(chi2)) 391 | #mask = vd[:,i] < 2 * numpy.median(vd[:,i]) 392 | #mask = numpy.isfinite(vd[:,i]) 393 | mask = Ellipsis 394 | plt.plot(wavelength, yd[mask,i], color='k', alpha=0.5) 395 | plt.plot(wavelength, s * ypred[mask], color='r') 396 | plt.ylim(yd[mask,i].min(), yd[mask,i].max()) 397 | plt.subplot(3, 1, 2) 398 | plt.plot(wavelength, ypred[mask], color='k') 399 | plt.subplot(3, 1, 3) 400 | plt.plot(wavelength, vd[mask,i], color='k') 401 | plt.yscale('log') 402 | plt.savefig('musefuse_bestfit_%d.pdf' % (i+1), bbox_inches='tight') 403 | plt.close() 404 | time.sleep(0.1) 405 | #print chi2 406 | assert L.shape == (ndata,), (L.shape, ypred.shape, y.shape, data_mask) 407 | return L 408 | 409 | def multi_loglikelihood_vectorized(params, data_mask): 410 | global Lmax 411 | O, Z, logSFtau, SFage, z, EBV = params 412 | SFtau = 10**logSFtau 413 | # predict the model 414 | ypred = model(Z, SFtau, SFage, z, EBV) 415 | # do the data comparison 416 | ndata = data_mask.sum() 417 | if (ypred == 0).all(): 418 | # give low probability to solutions with no stars 419 | return numpy.ones(ndata) * -1e100 420 | ypred += O 421 | 422 | yd = y[:,data_mask] 423 | vd = noise_level[:,data_mask] 424 | assert numpy.isfinite(yd).all() 425 | assert numpy.isfinite(vd).all() 426 | assert numpy.isfinite(ypred).all() 427 | 428 | ypreds = ypred.reshape((-1,1)) 429 | s = numpy.sum(yd * ypreds / vd, axis=0) / (numpy.sum(ypreds**2 / vd, axis=0) + 1e-10) 430 | assert s.shape == (ndata,), s.shape 431 | assert numpy.isfinite(s).all() 432 | chi2 = numpy.sum((yd - s.reshape((1,-1)) * ypreds)**2 / vd, axis=0) 433 | L = -0.5 * chi2 + numpy.random.uniform() * 1e-5 434 | 435 | assert L.shape == (ndata,), (L.shape, ypred.shape, y.shape, data_mask.sum()) 436 | 437 | #for j, i in enumerate(numpy.where(L > Lmax[data_mask])[0]): 438 | for j, i in enumerate(numpy.where(data_mask)[0]): 439 | if not (L[j] > Lmax[i]): continue 440 | Lmax[i] = L[j] 441 | if i % (1 + ndata // 3) != 0: continue 442 | print('updating bestfit plot of %d ... chi2: %.2f' % (i+1, chi2[j])) 443 | #print ' ', yd.shape, yd[:,j].shape, ypred.shape 444 | plt.figure(figsize=(20,20)) 445 | plt.subplot(3, 1, 1) 446 | plt.title('%s : chi2: %.2f' % (params, chi2[j])) 447 | #mask = vd[:,i] < 2 * numpy.median(vd[:,i]) 448 | #mask = numpy.isfinite(vd[:,i]) 449 | mask = Ellipsis 450 | plt.plot(wavelength, yd[mask,j], color='k', alpha=0.5) 451 | plt.plot(wavelength, s[j] * ypred[mask], color='r') 452 | plt.ylim(yd[mask,j].min(), yd[mask,j].max()) 453 | plt.subplot(3, 1, 2) 454 | plt.plot(wavelength, ypred[mask], color='k') 455 | plt.subplot(3, 1, 3) 456 | plt.plot(wavelength, vd[mask,j], color='k') 457 | plt.yscale('log') 458 | plt.savefig('musefuse_bestfit_%d.pdf' % (i+1), bbox_inches='tight') 459 | plt.close() 460 | time.sleep(0.1) 461 | 462 | return L 463 | 464 | def multi_loglikelihood_vectorized_short(params, data_mask): 465 | O, Z, logSFtau, SFage, z, EBV = params 466 | SFtau = 10**logSFtau 467 | # predict the model 468 | ypred = model(Z, SFtau, SFage, z, EBV) 469 | # do the data comparison 470 | if (ypred == 0).all(): 471 | # give low probability to solutions with no stars 472 | return numpy.ones(data_mask.sum()) * -1e100 473 | ypred += O 474 | 475 | yd = y[:,data_mask] 476 | vd = noise_level[:,data_mask] 477 | ypreds = ypred.reshape((-1,1)) 478 | s = numpy.sum(yd * ypreds / vd, axis=0) / (numpy.sum(ypreds**2 / vd, axis=0) + 1e-10) 479 | chi2 = numpy.sum((yd - s.reshape((1,-1)) * ypreds)**2 / vd, axis=0) 480 | L = -0.5 * chi2 + numpy.random.uniform() * 1e-5 481 | return L 482 | 483 | import numexpr as ne 484 | def multi_loglikelihood_numexpr(params, data_mask): 485 | O, Z, logSFtau, SFage, z, EBV = params 486 | SFtau = 10**logSFtau 487 | # predict the model 488 | ypred = model(Z, SFtau, SFage, z, EBV) 489 | # do the data comparison 490 | if (ypred == 0).all(): 491 | # give low probability to solutions with no stars 492 | return numpy.ones(data_mask.sum()) * -1e100 493 | ypred += O 494 | 495 | yd = y[:,data_mask] 496 | vd = noise_level[:,data_mask] 497 | ypreds = ypred.reshape((-1,1)) 498 | s1 = ne.evaluate("sum(yd * ypreds / vd, axis=0)") 499 | s2 = ne.evaluate("sum(ypreds**2 / vd, axis=0)") 500 | s = ne.evaluate("s1 / (s2 + 1e-10)").reshape((1,-1)) 501 | return ne.evaluate("sum((yd - s * ypreds)**2 / (-2 * vd), axis=0)") 502 | 503 | from ctypes import * 504 | from numpy.ctypeslib import ndpointer 505 | if int(os.environ.get('OMP_NUM_THREADS', '1')) > 1: 506 | lib = cdll.LoadLibrary('./cmuselike-parallel.so') 507 | else: 508 | lib = cdll.LoadLibrary('./cmuselike.so') 509 | lib.like.argtypes = [ 510 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 511 | ndpointer(dtype=numpy.float64, ndim=2, flags='C_CONTIGUOUS'), 512 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 513 | ndpointer(dtype=numpy.bool, ndim=1, flags='C_CONTIGUOUS'), 514 | c_int, 515 | c_int, 516 | ndpointer(dtype=numpy.float64, ndim=1, flags='C_CONTIGUOUS'), 517 | ] 518 | 519 | Lout = numpy.zeros(ndata) 520 | def multi_loglikelihood_clike(params, data_mask): 521 | global Lout 522 | #O = 0 523 | Z, logSFtau, SFage, z, EBV = params 524 | SFtau = 10**logSFtau 525 | # predict the model 526 | ypred = model(Z, SFtau, SFage, z, EBV) 527 | # do the data comparison 528 | if not numpy.any(ypred): 529 | # give low probability to solutions with no stars 530 | return numpy.ones(data_mask.sum()) * -1e100 531 | #ypred += O 532 | 533 | # do everything in C and return the resulting likelihood vector 534 | ret = lib.like(y, noise_level, ypred, data_mask, ndata, nspec, Lout) 535 | return Lout[data_mask] + numpy.random.normal(0, 1e-5, size=data_mask.sum()) 536 | 537 | def multi_loglikelihood_simple_clike(params, data_mask): 538 | logSFtau, SFage, z, EBV = params 539 | #Z = 0.012 # solar 540 | Z = 0.004 # Patricio2018 541 | params = Z, logSFtau, SFage, z, EBV 542 | return multi_loglikelihood_clike(params, data_mask) 543 | 544 | if False: 545 | data_mask_all = numpy.ones(ndata) == 1 546 | print('testing vectorised code...') 547 | for i in range(100): 548 | cube = numpy.random.uniform(size=nparams) 549 | params = priortransform(cube) 550 | L = multi_loglikelihood(params, data_mask_all) 551 | L2 = multi_loglikelihood_vectorized(params, data_mask_all) 552 | assert numpy.allclose(L, L2), (L, L2, cube, params) 553 | L2 = multi_loglikelihood_vectorized_short(params, data_mask_all) 554 | assert numpy.allclose(L, L2), (L, L2, cube, params) 555 | L2 = multi_loglikelihood_numexpr(params, data_mask_all) 556 | assert numpy.allclose(L, L2), (L, L2, cube, params) 557 | L2 = multi_loglikelihood_clike(params, data_mask_all) 558 | assert numpy.allclose(L, L2), (L, L2, cube, params) 559 | test_cubes = [priortransform(numpy.random.uniform(size=nparams)) for i in range(1000)] 560 | a = time.time() 561 | [multi_loglikelihood(cube, data_mask_all) for cube in test_cubes] 562 | print('original python code:', time.time() - a) 563 | a = time.time() 564 | [multi_loglikelihood_vectorized(cube, data_mask_all) for cube in test_cubes] 565 | print('vectorised python code:', time.time() - a) 566 | a = time.time() 567 | [multi_loglikelihood_vectorized_short(cube, data_mask_all) for cube in test_cubes] 568 | print('shortened vectorised python code:', time.time() - a) 569 | a = time.time() 570 | [multi_loglikelihood_numexpr(cube, data_mask_all) for cube in test_cubes] 571 | print('numexpr code:', time.time() - a) 572 | a = time.time() 573 | [multi_loglikelihood_clike(cube, data_mask_all) for cube in test_cubes] 574 | print('C code:', time.time() - a) 575 | 576 | #multi_loglikelihood = multi_loglikelihood_vectorized_short 577 | #multi_loglikelihood = multi_loglikelihood_numexpr 578 | multi_loglikelihood = multi_loglikelihood_clike 579 | 580 | prefix = sys.argv[1] 581 | 582 | modelname = os.environ.get('MODEL', 'FULL') 583 | if modelname == 'ZSOL': 584 | paramnames = ['logSFtau', 'SFage', 'z', 'EBV'] 585 | nparams = len(paramnames) 586 | prefix = prefix + '_zsol_' 587 | print('Switching to Zsol model') 588 | multi_loglikelihood = multi_loglikelihood_simple_clike 589 | priortransform = priortransform_simple 590 | elif modelname == 'FULL': 591 | prefix = prefix + '_full_' 592 | pass 593 | else: 594 | assert False, modelname 595 | 596 | """ 597 | 598 | After defining the problem, we use generic code to set up 599 | - Nested Sampling (Multi)Integrator 600 | - Our special sampler 601 | - RadFriends (constrained region draw) 602 | 603 | We start with the latter. 604 | """ 605 | 606 | 607 | from multi_nested_integrator import multi_nested_integrator 608 | from multi_nested_sampler import MultiNestedSampler 609 | from cachedconstrainer import CachedConstrainer, generate_individual_constrainer, generate_superset_constrainer 610 | 611 | superset_constrainer = generate_superset_constrainer() 612 | 613 | cc = CachedConstrainer() 614 | focusset_constrainer = cc.get 615 | _, _, individual_draw_constrained = generate_individual_constrainer() 616 | numpy.random.seed(1) 617 | start_time = time.time() 618 | print('setting up integrator ...') 619 | nlive_points = int(os.environ.get('NLIVE_POINTS','400')) 620 | 621 | # constrained region draw functions 622 | # we try hard to keep information about current regions and subselected regions 623 | # because recomputing the regions is expensive if the likelihood is very fast. 624 | # There are three constrainers: 625 | # - the one of the superset (all data sets) 626 | # - one for each data set if need a individual draw (focussed draw with only one) 627 | # - a memory for recent clusterings, because they might recur in the next iteration(s) 628 | # Note that this does caching not improve the algorithms efficiency 629 | # in fact, not recomputing regions keeps the regions larger, 630 | # leading potentially to slightly more rejections. 631 | # However, there is substantial execution speedup. 632 | 633 | 634 | # now set up sampler and pass the three constrainers 635 | 636 | sampler = MultiNestedSampler(nlive_points = nlive_points, 637 | priortransform=priortransform, multi_loglikelihood=multi_loglikelihood, 638 | ndim=nparams, ndata=ndata, 639 | superset_draw_constrained = superset_constrainer.draw_constrained, 640 | individual_draw_constrained = individual_draw_constrained, 641 | draw_constrained = focusset_constrainer, 642 | nsuperset_draws = int(os.environ.get('SUPERSET_DRAWS', '10')), 643 | use_graph = os.environ.get('USE_GRAPH', '1') == '1' 644 | ) 645 | 646 | superset_constrainer.sampler = sampler 647 | cc.sampler = sampler 648 | print('integrating ...') 649 | max_samples = int(os.environ.get('MAXSAMPLES', 100000)) 650 | min_samples = int(os.environ.get('MINSAMPLES', 0)) 651 | results = multi_nested_integrator(tolerance=0.5, multi_sampler=sampler, min_samples=min_samples, max_samples=max_samples) 652 | duration = time.time() - start_time 653 | print('writing output files ...') 654 | # store results 655 | with h5py.File(prefix + '.out_%d.hdf5' % ndata, 'w') as f: 656 | f.create_dataset('logZ', data=results['logZ'], compression='gzip', shuffle=True) 657 | f.create_dataset('logZerr', data=results['logZerr'], compression='gzip', shuffle=True) 658 | u, x, L, w, mask = list(zip(*results['weights'])) 659 | f.create_dataset('u', data=u, compression='gzip', shuffle=True) 660 | f.create_dataset('x', data=x, compression='gzip', shuffle=True) 661 | f.create_dataset('L', data=L, compression='gzip', shuffle=True) 662 | f.create_dataset('w', data=w, compression='gzip', shuffle=True) 663 | f.create_dataset('mask', data=mask, compression='gzip', shuffle=True) 664 | f.create_dataset('ndraws', data=sampler.ndraws) 665 | f.create_dataset('fiberids', data=goodids[:ndata], compression='gzip', shuffle=True) 666 | f.create_dataset('duration', data=duration) 667 | f.create_dataset('ndata', data=ndata) 668 | 669 | print('logZ = %.1f +- %.1f' % (results['logZ'][0], results['logZerr'][0])) 670 | print('ndraws:', sampler.ndraws, 'niter:', len(w)) 671 | 672 | print('writing statistic ...') 673 | json.dump(dict(ndraws=sampler.ndraws, duration=duration, ndata=ndata, niter=len(w)), 674 | open(prefix + '.out_%d.stats.json' % ndata, 'w'), indent=4) 675 | print('done.') 676 | 677 | 678 | --------------------------------------------------------------------------------